[PATCH] Fix PR80846, change vectorizer reduction epilogue (on x86)

Wed Jan 10 08:23:00 GMT 2018

On Tue, 9 Jan 2018, Jeff Law wrote:

> On 01/05/2018 02:01 AM, Richard Biener wrote:
> > On Tue, 28 Nov 2017, Richard Biener wrote:
> > 
> >>
> >> The following adds a new target hook, targetm.vectorize.split_reduction,
> >> which allows the target to specify a preferred mode to perform the
> >> final reducion on using either vector shifts or scalar extractions.
> >> Up to that mode the vector reduction result is reduced by combining
> >> lowparts and highparts recursively.  This avoids lane-crossing operations
> >> when doing AVX256 on Zen and Bulldozer and also speeds up things on
> >> Haswell (I verified ~20% speedup on Broadwell).
> >>
> >> Thus the patch implements the target hook on x86 to _always_ prefer
> >> SSE modes for the final reduction.
> >>
> >> For the testcase in the bugzilla
> >>
> >> int sumint(const int arr[]) {
> >>     arr = __builtin_assume_aligned(arr, 64);
> >>     int sum=0;
> >>     for (int i=0 ; i<1024 ; i++)
> >>       sum+=arr[i];
> >>     return sum;
> >> }
> >>
> >> this changes -O3 -mavx512f code from
> >>
> >> sumint:
> >> .LFB0:
> >>         .cfi_startproc
> >>         vpxord  %zmm0, %zmm0, %zmm0
> >>         leaq    4096(%rdi), %rax
> >>         .p2align 4,,10
> >>         .p2align 3
> >> .L2:
> >>         vpaddd  (%rdi), %zmm0, %zmm0
> >>         addq    $64, %rdi
> >>         cmpq    %rdi, %rax
> >>         jne     .L2
> >>         vpxord  %zmm1, %zmm1, %zmm1
> >>         vshufi32x4      $78, %zmm1, %zmm0, %zmm2
> >>         vpaddd  %zmm2, %zmm0, %zmm0
> >>         vmovdqa64       .LC0(%rip), %zmm2
> >>         vpermi2d        %zmm1, %zmm0, %zmm2
> >>         vpaddd  %zmm2, %zmm0, %zmm0
> >>         vmovdqa64       .LC1(%rip), %zmm2
> >>         vpermi2d        %zmm1, %zmm0, %zmm2
> >>         vpaddd  %zmm2, %zmm0, %zmm0
> >>         vmovdqa64       .LC2(%rip), %zmm2
> >>         vpermi2d        %zmm1, %zmm0, %zmm2
> >>         vpaddd  %zmm2, %zmm0, %zmm0
> >>         vmovd   %xmm0, %eax
> >>
> >> to
> >>
> >> sumint:
> >> .LFB0:
> >>         .cfi_startproc
> >>         vpxord  %zmm0, %zmm0, %zmm0
> >>         leaq    4096(%rdi), %rax
> >>         .p2align 4,,10
> >>         .p2align 3
> >> .L2:
> >>         vpaddd  (%rdi), %zmm0, %zmm0
> >>         addq    $64, %rdi
> >>         cmpq    %rdi, %rax
> >>         jne     .L2
> >>         vextracti64x4   $0x1, %zmm0, %ymm1
> >>         vpaddd  %ymm0, %ymm1, %ymm1
> >>         vmovdqa %xmm1, %xmm0
> >>         vextracti128    $1, %ymm1, %xmm1
> >>         vpaddd  %xmm1, %xmm0, %xmm0
> >>         vpsrldq $8, %xmm0, %xmm1
> >>         vpaddd  %xmm1, %xmm0, %xmm0
> >>         vpsrldq $4, %xmm0, %xmm1
> >>         vpaddd  %xmm1, %xmm0, %xmm0
> >>         vmovd   %xmm0, %eax
> >>
> >> and for -O3 -mavx2 from
> >>
> >> sumint:
> >> .LFB0:
> >>         .cfi_startproc
> >>         vpxor   %xmm0, %xmm0, %xmm0
> >>         leaq    4096(%rdi), %rax
> >>         .p2align 4,,10
> >>         .p2align 3
> >> .L2:
> >>         vpaddd  (%rdi), %ymm0, %ymm0
> >>         addq    $32, %rdi
> >>         cmpq    %rdi, %rax
> >>         jne     .L2
> >>         vpxor   %xmm1, %xmm1, %xmm1
> >>         vperm2i128      $33, %ymm1, %ymm0, %ymm2
> >>         vpaddd  %ymm2, %ymm0, %ymm0
> >>         vperm2i128      $33, %ymm1, %ymm0, %ymm2
> >>         vpalignr        $8, %ymm0, %ymm2, %ymm2
> >>         vpaddd  %ymm2, %ymm0, %ymm0
> >>         vperm2i128      $33, %ymm1, %ymm0, %ymm1
> >>         vpalignr        $4, %ymm0, %ymm1, %ymm1
> >>         vpaddd  %ymm1, %ymm0, %ymm0
> >>         vmovd   %xmm0, %eax
> >>
> >> to
> >>
> >> sumint:
> >> .LFB0:
> >>         .cfi_startproc
> >>         vpxor   %xmm0, %xmm0, %xmm0
> >>         leaq    4096(%rdi), %rax
> >>         .p2align 4,,10
> >>         .p2align 3
> >> .L2:
> >>         vpaddd  (%rdi), %ymm0, %ymm0
> >>         addq    $32, %rdi
> >>         cmpq    %rdi, %rax
> >>         jne     .L2
> >>         vmovdqa %xmm0, %xmm1
> >>         vextracti128    $1, %ymm0, %xmm0
> >>         vpaddd  %xmm0, %xmm1, %xmm0
> >>         vpsrldq $8, %xmm0, %xmm1
> >>         vpaddd  %xmm1, %xmm0, %xmm0
> >>         vpsrldq $4, %xmm0, %xmm1
> >>         vpaddd  %xmm1, %xmm0, %xmm0
> >>         vmovd   %xmm0, %eax
> >>         vzeroupper
> >>         ret
> >>
> >> which besides being faster is also smaller (less prefixes).
> >>
> >> SPEC 2k6 results on Haswell (thus AVX2) are neutral.  As it merely
> >> effects reduction vectorization epilogues I didn't expect big effects
> >> but for loops that do not run much (more likely with AVX512).
> >>
> >> Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
> >>
> >> Ok for trunk?
> > 
> > Ping?
> > 
> > Richard.
> > 
> >> The PR mentions some more tricks to optimize the sequence but
> >> those look like backend only optimizations.
> >>
> >> Thanks,
> >> Richard.
> >>
> >> 2017-11-28  Richard Biener  <rguenther@suse.de>
> >>
> >> 	PR tree-optimization/80846
> >> 	* target.def (split_reduction): New target hook.
> >> 	* targhooks.c (default_split_reduction): New function.
> >> 	* targhooks.h (default_split_reduction): Declare.
> >> 	* tree-vect-loop.c (vect_create_epilog_for_reduction): If the
> >> 	target requests first reduce vectors by combining low and high
> >> 	parts.
> >> 	* tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust.
> >> 	(get_vectype_for_scalar_type_and_size): Export.
> >> 	* tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare.
> >>
> >> 	* doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document.
> >> 	* doc/tm.texi: Regenerate.
> >>
> >> 	i386/
> >> 	* config/i386/i386.c (ix86_split_reduction): Implement
> >> 	TARGET_VECTORIZE_SPLIT_REDUCTION.
> >>
> >> 	* gcc.target/i386/pr80846-1.c: New testcase.
> >> 	* gcc.target/i386/pr80846-2.c: Likewise.
> I've got no objections here and you know this code far better than I.

I was really looking for x86 maintainer ack for the target hook
implementation which I just quote here for reference again:

+/* All CPUs perfer to avoid cross-lane operations so perform reductions
+   upper against lower halves up to SSE reg size.  */
+
+static machine_mode
+ix86_split_reduction (machine_mode mode)
+{
+  /* Reduce lowpart against highpart until we reach SSE reg width to
+     avoid cross-lane operations.  */
+  switch (mode)
+    {
+    case E_V16SImode:
+    case E_V8SImode:
+      return V4SImode;
+    case E_V32HImode:
+    case E_V16HImode:
+      return V8HImode;
+    case E_V64QImode:
+    case E_V32QImode:
+      return V16QImode;
+    case E_V16SFmode:
+    case E_V8SFmode:
+      return V4SFmode;
+    case E_V8DFmode:
+    case E_V4DFmode:
+      return V2DFmode;
+    default:
+      return mode;
+    }
+}

this means we'll do [zmm -> ymm] -> xmm (looks like I forgot VnDImode
in the above list, consider that added).

Richard.