[Bug target/79722] New: Missed opportunity for fused multiply/add with avx2

tkoenig at gcc dot gnu.org gcc-bugzilla@gcc.gnu.org
Mon Feb 27 11:54:00 GMT 2017


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79722

            Bug ID: 79722
           Summary: Missed opportunity for fused multiply/add with avx2
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tkoenig at gcc dot gnu.org
  Target Milestone: ---

Created attachment 40835
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=40835&action=edit
Output of gcc -Ofast -mavx2 -S -o bar-gcc.s bar.c

The test case is the same as PR 79709:

typedef double v4do __attribute__((vector_size (32)));
typedef long int v4i __attribute__((vector_size (32)));

#define VSET(vect,val) do { vect[0]=val; vect[1]=val; vect[2]=val; vect[3]=val;
} while (0)
void foo(v4do cx, v4do cy, v4i *r)
{
  v4do x, y, xn, yn;
  v4i add, res;
  v4do two, four;
  long int done;

  VSET(res, 0L);
  VSET(two, 2.0);
  VSET(four, 4.0);
  x = cx;
  y = cy;
  done = 0;
  while (1)
    {
      xn = x*x - y*y + cx;
      yn = two*x*y + cy;
      add = xn+xn + yn*yn < four;
      res += add;
      if (add[0] == 0 || add[1] == 0 || add[2] || add[3])
        break;
      x = xn;
      y = yn;
    }
  *r = res;
}

With gcc, the inner loop is tranlsated into

.L13:
        vpextrq $1, %xmm2, %rax
        testq   %rax, %rax
        je      .L2
        vextracti128    $0x1, %ymm2, %xmm2
        vmovq   %xmm2, %rax
        testq   %rax, %rax
        jne     .L2
        vpextrq $1, %xmm2, %rax
        vmovapd %ymm4, %ymm3
        testq   %rax, %rax
        jne     .L2
.L3:
        vmulpd  %ymm3, %ymm3, %ymm4
        vmulpd  %ymm9, %ymm3, %ymm3
        vaddpd  %ymm0, %ymm4, %ymm4
        vmulpd  %ymm7, %ymm3, %ymm3
        vsubpd  %ymm10, %ymm4, %ymm4
        vaddpd  %ymm1, %ymm3, %ymm9
        vaddpd  %ymm4, %ymm4, %ymm2
        vmulpd  %ymm9, %ymm9, %ymm10
        vaddpd  %ymm10, %ymm2, %ymm2
        vcmpltpd        %ymm6, %ymm2, %ymm2
        vmovq   %xmm2, %rax
        vpaddq  %ymm2, %ymm8, %ymm8
        testq   %rax, %rax
        jne     .L13

icc -O3 -march=core-avx2 -S results in

..B1.6:                         # Preds ..B1.5
                                # Execution count [6.94e-01]
        vmovdqa   %ymm11, %ymm5                                 #27.7
                                # LOE rbx rdi r12 r13 r14 r15 ymm0 ymm1 ymm2
ymm3 ymm4 ymm5 ymm6 ymm7
..B1.2:                         # Preds ..B1.6 ..B1.1
                                # Execution count [1.69e+00]
        vmovaps   %ymm4, %ymm11                                 #21.24
        vfmsub213pd %ymm6, %ymm4, %ymm11                        #21.24
        vfmsub231pd %ymm5, %ymm5, %ymm11                        #21.24
        vmulpd    %ymm3, %ymm5, %ymm5                           #22.16
        vaddpd    %ymm11, %ymm11, %ymm8                         #23.16
        vfmadd213pd %ymm7, %ymm5, %ymm4                         #22.22
        vfmadd231pd %ymm4, %ymm4, %ymm8                         #23.24
        vcmpltpd  %ymm2, %ymm8, %ymm9                           #23.29
        vandpd    %ymm9, %ymm1, %ymm10                          #23.29
        vmovups   %ymm10, -64(%rsp)                             #23.7
        vpaddq    %ymm10, %ymm0, %ymm0                          #24.7
        cmpq      $0, -64(%rsp)                                 #25.21
        je        ..B1.7        # Prob 20%                      #25.21
                                # LOE rbx rdi r12 r13 r14 r15 ymm0 ymm1 ymm2
ymm3 ymm4 ymm6 ymm7 ymm11
..B1.3:                         # Preds ..B1.2
                                # Execution count [1.36e+00]
        cmpq      $0, -56(%rsp)                                 #25.36
        je        ..B1.7        # Prob 20%                      #25.36
                                # LOE rbx rdi r12 r13 r14 r15 ymm0 ymm1 ymm2
ymm3 ymm4 ymm6 ymm7 ymm11
..B1.4:                         # Preds ..B1.3
                                # Execution count [1.08e+00]
        cmpq      $0, -48(%rsp)                                 #25.45
        jne       ..B1.7        # Prob 20%                      #25.45
                                # LOE rbx rdi r12 r13 r14 r15 ymm0 ymm1 ymm2
ymm3 ymm4 ymm6 ymm7 ymm11
..B1.5:                         # Preds ..B1.4
                                # Execution count [8.67e-01]
        cmpq      $0, -40(%rsp)                                 #25.55
        je        ..B1.6        # Prob 80%                      #25.55
                                # LOE rbx rdi r12 r13 r14 r15 ymm0 ymm1 ymm2
ymm3 ymm4 ymm6 ymm7 ymm11

where icc uses eight double precision floating point operations
vs. gcc's ten.


More information about the Gcc-bugs mailing list