79102 – gcc fails to auto-vectorise the multiplicative reduction of an array of complex floats

Bug 79102 - gcc fails to auto-vectorise the multiplicative reduction of an array of complex floats

Summary: gcc fails to auto-vectorise the multiplicative reduction of an array of compl...

Status:	NEW

Alias:	None

Product:	gcc
Classification:	Unclassified
Component:	tree-optimization (show other bugs)
Version:	7.0

Importance:	P3 normal
Target Milestone:	---
Assignee:	Not yet assigned to anyone

URL:
Keywords:	missed-optimization

Depends on:
Blocks:	vectorizer
	Show dependency tree / graph

Reported:	2017-01-16 14:10 UTC by Raphael C
Modified:	2023-07-27 13:48 UTC (History)
CC List:	1 user (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed:	2023-07-27 00:00:00

Attachments
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description Raphael C 2017-01-16 14:10:44 UTC

Consider this simple piece of code.

#include <complex.h>
complex float f(complex float x[]) {
  complex float p = 1.0;
  for (int i = 0; i < 128; i++)
    p *= x[i];
  return p;
}

If I compile it with -O3 -march=bdver2 -ffast-math  I get

f:
        vmovss  xmm2, DWORD PTR .LC1[rip]
        vxorps  xmm1, xmm1, xmm1
        lea     rax, [rdi+256]
.L2:
        vmovss  xmm0, DWORD PTR [rdi+4]
        add     rdi, 8
        vmulss  xmm3, xmm0, xmm2
        vmulss  xmm0, xmm0, xmm1
        vfmadd132ss     xmm1, xmm3, DWORD PTR [rdi-8]
        vfmsub132ss     xmm2, xmm0, DWORD PTR [rdi-8]
        cmp     rax, rdi
        jne     .L2
        vmovss  DWORD PTR [rsp-8], xmm2
        vmovss  DWORD PTR [rsp-4], xmm1
        vmovq   xmm0, QWORD PTR [rsp-8]
        ret
.LC1:
        .long   1065353216


This is unvectorised code. However if I do the same using float instead, that is with:

float f(float x[], int n ) {
  float p = 1.0;
  for (int i = 0; i < 32; i++)
    p *= x[i];
  return p;
}

I get

        vmovups xmm2, XMMWORD PTR [rdi]
        vmulps  xmm0, xmm2, XMMWORD PTR [rdi+16]
        vmulps  xmm0, xmm0, XMMWORD PTR [rdi+32]
        vmulps  xmm0, xmm0, XMMWORD PTR [rdi+48]
        vmulps  xmm0, xmm0, XMMWORD PTR [rdi+64]
        vmulps  xmm0, xmm0, XMMWORD PTR [rdi+80]
        vmulps  xmm0, xmm0, XMMWORD PTR [rdi+96]
        vmulps  xmm0, xmm0, XMMWORD PTR [rdi+112]
        vpsrldq xmm1, xmm0, 8
        vmulps  xmm0, xmm0, xmm1
        vpsrldq xmm1, xmm0, 4
        vmulps  xmm0, xmm0, xmm1
        ret

This is vectorised.

As a test I also the Intel C compiler version 17. In this case the assembly you get using complex float is however vectorised giving:

f:
        mov       rdx, rdi                                      #4.3
        and       rdx, 15                                       #4.3
        movsd     xmm0, QWORD PTR p.152.0.0.1[rip]              #3.19
        test      dl, dl                                        #4.3
        je        ..B1.4        # Prob 50%                      #4.3
        test      dl, 7                                         #4.3
        jne       ..B1.12       # Prob 10%                      #4.3
        movsd     xmm0, QWORD PTR [rdi]                         #5.10
        mov       dl, 1                                         #4.3
..B1.4:                         # Preds ..B1.3 ..B1.1
        movzx     eax, dl                                       #4.3
        neg       dl                                            #4.3
        and       dl, 3                                         #4.3
        movzx     edx, dl                                       #4.3
        movss     xmm1, DWORD PTR .L_2il0floatpacket.0[rip]     #3.19
        neg       rdx                                           #4.3
        movlhps   xmm0, xmm1                                    #3.19
        add       rdx, 128                                      #4.3
..B1.5:                         # Preds ..B1.5 ..B1.4
        movaps    xmm2, xmm0                                    #5.5
        movups    xmm1, XMMWORD PTR [rdi+rax*8]                 #5.10
        shufps    xmm2, xmm0, 160                               #5.5
        mulps     xmm2, xmm1                                    #5.5
        xorps     xmm1, XMMWORD PTR .L_2il0floatpacket.1[rip]   #5.5
        shufps    xmm1, xmm1, 177                               #5.5
        shufps    xmm0, xmm0, 245                               #5.5
        mulps     xmm1, xmm0                                    #5.5
        movups    xmm3, XMMWORD PTR [16+rdi+rax*8]              #5.10
        add       rax, 4                                        #4.3
        addps     xmm2, xmm1                                    #5.5
        movaps    xmm0, xmm2                                    #5.5
        shufps    xmm0, xmm2, 160                               #5.5
        mulps     xmm0, xmm3                                    #5.5
        xorps     xmm3, XMMWORD PTR .L_2il0floatpacket.1[rip]   #5.5
        shufps    xmm3, xmm3, 177                               #5.5
        shufps    xmm2, xmm2, 245                               #5.5
        mulps     xmm3, xmm2                                    #5.5
        addps     xmm0, xmm3                                    #5.5
        cmp       rax, rdx                                      #4.3
        jb        ..B1.5        # Prob 99%                      #4.3
        movaps    xmm1, xmm0                                    #3.19
        movhlps   xmm1, xmm0                                    #3.19
        movaps    xmm2, xmm1                                    #3.19
        shufps    xmm2, xmm1, 160                               #3.19
        mulps     xmm2, xmm0                                    #3.19
        xorps     xmm0, XMMWORD PTR .L_2il0floatpacket.1[rip]   #3.19
        shufps    xmm0, xmm0, 177                               #3.19
        shufps    xmm1, xmm1, 245                               #3.19
        mulps     xmm0, xmm1                                    #3.19
        addps     xmm0, xmm2                                    #3.19
..B1.7:                         # Preds ..B1.6 ..B1.12
        cmp       rdx, 128                                      #4.3
        jae       ..B1.11       # Prob 0%                       #4.3
..B1.9:                         # Preds ..B1.7 ..B1.9
        movsd     xmm1, QWORD PTR [rdi+rdx*8]                   #5.10
        inc       rdx                                           #4.3
        movaps    xmm2, xmm1                                    #5.5
        shufps    xmm2, xmm1, 160                               #5.5
        mulps     xmm2, xmm0                                    #5.5
        xorps     xmm0, XMMWORD PTR .L_2il0floatpacket.1[rip]   #5.5
        shufps    xmm0, xmm0, 177                               #5.5
        shufps    xmm1, xmm1, 245                               #5.5
        mulps     xmm0, xmm1                                    #5.5
        addps     xmm0, xmm2                                    #5.5
        cmp       rdx, 128                                      #4.3
        jb        ..B1.9        # Prob 99%                      #4.3
..B1.11:                        # Preds ..B1.9 ..B1.7
        ret                                                     #6.10
..B1.12:                        # Preds ..B1.2
        xor       edx, edx                                      #4.3
        jmp       ..B1.7        # Prob 100%                     #4.3
p.152.0.0.1:
        .long   0x3f800000,0x00000000
.L_2il0floatpacket.1:
        .long   0x00000000,0x80000000,0x00000000,0x80000000
.L_2il0floatpacket.0:
        .long   0x3f800000

Comment 1 Richard Biener 2017-01-17 09:07:37 UTC

The issue is we do not support reduction of _Complex vars.  The vectorizer sees

  <bb 3> [99.00%]:
  # i_16 = PHI <i_11(4), 0(2)>
  # p$real_13 = PHI <_21(4), 1.0e+0(2)>
  # p$imag_14 = PHI <_22(4), 0.0(2)>
  # ivtmp_48 = PHI <ivtmp_47(4), 128(2)>
  _1 = (long unsigned int) i_16;
  _2 = _1 * 8;
  _3 = x_9(D) + _2;
  _7 = REALPART_EXPR <*_3>;
  _12 = IMAGPART_EXPR <*_3>;
  _17 = _7 * p$real_13;
  _18 = _12 * p$imag_14;
  _19 = _7 * p$imag_14;
  _20 = _12 * p$real_13;
  _21 = _17 - _18;
  _22 = _19 + _20;
  i_11 = i_16 + 1;
  ivtmp_47 = ivtmp_48 - 1;
  if (ivtmp_47 != 0)
    goto <bb 4>; [98.99%]
  else
    goto <bb 5>; [1.01%]

  <bb 4> [98.00%]:
  goto <bb 3>; [100.00%]

  <bb 5> [1.00%]:
  # _50 = PHI <_21(3)>
  # _49 = PHI <_22(3)>
  p_10 = COMPLEX_EXPR <_50, _49>;
  return p_10;

which would need to be detected as a single reduction with two components.  I guess not lowering complex operations would help here (with its own complications of course).  Not sinking the COMPLEX_EXPR and having it as
loop carried dep would eventually help as well.