[Bug target/94298] New: x86 duplicates loads

Tue Mar 24 10:17:34 GMT 2020

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94298

            Bug ID: 94298
           Summary: x86 duplicates loads
           Product: gcc
           Version: 10.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: rguenth at gcc dot gnu.org
  Target Milestone: ---

For the following testcase at -O3 -fgimple (gimple testcase because the
vectorizer generated code depends on not committed patches) we somehow
duplicate the load from y:

typedef double v2df __attribute__((vector_size(16)));
typedef long v2di __attribute__((vector_size(16)));
double x[1024], y[1024];
void __GIMPLE (ssa,guessed_local(10737416))
foo ()
{
  v2df * vectp_x7;
  v2df vect__56;
  v2df vect__45;
  v2df * vectp_y3;
  v2df _12;
  v2df _13;
  unsigned int _19;
  unsigned int _24;

  __BB(2,guessed_local(10737416)):
  goto __BB3(precise(134217728));

  __BB(3,loop_header(1),guessed_local(1063004409)):
  vectp_y3_21 = __PHI (__BB2: &y, __BB3: vectp_y3_17);
  vectp_x7_6 = __PHI (__BB2: &x, __BB3: vectp_x7_20);
  _19 = __PHI (__BB2: 0u, __BB3: _24);
  vect__45_14 = __MEM <v2df> ((double *)vectp_y3_21);
  _13 = __VEC_PERM (vect__45_14, vect__45_14, _Literal (v2di) { 1l, 1l });
  _12 = __VEC_PERM (vect__45_14, vect__45_14, _Literal (v2di) { 0l, 0l });
  vect__56_7 = _12 + _13;
  __MEM <v2df> ((double *)vectp_x7_6) = vect__56_7;
  vectp_y3_17 = vectp_y3_21 + 16ul;
  vectp_x7_20 = vectp_x7_6 + 16ul;
  _24 = _19 + 1u;
  if (_24 != 512u)
    goto __BB3(adjusted(132875551));
  else
    goto __BB4(adjusted(1342177));

  __BB(4,guessed_local(10737416)):
  return;

}

results in

foo:
.LFB0:
        .cfi_startproc
        xorl    %eax, %eax
        .p2align 4,,10
        .p2align 3
.L2:
        movapd  y(%rax), %xmm1
        movapd  y(%rax), %xmm0
        addq    $16, %rax
        unpcklpd        %xmm1, %xmm1
        unpckhpd        %xmm0, %xmm0
        addpd   %xmm1, %xmm0
        movaps  %xmm0, x-16(%rax)
        cmpq    $8192, %rax
        jne     .L2
        ret

The duplication happens in IRA/LRA but I suspect either x86 costing or
operand constraints makes them think this is cheaper.  LRA is fed with

(insn 8 6 11 3 (set (reg/v:V2DF 85 [ vect__45 ])
        (mem:V2DF (plus:DI (reg:DI 86 [ ivtmp.6 ])
                (symbol_ref:DI ("y") [flags 0x2]  <var_decl 0x7f2a94b8dbd0 y>))
[1 MEM[symbol: y, index: ivtmp.6_7, offset: 0B]+0 S16 A128])) 1338
{movv2df_internal}
     (expr_list:REG_EQUIV (mem:V2DF (plus:DI (reg:DI 86 [ ivtmp.6 ])
                (symbol_ref:DI ("y") [flags 0x2]  <var_decl 0x7f2a94b8dbd0 y>))
[1 MEM[symbol: y, index: ivtmp.6_7, offset: 0B]+0 S16 A128])
        (nil)))
(insn 11 8 12 3 (set (reg:V2DF 89)
        (vec_select:V2DF (vec_concat:V4DF (reg/v:V2DF 85 [ vect__45 ])
                (reg/v:V2DF 85 [ vect__45 ]))
            (parallel [
                    (const_int 0 [0])
                    (const_int 2 [0x2])
                ]))) "t2.c":27:3 2995 {*vec_interleave_lowv2df}
     (nil))
(insn 12 11 13 3 (set (reg:V2DF 90)
        (vec_select:V2DF (vec_concat:V4DF (reg/v:V2DF 85 [ vect__45 ])
                (reg/v:V2DF 85 [ vect__45 ]))
            (parallel [
                    (const_int 1 [0x1])
                    (const_int 3 [0x3])
                ]))) "t2.c":27:3 2989 {*vec_interleave_highv2df}
     (expr_list:REG_DEAD (reg/v:V2DF 85 [ vect__45 ])
        (nil)))
(insn 13 12 14 3 (set (reg:V2DF 91 [ vect__56 ])
        (plus:V2DF (reg:V2DF 89)
            (reg:V2DF 90))) "t2.c":27:3 1519 {*addv2df3}
     (expr_list:REG_DEAD (reg:V2DF 90)
        (expr_list:REG_DEAD (reg:V2DF 89)
            (expr_list:REG_EQUIV (mem:V2DF (plus:DI (reg:DI 86 [ ivtmp.6 ])
                        (symbol_ref:DI ("x") [flags 0x2]  <var_decl
0x7f2a94b8db40 x>)) [1 MEM[symbol: x, index: ivtmp.6_7, offset: 0B]+0 S16
A128])
                (nil)))))
(insn 14 13 15 3 (set (mem:V2DF (plus:DI (reg:DI 86 [ ivtmp.6 ])
                (symbol_ref:DI ("x") [flags 0x2]  <var_decl 0x7f2a94b8db40 x>))
[1 MEM[symbol: x, index: ivtmp.6_7, offset: 0B]+0 S16 A128])
        (reg:V2DF 91 [ vect__56 ])) "t2.c":27:3 1338 {movv2df_internal}
     (expr_list:REG_DEAD (reg:V2DF 91 [ vect__56 ])
        (nil)))

and the LRA:

         Choosing alt 3 in insn 11:  (0) x  (1) 0  (2) m
{*vec_interleave_lowv2df}
      Creating newreg=92 from oldreg=89, assigning class SSE_REGS to r92
   11: r92:V2DF=vec_select(vec_concat(r92:V2DF,[r86:DI+`y']),parallel)
    Inserting insn reload before:
   26: r92:V2DF=[r86:DI+`y']
    Inserting insn reload after:
   27: r89:V2DF=r92:V2DF

and postreload CSE cannot do anything because the shuffle clobbers the
reg we loaded into (only sched2 moves things in a way that CSE would
be possible again but after sched2 there's no CSE anymore).