Bug 39821 - 120% slowdown with vectorizer
Summary: 120% slowdown with vectorizer
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: target (show other bugs)
Version: 4.4.0
: P3 normal
Target Milestone: 12.0
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks: vectorizer
  Show dependency treegraph
 
Reported: 2009-04-20 00:23 UTC by Ramiro Polla
Modified: 2021-09-03 02:41 UTC (History)
8 users (show)

See Also:
Host:
Target: x86_64-linux-gnu
Build:
Known to work:
Known to fail:
Last reconfirmed: 2009-04-20 09:26:17


Attachments
tarball of a simple testcase (655 bytes, application/x-bzip)
2009-04-21 00:08 UTC, Ramiro Polla
Details

Note You need to log in before you can comment on or make changes to this bug.
Description Ramiro Polla 2009-04-20 00:23:00 UTC
The vectorizer produces horrible code with this testcase:

$ cat dotproduct.c 
#include "inttypes.h"

int64_t dotproduct(int32_t *v1, int32_t *v2, int order)
{
    int64_t accum = 0;
    while (order--)
        accum += (int64_t) *v1++ * *v2++;
    return accum;
}

int64_t dotproduct_order4(int32_t *v1, int32_t *v2, int order)
{
    return dotproduct(v1, v2, 4);
}
$ gcc-4.4rc1 -o dotproduct.o -c dotproduct.c -O3
$ gcc-4.4rc1 -o dotproduct-no-vectorize.o -c dotproduct.c -O3 -fno-tree-vectorize
$ objdump -d dotproduct.o

dotproduct.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <dotproduct>:
   0:   31 c0                   xor    %eax,%eax
   2:   85 d2                   test   %edx,%edx
   4:   0f 84 4e 01 00 00       je     158 <dotproduct+0x158>
   a:   41 89 d0                mov    %edx,%r8d
   d:   44 8d 52 ff             lea    -0x1(%rdx),%r10d
  11:   41 c1 e8 02             shr    $0x2,%r8d
  15:   83 fa 03                cmp    $0x3,%edx
  18:   46 8d 0c 85 00 00 00    lea    0x0(,%r8,4),%r9d
  1f:   00 
  20:   76 05                   jbe    27 <dotproduct+0x27>
  22:   45 85 c9                test   %r9d,%r9d
  25:   75 09                   jne    30 <dotproduct+0x30>
  27:   31 c0                   xor    %eax,%eax
  29:   e9 fc 00 00 00          jmpq   12a <dotproduct+0x12a>
  2e:   66 90                   xchg   %ax,%ax
  30:   66 0f ef c0             pxor   %xmm0,%xmm0
  34:   31 c0                   xor    %eax,%eax
  36:   66 45 0f ef c9          pxor   %xmm9,%xmm9
  3b:   31 c9                   xor    %ecx,%ecx
  3d:   0f 1f 00                nopl   (%rax)
  40:   f3 0f 6f 14 07          movdqu (%rdi,%rax,1),%xmm2
  45:   83 c1 01                add    $0x1,%ecx
  48:   66 41 0f 6f d9          movdqa %xmm9,%xmm3
  4d:   f3 0f 6f 24 06          movdqu (%rsi,%rax,1),%xmm4
  52:   66 45 0f 6f c1          movdqa %xmm9,%xmm8
  57:   66 0f 6f ea             movdqa %xmm2,%xmm5
  5b:   48 83 c0 10             add    $0x10,%rax
  5f:   66 0f 66 dc             pcmpgtd %xmm4,%xmm3
  63:   66 0f 6f fc             movdqa %xmm4,%xmm7
  67:   66 44 0f 66 c2          pcmpgtd %xmm2,%xmm8
  6c:   41 39 c8                cmp    %ecx,%r8d
  6f:   66 0f 62 fb             punpckldq %xmm3,%xmm7
  73:   66 41 0f 62 e8          punpckldq %xmm8,%xmm5
  78:   66 0f 6a e3             punpckhdq %xmm3,%xmm4
  7c:   66 41 0f 6a d0          punpckhdq %xmm8,%xmm2
  81:   66 0f 6f cf             movdqa %xmm7,%xmm1
  85:   66 0f 6f f5             movdqa %xmm5,%xmm6
  89:   66 44 0f 6f d7          movdqa %xmm7,%xmm10
  8e:   66 0f f4 cd             pmuludq %xmm5,%xmm1
  92:   66 0f 6f da             movdqa %xmm2,%xmm3
  96:   66 0f 73 d6 20          psrlq  $0x20,%xmm6
  9b:   66 0f f4 f7             pmuludq %xmm7,%xmm6
  9f:   66 41 0f 73 d2 20       psrlq  $0x20,%xmm10
  a5:   66 0f 73 f6 20          psllq  $0x20,%xmm6
  aa:   66 41 0f f4 ea          pmuludq %xmm10,%xmm5
  af:   66 0f d4 ce             paddq  %xmm6,%xmm1
  b3:   66 0f 73 f5 20          psllq  $0x20,%xmm5
  b8:   66 0f d4 cd             paddq  %xmm5,%xmm1
  bc:   66 0f 6f ec             movdqa %xmm4,%xmm5
  c0:   66 0f d4 c8             paddq  %xmm0,%xmm1
  c4:   66 0f 73 d3 20          psrlq  $0x20,%xmm3
  c9:   66 0f 6f c4             movdqa %xmm4,%xmm0
  cd:   66 0f f4 dc             pmuludq %xmm4,%xmm3
  d1:   66 0f 73 f3 20          psllq  $0x20,%xmm3
  d6:   66 0f 73 d5 20          psrlq  $0x20,%xmm5
  db:   66 0f f4 c2             pmuludq %xmm2,%xmm0
  df:   66 0f f4 d5             pmuludq %xmm5,%xmm2
  e3:   66 0f d4 c3             paddq  %xmm3,%xmm0
  e7:   66 0f 73 f2 20          psllq  $0x20,%xmm2
  ec:   66 0f d4 c2             paddq  %xmm2,%xmm0
  f0:   66 0f d4 c1             paddq  %xmm1,%xmm0
  f4:   0f 87 46 ff ff ff       ja     40 <dotproduct+0x40>
  fa:   42 8d 0c 8d 00 00 00    lea    0x0(,%r9,4),%ecx
 101:   00 
 102:   66 0f 6f c8             movdqa %xmm0,%xmm1
 106:   45 29 ca                sub    %r9d,%r10d
 109:   89 c9                   mov    %ecx,%ecx
 10b:   66 0f 73 d9 08          psrldq $0x8,%xmm1
 110:   66 0f d4 c1             paddq  %xmm1,%xmm0
 114:   48 01 cf                add    %rcx,%rdi
 117:   48 01 ce                add    %rcx,%rsi
 11a:   44 39 ca                cmp    %r9d,%edx
 11d:   66 0f d6 44 24 f8       movq   %xmm0,-0x8(%rsp)
 123:   48 8b 44 24 f8          mov    -0x8(%rsp),%rax
 128:   74 2e                   je     158 <dotproduct+0x158>
 12a:   45 89 d2                mov    %r10d,%r10d
 12d:   31 d2                   xor    %edx,%edx
 12f:   4e 8d 0c 95 04 00 00    lea    0x4(,%r10,4),%r9
 136:   00 
 137:   66 0f 1f 84 00 00 00    nopw   0x0(%rax,%rax,1)
 13e:   00 00 
 140:   48 63 0c 16             movslq (%rsi,%rdx,1),%rcx
 144:   4c 63 04 17             movslq (%rdi,%rdx,1),%r8
 148:   48 83 c2 04             add    $0x4,%rdx
 14c:   49 0f af c8             imul   %r8,%rcx
 150:   48 01 c8                add    %rcx,%rax
 153:   4c 39 ca                cmp    %r9,%rdx
 156:   75 e8                   jne    140 <dotproduct+0x140>
 158:   f3 c3                   repz retq 
 15a:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)

0000000000000160 <dotproduct_order4>:
 160:   66 0f ef c0             pxor   %xmm0,%xmm0
 164:   f3 0f 6f 0f             movdqu (%rdi),%xmm1
 168:   f3 0f 6f 1e             movdqu (%rsi),%xmm3
 16c:   66 0f 6f d0             movdqa %xmm0,%xmm2
 170:   66 0f 6f f1             movdqa %xmm1,%xmm6
 174:   66 0f 66 c1             pcmpgtd %xmm1,%xmm0
 178:   66 0f 6f fb             movdqa %xmm3,%xmm7
 17c:   66 0f 66 d3             pcmpgtd %xmm3,%xmm2
 180:   66 0f 62 f0             punpckldq %xmm0,%xmm6
 184:   66 0f 62 fa             punpckldq %xmm2,%xmm7
 188:   66 0f 6a da             punpckhdq %xmm2,%xmm3
 18c:   66 0f 6a c8             punpckhdq %xmm0,%xmm1
 190:   66 0f 6f ee             movdqa %xmm6,%xmm5
 194:   66 44 0f 6f c7          movdqa %xmm7,%xmm8
 199:   66 0f 6f e7             movdqa %xmm7,%xmm4
 19d:   66 0f 6f c3             movdqa %xmm3,%xmm0
 1a1:   66 0f 73 d5 20          psrlq  $0x20,%xmm5
 1a6:   66 44 0f f4 c6          pmuludq %xmm6,%xmm8
 1ab:   66 0f f4 ef             pmuludq %xmm7,%xmm5
 1af:   66 0f 6f d1             movdqa %xmm1,%xmm2
 1b3:   66 0f 73 d4 20          psrlq  $0x20,%xmm4
 1b8:   66 0f 73 f5 20          psllq  $0x20,%xmm5
 1bd:   66 0f f4 e6             pmuludq %xmm6,%xmm4
 1c1:   66 41 0f d4 e8          paddq  %xmm8,%xmm5
 1c6:   66 0f 73 f4 20          psllq  $0x20,%xmm4
 1cb:   66 0f d4 e5             paddq  %xmm5,%xmm4
 1cf:   66 0f 6f eb             movdqa %xmm3,%xmm5
 1d3:   66 0f f4 c1             pmuludq %xmm1,%xmm0
 1d7:   66 0f 73 d2 20          psrlq  $0x20,%xmm2
 1dc:   66 0f f4 d3             pmuludq %xmm3,%xmm2
 1e0:   66 0f 73 f2 20          psllq  $0x20,%xmm2
 1e5:   66 0f d4 c2             paddq  %xmm2,%xmm0
 1e9:   66 0f 73 d5 20          psrlq  $0x20,%xmm5
 1ee:   66 0f f4 cd             pmuludq %xmm5,%xmm1
 1f2:   66 0f 73 f1 20          psllq  $0x20,%xmm1
 1f7:   66 0f d4 c1             paddq  %xmm1,%xmm0
 1fb:   66 0f d4 c4             paddq  %xmm4,%xmm0
 1ff:   66 0f 6f c8             movdqa %xmm0,%xmm1
 203:   66 0f 73 d9 08          psrldq $0x8,%xmm1
 208:   66 0f d4 c1             paddq  %xmm1,%xmm0
 20c:   66 0f d6 44 24 f8       movq   %xmm0,-0x8(%rsp)
 212:   48 8b 44 24 f8          mov    -0x8(%rsp),%rax
 217:   c3                      retq   
$ objdump -d dotproduct-no-vectorize.o

dotproduct-no-vectorize.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <dotproduct>:
   0:   31 c0                   xor    %eax,%eax
   2:   85 d2                   test   %edx,%edx
   4:   74 2a                   je     30 <dotproduct+0x30>
   6:   83 ea 01                sub    $0x1,%edx
   9:   4c 8d 0c 95 04 00 00    lea    0x4(,%rdx,4),%r9
  10:   00 
  11:   31 d2                   xor    %edx,%edx
  13:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  18:   48 63 0c 16             movslq (%rsi,%rdx,1),%rcx
  1c:   4c 63 04 17             movslq (%rdi,%rdx,1),%r8
  20:   48 83 c2 04             add    $0x4,%rdx
  24:   49 0f af c8             imul   %r8,%rcx
  28:   48 01 c8                add    %rcx,%rax
  2b:   4c 39 ca                cmp    %r9,%rdx
  2e:   75 e8                   jne    18 <dotproduct+0x18>
  30:   f3 c3                   repz retq 
  32:   66 66 66 66 66 2e 0f    nopw   %cs:0x0(%rax,%rax,1)
  39:   1f 84 00 00 00 00 00 

0000000000000040 <dotproduct_order4>:
  40:   48 63 07                movslq (%rdi),%rax
  43:   48 63 16                movslq (%rsi),%rdx
  46:   48 63 4f 04             movslq 0x4(%rdi),%rcx
  4a:   48 0f af d0             imul   %rax,%rdx
  4e:   48 63 46 04             movslq 0x4(%rsi),%rax
  52:   48 0f af c1             imul   %rcx,%rax
  56:   48 63 4f 08             movslq 0x8(%rdi),%rcx
  5a:   48 01 c2                add    %rax,%rdx
  5d:   48 63 46 08             movslq 0x8(%rsi),%rax
  61:   48 0f af c1             imul   %rcx,%rax
  65:   48 63 4f 0c             movslq 0xc(%rdi),%rcx
  69:   48 01 c2                add    %rax,%rdx
  6c:   48 63 46 0c             movslq 0xc(%rsi),%rax
  70:   48 0f af c1             imul   %rcx,%rax
  74:   48 01 d0                add    %rdx,%rax
  77:   c3                      retq
Comment 1 Richard Biener 2009-04-20 09:26:17 UTC
The vectorizer creates

  vect_var_.128_46 = M*vect_p.123_44{misalignment: 0};
  vect_var_.129_47 = [vec_unpack_lo_expr] vect_var_.128_46;
  vect_var_.129_48 = [vec_unpack_hi_expr] vect_var_.128_46;
  vect_var_.135_53 = M*vect_p.130_51{misalignment: 0};
  vect_var_.136_54 = [vec_unpack_lo_expr] vect_var_.135_53;
  vect_var_.136_55 = [vec_unpack_hi_expr] vect_var_.135_53;
  vect_var_.137_56 = vect_var_.136_54 * vect_var_.129_47;
  vect_var_.137_57 = vect_var_.136_55 * vect_var_.129_48;
  vect_var_.138_59 = vect_var_.137_56 + vect_var_.138_58;
  vect_var_.138_60 = vect_var_.137_57 + vect_var_.138_59;
  v1_14 = v1_26 + 4;

but the widening unpacking results in absymal code generated.  Where are
all the shifts coming from?
Comment 2 Uroš Bizjak 2009-04-20 20:52:06 UTC
(In reply to comment #1)

> but the widening unpacking results in absymal code generated.  Where are
> all the shifts coming from?

Not from unpacking, but from mulv2di pattern from sse.md

Can you please attach full source to create executable testcase? IIRC, execution times depend on target processor, and perhaps vect cost should be updated for this case.
Comment 3 Ramiro Polla 2009-04-21 00:08:54 UTC
Created attachment 17660 [details]
tarball of a simple testcase
Comment 4 Ramiro Polla 2009-04-21 00:10:03 UTC
I've attached a simple testcase. The system I'm running this on is a q6600 with 64-bit Linux.
Comment 5 Andrew Pinski 2021-07-26 06:19:30 UTC
The code generation for aarch64 looks fine:
dotproduct_order4:
.LFB1:
        .cfi_startproc
        ldr     q1, [x0]
        ldr     q2, [x1]
        smull   v0.2d, v2.2s, v1.2s
        smlal2  v0.2d, v2.4s, v1.4s
        addp    d0, v0.2d
        fmov    x0, d0
        ret
  vect__6.41_18 = MEM <vector(4) int> [(int32_t *)v1_2(D)];
  vect__10.44_13 = MEM <vector(4) int> [(int32_t *)v2_3(D)];
  vect_patt_25.45_8 = WIDEN_MULT_LO_EXPR <vect__10.44_13, vect__6.41_18>;
  vect_patt_25.45_4 = WIDEN_MULT_HI_EXPR <vect__10.44_13, vect__6.41_18>;
  vect_accum_14.46_31 = vect_patt_25.45_4 + vect_patt_25.45_8;
  _33 = .REDUC_PLUS (vect_accum_14.46_31); [tail call]
---- CUT ----
Even the gimple level for x86_64 looks ok:
  vect__6.41_18 = MEM <vector(4) int> [(int32_t *)v1_2(D)];
  vect__10.44_13 = MEM <vector(4) int> [(int32_t *)v2_3(D)];
  vect_patt_25.45_8 = WIDEN_MULT_LO_EXPR <vect__10.44_13, vect__6.41_18>;
  vect_patt_25.45_4 = WIDEN_MULT_HI_EXPR <vect__10.44_13, vect__6.41_18>;
  vect_accum_14.46_31 = vect_patt_25.45_4 + vect_patt_25.45_8;
  _33 = VEC_PERM_EXPR <vect_accum_14.46_31, { 0, 0 }, { 1, 2 }>;
  _34 = vect_accum_14.46_31 + _33;
  stmp_accum_14.47_35 = BIT_FIELD_REF <_34, 64, 0>;

But the expansion looks bad.
Comment 6 Richard Biener 2021-07-27 07:24:08 UTC
0x398f310 _2 * _4 1 times scalar_stmt costs 12 in body
...
0x392b3f0 _1 w* _3 2 times vec_promote_demote costs 8 in body
...
t4.c:4:12: note:  Cost model analysis:
  Vector inside of loop cost: 40
  Vector prologue cost: 4
  Vector epilogue cost: 108
  Scalar iteration cost: 40
  Scalar outside cost: 32
  Vector outside cost: 112
  prologue iterations: 0
  epilogue iterations: 2
  Calculated minimum iters for profitability: 3

so clearly the widening multiplication is not costed correctly.  With SSE 4.2
we can do better:

.L4:
        movdqu  (%rcx,%rax), %xmm0
        movdqu  (%rsi,%rax), %xmm1
        addq    $16, %rax
        movdqa  %xmm0, %xmm3
        movdqa  %xmm1, %xmm4
        punpckldq       %xmm0, %xmm3
        punpckldq       %xmm1, %xmm4
        punpckhdq       %xmm0, %xmm0
        pmuldq  %xmm4, %xmm3
        punpckhdq       %xmm1, %xmm1
        pmuldq  %xmm1, %xmm0
        paddq   %xmm3, %xmm2
        paddq   %xmm0, %xmm2
        cmpq    %rdi, %rax
        jne     .L4

but even there the costing is imprecise.  The vectorizer is unhelpful in
categorizing the widen mult as vec_promote_demote which then fails to
run into

        case MULT_EXPR:
        case WIDEN_MULT_EXPR:
        case MULT_HIGHPART_EXPR:
          stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
          break;

fixing that yields

0x392b3f0 _1 w* _3 2 times vector_stmt costs 136 in body

for both SSE2 and SSE4.2 and AVX2 so that's over-estimating cost then via

      /* V*DImode is emulated with 5-8 insns.  */
      else if (mode == V2DImode || mode == V4DImode)
        {
          if (TARGET_XOP && mode == V2DImode)
            return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 3);
          else
            return ix86_vec_cost (mode, cost->mulss * 3 + cost->sse_op * 5);
        }

with cost->mulss == 16.  I suppose it is somehow failing to realize it's
doing a widening multiply.
Comment 7 GCC Commits 2021-07-27 08:42:17 UTC
The master branch has been updated by Richard Biener <rguenth@gcc.gnu.org>:

https://gcc.gnu.org/g:c8ce54c6e67295b70052d1b9f9a2f7ce9e2f8f0d

commit r12-2524-gc8ce54c6e67295b70052d1b9f9a2f7ce9e2f8f0d
Author: Richard Biener <rguenther@suse.de>
Date:   Tue Jul 27 09:24:57 2021 +0200

    tree-optimization/39821 - fix cost classification for widening arith
    
    This adjusts the vectorizer to cost vector_stmt for widening
    arithmetic instead of vec_promote_demote in the line of telling
    the target that stmt_info->stmt is the meaningful piece we cost.
    
    2021-07-27  Richard Biener  <rguenther@suse.de>
    
            PR tree-optimization/39821
            * tree-vect-stmts.c (vect_model_promotion_demotion_cost): Use
            vector_stmt for widening arithmetic.
            (vectorizable_conversion): Adjust.
Comment 8 Richard Biener 2021-07-27 08:43:23 UTC
I've pushed the change that makes us run into ix86_multiplication_cost but as said that doesn't differentiate highpart or widening multiply yet and thus we're now missing optimizations because of too conservative costing.
Comment 9 Hongtao.liu 2021-07-28 05:36:07 UTC
(In reply to Richard Biener from comment #8)
> I've pushed the change that makes us run into ix86_multiplication_cost but
> as said that doesn't differentiate highpart or widening multiply yet and
> thus we're now missing optimizations because of too conservative costing.

For MULT_HIGHPART_EXPR, x86 only have pmulhw, it's probably ok to go into ix86_multiplication_cost.

For WIDEN_MULT_EXPR, we need a separate cost function which should also accept sign info since we have pmuludq under sse2 but pmuldq under sse4.1.


.i.e we should vectorize udotproduct under sse2, but sdotprodoct under sse4.1

#include<stdint.h>
uint64_t udotproduct(uint32_t *v1, uint32_t *v2, int order)
{
    uint64_t accum = 0;
    while (order--)
        accum += (uint64_t) *v1++ * *v2++;
    return accum;
}

#include<stdint.h>
int64_t sdotproduct(int32_t *v1, int32_t *v2, int order)
{
    int64_t accum = 0;
    while (order--)
        accum += (int64_t) *v1++ * *v2++;
    return accum;
}
Comment 10 GCC Commits 2021-07-29 01:06:42 UTC
The master branch has been updated by hongtao Liu <liuhongt@gcc.gnu.org>:

https://gcc.gnu.org/g:231bcc77b953406b8381c7f55a3ec181da67d1e7

commit r12-2586-g231bcc77b953406b8381c7f55a3ec181da67d1e7
Author: liuhongt <hongtao.liu@intel.com>
Date:   Wed Jul 28 16:24:52 2021 +0800

    Add a separate function to calculate cost for WIDEN_MULT_EXPR.
    
    gcc/ChangeLog:
    
            PR target/39821
            * config/i386/i386.c (ix86_widen_mult_cost): New function.
            (ix86_add_stmt_cost): Use ix86_widen_mult_cost for
            WIDEN_MULT_EXPR.
    
    gcc/testsuite/ChangeLog:
    
            PR target/39821
            * gcc.target/i386/sse2-pr39821.c: New test.
            * gcc.target/i386/sse4-pr39821.c: New test.
Comment 11 Hongtao.liu 2021-07-29 01:12:55 UTC
Fixed in GCC12.
Comment 12 Richard Biener 2021-08-26 12:03:33 UTC
Fixed.