The vectorizer produces horrible code with this testcase: $ cat dotproduct.c #include "inttypes.h" int64_t dotproduct(int32_t *v1, int32_t *v2, int order) { int64_t accum = 0; while (order--) accum += (int64_t) *v1++ * *v2++; return accum; } int64_t dotproduct_order4(int32_t *v1, int32_t *v2, int order) { return dotproduct(v1, v2, 4); } $ gcc-4.4rc1 -o dotproduct.o -c dotproduct.c -O3 $ gcc-4.4rc1 -o dotproduct-no-vectorize.o -c dotproduct.c -O3 -fno-tree-vectorize $ objdump -d dotproduct.o dotproduct.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <dotproduct>: 0: 31 c0 xor %eax,%eax 2: 85 d2 test %edx,%edx 4: 0f 84 4e 01 00 00 je 158 <dotproduct+0x158> a: 41 89 d0 mov %edx,%r8d d: 44 8d 52 ff lea -0x1(%rdx),%r10d 11: 41 c1 e8 02 shr $0x2,%r8d 15: 83 fa 03 cmp $0x3,%edx 18: 46 8d 0c 85 00 00 00 lea 0x0(,%r8,4),%r9d 1f: 00 20: 76 05 jbe 27 <dotproduct+0x27> 22: 45 85 c9 test %r9d,%r9d 25: 75 09 jne 30 <dotproduct+0x30> 27: 31 c0 xor %eax,%eax 29: e9 fc 00 00 00 jmpq 12a <dotproduct+0x12a> 2e: 66 90 xchg %ax,%ax 30: 66 0f ef c0 pxor %xmm0,%xmm0 34: 31 c0 xor %eax,%eax 36: 66 45 0f ef c9 pxor %xmm9,%xmm9 3b: 31 c9 xor %ecx,%ecx 3d: 0f 1f 00 nopl (%rax) 40: f3 0f 6f 14 07 movdqu (%rdi,%rax,1),%xmm2 45: 83 c1 01 add $0x1,%ecx 48: 66 41 0f 6f d9 movdqa %xmm9,%xmm3 4d: f3 0f 6f 24 06 movdqu (%rsi,%rax,1),%xmm4 52: 66 45 0f 6f c1 movdqa %xmm9,%xmm8 57: 66 0f 6f ea movdqa %xmm2,%xmm5 5b: 48 83 c0 10 add $0x10,%rax 5f: 66 0f 66 dc pcmpgtd %xmm4,%xmm3 63: 66 0f 6f fc movdqa %xmm4,%xmm7 67: 66 44 0f 66 c2 pcmpgtd %xmm2,%xmm8 6c: 41 39 c8 cmp %ecx,%r8d 6f: 66 0f 62 fb punpckldq %xmm3,%xmm7 73: 66 41 0f 62 e8 punpckldq %xmm8,%xmm5 78: 66 0f 6a e3 punpckhdq %xmm3,%xmm4 7c: 66 41 0f 6a d0 punpckhdq %xmm8,%xmm2 81: 66 0f 6f cf movdqa %xmm7,%xmm1 85: 66 0f 6f f5 movdqa %xmm5,%xmm6 89: 66 44 0f 6f d7 movdqa %xmm7,%xmm10 8e: 66 0f f4 cd pmuludq %xmm5,%xmm1 92: 66 0f 6f da movdqa %xmm2,%xmm3 96: 66 0f 73 d6 20 psrlq $0x20,%xmm6 9b: 66 0f f4 f7 pmuludq %xmm7,%xmm6 9f: 66 41 0f 73 d2 20 psrlq $0x20,%xmm10 a5: 66 0f 73 f6 20 psllq $0x20,%xmm6 aa: 66 41 0f f4 ea pmuludq %xmm10,%xmm5 af: 66 0f d4 ce paddq %xmm6,%xmm1 b3: 66 0f 73 f5 20 psllq $0x20,%xmm5 b8: 66 0f d4 cd paddq %xmm5,%xmm1 bc: 66 0f 6f ec movdqa %xmm4,%xmm5 c0: 66 0f d4 c8 paddq %xmm0,%xmm1 c4: 66 0f 73 d3 20 psrlq $0x20,%xmm3 c9: 66 0f 6f c4 movdqa %xmm4,%xmm0 cd: 66 0f f4 dc pmuludq %xmm4,%xmm3 d1: 66 0f 73 f3 20 psllq $0x20,%xmm3 d6: 66 0f 73 d5 20 psrlq $0x20,%xmm5 db: 66 0f f4 c2 pmuludq %xmm2,%xmm0 df: 66 0f f4 d5 pmuludq %xmm5,%xmm2 e3: 66 0f d4 c3 paddq %xmm3,%xmm0 e7: 66 0f 73 f2 20 psllq $0x20,%xmm2 ec: 66 0f d4 c2 paddq %xmm2,%xmm0 f0: 66 0f d4 c1 paddq %xmm1,%xmm0 f4: 0f 87 46 ff ff ff ja 40 <dotproduct+0x40> fa: 42 8d 0c 8d 00 00 00 lea 0x0(,%r9,4),%ecx 101: 00 102: 66 0f 6f c8 movdqa %xmm0,%xmm1 106: 45 29 ca sub %r9d,%r10d 109: 89 c9 mov %ecx,%ecx 10b: 66 0f 73 d9 08 psrldq $0x8,%xmm1 110: 66 0f d4 c1 paddq %xmm1,%xmm0 114: 48 01 cf add %rcx,%rdi 117: 48 01 ce add %rcx,%rsi 11a: 44 39 ca cmp %r9d,%edx 11d: 66 0f d6 44 24 f8 movq %xmm0,-0x8(%rsp) 123: 48 8b 44 24 f8 mov -0x8(%rsp),%rax 128: 74 2e je 158 <dotproduct+0x158> 12a: 45 89 d2 mov %r10d,%r10d 12d: 31 d2 xor %edx,%edx 12f: 4e 8d 0c 95 04 00 00 lea 0x4(,%r10,4),%r9 136: 00 137: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1) 13e: 00 00 140: 48 63 0c 16 movslq (%rsi,%rdx,1),%rcx 144: 4c 63 04 17 movslq (%rdi,%rdx,1),%r8 148: 48 83 c2 04 add $0x4,%rdx 14c: 49 0f af c8 imul %r8,%rcx 150: 48 01 c8 add %rcx,%rax 153: 4c 39 ca cmp %r9,%rdx 156: 75 e8 jne 140 <dotproduct+0x140> 158: f3 c3 repz retq 15a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) 0000000000000160 <dotproduct_order4>: 160: 66 0f ef c0 pxor %xmm0,%xmm0 164: f3 0f 6f 0f movdqu (%rdi),%xmm1 168: f3 0f 6f 1e movdqu (%rsi),%xmm3 16c: 66 0f 6f d0 movdqa %xmm0,%xmm2 170: 66 0f 6f f1 movdqa %xmm1,%xmm6 174: 66 0f 66 c1 pcmpgtd %xmm1,%xmm0 178: 66 0f 6f fb movdqa %xmm3,%xmm7 17c: 66 0f 66 d3 pcmpgtd %xmm3,%xmm2 180: 66 0f 62 f0 punpckldq %xmm0,%xmm6 184: 66 0f 62 fa punpckldq %xmm2,%xmm7 188: 66 0f 6a da punpckhdq %xmm2,%xmm3 18c: 66 0f 6a c8 punpckhdq %xmm0,%xmm1 190: 66 0f 6f ee movdqa %xmm6,%xmm5 194: 66 44 0f 6f c7 movdqa %xmm7,%xmm8 199: 66 0f 6f e7 movdqa %xmm7,%xmm4 19d: 66 0f 6f c3 movdqa %xmm3,%xmm0 1a1: 66 0f 73 d5 20 psrlq $0x20,%xmm5 1a6: 66 44 0f f4 c6 pmuludq %xmm6,%xmm8 1ab: 66 0f f4 ef pmuludq %xmm7,%xmm5 1af: 66 0f 6f d1 movdqa %xmm1,%xmm2 1b3: 66 0f 73 d4 20 psrlq $0x20,%xmm4 1b8: 66 0f 73 f5 20 psllq $0x20,%xmm5 1bd: 66 0f f4 e6 pmuludq %xmm6,%xmm4 1c1: 66 41 0f d4 e8 paddq %xmm8,%xmm5 1c6: 66 0f 73 f4 20 psllq $0x20,%xmm4 1cb: 66 0f d4 e5 paddq %xmm5,%xmm4 1cf: 66 0f 6f eb movdqa %xmm3,%xmm5 1d3: 66 0f f4 c1 pmuludq %xmm1,%xmm0 1d7: 66 0f 73 d2 20 psrlq $0x20,%xmm2 1dc: 66 0f f4 d3 pmuludq %xmm3,%xmm2 1e0: 66 0f 73 f2 20 psllq $0x20,%xmm2 1e5: 66 0f d4 c2 paddq %xmm2,%xmm0 1e9: 66 0f 73 d5 20 psrlq $0x20,%xmm5 1ee: 66 0f f4 cd pmuludq %xmm5,%xmm1 1f2: 66 0f 73 f1 20 psllq $0x20,%xmm1 1f7: 66 0f d4 c1 paddq %xmm1,%xmm0 1fb: 66 0f d4 c4 paddq %xmm4,%xmm0 1ff: 66 0f 6f c8 movdqa %xmm0,%xmm1 203: 66 0f 73 d9 08 psrldq $0x8,%xmm1 208: 66 0f d4 c1 paddq %xmm1,%xmm0 20c: 66 0f d6 44 24 f8 movq %xmm0,-0x8(%rsp) 212: 48 8b 44 24 f8 mov -0x8(%rsp),%rax 217: c3 retq $ objdump -d dotproduct-no-vectorize.o dotproduct-no-vectorize.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <dotproduct>: 0: 31 c0 xor %eax,%eax 2: 85 d2 test %edx,%edx 4: 74 2a je 30 <dotproduct+0x30> 6: 83 ea 01 sub $0x1,%edx 9: 4c 8d 0c 95 04 00 00 lea 0x4(,%rdx,4),%r9 10: 00 11: 31 d2 xor %edx,%edx 13: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) 18: 48 63 0c 16 movslq (%rsi,%rdx,1),%rcx 1c: 4c 63 04 17 movslq (%rdi,%rdx,1),%r8 20: 48 83 c2 04 add $0x4,%rdx 24: 49 0f af c8 imul %r8,%rcx 28: 48 01 c8 add %rcx,%rax 2b: 4c 39 ca cmp %r9,%rdx 2e: 75 e8 jne 18 <dotproduct+0x18> 30: f3 c3 repz retq 32: 66 66 66 66 66 2e 0f nopw %cs:0x0(%rax,%rax,1) 39: 1f 84 00 00 00 00 00 0000000000000040 <dotproduct_order4>: 40: 48 63 07 movslq (%rdi),%rax 43: 48 63 16 movslq (%rsi),%rdx 46: 48 63 4f 04 movslq 0x4(%rdi),%rcx 4a: 48 0f af d0 imul %rax,%rdx 4e: 48 63 46 04 movslq 0x4(%rsi),%rax 52: 48 0f af c1 imul %rcx,%rax 56: 48 63 4f 08 movslq 0x8(%rdi),%rcx 5a: 48 01 c2 add %rax,%rdx 5d: 48 63 46 08 movslq 0x8(%rsi),%rax 61: 48 0f af c1 imul %rcx,%rax 65: 48 63 4f 0c movslq 0xc(%rdi),%rcx 69: 48 01 c2 add %rax,%rdx 6c: 48 63 46 0c movslq 0xc(%rsi),%rax 70: 48 0f af c1 imul %rcx,%rax 74: 48 01 d0 add %rdx,%rax 77: c3 retq
The vectorizer creates vect_var_.128_46 = M*vect_p.123_44{misalignment: 0}; vect_var_.129_47 = [vec_unpack_lo_expr] vect_var_.128_46; vect_var_.129_48 = [vec_unpack_hi_expr] vect_var_.128_46; vect_var_.135_53 = M*vect_p.130_51{misalignment: 0}; vect_var_.136_54 = [vec_unpack_lo_expr] vect_var_.135_53; vect_var_.136_55 = [vec_unpack_hi_expr] vect_var_.135_53; vect_var_.137_56 = vect_var_.136_54 * vect_var_.129_47; vect_var_.137_57 = vect_var_.136_55 * vect_var_.129_48; vect_var_.138_59 = vect_var_.137_56 + vect_var_.138_58; vect_var_.138_60 = vect_var_.137_57 + vect_var_.138_59; v1_14 = v1_26 + 4; but the widening unpacking results in absymal code generated. Where are all the shifts coming from?
(In reply to comment #1) > but the widening unpacking results in absymal code generated. Where are > all the shifts coming from? Not from unpacking, but from mulv2di pattern from sse.md Can you please attach full source to create executable testcase? IIRC, execution times depend on target processor, and perhaps vect cost should be updated for this case.
Created attachment 17660 [details] tarball of a simple testcase
I've attached a simple testcase. The system I'm running this on is a q6600 with 64-bit Linux.
The code generation for aarch64 looks fine: dotproduct_order4: .LFB1: .cfi_startproc ldr q1, [x0] ldr q2, [x1] smull v0.2d, v2.2s, v1.2s smlal2 v0.2d, v2.4s, v1.4s addp d0, v0.2d fmov x0, d0 ret vect__6.41_18 = MEM <vector(4) int> [(int32_t *)v1_2(D)]; vect__10.44_13 = MEM <vector(4) int> [(int32_t *)v2_3(D)]; vect_patt_25.45_8 = WIDEN_MULT_LO_EXPR <vect__10.44_13, vect__6.41_18>; vect_patt_25.45_4 = WIDEN_MULT_HI_EXPR <vect__10.44_13, vect__6.41_18>; vect_accum_14.46_31 = vect_patt_25.45_4 + vect_patt_25.45_8; _33 = .REDUC_PLUS (vect_accum_14.46_31); [tail call] ---- CUT ---- Even the gimple level for x86_64 looks ok: vect__6.41_18 = MEM <vector(4) int> [(int32_t *)v1_2(D)]; vect__10.44_13 = MEM <vector(4) int> [(int32_t *)v2_3(D)]; vect_patt_25.45_8 = WIDEN_MULT_LO_EXPR <vect__10.44_13, vect__6.41_18>; vect_patt_25.45_4 = WIDEN_MULT_HI_EXPR <vect__10.44_13, vect__6.41_18>; vect_accum_14.46_31 = vect_patt_25.45_4 + vect_patt_25.45_8; _33 = VEC_PERM_EXPR <vect_accum_14.46_31, { 0, 0 }, { 1, 2 }>; _34 = vect_accum_14.46_31 + _33; stmp_accum_14.47_35 = BIT_FIELD_REF <_34, 64, 0>; But the expansion looks bad.
0x398f310 _2 * _4 1 times scalar_stmt costs 12 in body ... 0x392b3f0 _1 w* _3 2 times vec_promote_demote costs 8 in body ... t4.c:4:12: note: Cost model analysis: Vector inside of loop cost: 40 Vector prologue cost: 4 Vector epilogue cost: 108 Scalar iteration cost: 40 Scalar outside cost: 32 Vector outside cost: 112 prologue iterations: 0 epilogue iterations: 2 Calculated minimum iters for profitability: 3 so clearly the widening multiplication is not costed correctly. With SSE 4.2 we can do better: .L4: movdqu (%rcx,%rax), %xmm0 movdqu (%rsi,%rax), %xmm1 addq $16, %rax movdqa %xmm0, %xmm3 movdqa %xmm1, %xmm4 punpckldq %xmm0, %xmm3 punpckldq %xmm1, %xmm4 punpckhdq %xmm0, %xmm0 pmuldq %xmm4, %xmm3 punpckhdq %xmm1, %xmm1 pmuldq %xmm1, %xmm0 paddq %xmm3, %xmm2 paddq %xmm0, %xmm2 cmpq %rdi, %rax jne .L4 but even there the costing is imprecise. The vectorizer is unhelpful in categorizing the widen mult as vec_promote_demote which then fails to run into case MULT_EXPR: case WIDEN_MULT_EXPR: case MULT_HIGHPART_EXPR: stmt_cost = ix86_multiplication_cost (ix86_cost, mode); break; fixing that yields 0x392b3f0 _1 w* _3 2 times vector_stmt costs 136 in body for both SSE2 and SSE4.2 and AVX2 so that's over-estimating cost then via /* V*DImode is emulated with 5-8 insns. */ else if (mode == V2DImode || mode == V4DImode) { if (TARGET_XOP && mode == V2DImode) return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 3); else return ix86_vec_cost (mode, cost->mulss * 3 + cost->sse_op * 5); } with cost->mulss == 16. I suppose it is somehow failing to realize it's doing a widening multiply.
The master branch has been updated by Richard Biener <rguenth@gcc.gnu.org>: https://gcc.gnu.org/g:c8ce54c6e67295b70052d1b9f9a2f7ce9e2f8f0d commit r12-2524-gc8ce54c6e67295b70052d1b9f9a2f7ce9e2f8f0d Author: Richard Biener <rguenther@suse.de> Date: Tue Jul 27 09:24:57 2021 +0200 tree-optimization/39821 - fix cost classification for widening arith This adjusts the vectorizer to cost vector_stmt for widening arithmetic instead of vec_promote_demote in the line of telling the target that stmt_info->stmt is the meaningful piece we cost. 2021-07-27 Richard Biener <rguenther@suse.de> PR tree-optimization/39821 * tree-vect-stmts.c (vect_model_promotion_demotion_cost): Use vector_stmt for widening arithmetic. (vectorizable_conversion): Adjust.
I've pushed the change that makes us run into ix86_multiplication_cost but as said that doesn't differentiate highpart or widening multiply yet and thus we're now missing optimizations because of too conservative costing.
(In reply to Richard Biener from comment #8) > I've pushed the change that makes us run into ix86_multiplication_cost but > as said that doesn't differentiate highpart or widening multiply yet and > thus we're now missing optimizations because of too conservative costing. For MULT_HIGHPART_EXPR, x86 only have pmulhw, it's probably ok to go into ix86_multiplication_cost. For WIDEN_MULT_EXPR, we need a separate cost function which should also accept sign info since we have pmuludq under sse2 but pmuldq under sse4.1. .i.e we should vectorize udotproduct under sse2, but sdotprodoct under sse4.1 #include<stdint.h> uint64_t udotproduct(uint32_t *v1, uint32_t *v2, int order) { uint64_t accum = 0; while (order--) accum += (uint64_t) *v1++ * *v2++; return accum; } #include<stdint.h> int64_t sdotproduct(int32_t *v1, int32_t *v2, int order) { int64_t accum = 0; while (order--) accum += (int64_t) *v1++ * *v2++; return accum; }
The master branch has been updated by hongtao Liu <liuhongt@gcc.gnu.org>: https://gcc.gnu.org/g:231bcc77b953406b8381c7f55a3ec181da67d1e7 commit r12-2586-g231bcc77b953406b8381c7f55a3ec181da67d1e7 Author: liuhongt <hongtao.liu@intel.com> Date: Wed Jul 28 16:24:52 2021 +0800 Add a separate function to calculate cost for WIDEN_MULT_EXPR. gcc/ChangeLog: PR target/39821 * config/i386/i386.c (ix86_widen_mult_cost): New function. (ix86_add_stmt_cost): Use ix86_widen_mult_cost for WIDEN_MULT_EXPR. gcc/testsuite/ChangeLog: PR target/39821 * gcc.target/i386/sse2-pr39821.c: New test. * gcc.target/i386/sse4-pr39821.c: New test.
Fixed in GCC12.
Fixed.