This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug tree-optimization/39821] New: 120% slowdown with vectorizer
- From: "ramiro86 at hotmail dot com" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: 20 Apr 2009 00:23:01 -0000
- Subject: [Bug tree-optimization/39821] New: 120% slowdown with vectorizer
- Reply-to: gcc-bugzilla at gcc dot gnu dot org
The vectorizer produces horrible code with this testcase:
$ cat dotproduct.c
#include "inttypes.h"
int64_t dotproduct(int32_t *v1, int32_t *v2, int order)
{
int64_t accum = 0;
while (order--)
accum += (int64_t) *v1++ * *v2++;
return accum;
}
int64_t dotproduct_order4(int32_t *v1, int32_t *v2, int order)
{
return dotproduct(v1, v2, 4);
}
$ gcc-4.4rc1 -o dotproduct.o -c dotproduct.c -O3
$ gcc-4.4rc1 -o dotproduct-no-vectorize.o -c dotproduct.c -O3
-fno-tree-vectorize
$ objdump -d dotproduct.o
dotproduct.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <dotproduct>:
0: 31 c0 xor %eax,%eax
2: 85 d2 test %edx,%edx
4: 0f 84 4e 01 00 00 je 158 <dotproduct+0x158>
a: 41 89 d0 mov %edx,%r8d
d: 44 8d 52 ff lea -0x1(%rdx),%r10d
11: 41 c1 e8 02 shr $0x2,%r8d
15: 83 fa 03 cmp $0x3,%edx
18: 46 8d 0c 85 00 00 00 lea 0x0(,%r8,4),%r9d
1f: 00
20: 76 05 jbe 27 <dotproduct+0x27>
22: 45 85 c9 test %r9d,%r9d
25: 75 09 jne 30 <dotproduct+0x30>
27: 31 c0 xor %eax,%eax
29: e9 fc 00 00 00 jmpq 12a <dotproduct+0x12a>
2e: 66 90 xchg %ax,%ax
30: 66 0f ef c0 pxor %xmm0,%xmm0
34: 31 c0 xor %eax,%eax
36: 66 45 0f ef c9 pxor %xmm9,%xmm9
3b: 31 c9 xor %ecx,%ecx
3d: 0f 1f 00 nopl (%rax)
40: f3 0f 6f 14 07 movdqu (%rdi,%rax,1),%xmm2
45: 83 c1 01 add $0x1,%ecx
48: 66 41 0f 6f d9 movdqa %xmm9,%xmm3
4d: f3 0f 6f 24 06 movdqu (%rsi,%rax,1),%xmm4
52: 66 45 0f 6f c1 movdqa %xmm9,%xmm8
57: 66 0f 6f ea movdqa %xmm2,%xmm5
5b: 48 83 c0 10 add $0x10,%rax
5f: 66 0f 66 dc pcmpgtd %xmm4,%xmm3
63: 66 0f 6f fc movdqa %xmm4,%xmm7
67: 66 44 0f 66 c2 pcmpgtd %xmm2,%xmm8
6c: 41 39 c8 cmp %ecx,%r8d
6f: 66 0f 62 fb punpckldq %xmm3,%xmm7
73: 66 41 0f 62 e8 punpckldq %xmm8,%xmm5
78: 66 0f 6a e3 punpckhdq %xmm3,%xmm4
7c: 66 41 0f 6a d0 punpckhdq %xmm8,%xmm2
81: 66 0f 6f cf movdqa %xmm7,%xmm1
85: 66 0f 6f f5 movdqa %xmm5,%xmm6
89: 66 44 0f 6f d7 movdqa %xmm7,%xmm10
8e: 66 0f f4 cd pmuludq %xmm5,%xmm1
92: 66 0f 6f da movdqa %xmm2,%xmm3
96: 66 0f 73 d6 20 psrlq $0x20,%xmm6
9b: 66 0f f4 f7 pmuludq %xmm7,%xmm6
9f: 66 41 0f 73 d2 20 psrlq $0x20,%xmm10
a5: 66 0f 73 f6 20 psllq $0x20,%xmm6
aa: 66 41 0f f4 ea pmuludq %xmm10,%xmm5
af: 66 0f d4 ce paddq %xmm6,%xmm1
b3: 66 0f 73 f5 20 psllq $0x20,%xmm5
b8: 66 0f d4 cd paddq %xmm5,%xmm1
bc: 66 0f 6f ec movdqa %xmm4,%xmm5
c0: 66 0f d4 c8 paddq %xmm0,%xmm1
c4: 66 0f 73 d3 20 psrlq $0x20,%xmm3
c9: 66 0f 6f c4 movdqa %xmm4,%xmm0
cd: 66 0f f4 dc pmuludq %xmm4,%xmm3
d1: 66 0f 73 f3 20 psllq $0x20,%xmm3
d6: 66 0f 73 d5 20 psrlq $0x20,%xmm5
db: 66 0f f4 c2 pmuludq %xmm2,%xmm0
df: 66 0f f4 d5 pmuludq %xmm5,%xmm2
e3: 66 0f d4 c3 paddq %xmm3,%xmm0
e7: 66 0f 73 f2 20 psllq $0x20,%xmm2
ec: 66 0f d4 c2 paddq %xmm2,%xmm0
f0: 66 0f d4 c1 paddq %xmm1,%xmm0
f4: 0f 87 46 ff ff ff ja 40 <dotproduct+0x40>
fa: 42 8d 0c 8d 00 00 00 lea 0x0(,%r9,4),%ecx
101: 00
102: 66 0f 6f c8 movdqa %xmm0,%xmm1
106: 45 29 ca sub %r9d,%r10d
109: 89 c9 mov %ecx,%ecx
10b: 66 0f 73 d9 08 psrldq $0x8,%xmm1
110: 66 0f d4 c1 paddq %xmm1,%xmm0
114: 48 01 cf add %rcx,%rdi
117: 48 01 ce add %rcx,%rsi
11a: 44 39 ca cmp %r9d,%edx
11d: 66 0f d6 44 24 f8 movq %xmm0,-0x8(%rsp)
123: 48 8b 44 24 f8 mov -0x8(%rsp),%rax
128: 74 2e je 158 <dotproduct+0x158>
12a: 45 89 d2 mov %r10d,%r10d
12d: 31 d2 xor %edx,%edx
12f: 4e 8d 0c 95 04 00 00 lea 0x4(,%r10,4),%r9
136: 00
137: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
13e: 00 00
140: 48 63 0c 16 movslq (%rsi,%rdx,1),%rcx
144: 4c 63 04 17 movslq (%rdi,%rdx,1),%r8
148: 48 83 c2 04 add $0x4,%rdx
14c: 49 0f af c8 imul %r8,%rcx
150: 48 01 c8 add %rcx,%rax
153: 4c 39 ca cmp %r9,%rdx
156: 75 e8 jne 140 <dotproduct+0x140>
158: f3 c3 repz retq
15a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
0000000000000160 <dotproduct_order4>:
160: 66 0f ef c0 pxor %xmm0,%xmm0
164: f3 0f 6f 0f movdqu (%rdi),%xmm1
168: f3 0f 6f 1e movdqu (%rsi),%xmm3
16c: 66 0f 6f d0 movdqa %xmm0,%xmm2
170: 66 0f 6f f1 movdqa %xmm1,%xmm6
174: 66 0f 66 c1 pcmpgtd %xmm1,%xmm0
178: 66 0f 6f fb movdqa %xmm3,%xmm7
17c: 66 0f 66 d3 pcmpgtd %xmm3,%xmm2
180: 66 0f 62 f0 punpckldq %xmm0,%xmm6
184: 66 0f 62 fa punpckldq %xmm2,%xmm7
188: 66 0f 6a da punpckhdq %xmm2,%xmm3
18c: 66 0f 6a c8 punpckhdq %xmm0,%xmm1
190: 66 0f 6f ee movdqa %xmm6,%xmm5
194: 66 44 0f 6f c7 movdqa %xmm7,%xmm8
199: 66 0f 6f e7 movdqa %xmm7,%xmm4
19d: 66 0f 6f c3 movdqa %xmm3,%xmm0
1a1: 66 0f 73 d5 20 psrlq $0x20,%xmm5
1a6: 66 44 0f f4 c6 pmuludq %xmm6,%xmm8
1ab: 66 0f f4 ef pmuludq %xmm7,%xmm5
1af: 66 0f 6f d1 movdqa %xmm1,%xmm2
1b3: 66 0f 73 d4 20 psrlq $0x20,%xmm4
1b8: 66 0f 73 f5 20 psllq $0x20,%xmm5
1bd: 66 0f f4 e6 pmuludq %xmm6,%xmm4
1c1: 66 41 0f d4 e8 paddq %xmm8,%xmm5
1c6: 66 0f 73 f4 20 psllq $0x20,%xmm4
1cb: 66 0f d4 e5 paddq %xmm5,%xmm4
1cf: 66 0f 6f eb movdqa %xmm3,%xmm5
1d3: 66 0f f4 c1 pmuludq %xmm1,%xmm0
1d7: 66 0f 73 d2 20 psrlq $0x20,%xmm2
1dc: 66 0f f4 d3 pmuludq %xmm3,%xmm2
1e0: 66 0f 73 f2 20 psllq $0x20,%xmm2
1e5: 66 0f d4 c2 paddq %xmm2,%xmm0
1e9: 66 0f 73 d5 20 psrlq $0x20,%xmm5
1ee: 66 0f f4 cd pmuludq %xmm5,%xmm1
1f2: 66 0f 73 f1 20 psllq $0x20,%xmm1
1f7: 66 0f d4 c1 paddq %xmm1,%xmm0
1fb: 66 0f d4 c4 paddq %xmm4,%xmm0
1ff: 66 0f 6f c8 movdqa %xmm0,%xmm1
203: 66 0f 73 d9 08 psrldq $0x8,%xmm1
208: 66 0f d4 c1 paddq %xmm1,%xmm0
20c: 66 0f d6 44 24 f8 movq %xmm0,-0x8(%rsp)
212: 48 8b 44 24 f8 mov -0x8(%rsp),%rax
217: c3 retq
$ objdump -d dotproduct-no-vectorize.o
dotproduct-no-vectorize.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <dotproduct>:
0: 31 c0 xor %eax,%eax
2: 85 d2 test %edx,%edx
4: 74 2a je 30 <dotproduct+0x30>
6: 83 ea 01 sub $0x1,%edx
9: 4c 8d 0c 95 04 00 00 lea 0x4(,%rdx,4),%r9
10: 00
11: 31 d2 xor %edx,%edx
13: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
18: 48 63 0c 16 movslq (%rsi,%rdx,1),%rcx
1c: 4c 63 04 17 movslq (%rdi,%rdx,1),%r8
20: 48 83 c2 04 add $0x4,%rdx
24: 49 0f af c8 imul %r8,%rcx
28: 48 01 c8 add %rcx,%rax
2b: 4c 39 ca cmp %r9,%rdx
2e: 75 e8 jne 18 <dotproduct+0x18>
30: f3 c3 repz retq
32: 66 66 66 66 66 2e 0f nopw %cs:0x0(%rax,%rax,1)
39: 1f 84 00 00 00 00 00
0000000000000040 <dotproduct_order4>:
40: 48 63 07 movslq (%rdi),%rax
43: 48 63 16 movslq (%rsi),%rdx
46: 48 63 4f 04 movslq 0x4(%rdi),%rcx
4a: 48 0f af d0 imul %rax,%rdx
4e: 48 63 46 04 movslq 0x4(%rsi),%rax
52: 48 0f af c1 imul %rcx,%rax
56: 48 63 4f 08 movslq 0x8(%rdi),%rcx
5a: 48 01 c2 add %rax,%rdx
5d: 48 63 46 08 movslq 0x8(%rsi),%rax
61: 48 0f af c1 imul %rcx,%rax
65: 48 63 4f 0c movslq 0xc(%rdi),%rcx
69: 48 01 c2 add %rax,%rdx
6c: 48 63 46 0c movslq 0xc(%rsi),%rax
70: 48 0f af c1 imul %rcx,%rax
74: 48 01 d0 add %rdx,%rax
77: c3 retq
--
Summary: 120% slowdown with vectorizer
Product: gcc
Version: 4.4.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: ramiro86 at hotmail dot com
GCC target triplet: x86_64-linux-gnu
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39821