[Bug target/51179] New: poor vectorization on interlagos.
Joost.VandeVondele at mat dot ethz.ch
gcc-bugzilla@gcc.gnu.org
Wed Nov 16 19:25:00 GMT 2011
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=51179
Bug #: 51179
Summary: poor vectorization on interlagos.
Classification: Unclassified
Product: gcc
Version: 4.6.1
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
AssignedTo: unassigned@gcc.gnu.org
ReportedBy: Joost.VandeVondele@mat.ethz.ch
The following code executes significantly faster when compiled with the cray
compiler (gcc: 43.4s cray:7.7s for 100000000 calls)
SUBROUTINE smm_dnn_4_10_10_4_1_2_1(A,B,C)
REAL(KIND=KIND(0.0D0)) :: C(4,10), B(10,10), A(4,10)
INTEGER ::i,j,l
DO j= 1 , 10 , 2
DO l= 1 , 10 , 1
DO i= 1 , 4 , 1
C(i+0,j+0)=C(i+0,j+0)+A(i+0,l+0)*B(l+0,j+0)
C(i+0,j+1)=C(i+0,j+1)+A(i+0,l+0)*B(l+0,j+1)
ENDDO
ENDDO
ENDDO
END SUBROUTINE
cray options: -h noomp -e m -F -ra -O2 -Oipa1 -v tst.f90
gfortran: -O3 -march=native -ffast-math
which yields for gfortran:
-march=bdver1 -mcx16 -msahf -mno-movbe -maes -mpclmul -mpopcnt -mabm -mlwp
-mno-fma -mfma4 -mxop -mno-bmi -mno-tbm -mavx -msse4.2 -msse4.1
The cray code looks nice:
0000000000000000 <smm_dnn_4_10_10_4_1_2_1_>:
0: 48 89 7c 24 f8 mov %rdi,-0x8(%rsp)
5: 48 89 74 24 f0 mov %rsi,-0x10(%rsp)
a: 48 89 54 24 e8 mov %rdx,-0x18(%rsp)
f: c5 fc 10 02 vmovups (%rdx),%ymm0
13: c5 fc 10 4a 20 vmovups 0x20(%rdx),%ymm1
18: c5 fc 10 52 40 vmovups 0x40(%rdx),%ymm2
1d: c5 fc 10 5a 60 vmovups 0x60(%rdx),%ymm3
22: c5 fc 10 a2 80 00 00 vmovups 0x80(%rdx),%ymm4
29: 00
2a: c5 fc 10 aa a0 00 00 vmovups 0xa0(%rdx),%ymm5
31: 00
32: c5 fc 10 b2 c0 00 00 vmovups 0xc0(%rdx),%ymm6
39: 00
3a: c5 fc 10 ba e0 00 00 vmovups 0xe0(%rdx),%ymm7
41: 00
42: c5 7c 10 82 00 01 00 vmovups 0x100(%rdx),%ymm8
49: 00
4a: c5 7c 10 8a 20 01 00 vmovups 0x120(%rdx),%ymm9
51: 00
52: 31 c0 xor %eax,%eax
54: 48 89 c1 mov %rax,%rcx
57: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
5e: 00 00
60: c4 62 7d 19 94 c6 d0 vbroadcastsd 0x2d0(%rsi,%rax,8),%ymm10
67: 02 00 00
6a: c5 7c 10 1c 0f vmovups (%rdi,%rcx,1),%ymm11
6f: c4 43 a5 69 c9 a0 vfmaddpd %ymm9,%ymm10,%ymm11,%ymm9
75: c4 62 7d 19 94 c6 80 vbroadcastsd 0x280(%rsi,%rax,8),%ymm10
7c: 02 00 00
7f: c4 43 a5 69 c0 a0 vfmaddpd %ymm8,%ymm10,%ymm11,%ymm8
85: c4 62 7d 19 94 c6 30 vbroadcastsd 0x230(%rsi,%rax,8),%ymm10
8c: 02 00 00
8f: c4 e3 a5 69 ff a0 vfmaddpd %ymm7,%ymm10,%ymm11,%ymm7
95: c4 62 7d 19 94 c6 e0 vbroadcastsd 0x1e0(%rsi,%rax,8),%ymm10
9c: 01 00 00
9f: c4 e3 a5 69 f6 a0 vfmaddpd %ymm6,%ymm10,%ymm11,%ymm6
a5: c4 62 7d 19 94 c6 90 vbroadcastsd 0x190(%rsi,%rax,8),%ymm10
ac: 01 00 00
af: c4 e3 a5 69 ed a0 vfmaddpd %ymm5,%ymm10,%ymm11,%ymm5
b5: c4 62 7d 19 94 c6 40 vbroadcastsd 0x140(%rsi,%rax,8),%ymm10
bc: 01 00 00
bf: c4 e3 a5 69 e4 a0 vfmaddpd %ymm4,%ymm10,%ymm11,%ymm4
c5: c4 62 7d 19 94 c6 f0 vbroadcastsd 0xf0(%rsi,%rax,8),%ymm10
cc: 00 00 00
cf: c4 e3 a5 69 db a0 vfmaddpd %ymm3,%ymm10,%ymm11,%ymm3
d5: c4 62 7d 19 94 c6 a0 vbroadcastsd 0xa0(%rsi,%rax,8),%ymm10
dc: 00 00 00
df: c4 e3 a5 69 d2 a0 vfmaddpd %ymm2,%ymm10,%ymm11,%ymm2
e5: c4 62 7d 19 54 c6 50 vbroadcastsd 0x50(%rsi,%rax,8),%ymm10
ec: c4 e3 a5 69 c9 a0 vfmaddpd %ymm1,%ymm10,%ymm11,%ymm1
f2: c4 62 7d 19 14 c6 vbroadcastsd (%rsi,%rax,8),%ymm10
f8: c4 e3 a5 69 c0 a0 vfmaddpd %ymm0,%ymm10,%ymm11,%ymm0
fe: 48 83 c1 20 add $0x20,%rcx
102: 48 ff c0 inc %rax
105: 48 83 f8 0a cmp $0xa,%rax
109: 0f 8c 51 ff ff ff jl 60 <smm_dnn_4_10_10_4_1_2_1_+0x60>
10f: c5 78 11 8a 20 01 00 vmovups %xmm9,0x120(%rdx)
116: 00
117: c4 63 7d 19 8a 30 01 vextractf128 $0x1,%ymm9,0x130(%rdx)
11e: 00 00 01
121: c5 78 11 82 00 01 00 vmovups %xmm8,0x100(%rdx)
128: 00
129: c4 63 7d 19 82 10 01 vextractf128 $0x1,%ymm8,0x110(%rdx)
130: 00 00 01
133: c5 f8 11 ba e0 00 00 vmovups %xmm7,0xe0(%rdx)
13a: 00
13b: c4 e3 7d 19 ba f0 00 vextractf128 $0x1,%ymm7,0xf0(%rdx)
142: 00 00 01
145: c5 f8 11 b2 c0 00 00 vmovups %xmm6,0xc0(%rdx)
14c: 00
14d: c4 e3 7d 19 b2 d0 00 vextractf128 $0x1,%ymm6,0xd0(%rdx)
154: 00 00 01
157: c5 f8 11 aa a0 00 00 vmovups %xmm5,0xa0(%rdx)
15e: 00
15f: c4 e3 7d 19 aa b0 00 vextractf128 $0x1,%ymm5,0xb0(%rdx)
166: 00 00 01
169: c5 f8 11 a2 80 00 00 vmovups %xmm4,0x80(%rdx)
170: 00
171: c4 e3 7d 19 a2 90 00 vextractf128 $0x1,%ymm4,0x90(%rdx)
178: 00 00 01
17b: c5 f8 11 5a 60 vmovups %xmm3,0x60(%rdx)
180: c4 e3 7d 19 5a 70 01 vextractf128 $0x1,%ymm3,0x70(%rdx)
187: c5 f8 11 52 40 vmovups %xmm2,0x40(%rdx)
18c: c4 e3 7d 19 52 50 01 vextractf128 $0x1,%ymm2,0x50(%rdx)
193: c5 f8 11 4a 20 vmovups %xmm1,0x20(%rdx)
198: c4 e3 7d 19 4a 30 01 vextractf128 $0x1,%ymm1,0x30(%rdx)
19f: c5 f8 11 02 vmovups %xmm0,(%rdx)
1a3: c4 e3 7d 19 42 10 01 vextractf128 $0x1,%ymm0,0x10(%rdx)
1aa: c5 f8 77 vzeroupper
1ad: c3 retq
1ae: 66 90 xchg %ax,%ax
gcc's code looks more involved:
smm_dnn_4_10_10_4_1_2_1_:
.LFB0:
pushq %rbp
.LCFI0:
movl $1, %eax
movq %rsp, %rbp
.LCFI1:
andq $-32, %rsp
subq $616, %rsp
.LCFI2:
vmovupd 96(%rdi), %ymm0
vmovupd (%rdi), %ymm3
vmovupd 32(%rdi), %ymm1
vmovsd 280(%rdi), %xmm13
vmovupd 64(%rdi), %ymm2
vmovsd 288(%rdi), %xmm15
vmovsd 256(%rdi), %xmm4
vmovsd 264(%rdi), %xmm6
vmovsd 272(%rdi), %xmm7
vmovupd 128(%rdi), %ymm12
vmovsd %xmm13, 296(%rsp)
vmovupd 160(%rdi), %ymm11
vperm2f128 $32, %ymm1, %ymm3, %ymm13
vmovsd %xmm15, 288(%rsp)
vperm2f128 $49, %ymm1, %ymm3, %ymm1
vmovsd %xmm4, 320(%rsp)
vperm2f128 $32, %ymm0, %ymm2, %ymm15
vmovsd 296(%rdi), %xmm4
vperm2f128 $49, %ymm0, %ymm2, %ymm2
vmovsd %xmm6, 312(%rsp)
vmovaps %ymm1, 40(%rsp)
vunpcklpd %ymm1, %ymm13, %ymm1
vmovsd 304(%rdi), %xmm6
vunpcklpd %ymm2, %ymm15, %ymm0
vmovsd %xmm7, 304(%rsp)
vmovsd 312(%rdi), %xmm7
vmovaps %ymm2, -24(%rsp)
vperm2f128 $32, %ymm0, %ymm1, %ymm2
vmovupd 192(%rdi), %ymm10
vperm2f128 $49, %ymm0, %ymm1, %ymm0
vmovsd %xmm4, 280(%rsp)
vmovsd %xmm6, 336(%rsp)
vmovaps %ymm13, %ymm4
vmovsd %xmm7, 328(%rsp)
vmovaps %ymm15, %ymm6
vmovaps %ymm2, %ymm7
vunpcklpd %ymm0, %ymm2, %ymm8
vmovupd 224(%rdi), %ymm9
vmovaps %ymm13, 72(%rsp)
vmovaps %ymm15, 8(%rsp)
vmovaps %ymm2, -56(%rsp)
vmovaps %ymm0, -88(%rsp)
vxorps %xmm0, %xmm0, %xmm0
.L3:
vunpckhpd 40(%rsp), %ymm4, %ymm3
vmovupd (%rsi), %ymm4
vunpckhpd -24(%rsp), %ymm6, %ymm1
vunpckhpd -88(%rsp), %ymm7, %ymm5
vperm2f128 $32, %ymm1, %ymm3, %ymm2
vperm2f128 $49, %ymm1, %ymm3, %ymm1
vfmaddpd %ymm0, %ymm5, %ymm4, %ymm15
vfmaddpd %ymm0, %ymm8, %ymm4, %ymm3
vunpcklpd %ymm1, %ymm2, %ymm6
vunpckhpd %ymm1, %ymm2, %ymm2
vmovupd 80(%rsi), %ymm1
vfmaddpd %ymm0, %ymm6, %ymm4, %ymm13
vfmaddpd %ymm0, %ymm2, %ymm4, %ymm4
vmovaps %ymm15, 200(%rsp)
vmovsd 320(%rsp), %xmm15
vfmaddpd %ymm0, %ymm8, %ymm1, %ymm14
vfmaddpd %ymm0, %ymm6, %ymm1, %ymm6
vfmaddpd %ymm0, %ymm5, %ymm1, %ymm5
vfmaddpd %ymm0, %ymm2, %ymm1, %ymm1
vperm2f128 $32, %ymm11, %ymm12, %ymm2
vmovaps %ymm13, -120(%rsp)
vmovsd 64(%rsi), %xmm13
vmovaps %ymm4, 136(%rsp)
vmovaps %ymm6, 232(%rsp)
vfmaddsd (%rdx), %xmm15, %xmm13, %xmm15
vmovaps %ymm1, 104(%rsp)
vperm2f128 $49, %ymm11, %ymm12, %ymm1
vmovaps %ymm5, 168(%rsp)
vperm2f128 $32, %ymm9, %ymm10, %ymm5
vunpcklpd %ymm1, %ymm2, %ymm6
vmovsd %xmm13, 344(%rsp)
vunpckhpd %ymm1, %ymm2, %ymm2
vperm2f128 $49, %ymm9, %ymm10, %ymm1
vunpcklpd %ymm1, %ymm5, %ymm4
vmovsd %xmm15, 352(%rsp)
vunpckhpd %ymm1, %ymm5, %ymm1
vperm2f128 $32, %ymm4, %ymm6, %ymm5
vperm2f128 $49, %ymm4, %ymm6, %ymm4
vunpcklpd %ymm4, %ymm5, %ymm7
vunpckhpd %ymm4, %ymm5, %ymm5
vperm2f128 $32, %ymm1, %ymm2, %ymm4
vperm2f128 $49, %ymm1, %ymm2, %ymm1
vmovupd 32(%rsi), %ymm2
vunpcklpd %ymm1, %ymm4, %ymm6
vunpckhpd %ymm1, %ymm4, %ymm4
vmovupd 112(%rsi), %ymm1
vfmaddpd %ymm3, %ymm7, %ymm2, %ymm3
vfmaddpd %ymm14, %ymm7, %ymm1, %ymm7
vhaddpd %ymm3, %ymm3, %ymm3
vhaddpd %ymm7, %ymm7, %ymm7
vperm2f128 $1, %ymm3, %ymm3, %ymm15
vaddpd %ymm15, %ymm3, %ymm3
vmovaps %ymm3, 584(%rsp)
vmovsd 352(%rsp), %xmm3
vaddsd 584(%rsp), %xmm3, %xmm3
vmovsd 144(%rsi), %xmm15
vmovsd %xmm3, 264(%rsp)
vmovsd 320(%rsp), %xmm3
vfmaddsd 32(%rdx), %xmm3, %xmm15, %xmm13
vperm2f128 $1, %ymm7, %ymm7, %ymm3
vaddpd %ymm3, %ymm7, %ymm3
vmovaps %ymm3, 552(%rsp)
vmovsd 312(%rsp), %xmm3
vaddsd 552(%rsp), %xmm13, %xmm13
vmovsd %xmm13, 272(%rsp)
vmovsd 344(%rsp), %xmm13
vfmaddsd 8(%rdx), %xmm3, %xmm13, %xmm7
vfmaddpd -120(%rsp), %ymm6, %ymm2, %ymm13
vhaddpd %ymm13, %ymm13, %ymm13
vperm2f128 $1, %ymm13, %ymm13, %ymm3
vaddpd %ymm3, %ymm13, %ymm3
vmovaps %ymm3, 520(%rsp)
vaddsd 520(%rsp), %xmm7, %xmm7
vmovsd %xmm7, 352(%rsp)
vfmaddpd 232(%rsp), %ymm6, %ymm1, %ymm6
vmovsd 312(%rsp), %xmm13
vfmaddsd 40(%rdx), %xmm13, %xmm15, %xmm7
vhaddpd %ymm6, %ymm6, %ymm6
vperm2f128 $1, %ymm6, %ymm6, %ymm3
vaddpd %ymm3, %ymm6, %ymm3
vmovsd 304(%rsp), %xmm6
vmovaps %ymm3, 488(%rsp)
vmovsd 344(%rsp), %xmm3
vaddsd 488(%rsp), %xmm7, %xmm13
vfmaddsd 16(%rdx), %xmm6, %xmm3, %xmm7
vfmaddpd 200(%rsp), %ymm5, %ymm2, %ymm3
vfmaddpd 168(%rsp), %ymm5, %ymm1, %ymm5
vfmaddpd 136(%rsp), %ymm4, %ymm2, %ymm2
vfmaddpd 104(%rsp), %ymm4, %ymm1, %ymm1
vmovsd 288(%rsp), %xmm4
vhaddpd %ymm3, %ymm3, %ymm3
vhaddpd %ymm5, %ymm5, %ymm5
vhaddpd %ymm2, %ymm2, %ymm2
vhaddpd %ymm1, %ymm1, %ymm1
vperm2f128 $1, %ymm3, %ymm3, %ymm6
vaddpd %ymm6, %ymm3, %ymm3
vmovaps %ymm3, 456(%rsp)
vperm2f128 $1, %ymm5, %ymm5, %ymm3
vaddpd %ymm3, %ymm5, %ymm3
vaddsd 456(%rsp), %xmm7, %xmm14
vmovsd 304(%rsp), %xmm7
vfmaddsd 48(%rdx), %xmm7, %xmm15, %xmm6
vmovsd 296(%rsp), %xmm7
vmovaps %ymm3, 424(%rsp)
vmovsd 344(%rsp), %xmm3
vfmaddsd 24(%rdx), %xmm7, %xmm3, %xmm5
vperm2f128 $1, %ymm2, %ymm2, %ymm3
vaddpd %ymm3, %ymm2, %ymm2
vaddsd 424(%rsp), %xmm6, %xmm6
vmovaps %ymm2, 392(%rsp)
vperm2f128 $1, %ymm1, %ymm1, %ymm2
vaddpd %ymm2, %ymm1, %ymm1
vfmaddsd 56(%rdx), %xmm7, %xmm15, %xmm15
vaddsd 392(%rsp), %xmm5, %xmm5
vmovaps %ymm1, 360(%rsp)
vmovsd 72(%rsi), %xmm2
vmovsd 152(%rsi), %xmm1
addq $160, %rsi
vaddsd 360(%rsp), %xmm15, %xmm15
vfmaddsd 264(%rsp), %xmm4, %xmm2, %xmm3
vmovsd %xmm3, (%rdx)
vfmaddsd 272(%rsp), %xmm4, %xmm1, %xmm3
vmovsd %xmm3, 32(%rdx)
vmovsd 280(%rsp), %xmm3
vfmaddsd 352(%rsp), %xmm3, %xmm2, %xmm7
vmovsd %xmm7, 8(%rdx)
vfmaddsd %xmm13, %xmm3, %xmm1, %xmm7
vfmaddsd %xmm6, 336(%rsp), %xmm1, %xmm6
vfmaddsd %xmm5, 328(%rsp), %xmm2, %xmm5
vfmaddsd %xmm15, 328(%rsp), %xmm1, %xmm1
vmovsd %xmm7, 40(%rdx)
vfmaddsd %xmm14, 336(%rsp), %xmm2, %xmm7
vmovsd %xmm6, 48(%rdx)
vmovsd %xmm5, 24(%rdx)
vmovsd %xmm1, 56(%rdx)
vmovsd %xmm7, 16(%rdx)
addq $64, %rdx
cmpl $9, %eax
je .L1
addl $2, %eax
vmovaps 72(%rsp), %ymm4
vmovaps 8(%rsp), %ymm6
vmovaps -56(%rsp), %ymm7
jmp .L3
.p2align 5,,7
.p2align 3
.L1:
leave
.LCFI3:
More information about the Gcc-bugs
mailing list