This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug tree-optimization/51499] vectorizer missing simple case
- From: "dominiq at lps dot ens.fr" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Mon, 12 Dec 2011 12:47:54 +0000
- Subject: [Bug tree-optimization/51499] vectorizer missing simple case
- Auto-submitted: auto-generated
- References: <bug-51499-4@http.gcc.gnu.org/bugzilla/>
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=51499
--- Comment #12 from Dominique d'Humieres <dominiq at lps dot ens.fr> 2011-12-12 12:47:54 UTC ---
> > even when the above loops are unrolled. How can the loop L5 be unrolled if it
> > is only there for a "scalar epilogue"?
>
> It can't be unrolled, since the alignment is unknown, so we don't know the
> number of iterations of the prologue loop, and, therefore, we don't know the
> number of iterations of the epilogue.
Well, it is unrolled with -funroll-loops, for instance if I compile with
'-Ofast -funroll-loops --param max-unroll-times=4', I get
L3:
movsd (%r8,%r11), %xmm3
addq $4, %r10
movsd 16(%r8,%r11), %xmm5
movsd 32(%r8,%r11), %xmm7
movhpd 8(%r8,%r11), %xmm3
movsd 48(%r8,%r11), %xmm9
movhpd 24(%r8,%r11), %xmm5
movapd (%r9,%r11), %xmm4
movhpd 40(%r8,%r11), %xmm7
movapd 16(%r9,%r11), %xmm6
movhpd 56(%r8,%r11), %xmm9
movapd 32(%r9,%r11), %xmm8
mulpd %xmm3, %xmm4
movapd 48(%r9,%r11), %xmm10
mulpd %xmm5, %xmm6
mulpd %xmm7, %xmm8
mulpd %xmm9, %xmm10
movlpd %xmm4, (%rcx,%r11)
movhpd %xmm4, 8(%rcx,%r11)
movlpd %xmm6, 16(%rcx,%r11)
movhpd %xmm6, 24(%rcx,%r11)
movlpd %xmm8, 32(%rcx,%r11)
movhpd %xmm8, 40(%rcx,%r11)
movlpd %xmm10, 48(%rcx,%r11)
movhpd %xmm10, 56(%rcx,%r11)
addq $64, %r11
cmpq $504156, %r10
jbe L3
and
L5:
movsd -8(%rdi,%r9,8), %xmm15
leaq 1(%r9), %rbx
leaq 2(%r9), %r8
movsd -8(%rdi,%rbx,8), %xmm0
leaq 3(%r9), %rcx
movsd -8(%rdi,%r8,8), %xmm1
mulsd -8(%rdx,%r9,8), %xmm15
movsd -8(%rdi,%rcx,8), %xmm2
mulsd -8(%rdx,%rbx,8), %xmm0
mulsd -8(%rdx,%r8,8), %xmm1
mulsd -8(%rdx,%rcx,8), %xmm2
movsd %xmm15, -8(%rsi,%r9,8)
addq $4, %r9
cmpq %r12, %r9
movsd %xmm0, -8(%rsi,%rbx,8)
movsd %xmm1, -8(%rsi,%r8,8)
movsd %xmm2, -8(%rsi,%rcx,8)
jne L5
So both the vectorized and the unvectorized loops are unrolled four times. This
does not seem logical to me if the L5 loop was there only to handle a left over
scalar (AFAIU %xmm* store only one or two doubles and there is at most one left
if the length is odd or if the length is even and the first one has been peeled
for alignement).
I am also puzzled by the way the vectors as stored back as a pair
movlpd %xmm4, (%rcx,%r11)
movhpd %xmm4, 8(%rcx,%r11)
Why not a 'movapd' instead?