Bug 53355 - Autovectorization of a simple loop could be improved.
Summary: Autovectorization of a simple loop could be improved.
Status: ASSIGNED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 4.7.0
: P3 enhancement
Target Milestone: ---
Assignee: Richard Biener
URL:
Keywords:
: 18557 (view as bug list)
Depends on:
Blocks: vectorizer
  Show dependency treegraph
 
Reported: 2012-05-15 01:49 UTC by Steven Fuerst
Modified: 2013-04-19 22:03 UTC (History)
3 users (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed: 2012-05-15 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Steven Fuerst 2012-05-15 01:49:21 UTC
The following simple function gets autovectorized by gcc-4.7 -O3

void foo(double *x)
{
	int i;

	for (i = 0; i < 100000; i++)
	{
		x[i] += 2.0;
	}
}

However, the generated code is rather poor:

foo:
.LFB0:
	.cfi_startproc
	movq	%rdi, %rax
	salq	$60, %rax
	sarq	$63, %rax
	movq	%rax, %rdx
	andl	$1, %edx
	testq	%rdx, %rdx
	movl	%edx, %ecx
	je	.L7
	movsd	.LC0(%rip), %xmm0
	movl	$99999, %r9d
	movl	$1, %r10d
	addsd	(%rdi), %xmm0
	movsd	%xmm0, (%rdi)
.L2:
	movl	$100000, %r8d
	andl	$1, %eax
	subl	%ecx, %r8d
	movl	%r8d, %esi
	shrl	%esi
	movl	%esi, %r11d
	addl	%r11d, %r11d
	je	.L3
	movapd	.LC1(%rip), %xmm1
	leaq	(%rdi,%rax,8), %rcx
	xorl	%edx, %edx
	xorl	%eax, %eax
	.p2align 4,,10
	.p2align 3
.L4:
	movapd	(%rcx,%rax), %xmm0
	addl	$1, %edx
	addpd	%xmm1, %xmm0
	movapd	%xmm0, (%rcx,%rax)
	addq	$16, %rax
	cmpl	%esi, %edx
	jb	.L4
	addl	%r11d, %r10d
	subl	%r11d, %r9d
	cmpl	%r11d, %r8d
	je	.L1
.L3:
	leal	-1(%r9), %edx
	movslq	%r10d, %r10
	leaq	(%rdi,%r10,8), %rax
	movsd	.LC0(%rip), %xmm1
	addq	%rdx, %r10
	leaq	8(%rdi,%r10,8), %rdx
	.p2align 4,,10
	.p2align 3
.L6:
	movsd	(%rax), %xmm0
	addsd	%xmm1, %xmm0
	movsd	%xmm0, (%rax)
	addq	$8, %rax
	cmpq	%rdx, %rax
	jne	.L6
.L1:
	rep
	ret
.L7:
	movl	$100000, %r9d
	xorl	%r10d, %r10d
	jmp	.L2


The first thing wrong is that the alignment test is rather clunky.  Instead, we can just do a "testq $8, %rdi" followed by a jump to the miss-aligned case.

The second thing wrong is that the code instead of falling-through to the aligned case, jumps down to L7 instead.  Forward jumps are predicted as not taken.

The inner loop is also longer than it should be.  It is possible to have one less increment statement inside.

Finally, after the main loop, the code again tests for miss-alignment.  This pessimizes the fast path.  Instead, it possible to move the slow miss-aligned case completely out with very little size penalty.

Doing the above optimizations yields:

	movapd 2_s, %xmm1
	testq $8, %rdi
	jne 2f
	
	lea 800000(%rdi), %rax

# The "Hot" inner loop
1:	movapd	(%rdi), %xmm0
	addq	$16, %rdi
	addpd	%xmm1, %xmm0
	movapd	%xmm0, (%rdi)
	cmpq	%rdi, %rax
	jne 1b
	rep ret

# The slow miss-aligned case	
2:      movsd	(%rdi), %xmm0
	addsd	%xmm1, %xmm0
	movsd	%xmm0, (%rdi)

	leaq 	(800000 - 16) (%rdi), %rax
	addq	$8, %rdi

3:	movapd	(%rdi), %xmm0
	addq	$16, %rdi
	addpd	%xmm1, %xmm0
	movapd	%xmm0, (%rdi)
	cmpq	%rdi, %rax
	jne 3b

	movsd	(%rdi), %xmm0
	addsd	%xmm1, %xmm0
	movsd	%xmm0, (%rdi)
	ret
Comment 1 Richard Biener 2012-05-15 09:40:45 UTC
Confirmed.  Btw, it does not check for misalignment again but for whether
there is a remaining scalar iteration to be done.  On trunk (for GCC 4.8)
we already improve somewhat - but the odd induction variable choices
remain.

foo:
.LFB0:
        .cfi_startproc
        movq    %rdi, %rax
        salq    $60, %rax
        shrq    $63, %rax
        testl   %eax, %eax
        je      .L2
        movsd   (%rdi), %xmm0
        movl    $100000, %r8d
        movsd   .LC0(%rip), %xmm1
        subl    %eax, %r8d
        movl    %r8d, %esi
        movl    $99999, %r11d
        addsd   %xmm1, %xmm0
        shrl    %esi
        movl    %esi, %r9d
        addl    %r9d, %r9d
        movsd   %xmm0, (%rdi)
        je      .L9
        movl    $1, %r10d
.L8:
        movapd  .LC1(%rip), %xmm1
        leaq    (%rdi,%rax,8), %rcx
        xorl    %edx, %edx
        xorl    %eax, %eax
        .p2align 4,,10
        .p2align 3
.L7:
        movapd  (%rcx,%rax), %xmm0
        addl    $1, %edx
        addpd   %xmm1, %xmm0
        movapd  %xmm0, (%rcx,%rax)
        addq    $16, %rax
        cmpl    %esi, %edx
        jb      .L7
        subl    %r9d, %r11d
        cmpl    %r9d, %r8d
        leal    (%r10,%r9), %edx
        je      .L1
        movsd   .LC0(%rip), %xmm1
.L3:
        leal    -1(%r11), %ecx
        movslq  %edx, %rdx
        leaq    0(,%rdx,8), %rax
        leaq    1(%rdx,%rcx), %rdx
        salq    $3, %rdx
        .p2align 4,,10
        .p2align 3
.L6:
        movsd   (%rdi,%rax), %xmm0
        addsd   %xmm1, %xmm0
        movsd   %xmm0, (%rdi,%rax)
        addq    $8, %rax
        cmpq    %rdx, %rax
        jne     .L6
        rep
        ret
.L1:
        ret
.L2:
        movl    $100000, %r8d
        movl    $50000, %esi
        movl    $100000, %r9d
        movl    $100000, %r11d
        xorl    %r10d, %r10d
        jmp     .L8
.L9:
        movl    $1, %edx
        jmp     .L3

what we also fail to optimize are the entry checks of the vectorized loops
after the prologue loop:

  # BLOCK 2 loop_depth:0 freq:100
  # PRED: ENTRY [100.0%]  (fallthru,exec)
  D.1738_23 = (unsigned long) x_2(D);
  D.1739_24 = D.1738_23 & 15;
  D.1740_25 = D.1739_24 >> 3;
  D.1741_26 = -D.1740_25;
  D.1742_27 = (unsigned int) D.1741_26;
  D.1802_18 = D.1741_26 & 1;
  prolog_loop_niters.16_28 = (unsigned int) D.1802_18;
  if (prolog_loop_niters.16_28 == 0)
    goto <bb 12>;
  else
    goto <bb 3>;
  # SUCC: 3 [100.0%]  (false,exec) 12 (true,exec)

  # BLOCK 3 loop_depth:0 freq:100
  # PRED: 2 [100.0%]  (false,exec)
  x_65 = x_2(D);
  D.1718_37 = MEM[base: x_65, offset: 0B];
  D.1719_38 = D.1718_37 + 2.0e+0;
  MEM[base: x_65, offset: 0B] = D.1719_38;
  prolog_loop_adjusted_niters.18_52 = D.1741_26 & 1;
  niters.19_53 = 100000 - prolog_loop_niters.16_28;
  bnd.20_54 = niters.19_53 >> 1;
  ratio_mult_vf.21_55 = bnd.20_54 << 1;
  if (ratio_mult_vf.21_55 == 0)
    goto <bb 7>;
  else
    goto <bb 4>;
  # SUCC: 4 [100.0%]  (false,exec) 7 (true,exec)

we fail to see that ratio_mult_vf.21_55 is never zero.  Of course that is
because of the awkward representation of the checks.

I will look into this.
Comment 2 Richard Biener 2012-05-15 13:18:42 UTC
Author: rguenth
Date: Tue May 15 13:18:32 2012
New Revision: 187535

URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=187535
Log:
2012-05-15  Richard Guenther  <rguenther@suse.de>

	PR tree-optimization/53355
	* tree-vrp.c (extract_range_from_binary_expr_1): Handle LSHIFT_EXPRs
	by constants.

	* gcc.dg/tree-ssa/vrp67.c: New testcase.

Added:
    trunk/gcc/testsuite/gcc.dg/tree-ssa/vrp67.c
Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/testsuite/ChangeLog
    trunk/gcc/tree-vrp.c
Comment 3 Richard Biener 2012-05-15 13:19:06 UTC
With the patch we now emit

foo:
.LFB0:
        .cfi_startproc
        movq    %rdi, %rax
        salq    $60, %rax
        sarq    $63, %rax
        movq    %rax, %rdx
        andl    $1, %edx
        testl   %edx, %edx
        je      .L7
        movsd   .LC0(%rip), %xmm0
        movl    $99999, %r10d
        movl    $1, %r11d
        addsd   (%rdi), %xmm0
        movsd   %xmm0, (%rdi)
.L2:
        movl    $100000, %r8d
        andl    $1, %eax
        subl    %edx, %r8d
        movapd  .LC1(%rip), %xmm1
        movl    %r8d, %esi
        leaq    (%rdi,%rax,8), %rcx
        xorl    %edx, %edx
        shrl    %esi
        xorl    %eax, %eax
        leal    (%rsi,%rsi), %r9d
        .p2align 4,,10
        .p2align 3
.L6:
        movapd  (%rcx,%rax), %xmm0
        addl    $1, %edx
        addpd   %xmm1, %xmm0
        movapd  %xmm0, (%rcx,%rax)
        addq    $16, %rax
        cmpl    %esi, %edx
        jb      .L6
        subl    %r9d, %r10d
        cmpl    %r9d, %r8d
        leal    (%r11,%r9), %edx
        je      .L1
        leal    -1(%r10), %ecx
        movslq  %edx, %rdx
        leaq    0(,%rdx,8), %rax
        movsd   .LC0(%rip), %xmm1
        leaq    1(%rdx,%rcx), %rdx
        salq    $3, %rdx
        .p2align 4,,10
        .p2align 3
.L5:
        movsd   (%rdi,%rax), %xmm0
        addsd   %xmm1, %xmm0
        movsd   %xmm0, (%rdi,%rax)
        addq    $8, %rax
        cmpq    %rdx, %rax
        jne     .L5
.L1:
        rep
        ret
.L7:
        movl    $100000, %r10d
        xorl    %r11d, %r11d
        jmp     .L2
Comment 4 Richard Biener 2012-05-16 14:46:33 UTC
One major remaining issue is that the entry checks for the versioned loops
all base on the number of iterations of said loop instead of on the
feature (in the case of peeling for alignment we don't check for aligned but
we check for does the align-loop run zero times).  That causes unnecessary
code to be executed in the path that does not need it.

Another remaining issue is that we allow for

  if (unaligned)
    for (;;)
  if (any-vectorized-iterations)
    for (;;)
  if (any-remaining-iterations)
    for (;;)

thus any-vectorized-iterations may be false when we leave the alignment
loop.  That shounds bogus - this case should drop into a single purely
scalar loop instead, avoiding the check and making all cases faster.
In the testcase this gets optimized away (because we have a known number
of overall iterations), but with a variable upper bound cost considerations
should not allow this case.

In general the scalar loop version, the epilogue and only then the prologue
loop should be considered as fallback for the non profitable case (in this
order).  Not the prologue loop first.
Comment 5 Richard Biener 2012-05-18 12:32:11 UTC
The following testcase (extracted from PR53346, which stores zero though) shows
another case of slowness

void foo (int *p, int n)
{
  int i;
  for (i = 0; i < n; ++i)
    p[i] = 1;
}
Comment 6 Richard Biener 2012-07-13 13:52:25 UTC
*** Bug 18557 has been marked as a duplicate of this bug. ***