Bug 88492 - SLP optimization generates ugly code
Summary: SLP optimization generates ugly code
Status: ASSIGNED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 9.0
: P3 normal
Target Milestone: ---
Assignee: Tamar Christina
URL:
Keywords: missed-optimization
Depends on:
Blocks: spec vectorizer
  Show dependency treegraph
 
Reported: 2018-12-14 05:16 UTC by Jiangning Liu
Modified: 2021-04-15 01:46 UTC (History)
4 users (show)

See Also:
Host:
Target: aarch64
Build:
Known to work:
Known to fail:
Last reconfirmed: 2018-12-14 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Jiangning Liu 2018-12-14 05:16:54 UTC
For aarch64, SLP optimization generates ugly code for the case below,

int test_slp( unsigned char *b )
{
        unsigned int tmp[4][4];
        int sum = 0;
        for( int i = 0; i < 4; i++, b += 4 )
        {
                tmp[i][0] = b[0];
                tmp[i][2] = b[1];
                tmp[i][1] = b[2];
                tmp[i][3] = b[3];
        }
        for( int i = 0; i < 4; i++ )
        {
                sum += tmp[0][i] + tmp[1][i] + tmp[2][i] + tmp[3][i];
        }
        return sum;
}

With command line "gcc -O3", the following code is generated,

0000000000000000 <test_slp>:
   0:   90000001        adrp    x1, 0 <test_slp>
   4:   d10103ff        sub     sp, sp, #0x40
   8:   3dc00001        ldr     q1, [x0]
   c:   3dc00020        ldr     q0, [x1]
  10:   4e000021        tbl     v1.16b, {v1.16b}, v0.16b
  14:   2f08a422        uxtl    v2.8h, v1.8b
  18:   6f08a421        uxtl2   v1.8h, v1.16b
  1c:   2f10a443        uxtl    v3.4s, v2.4h
  20:   6f10a442        uxtl2   v2.4s, v2.8h
  24:   2f10a420        uxtl    v0.4s, v1.4h
  28:   6f10a421        uxtl2   v1.4s, v1.8h
  2c:   9e660060        fmov    x0, d3
  30:   ad000be3        stp     q3, q2, [sp]
  34:   b9401be8        ldr     w8, [sp, #24]
  38:   ad0107e0        stp     q0, q1, [sp, #32]
  3c:   9e660022        fmov    x2, d1
  40:   d360fc01        lsr     x1, x0, #32
  44:   9e660040        fmov    x0, d2
  48:   294117e6        ldp     w6, w5, [sp, #8]
  4c:   d360fc43        lsr     x3, x2, #32
  50:   b9402be2        ldr     w2, [sp, #40]
  54:   d360fc07        lsr     x7, x0, #32
  58:   9e660000        fmov    x0, d0
  5c:   0ea18400        add     v0.2s, v0.2s, v1.2s
  60:   0b0100e7        add     w7, w7, w1
  64:   0b0800c6        add     w6, w6, w8
  68:   b9401fe8        ldr     w8, [sp, #28]
  6c:   d360fc00        lsr     x0, x0, #32
  70:   1e260001        fmov    w1, s0
  74:   0ea28460        add     v0.2s, v3.2s, v2.2s
  78:   0b000063        add     w3, w3, w0
  7c:   0b070063        add     w3, w3, w7
  80:   29471fe0        ldp     w0, w7, [sp, #56]
  84:   1e260004        fmov    w4, s0
  88:   0b000042        add     w2, w2, w0
  8c:   b9402fe0        ldr     w0, [sp, #44]
  90:   0b060042        add     w2, w2, w6
  94:   0b040021        add     w1, w1, w4
  98:   0b070000        add     w0, w0, w7
  9c:   0b030021        add     w1, w1, w3
  a0:   0b0800a3        add     w3, w5, w8
  a4:   0b020021        add     w1, w1, w2
  a8:   0b030000        add     w0, w0, w3
  ac:   0b000020        add     w0, w1, w0
  b0:   910103ff        add     sp, sp, #0x40
  b4:   d65f03c0        ret

In the code, vectorization code is generated, but there are ugly instructions generated as well, e.g. memory store and register copy from SIMD register to general purpose register.

With command line "gcc -O3 -fno-tree-slp-vectorize", the following code can be generated, and it looks pretty clean. Usually, this code sequence is friendly to hardware prefetch.

0000000000000000 <test_slp>:
   0:   39402004        ldrb    w4, [x0, #8]
   4:   39401002        ldrb    w2, [x0, #4]
   8:   39403001        ldrb    w1, [x0, #12]
   c:   39400003        ldrb    w3, [x0]
  10:   39402806        ldrb    w6, [x0, #10]
  14:   0b040021        add     w1, w1, w4
  18:   39401805        ldrb    w5, [x0, #6]
  1c:   0b020063        add     w3, w3, w2
  20:   39403804        ldrb    w4, [x0, #14]
  24:   0b030021        add     w1, w1, w3
  28:   39400802        ldrb    w2, [x0, #2]
  2c:   39400403        ldrb    w3, [x0, #1]
  30:   0b060084        add     w4, w4, w6
  34:   39402407        ldrb    w7, [x0, #9]
  38:   0b050042        add     w2, w2, w5
  3c:   39401406        ldrb    w6, [x0, #5]
  40:   0b020084        add     w4, w4, w2
  44:   39403405        ldrb    w5, [x0, #13]
  48:   0b040021        add     w1, w1, w4
  4c:   0b060063        add     w3, w3, w6
  50:   39400c02        ldrb    w2, [x0, #3]
  54:   0b0700a5        add     w5, w5, w7
  58:   39403c04        ldrb    w4, [x0, #15]
  5c:   0b050063        add     w3, w3, w5
  60:   39401c06        ldrb    w6, [x0, #7]
  64:   39402c05        ldrb    w5, [x0, #11]
  68:   0b030021        add     w1, w1, w3
  6c:   0b060040        add     w0, w2, w6
  70:   0b050082        add     w2, w4, w5
  74:   0b020000        add     w0, w0, w2
  78:   0b000020        add     w0, w1, w0
  7c:   d65f03c0        ret

Anyway, it looks the heuristic rule to enable SLP optimization needs to be improved.
Comment 1 ktkachov 2018-12-14 09:25:56 UTC
Confirmed. Don't know if the vectoriser can do anything better here, but we if not the cost models should be disabling it.
Comment 2 Richard Biener 2018-12-14 10:57:54 UTC
IIRC we have a duplicate for this.  The issue is the SLP vectorizer doesn't
handle reductions (not implemented) and thus the vector results need
to be decomposed for the scalar reduction tail.  On x86 we get with -mavx2

        vmovdqu (%rdi), %xmm0
        vpshufb .LC0(%rip), %xmm0, %xmm0
        vpmovzxbw       %xmm0, %xmm1
        vpsrldq $8, %xmm0, %xmm0
        vpmovzxwd       %xmm1, %xmm2
        vpsrldq $8, %xmm1, %xmm1
        vpmovzxbw       %xmm0, %xmm0
        vpmovzxwd       %xmm1, %xmm1
        vmovaps %xmm2, -72(%rsp)
        movl    -68(%rsp), %eax
        vmovaps %xmm1, -56(%rsp)
        vpmovzxwd       %xmm0, %xmm1
        vpsrldq $8, %xmm0, %xmm0
        addl    -52(%rsp), %eax
        vpmovzxwd       %xmm0, %xmm0
        vmovaps %xmm1, -40(%rsp)
        movl    -56(%rsp), %edx
        addl    -36(%rsp), %eax
        vmovaps %xmm0, -24(%rsp)
        addl    -72(%rsp), %edx
        addl    -20(%rsp), %eax
        addl    -40(%rsp), %edx
        addl    -24(%rsp), %edx
        addl    %edx, %eax
        movl    -48(%rsp), %edx
        addl    -64(%rsp), %edx
        addl    -32(%rsp), %edx
        addl    -16(%rsp), %edx
        addl    %edx, %eax
        movl    -44(%rsp), %edx
        addl    -60(%rsp), %edx
        addl    -28(%rsp), %edx
        addl    -12(%rsp), %edx
        addl    %edx, %eax
        ret

the main issue of course that we fail to elide the stack temporary.
Re-running FRE after loop opts might help here but of course
SLP vectorization handling the reduction would be best (though the
tail loop is structured badly, not matching up with the head one).

Whether vectorizing this specific testcases head loop is profitable
or not is questionable on its own of course (but you can easily make
it so and still get similar ugly code in the tail).
Comment 3 Tamar Christina 2019-04-09 18:13:16 UTC
I'll be taking a look at this one as a part of GCC 10 as well.
Comment 4 Hao Liu 2019-07-12 11:02:17 UTC
It seems Richard Biener's patch (r272843) can remove the redundant load/store. r272843 comments as following: 
>     2019-07-01  Richard Biener  <rguenther@suse.de>
>    
>            * tree-ssa-sccvn.c (class pass_fre): Add may_iterate
>            pass parameter.
>            (pass_fre::execute): Honor it.
>            * passes.def: Adjust pass_fre invocations to allow iterating,
>            add non-iterating pass_fre before late threading/dom.
>    
>            * gcc.dg/tree-ssa/pr77445-2.c: Adjust.

Tested with Jiangning's case with "gcc -O3", the following code is generated:

  test_slp:
  .LFB0:
	.cfi_startproc
	adrp	x1, .LC0
	ldr	q0, [x0]
	ldr	q1, [x1, #:lo12:.LC0]
	tbl	v0.16b, {v0.16b}, v1.16b
	uxtl	v1.8h, v0.8b
	uxtl2	v0.8h, v0.16b
	uxtl	v4.4s, v1.4h
	uxtl	v2.4s, v0.4h
	uxtl2	v0.4s, v0.8h
	uxtl2	v1.4s, v1.8h
	dup	s21, v4.s[0]
	dup	s22, v2.s[1]
	dup	s3, v0.s[1]
	dup	s6, v1.s[0]
	dup	s23, v4.s[1]
	dup	s16, v2.s[0]
	add	v3.2s, v3.2s, v22.2s
	dup	s20, v0.s[0]
	dup	s17, v1.s[1]
	dup	s5, v0.s[2]
	fmov	w0, s3
	add	v3.2s, v6.2s, v21.2s
	dup	s19, v2.s[2]
	add	v17.2s, v17.2s, v23.2s
	dup	s7, v4.s[2]
	fmov	w1, s3
	add	v3.2s, v16.2s, v20.2s
	dup	s18, v1.s[2]
	fmov	w3, s17
	dup	s2, v2.s[3]
	fmov	w2, s3
	add	v3.2s, v5.2s, v19.2s
	dup	s0, v0.s[3]
	dup	s4, v4.s[3]
	add	w0, w0, w3
	dup	s1, v1.s[3]
	fmov	w3, s3
	add	v3.2s, v7.2s, v18.2s
	add	v0.2s, v2.2s, v0.2s
	add	w1, w1, w2
	add	w0, w0, w1
	fmov	w2, s3
	add	w3, w3, w2
	fmov	w2, s0
	add	v0.2s, v1.2s, v4.2s
	add	w0, w0, w3
	fmov	w1, s0
	add	w1, w2, w1
	add	w0, w0, w1
	ret

Although SLP still generates SIMD code, it looks much better than previous code with memory load/store. Performance is expected to be better as no redundant load/store.
Comment 5 Richard Biener 2019-07-12 11:13:16 UTC
Yeah.  Again, on x86 with -mavx2 we now have right after late FRE:

  <bb 2> [local count: 214748371]:
  vect__1.6_64 = MEM <vector(16) unsigned char> [(unsigned char *)b_24(D)];
  vect__1.7_63 = VEC_PERM_EXPR <vect__1.6_64, vect__1.6_64, { 0, 2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12, 14, 13, 15 }>;
  vect__2.9_62 = [vec_unpack_lo_expr] vect__1.7_63;
  vect__2.9_59 = [vec_unpack_hi_expr] vect__1.7_63;
  vect__2.8_57 = [vec_unpack_lo_expr] vect__2.9_62;
  vect__2.8_56 = [vec_unpack_hi_expr] vect__2.9_62;
  vect__2.8_55 = [vec_unpack_lo_expr] vect__2.9_59;
  vect__2.8_54 = [vec_unpack_hi_expr] vect__2.9_59;
  MEM <vector(4) unsigned int> [(unsigned int *)&tmp] = vect__2.8_57;
  MEM <vector(4) unsigned int> [(unsigned int *)&tmp + 16B] = vect__2.8_56;
  MEM <vector(4) unsigned int> [(unsigned int *)&tmp + 32B] = vect__2.8_55;
  MEM <vector(4) unsigned int> [(unsigned int *)&tmp + 48B] = vect__2.8_54;
  vectp_b.4_65 = b_24(D) + 16;
  _8 = BIT_FIELD_REF <vect__2.8_57, 32, 0>;
  _22 = BIT_FIELD_REF <vect__2.8_56, 32, 0>;
  _30 = _8 + _22;
  _14 = BIT_FIELD_REF <vect__2.8_55, 32, 0>;
  _81 = BIT_FIELD_REF <vect__2.8_54, 32, 0>;
  _43 = _14 + _30;
  _45 = _43 + _81;
  sum_34 = (int) _45;
  _58 = BIT_FIELD_REF <vect__2.8_57, 32, 32>;
  _38 = BIT_FIELD_REF <vect__2.8_56, 32, 32>;
  _72 = _38 + _58;
  _68 = BIT_FIELD_REF <vect__2.8_55, 32, 32>;
  _53 = BIT_FIELD_REF <vect__2.8_54, 32, 32>;
  _29 = _68 + _72;
  _31 = _29 + _53;
  _7 = _31 + _45;
  sum_61 = (int) _7;
  _47 = BIT_FIELD_REF <vect__2.8_57, 32, 64>;
  _88 = BIT_FIELD_REF <vect__2.8_56, 32, 64>;
  _44 = _47 + _88;
  _74 = _44 + _90;
  _73 = _74 + _92;
  _83 = _7 + _73;
  sum_84 = (int) _83;
  _94 = BIT_FIELD_REF <vect__2.8_57, 32, 96>;
  _96 = BIT_FIELD_REF <vect__2.8_56, 32, 96>;
  _71 = _94 + _96;
  _98 = BIT_FIELD_REF <vect__2.8_55, 32, 96>;
  _100 = BIT_FIELD_REF <vect__2.8_54, 32, 96>;
  _70 = _71 + _98;
  _46 = _70 + _100;
  _18 = _46 + _83;
  sum_27 = (int) _18;
  tmp ={v} {CLOBBER};
  return sum_27;

I know the "real" code this testcase is from has actual operations
in place of the b[N] reads, for the above vectorization looks somewhat
pointless given we end up decomposing the result again.

So the appropriate fix would of course be to vectorize the reduction
loop (but that hits the sign-changing reduction issue).
Comment 6 ptomsich 2021-04-14 16:36:11 UTC
With the current master, the test case generates (with -mcpu=neoverse-n1):

	.arch armv8.2-a+crc+fp16+rcpc+dotprod+profile
	.file	"pr88492.c"
	.text
	.align	2
	.p2align 5,,15
	.global	test_slp
	.type	test_slp, %function
test_slp:
.LFB0:
	.cfi_startproc
	ldr	q2, [x0]
	adrp	x1, .LC0
	ldr	q16, [x1, #:lo12:.LC0]
	uxtl	v4.8h, v2.8b
	uxtl2	v2.8h, v2.16b
	uxtl	v0.4s, v4.4h
	uxtl	v6.4s, v2.4h
	uxtl2	v4.4s, v4.8h
	uxtl2	v2.4s, v2.8h
	mov	v1.16b, v0.16b
	mov	v7.16b, v6.16b
	mov	v5.16b, v4.16b
	mov	v3.16b, v2.16b
	tbl	v0.16b, {v0.16b - v1.16b}, v16.16b
	tbl	v6.16b, {v6.16b - v7.16b}, v16.16b
	tbl	v4.16b, {v4.16b - v5.16b}, v16.16b
	tbl	v2.16b, {v2.16b - v3.16b}, v16.16b
	add	v0.4s, v0.4s, v4.4s
	add	v6.4s, v6.4s, v2.4s
	add	v0.4s, v0.4s, v6.4s
	addv	s0, v0.4s
	fmov	w0, s0
	ret
	.cfi_endproc
.LFE0:
	.size	test_slp, .-test_slp

which contrasts with LLVM13 (with -mcpu=neoverse-n1):

test_slp:                               // @test_slp
	.cfi_startproc
// %bb.0:                               // %entry
	ldr	q0, [x0]
	movi	v1.16b, #1
	movi	v2.2d, #0000000000000000
	udot	v2.4s, v0.16b, v1.16b
	addv	s0, v2.4s
	fmov	w0, s0
	ret
.Lfunc_end0:
	.size	test_slp, .Lfunc_end0-test_slp

or (LLVM13 w/o the mcpu-option):

	.type	test_slp,@function
test_slp:                               // @test_slp
	.cfi_startproc
// %bb.0:                               // %entry
	ldr	q0, [x0]
	ushll2	v1.8h, v0.16b, #0
	ushll	v0.8h, v0.8b, #0
	uaddl2	v2.4s, v0.8h, v1.8h
	uaddl	v0.4s, v0.4h, v1.4h
	add	v0.4s, v0.4s, v2.4s
	addv	s0, v0.4s
	fmov	w0, s0
	ret
.Lfunc_end0:
	.size	test_slp, .Lfunc_end0-test_slp
Comment 7 Tamar Christina 2021-04-14 17:12:04 UTC
(In reply to ptomsich from comment #6)
> With the current master, the test case generates (with -mcpu=neoverse-n1):

> which contrasts with LLVM13 (with -mcpu=neoverse-n1):
> 
> test_slp:                               // @test_slp
> 	.cfi_startproc
> // %bb.0:                               // %entry
> 	ldr	q0, [x0]
> 	movi	v1.16b, #1
> 	movi	v2.2d, #0000000000000000
> 	udot	v2.4s, v0.16b, v1.16b
> 	addv	s0, v2.4s
> 	fmov	w0, s0
> 	ret
> .Lfunc_end0:
> 	.size	test_slp, .Lfunc_end0-test_slp
> 
> or (LLVM13 w/o the mcpu-option):
> 
> 	.type	test_slp,@function
> test_slp:                               // @test_slp
> 	.cfi_startproc
> // %bb.0:                               // %entry
> 	ldr	q0, [x0]
> 	ushll2	v1.8h, v0.16b, #0
> 	ushll	v0.8h, v0.8b, #0
> 	uaddl2	v2.4s, v0.8h, v1.8h
> 	uaddl	v0.4s, v0.4h, v1.4h
> 	add	v0.4s, v0.4s, v2.4s
> 	addv	s0, v0.4s
> 	fmov	w0, s0
> 	ret
> .Lfunc_end0:
> 	.size	test_slp, .Lfunc_end0-test_slp

It's definitely a neat trick, but correct me if I'm wrong: it's only possible because addition is commutative.

Clang has just simply reordered the loads because the loop is very simple to just

    for( int i = 0; i < 4; i++, b += 4 )
    {
            tmp[i][0] = b[0];
            tmp[i][1] = b[1];
            tmp[i][2] = b[2];
            tmp[i][3] = b[3];
    }

Which GCC also handles fine. 

As Richi mentioned before

>I know the "real" code this testcase is from has actual operations
> in place of the b[N] reads, for the above vectorization looks somewhat
> pointless given we end up decomposing the result again.

It seems a bit of a too narrow focus to optimize for this particular example as the real code does "other" things.

i.e.

Both GCC and Clang fall apart with

int test_slp( unsigned char *b )
{
    unsigned int tmp[4][4];
    int sum = 0;
    for( int i = 0; i < 4; i++, b += 4 )
    {
            tmp[i][0] = b[0] - b[4];
            tmp[i][2] = b[1] + b[5];
            tmp[i][1] = b[2] - b[6];
            tmp[i][3] = b[3] + b[7];
    }
    for( int i = 0; i < 4; i++ )
    {
            sum += tmp[0][i] + tmp[1][i] + tmp[2][i] + tmp[3][i];
    }
    return sum;
}

which has about the same access pattern as the real code.

If you change the operations you'll notice that for others examples like

int test_slp( unsigned char *b )
{
    unsigned int tmp[4][4];
    int sum = 0;
    for( int i = 0; i < 4; i++, b += 4 )
    {
            tmp[i][0] = b[0] - b[4];
            tmp[i][2] = b[1] - b[5];
            tmp[i][1] = b[2] - b[6];
            tmp[i][3] = b[3] - b[7];
    }
    for( int i = 0; i < 4; i++ )
    {
            sum += tmp[0][i] + tmp[1][i] + tmp[2][i] + tmp[3][i];
    }
    return sum;
}

GCC handles this better (but we are let down by register allocation).

To me it seems quite unlikely that actual code would be written like that, but I guess there could be a case to be made to try to reassoc loads as well.