[Bug tree-optimization/88492] New: SLP optimization generates ugly code

Fri Dec 14 05:17:00 GMT 2018

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88492

            Bug ID: 88492
           Summary: SLP optimization generates ugly code
           Product: gcc
           Version: 9.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: jiangning.liu at amperecomputing dot com
  Target Milestone: ---

For aarch64, SLP optimization generates ugly code for the case below,

int test_slp( unsigned char *b )
{
        unsigned int tmp[4][4];
        int sum = 0;
        for( int i = 0; i < 4; i++, b += 4 )
        {
                tmp[i][0] = b[0];
                tmp[i][2] = b[1];
                tmp[i][1] = b[2];
                tmp[i][3] = b[3];
        }
        for( int i = 0; i < 4; i++ )
        {
                sum += tmp[0][i] + tmp[1][i] + tmp[2][i] + tmp[3][i];
        }
        return sum;
}

With command line "gcc -O3", the following code is generated,

0000000000000000 <test_slp>:
   0:   90000001        adrp    x1, 0 <test_slp>
   4:   d10103ff        sub     sp, sp, #0x40
   8:   3dc00001        ldr     q1, [x0]
   c:   3dc00020        ldr     q0, [x1]
  10:   4e000021        tbl     v1.16b, {v1.16b}, v0.16b
  14:   2f08a422        uxtl    v2.8h, v1.8b
  18:   6f08a421        uxtl2   v1.8h, v1.16b
  1c:   2f10a443        uxtl    v3.4s, v2.4h
  20:   6f10a442        uxtl2   v2.4s, v2.8h
  24:   2f10a420        uxtl    v0.4s, v1.4h
  28:   6f10a421        uxtl2   v1.4s, v1.8h
  2c:   9e660060        fmov    x0, d3
  30:   ad000be3        stp     q3, q2, [sp]
  34:   b9401be8        ldr     w8, [sp, #24]
  38:   ad0107e0        stp     q0, q1, [sp, #32]
  3c:   9e660022        fmov    x2, d1
  40:   d360fc01        lsr     x1, x0, #32
  44:   9e660040        fmov    x0, d2
  48:   294117e6        ldp     w6, w5, [sp, #8]
  4c:   d360fc43        lsr     x3, x2, #32
  50:   b9402be2        ldr     w2, [sp, #40]
  54:   d360fc07        lsr     x7, x0, #32
  58:   9e660000        fmov    x0, d0
  5c:   0ea18400        add     v0.2s, v0.2s, v1.2s
  60:   0b0100e7        add     w7, w7, w1
  64:   0b0800c6        add     w6, w6, w8
  68:   b9401fe8        ldr     w8, [sp, #28]
  6c:   d360fc00        lsr     x0, x0, #32
  70:   1e260001        fmov    w1, s0
  74:   0ea28460        add     v0.2s, v3.2s, v2.2s
  78:   0b000063        add     w3, w3, w0
  7c:   0b070063        add     w3, w3, w7
  80:   29471fe0        ldp     w0, w7, [sp, #56]
  84:   1e260004        fmov    w4, s0
  88:   0b000042        add     w2, w2, w0
  8c:   b9402fe0        ldr     w0, [sp, #44]
  90:   0b060042        add     w2, w2, w6
  94:   0b040021        add     w1, w1, w4
  98:   0b070000        add     w0, w0, w7
  9c:   0b030021        add     w1, w1, w3
  a0:   0b0800a3        add     w3, w5, w8
  a4:   0b020021        add     w1, w1, w2
  a8:   0b030000        add     w0, w0, w3
  ac:   0b000020        add     w0, w1, w0
  b0:   910103ff        add     sp, sp, #0x40
  b4:   d65f03c0        ret

In the code, vectorization code is generated, but there are ugly instructions
generated as well, e.g. memory store and register copy from SIMD register to
general purpose register.

With command line "gcc -O3 -fno-tree-slp-vectorize", the following code can be
generated, and it looks pretty clean. Usually, this code sequence is friendly
to hardware prefetch.

0000000000000000 <test_slp>:
   0:   39402004        ldrb    w4, [x0, #8]
   4:   39401002        ldrb    w2, [x0, #4]
   8:   39403001        ldrb    w1, [x0, #12]
   c:   39400003        ldrb    w3, [x0]
  10:   39402806        ldrb    w6, [x0, #10]
  14:   0b040021        add     w1, w1, w4
  18:   39401805        ldrb    w5, [x0, #6]
  1c:   0b020063        add     w3, w3, w2
  20:   39403804        ldrb    w4, [x0, #14]
  24:   0b030021        add     w1, w1, w3
  28:   39400802        ldrb    w2, [x0, #2]
  2c:   39400403        ldrb    w3, [x0, #1]
  30:   0b060084        add     w4, w4, w6
  34:   39402407        ldrb    w7, [x0, #9]
  38:   0b050042        add     w2, w2, w5
  3c:   39401406        ldrb    w6, [x0, #5]
  40:   0b020084        add     w4, w4, w2
  44:   39403405        ldrb    w5, [x0, #13]
  48:   0b040021        add     w1, w1, w4
  4c:   0b060063        add     w3, w3, w6
  50:   39400c02        ldrb    w2, [x0, #3]
  54:   0b0700a5        add     w5, w5, w7
  58:   39403c04        ldrb    w4, [x0, #15]
  5c:   0b050063        add     w3, w3, w5
  60:   39401c06        ldrb    w6, [x0, #7]
  64:   39402c05        ldrb    w5, [x0, #11]
  68:   0b030021        add     w1, w1, w3
  6c:   0b060040        add     w0, w2, w6
  70:   0b050082        add     w2, w4, w5
  74:   0b020000        add     w0, w0, w2
  78:   0b000020        add     w0, w1, w0
  7c:   d65f03c0        ret

Anyway, it looks the heuristic rule to enable SLP optimization needs to be
improved.