Created attachment 55183 [details] Open-source stream benchmark The stream benchmark performance deteriorates. The loop peeling policy in the vect_enhance_data_refs_alignment function is modified to degrade the benchmark performance,which can be demonstrated on example from attachment. Alternatively, you can obtain it from https://github.com/jeffhammond/stream/archive/master.zip. Compiling & Running: gcc -fopenmp -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream ./stream Patches to modify the loop stripping policy of the vect_enhance_data_refs_alignment function are available from: https://gcc.gnu.org/git/?p=gcc.git&a=commit;h=49ab46214e9288ee1268f87ddcd64dacfd21c31d After you open OpenMP, Before modification: (Add subitem) ldr d0, [x5, x1, lsl #3] fadd d0, d0, d1 str d0, [x4, x1, lsl #3] mov w4, w2 sub w7, w7, w2 add x4, x4, x1 ldr x1, [x10, #888] lsl x4, x4, #3 lsr w8, w7, #1 add x6, x4, x6 add x5, x4, x5 mov w2, #0x0 / add x4, x4, x1 mov x1, #0x0 / ldr q0, [x5, x1] add w2, w2, #0x1 ldr q1, [x6, x1] cmp w2, w8 fadd v0.2d, v0.2d, v1.2d str q0, [x4, x1] add x1, x1, #0x10 b.cc 4012d8 <main._omp_fn.4+0xd8> and w1, w7, #0xfffffffe add w0, w0, w1 cmp w7, w1 b.eq 401348 <main._omp_fn.4+0x148> ldr x5, [x9, #880] sxtw x1, w0 ldr x4, [x11, #896] add w0, w0, #0x1 ldr d1, [x5, x1, lsl #3] cmp w3, w0 ldr x2, [x10, #888] ldr d0, [x4, x1, lsl #3] fadd d0, d0, d1 str d0, [x2, x1, lsl #3] b.le 401348 <main._omp_fn.4+0x148> sxtw x0, w0 ldr d0, [x5, x0, lsl #3] ldr d1, [x4, x0, lsl #3] fadd d0, d0, d1 str d0, [x2, x0, lsl #3] ldr x19, [sp, #16] ldp x29, x30, [sp], #32 After the modification: mov x29, sp str x19, [sp, #16] bl 4006e0 <omp_get_num_threads@plt> mov w19, w0 bl 4006b0 <omp_get_thread_num@plt> mov w2, #0x8000 / movk w2, #0x61a, lsl #16 sdiv w1, w2, w19 msub w2, w1, w19, w2 cmp w0, w2 b.ge 401238 <main._omp_fn.4+0x38> add w1, w1, #0x1 mov w2, #0x0 / madd w0, w1, w0, w2 add w1, w1, w0 cmp w0, w1 b.ge 4012d8 <main._omp_fn.4+0xd8> sub w2, w1, w0 adrp x8, 401000 <main._omp_fn.3+0x100> adrp x9, 401000 <main._omp_fn.3+0x100> adrp x7, 401000 <main._omp_fn.3+0x100> cmp w2, #0x1 b.eq 4012b8 <main._omp_fn.4+0xb8> ldr x1, [x7, #760] sbfiz x4, x0, #3, #32 ldr x6, [x8, #744] lsr w10, w2, #1 ldr x5, [x9, #752] mov w3, #0x0 / add x6, x4, x6 add x5, x4, x5 add x4, x4, x1 mov x1, #0x0 / ldr q0, [x6, x1] add w3, w3, #0x1 ldr q1, [x5, x1] cmp w3, w10 fadd v0.2d, v0.2d, v1.2d str q0, [x4, x1] add x1, x1, #0x10 b.cc 401288 <main._omp_fn.4+0x88> and w1, w2, #0xfffffffe add w0, w0, w1 cmp w2, w1 b.eq 4012d8 <main._omp_fn.4+0xd8> ldr x3, [x9, #752] sxtw x0, w0 ldr x2, [x8, #744] ldr x1, [x7, #760] ldr d0, [x3, x0, lsl #3] ldr d1, [x2, x0, lsl #3] fadd d0, d0, d1 str d0, [x1, x0, lsl #3] ldr x19, [sp, #16] ldp x29, x30, [sp], #32 ret After modifying the peeling policy, the vectorization of the for loop in the Add subitem does not attempt to peel the loop, but the performance eventually degrades.
This is almost definitely an aarch64 cost model issue ...
(In reply to Andrew Pinski from comment #1) > This is almost definitely an aarch64 cost model issue ... Do you mean that the vectorized cost_model of the underlying hardware causes the policy of not peeling the loop after r247544 to be chosen? ? So why does loop peeling result in performance improvements? For the following code, I understand that this is a very standard vectorized effective loop. for (j=0; j<STREAM_ARRAY_SIZE; j++) c[j] = a[j]+b[j];