https://gcc.godbolt.org/z/x7GGzezGh #include <stdio.h> #define LEN 32000 #define ntimes 200000 #define TYPE float #define lll LEN #define LEN2 256 #define ALIGNMENT 16 __attribute__ ((aligned(ALIGNMENT))) TYPE X[lll],Y[lll],Z[lll],U[lll],V[lll]; struct GlobalData { __attribute__((aligned(ALIGNMENT))) TYPE a[LEN]; int pad1[3]; __attribute__((aligned(ALIGNMENT))) TYPE b[LEN]; int pad2[5]; __attribute__((aligned(ALIGNMENT))) TYPE c[LEN]; int pad3[7]; __attribute__((aligned(ALIGNMENT))) TYPE d[LEN]; int pad4[11]; __attribute__((aligned(ALIGNMENT))) TYPE e[LEN]; int pad5[13]; __attribute__((aligned(ALIGNMENT))) TYPE aa[LEN2][LEN2]; int pad6[17]; __attribute__((aligned(ALIGNMENT))) TYPE bb[LEN2][LEN2]; int pad7[19]; __attribute__((aligned(ALIGNMENT))) TYPE cc[LEN2][LEN2]; int pad8[23]; __attribute__((aligned(ALIGNMENT))) TYPE tt[LEN2][LEN2]; } global_data; __attribute__((aligned(ALIGNMENT))) TYPE * const a = global_data.a; __attribute__((aligned(ALIGNMENT))) TYPE * const b = global_data.b; __attribute__((aligned(ALIGNMENT))) TYPE * const c = global_data.c; __attribute__((aligned(ALIGNMENT))) TYPE * const d = global_data.d; __attribute__((aligned(ALIGNMENT))) TYPE * const e = global_data.e; __attribute__((aligned(ALIGNMENT))) TYPE (* const aa)[LEN2] = global_data.aa; __attribute__((aligned(ALIGNMENT))) TYPE (* const bb)[LEN2] = global_data.bb; __attribute__((aligned(ALIGNMENT))) TYPE (* const cc)[LEN2] = global_data.cc; __attribute__((aligned(ALIGNMENT))) TYPE (* const tt)[LEN2] = global_data.tt; int foo() { // linear dependence testing // no dependence - vectorizable for (int nl = 0; nl < 2*ntimes; nl++) { // #pragma vector always for (int i = 1; i < LEN; i += 2) { a[i] = a[i - 1] + b[i]; } } return 0; } Both RVV and ARM SVE faild to vectorize it wheras Clang can vectorize it.
I suspect it is SRA issue again ?
Reduced case: #include <stdio.h> #define LEN 32000 #define ntimes 200000 #define TYPE int #define lll LEN #define LEN2 256 #define ALIGNMENT 16 __attribute__ ((aligned(ALIGNMENT))) TYPE X[lll],Y[lll],Z[lll],U[lll],V[lll]; struct GlobalData { __attribute__((aligned(ALIGNMENT))) TYPE a[LEN]; int pad1[3]; __attribute__((aligned(ALIGNMENT))) TYPE b[LEN]; int pad2[5]; __attribute__((aligned(ALIGNMENT))) TYPE c[LEN]; int pad3[7]; __attribute__((aligned(ALIGNMENT))) TYPE d[LEN]; int pad4[11]; __attribute__((aligned(ALIGNMENT))) TYPE e[LEN]; int pad5[13]; __attribute__((aligned(ALIGNMENT))) TYPE aa[LEN2][LEN2]; int pad6[17]; __attribute__((aligned(ALIGNMENT))) TYPE bb[LEN2][LEN2]; int pad7[19]; __attribute__((aligned(ALIGNMENT))) TYPE cc[LEN2][LEN2]; int pad8[23]; __attribute__((aligned(ALIGNMENT))) TYPE tt[LEN2][LEN2]; } global_data; __attribute__((aligned(ALIGNMENT))) TYPE * const a = global_data.a; __attribute__((aligned(ALIGNMENT))) TYPE * const b = global_data.b; __attribute__((aligned(ALIGNMENT))) TYPE * const c = global_data.c; __attribute__((aligned(ALIGNMENT))) TYPE * const d = global_data.d; __attribute__((aligned(ALIGNMENT))) TYPE * const e = global_data.e; __attribute__((aligned(ALIGNMENT))) TYPE (* const aa)[LEN2] = global_data.aa; __attribute__((aligned(ALIGNMENT))) TYPE (* const bb)[LEN2] = global_data.bb; __attribute__((aligned(ALIGNMENT))) TYPE (* const cc)[LEN2] = global_data.cc; __attribute__((aligned(ALIGNMENT))) TYPE (* const tt)[LEN2] = global_data.tt; int s111() { for (int nl = 0; nl < 2*ntimes; nl++) { for (int i = 0; i < lll; i++) { X[i] = Y[i] + 1; } } return 0; } Also failed to vectorize.
Well, the "issue" is that we are performing loop interchange on this benchmark loop and the vectorizer doesn't like the zero-step in the then innermost loop. It's not a practical example, nobody would do such outer loop in practice. There's a missed optimization in that we fail to elide the then inner loop. The solution is to insert a use of 'a' after the inner loop, like TSVC benchmarks usually have: real_t s111(struct args_t * func_args) { // linear dependence testing // no dependence - vectorizable initialise_arrays(__func__); for (int nl = 0; nl < 2*iterations; nl++) { for (int i = 1; i < LEN_1D; i += 2) { a[i] = a[i - 1] + b[i]; } dummy(a, b, c, d, e, aa, bb, cc, 0.); } return calc_checksum(__func__); } the it just works(TM). WONTFIX (in the vectorizer). In "theory" the interchanged loop could be vectorized by outer loop vectorization. But as said, IMHO a waste of time to cheat badly written benchmarks.
I see. It does vectorize it with -fno-vect-cost-model -fno-loop-interchange: https://gcc.godbolt.org/z/8EEWcPro3 Codegen same as LLVM. I am gonna revisit it in GCC-15 (GCC-14 stage 1 is closing soon). Thanks a lot!
I'm not sure what the problem is with a zero DR step for an inner loop reference (possibly dependence analysis runs into some unhandled cases - who knows). The following vectorizes the inner loop (the load is hoisted as invariant, but the store is not sunk - there's no sinking phase after interchange). diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index d5c9c4a11c2..7d1f0697fe7 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -2944,6 +2944,7 @@ vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info) DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; if (!nested_in_vect_loop_p (loop, stmt_info)) return DR_IS_READ (dr); +#if 0 /* Allow references with zero step for outer loops marked with pragma omp simd only - it guarantees absence of loop-carried dependencies between inner loop iterations. */ @@ -2954,6 +2955,7 @@ vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info) "zero step in inner loop of nest\n"); return false; } +#endif } if (loop && nested_in_vect_loop_p (loop, stmt_info)) Note when we don't vectorize we are eliding the inner loop later, when we vectorize we don't. unvectorized: s111: .LFB0: .cfi_startproc xorl %eax, %eax .L2: movl Y(%rax), %ecx addq $4, %rax leal 1(%rcx), %edx movl %edx, X-4(%rax) cmpq $128000, %rax jne .L2 xorl %eax, %eax ret vectorized: s111: .LFB0: .cfi_startproc movdqa .LC0(%rip), %xmm1 xorl %ecx, %ecx .L2: movdqa Y(%rcx), %xmm0 leaq X(%rcx), %rdx movl $400000, %eax paddd %xmm1, %xmm0 .p2align 4,,10 .p2align 3 .L3: movaps %xmm0, (%rdx) subl $2, %eax jne .L3 addq $16, %rcx cmpq $128000, %rcx jne .L2 xorl %eax, %eax ret