This testcase (from TSVC) is about 4 times faster on zen3 when built with clang. typedef float real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 // array definitions real_t flat_2d_array[LEN_2D*LEN_2D]; real_t x[LEN_1D]; real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D], bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D],tt[LEN_2D][LEN_2D]; int indx[LEN_1D]; real_t* __restrict__ xx; real_t* yy; real_t s243(void) { // node splitting // false dependence cycle breaking for (int nl = 0; nl < iterations; nl++) { for (int i = 0; i < LEN_1D-1; i++) { a[i] = b[i] + c[i ] * d[i]; b[i] = a[i] + d[i ] * e[i]; a[i] = b[i] + a[i+1] * d[i]; } } } internal loop from clang is: .LBB0_2: # Parent Loop BB0_1 Depth=1 # => This Inner Loop Header: Depth=2 vmovups c(%rcx), %ymm12 vmovups c+32(%rcx), %ymm14 vmovups d(%rcx), %ymm0 vmovups d+32(%rcx), %ymm7 vfmadd213ps b(%rcx), %ymm0, %ymm12 # ymm12 = (ymm0 * ymm12) + mem vfmadd213ps b+32(%rcx), %ymm7, %ymm14 # ymm14 = (ymm7 * ymm14) + mem vfmadd231ps e(%rcx), %ymm0, %ymm12 # ymm12 = (ymm0 * mem) + ymm12 vfmadd231ps e+32(%rcx), %ymm7, %ymm14 # ymm14 = (ymm7 * mem) + ymm14 vmovups %ymm12, b(%rcx) vmovups %ymm14, b+32(%rcx) vfmadd231ps a+4(%rcx), %ymm0, %ymm12 # ymm12 = (ymm0 * mem) + ymm12 vfmadd231ps a+36(%rcx), %ymm7, %ymm14 # ymm14 = (ymm7 * mem) + ymm14 vmovups %ymm12, a(%rcx) vmovups %ymm14, a+32(%rcx) addq $64, %rcx cmpq $127936, %rcx # imm = 0x1F3C0 jne .LBB0_2
Here we get: s243.c:27:18: missed: not vectorized, possible dependence between data-refs a[i_29] and a[_9] s243.c:26:27: missed: bad data dependence. s243.c:26:27: note: ***** Analysis failed with vector mode V8QI <bb 6> [local count: 1052266997]: <bb 3> [local count: 1063004410]: # i_29 = PHI <_9(6), 0(4)> # ivtmp_43 = PHI <ivtmp_42(6), 31999(4)> _1 = b[i_29]; _2 = c[i_29]; _3 = d[i_29]; _4 = _2 * _3; _5 = _1 + _4; a[i_29] = _5; _6 = e[i_29]; _7 = _3 * _6; _8 = _5 + _7; b[i_29] = _8; _9 = i_29 + 1; _10 = a[_9]; _11 = _3 * _10; _12 = _8 + _11; a[i_29] = _12; ivtmp_42 = ivtmp_43 - 1; if (ivtmp_42 != 0) goto <bb 6>; [98.99%] else goto <bb 5>; [1.01%]
Hmm, wonder why DSE didn't remove the first a[i] store. Ah, because DSE doesn't use data-ref analysis and thus cannot disambiguate the variable offset. Manually applying DSE produces .L4: vmovaps c(%rax), %ymm1 vaddps e(%rax), %ymm1, %ymm0 addq $32, %rax vmovups a-28(%rax), %ymm1 vmulps d-32(%rax), %ymm1, %ymm1 vmulps d-32(%rax), %ymm0, %ymm0 vaddps b-32(%rax), %ymm0, %ymm0 vmovaps %ymm0, b-32(%rax) vaddps %ymm0, %ymm1, %ymm0 vmovaps %ymm0, a-32(%rax) cmpq $127968, %rax jne .L4 manually DSEd loop: for (int nl = 0; nl < iterations; nl++) { for (int i = 0; i < LEN_1D-1; i++) { real_t tem = b[i] + c[i ] * d[i]; b[i] = tem + d[i ] * e[i]; a[i] = b[i] + a[i+1] * d[i]; } }
The master branch has been updated by Richard Biener <rguenth@gcc.gnu.org>: https://gcc.gnu.org/g:4bdf739f835520ccbc433dc9eac461895741f317 commit r13-2769-g4bdf739f835520ccbc433dc9eac461895741f317 Author: Richard Biener <rguenther@suse.de> Date: Thu Sep 22 09:40:40 2022 +0200 tree-optimization/99407 - DSE with data-ref analysis The following resolves the issue that DSE cannot handle references with variable offsets well when identifying possible uses of a store. Instead of just relying on ref_maybe_used_by_stmt_p we use data-ref analysis, making sure to perform that at most once per stmt. The new mode is only exercised by the DSE pass before loop optimization as specified by a new pass parameter and when expensive optimizations are enabled, so it's disabled below -O2. PR tree-optimization/99407 * tree-ssa-dse.cc (dse_stmt_to_dr_map): New global. (dse_classify_store): Use data-ref analysis to disambiguate more uses. (pass_dse::use_dr_analysis_p): New pass parameter. (pass_dse::set_pass_param): Implement. (pass_dse::execute): Allocate and deallocate dse_stmt_to_dr_map. * passes.def: Allow DR analysis for the DSE pass before loop. * gcc.dg/vect/tsvc/vect-tsvc-s243.c: Remove XFAIL.
Fixed for GCC 13.
*** Bug 106989 has been marked as a duplicate of this bug. ***