Created attachment 53118 [details] preprocessed file FDIV SVE instruction is generated with the predicate register whose all bits are active. This FDIV sets the divide-by-zero flag (the bit 1 (DZC) of FPSR register) unnecessarily. Should this instruction be executed with the appropriate predicate bits? In addition to FDIV, there may be other flags in the FPSR(IOC, OFC, UFC, IXC, etc.) that could be set by performing operations on SIMD lanes that do not contain the intended values in FADD/FSUB/FMUL as well. /* float a[7]; float b[7]; for(int i=0; i<7; i++) { a[i] = some_initial\values; } for(int i=0; i<7; i++) { b[i] = COEF / a[i]; } */ (p0 = {0x11, 0x11, 0x11, 0x1, 0x0, 0x0, 0x0, 0x0} (p1 = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff} (z1.s = {3, 3.29999995, 3.5999999, 3.9000001, 4.19999981, 4.5, 4.80000019, 0, 0, 0, 0, 0, 0, 0, 0, 0} add x0, sp, 64 fdiv z0.s, p0/m, z0.s, z1.s # divide-by-zero exception raised st1w z0.s, p1, [x0] % gcc -v -save-temps -O3 -g -march=armv8.2-a+sve main.c Using built-in specs. COLLECT_GCC=gcc COLLECT_LTO_WRAPPER=/opt/causal/gcc-12.1.0_gcc-11.3.0/libexec/gcc/aarch64-unknown-linux-gnu/12.1.0/lto-wrapper Target: aarch64-unknown-linux-gnu Configured with: ./configure --prefix=/opt/causal/gcc-12.1.0_gcc-11.3.0 Thread model: posix Supported LTO compression algorithms: zlib gcc version 12.1.0 (GCC) COLLECT_GCC_OPTIONS='-v' '-save-temps' '-O3' '-g' '-march=armv8.2-a+sve' '-mlittle-endian' '-mabi=lp64' '-dumpdir' 'a-' /opt/causal/gcc-12.1.0_gcc-11.3.0/libexec/gcc/aarch64-unknown-linux-gnu/12.1.0/cc1 -E -quiet -v main.c -march=armv8.2-a+sve -mlittle-endian -mabi=lp64 -g -fworking-directory -O3 -fpch-preprocess -o a-main.i ignoring nonexistent directory "/opt/causal/gcc-12.1.0_gcc-11.3.0/lib/gcc/aarch64-unknown-linux-gnu/12.1.0/../../../../aarch64-unknown-linux-gnu/include" #include "..." search starts here: #include <...> search starts here: /opt/causal/gcc-12.1.0_gcc-11.3.0/lib/gcc/aarch64-unknown-linux-gnu/12.1.0/include /usr/local/include /opt/causal/gcc-12.1.0_gcc-11.3.0/include /opt/causal/gcc-12.1.0_gcc-11.3.0/lib/gcc/aarch64-unknown-linux-gnu/12.1.0/include-fixed /usr/include End of search list. COLLECT_GCC_OPTIONS='-v' '-save-temps' '-O3' '-g' '-march=armv8.2-a+sve' '-mlittle-endian' '-mabi=lp64' '-dumpdir' 'a-' /opt/causal/gcc-12.1.0_gcc-11.3.0/libexec/gcc/aarch64-unknown-linux-gnu/12.1.0/cc1 -fpreprocessed a-main.i -quiet -dumpdir a- -dumpbase main.c -dumpbase-ext .c -march=armv8.2-a+sve -mlittle-endian -mabi=lp64 -g -O3 -version -o a-main.s GNU C17 (GCC) version 12.1.0 (aarch64-unknown-linux-gnu) compiled by GNU C version 12.1.0, GMP version 6.1.2, MPFR version 3.1.6-p2, MPC version 1.0.2, isl version none GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072 GNU C17 (GCC) version 12.1.0 (aarch64-unknown-linux-gnu) compiled by GNU C version 12.1.0, GMP version 6.1.2, MPFR version 3.1.6-p2, MPC version 1.0.2, isl version none GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072 Compiler executable checksum: 44e79a65ea64887de47cdc9a11ff4739 COLLECT_GCC_OPTIONS='-v' '-save-temps' '-O3' '-g' '-march=armv8.2-a+sve' '-mlittle-endian' '-mabi=lp64' '-dumpdir' 'a-' as -v --gdwarf-5 -EL -march=armv8.2-a+sve -mabi=lp64 -o a-main.o a-main.s GNU assembler version 2.38 (aarch64-unknown-linux-gnu) using BFD version (GNU Binutils) 2.38 COMPILER_PATH=/opt/causal/gcc-12.1.0_gcc-11.3.0/libexec/gcc/aarch64-unknown-linux-gnu/12.1.0/:/opt/causal/gcc-12.1.0_gcc-11.3.0/libexec/gcc/aarch64-unknown-linux-gnu/12.1.0/:/opt/causal/gcc-12.1.0_gcc-11.3.0/libexec/gcc/aarch64-unknown-linux-gnu/:/opt/causal/gcc-12.1.0_gcc-11.3.0/lib/gcc/aarch64-unknown-linux-gnu/12.1.0/:/opt/causal/gcc-12.1.0_gcc-11.3.0/lib/gcc/aarch64-unknown-linux-gnu/ LIBRARY_PATH=/opt/causal/gcc-12.1.0_gcc-11.3.0/lib/gcc/aarch64-unknown-linux-gnu/12.1.0/:/opt/causal/gcc-12.1.0_gcc-11.3.0/lib/gcc/aarch64-unknown-linux-gnu/12.1.0/../../../../lib64/:/lib/../lib64/:/usr/lib/../lib64/:/opt/causal/gcc-12.1.0_gcc-11.3.0/lib/gcc/aarch64-unknown-linux-gnu/12.1.0/../../../:/lib/:/usr/lib/ COLLECT_GCC_OPTIONS='-v' '-save-temps' '-O3' '-g' '-march=armv8.2-a+sve' '-mlittle-endian' '-mabi=lp64' '-dumpdir' 'a.' /opt/causal/gcc-12.1.0_gcc-11.3.0/libexec/gcc/aarch64-unknown-linux-gnu/12.1.0/collect2 -plugin /opt/causal/gcc-12.1.0_gcc-11.3.0/libexec/gcc/aarch64-unknown-linux-gnu/12.1.0/liblto_plugin.so -plugin-opt=/opt/causal/gcc-12.1.0_gcc-11.3.0/libexec/gcc/aarch64-unknown-linux-gnu/12.1.0/lto-wrapper -plugin-opt=-fresolution=a.res -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lgcc_s --eh-frame-hdr -dynamic-linker /lib/ld-linux-aarch64.so.1 -X -EL -maarch64linux /lib/../lib64/crt1.o /lib/../lib64/crti.o /opt/causal/gcc-12.1.0_gcc-11.3.0/lib/gcc/aarch64-unknown-linux-gnu/12.1.0/crtbegin.o -L/opt/causal/gcc-12.1.0_gcc-11.3.0/lib/gcc/aarch64-unknown-linux-gnu/12.1.0 -L/opt/causal/gcc-12.1.0_gcc-11.3.0/lib/gcc/aarch64-unknown-linux-gnu/12.1.0/../../../../lib64 -L/lib/../lib64 -L/usr/lib/../lib64 -L/opt/causal/gcc-12.1.0_gcc-11.3.0/lib/gcc/aarch64-unknown-linux-gnu/12.1.0/../../.. a-main.o -lgcc --push-state --as-needed -lgcc_s --pop-state -lc -lgcc --push-state --as-needed -lgcc_s --pop-state /opt/causal/gcc-12.1.0_gcc-11.3.0/lib/gcc/aarch64-unknown-linux-gnu/12.1.0/crtend.o /lib/../lib64/crtn.o COLLECT_GCC_OPTIONS='-v' '-save-temps' '-O3' '-g' '-march=armv8.2-a+sve' '-mlittle-endian' '-mabi=lp64' '-dumpdir' 'a.'
Confirmed. The division should have been predicated on the same as the load/store but currently GCC does not do that. GCC does not really support looking into fpu status bits or exceptions while vectorizing either.
(In reply to Andrew Pinski from comment #1) > Confirmed. The division should have been predicated on the same as the > load/store but currently GCC does not do that. > > GCC does not really support looking into fpu status bits or exceptions while > vectorizing either. It effectively "supports" it by failing to vectorize when exception state builtins are used in the vectorized region and otherwise it just accumulates exception bits (but it doesn't support in-order traps if you enable exceptions to trap). Note there's a bit of confusion as to what exactly controls FP exception bit correctness and the documentation should probably be clarified.
Dup of bug 96373. *** This bug has been marked as a duplicate of bug 96373 ***