Created attachment 60227 [details] pr43879-3.c: C test case from test suite gcc.dg/torture/pr43879-3.c is failing with -mlra: $ avr-gcc -mmcu=attiny40 -dumpbase "" -save-temps -dp -Os -o pr43879-3.elf -mlra pr43879-3.c Target: avr Configured with: --target=avr --disable-nls --with-dwarf2 --with-gnu-as --with-gnu-ld --with-long-double=64 --disable-libcc1 --disable-shared --enable-languages=c,c++ gcc version 15.0.1 20250121 (experimental) (GCC) The assembly reads: main: ... /* prologue: function */ /* frame size = 8 */ /* stack size = 8 */ .L__stack_usage = 8 # Y[5]:SI = 5 looks ok. ldi r20,lo8(5) ; 173 [c=4 l=1] movqi_insn/1 subi r28,-5 ; 120 [c=8 l=2] *addhi3/3 sbci r29,-1 st Y+,r20 ; 169 [c=4 l=1] movqi_insn/2 st Y+,__zero_reg__ ; 177 [c=4 l=1] movqi_insn/2 st Y+,__zero_reg__ ; 178 [c=4 l=1] movqi_insn/2 st Y+,__zero_reg__ ; 179 [c=4 l=1] movqi_insn/2 subi r28,9 ; 122 [c=8 l=2] *addhi3/3 sbc r29,__zero_reg__ .L9: # Y[1]:SI = 5 looks ok. ldi r20,lo8(5) ; 180 [c=4 l=1] movqi_insn/1 ld __tmp_reg__,Y+ ; 73 [c=8 l=1] *addhi3/3 st Y+,r20 ; 143 [c=4 l=1] movqi_insn/2 st Y+,__zero_reg__ ; 184 [c=4 l=1] movqi_insn/2 st Y+,__zero_reg__ ; 185 [c=4 l=1] movqi_insn/2 st Y+,__zero_reg__ ; 186 [c=4 l=1] movqi_insn/2 subi r28,5 ; 75 [c=8 l=2] *addhi3/3 sbc r29,__zero_reg__ .L10: # Reding 4 times R20:QI = Y[4] ??? subi r28,-4 ; 88 [c=8 l=2] *addhi3/3 sbci r29,-1 ld r20,Y ; 89 [c=4 l=1] movqi_insn/3 push r20 ; 11 [c=4 l=1] pushqi1/0 ld r20,Y ; 93 [c=4 l=1] movqi_insn/3 push r20 ; 13 [c=4 l=1] pushqi1/0 ld r20,Y ; 97 [c=4 l=1] movqi_insn/3 push r20 ; 15 [c=4 l=1] pushqi1/0 ld r20,Y ; 101 [c=4 l=1] movqi_insn/3 push r20 ; 17 [c=4 l=1] pushqi1/0 subi r28,-5 ; 132 [c=8 l=2] *addhi3/3 sbci r29,-1 # Reading from R22:SI = Y[9] but was never initialized. Note frame size is 8. ld r22,Y+ ; 155 [c=4 l=1] movqi_insn/3 ld r23,Y+ ; 156 [c=4 l=1] movqi_insn/3 ld r24,Y+ ; 157 [c=4 l=1] movqi_insn/3 ld r25,Y+ ; 158 [c=4 l=1] movqi_insn/3 rcall f1 ; 19 [c=0 l=1] call_insn/1 The code after label .L10 looks very wrong. R23 (and maybe other regs) are never properly initialized since they read garbage from the stack. FYI, the error persists with -mno-fuse-add and is a bit more complicated then, but the problem remains: .L10: subi r28,lo8(-(4)) ; 78 [c=4 l=5] movqi_insn/3 sbci r29,hi8(-(4)) ld r20,Y subi r28,lo8((4)) sbci r29,hi8((4)) push r20 ; 11 [c=4 l=1] pushqi1/0 subi r28,lo8(-(4)) ; 79 [c=4 l=5] movqi_insn/3 sbci r29,hi8(-(4)) ld r20,Y subi r28,lo8((4)) sbci r29,hi8((4)) push r20 ; 13 [c=4 l=1] pushqi1/0 subi r28,lo8(-(4)) ; 80 [c=4 l=5] movqi_insn/3 sbci r29,hi8(-(4)) ld r20,Y subi r28,lo8((4)) sbci r29,hi8((4)) push r20 ; 15 [c=4 l=1] pushqi1/0 subi r28,lo8(-(4)) ; 81 [c=4 l=5] movqi_insn/3 sbci r29,hi8(-(4)) ld r20,Y subi r28,lo8((4)) sbci r29,hi8((4)) push r20 ; 17 [c=4 l=1] pushqi1/0 subi r28,lo8(-(9)) ; 82 [c=16 l=8] *movsi/2 sbci r29,hi8(-(9)) ld r22,Y+ ld r23,Y+ ld r24,Y+ ld r25,Y subi r28,lo8((9+3)) sbci r29,hi8((9+3)) Running the testsuite from $builddir/gcc: $ make -k check-gcc RUNTESTFLAGS="--target_board=attiny40-sim dg-torture.exp=pr43879-3.c -tool_opts='-mlra'" ... FAIL: gcc.dg/torture/pr43879-3.c -O1 execution test FAIL: gcc.dg/torture/pr43879-3.c -O2 execution test FAIL: gcc.dg/torture/pr43879-3.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions execution test FAIL: gcc.dg/torture/pr43879-3.c -O3 -g execution test FAIL: gcc.dg/torture/pr43879-3.c -Os execution test FAIL: gcc.dg/torture/pr43879-3.c -O2 -flto -fno-use-linker-plugin -flto-partition=none execution test
Created attachment 60230 [details] reduced C99 test case Here is a reduced test case that fails with -mlra -mmcu=attiny40 for any optimization level > 0.
(In reply to Georg-Johann Lay from comment #1) > Created attachment 60230 [details] > reduced C99 test case In that test case: __attribute__((noipa)) void func2 (long a, long b) { static unsigned char count = 0; if (b != count++) __builtin_abort (); } int main (void) { for (long b = 0; b < 5; ++b) func2 (0, b); return 0; } With -mlra -Os -mmcu=attiny40 -dp, main prepares argument b as follows (it lives in Y[1]:SI): subi r28,-4 ; 48 [c=8 l=2] *addhi3/3 sbci r29,-1 ld r20,Y ; 49 [c=4 l=1] movqi_insn/3 push r20 ; 8 [c=4 l=1] pushqi1/0 ld r20,Y ; 53 [c=4 l=1] movqi_insn/3 push r20 ; 10 [c=4 l=1] pushqi1/0 ld r20,Y ; 57 [c=4 l=1] movqi_insn/3 push r20 ; 12 [c=4 l=1] pushqi1/0 ld r20,Y+ ; 96 [c=4 l=1] movqi_insn/3 push r20 ; 14 [c=4 l=1] pushqi1/0 which is complete garbage and looks like elimination going bananas. With -mno-lra, all is fine: subi r28,-4 ; 47 [c=8 l=2] *addhi3/3 sbci r29,-1 ld r20,Y ; 48 [c=4 l=1] movqi_insn/3 push r20 ; 8 [c=4 l=1] pushqi1/0 ld r21,-Y ; 91 [c=4 l=1] movqi_insn/3 push r21 ; 10 [c=4 l=1] pushqi1/0 ld r22,-Y ; 93 [c=4 l=1] movqi_insn/3 push r22 ; 12 [c=4 l=1] pushqi1/0 ld r23,-Y ; 95 [c=4 l=1] movqi_insn/3 push r23 ; 14 [c=4 l=1] pushqi1/0 (FYI, the FP in Y is adjusted later (after the call) due to -mfuse-add, which also tidies the byte loads to LD -Y.)
Created attachment 60238 [details] C99 test case that fails on ordinary AVRs (not avrtiny) This test case fails on ordinary AVRs like -mmcu=atmega128. It passes more arguments and clobbers some regs so that b lives in a stack slot: __attribute__((noipa)) void func2 (long long a1, long long a2, long b) { static unsigned char count = 0; if (b != count++) __builtin_abort (); } int main (void) { for (long b = 0; b < 5; ++b) { __asm ("" ::: "r5", "r9"); func2 (0, 0, b); } return 0; } $ avr-gcc -mmcu=atmega128 -dumpbase "" -save-temps -dp -Os -mlra ... The argument preparation for b reads: .L4: ldd r24,Y+4 ; 63 [c=4 l=1] movqi_insn/3 push r24 ; 9 [c=4 l=1] pushqi1/0 ldd r24,Y+4 ; 64 [c=4 l=1] movqi_insn/3 push r24 ; 11 [c=4 l=1] pushqi1/0 ldd r24,Y+4 ; 65 [c=4 l=1] movqi_insn/3 push r24 ; 13 [c=4 l=1] pushqi1/0 ldd r24,Y+4 ; 66 [c=4 l=1] movqi_insn/3 push r24 ; 15 [c=4 l=1] pushqi1/0 What's also strange is that the code has a frame size of 4 and consequently, b lives in Y[1]:SI. However, after the call of func2, the code accesses Y[5]:SI that was never initialized: /* prologue: function */ /* frame size = 4 */ /* stack size = 4 */ .L__stack_usage = 4 # long b = 0; std Y+1,__zero_reg__ ; 115 [c=4 l=1] movqi_insn/2 std Y+2,__zero_reg__ ; 116 [c=4 l=1] movqi_insn/2 std Y+3,__zero_reg__ ; 117 [c=4 l=1] movqi_insn/2 std Y+4,__zero_reg__ ; 118 [c=4 l=1] movqi_insn/2 .L4: # Crippled passing of b. Seems tlike LRA thinks that PUSH changes Y. ldd r24,Y+4 ; 63 [c=4 l=1] movqi_insn/3 push r24 ; 9 [c=4 l=1] pushqi1/0 ldd r24,Y+4 ; 64 [c=4 l=1] movqi_insn/3 push r24 ; 11 [c=4 l=1] pushqi1/0 ldd r24,Y+4 ; 65 [c=4 l=1] movqi_insn/3 push r24 ; 13 [c=4 l=1] pushqi1/0 ldd r24,Y+4 ; 66 [c=4 l=1] movqi_insn/3 push r24 ; 15 [c=4 l=1] pushqi1/0 # Prepare long long args a1 and a2. ... rcall func2 ; 32 [c=0 l=1] call_insn/1 # Following access is wrong. As it seems, regalloc # thinks that the PUSHes above changed the frame pointer? ldd r24,Y+5 ; 102 [c=4 l=1] movqi_insn/3 ldd r25,Y+6 ; 103 [c=4 l=1] movqi_insn/3 ldd r26,Y+7 ; 104 [c=4 l=1] movqi_insn/3 ldd r27,Y+8 ; 105 [c=4 l=1] movqi_insn/3 adiw r24,1 ; 84 [c=16 l=3] *addsi3/1 adc r26,__zero_reg__ adc r27,__zero_reg__ std Y+5,r24 ; 106 [c=4 l=1] movqi_insn/2 std Y+6,r25 ; 107 [c=4 l=1] movqi_insn/2 std Y+7,r26 ; 108 [c=4 l=1] movqi_insn/2 std Y+8,r27 ; 109 [c=4 l=1] movqi_insn/2 ; SP += 4 ; 35 [c=8 l=4] *addhi3_sp pop __tmp_reg__ pop __tmp_reg__ pop __tmp_reg__ pop __tmp_reg__ # After popping b, loading b is from the correct offset: ldd r24,Y+1 ; 110 [c=4 l=1] movqi_insn/3 ldd r25,Y+2 ; 111 [c=4 l=1] movqi_insn/3 ldd r26,Y+3 ; 112 [c=4 l=1] movqi_insn/3 ldd r27,Y+4 ; 113 [c=4 l=1] movqi_insn/3 sbiw r26,0 ; 87 [c=28 l=3] *cmpsi/2 sbci r25,hi8(5) sbci r24,lo8(5) brne .L4 ; 88 [c=4 l=1] branch ...
The master branch has been updated by Georg-Johann Lay <gjl@gcc.gnu.org>: https://gcc.gnu.org/g:6f4592ae95eed53dc3a370f98c04a8f25f007811 commit r15-7123-g6f4592ae95eed53dc3a370f98c04a8f25f007811 Author: Georg-Johann Lay <avr@gjlay.de> Date: Wed Jan 22 12:02:16 2025 +0100 AVR: Add test cases for PR118591. gcc/testsuite/ PR rtl-optimization/118591 * gcc.target/avr/torture/pr118591-1.c: New test. * gcc.target/avr/torture/pr118591-2.c: New test.
The patch is simple. diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc index 7dbc7fe1e00..6c86e4f8f6c 100644 --- a/gcc/lra-constraints.cc +++ b/gcc/lra-constraints.cc @@ -5883,8 +5883,8 @@ static bool inherit_reload_reg (bool def_p, int original_regno, enum reg_class cl, rtx_insn *insn, rtx next_usage_insns) { - if (optimize_function_for_size_p (cfun)) - return false; + // if (optimize_function_for_size_p (cfun)) + // return false; enum reg_class rclass = lra_get_allocno_class (original_regno); rtx original_reg = regno_reg_rtx[original_regno];
Proposed patch https://gcc.gnu.org/pipermail/gcc-patches/2025-April/681478.html
The test case works now after the patches for PR120424 are upstream.