Bug 118591 - [lra][avr] Wrong code with -mlra in pr43879-3.c
Summary: [lra][avr] Wrong code with -mlra in pr43879-3.c
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: rtl-optimization (show other bugs)
Version: 15.0
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: ra, wrong-code
Depends on:
Blocks: avr+ra 113934
  Show dependency treegraph
 
Reported: 2025-01-21 16:52 UTC by Georg-Johann Lay
Modified: 2025-06-27 13:02 UTC (History)
2 users (show)

See Also:
Host:
Target: avr
Build:
Known to work:
Known to fail:
Last reconfirmed: 2025-03-26 00:00:00


Attachments
pr43879-3.c: C test case from test suite (182 bytes, text/plain)
2025-01-21 16:52 UTC, Georg-Johann Lay
Details
reduced C99 test case (166 bytes, text/plain)
2025-01-21 19:32 UTC, Georg-Johann Lay
Details
C99 test case that fails on ordinary AVRs (not avrtiny) (197 bytes, text/plain)
2025-01-22 10:33 UTC, Georg-Johann Lay
Details

Note You need to log in before you can comment on or make changes to this bug.
Description Georg-Johann Lay 2025-01-21 16:52:58 UTC
Created attachment 60227 [details]
pr43879-3.c: C test case from test suite

gcc.dg/torture/pr43879-3.c is failing with -mlra:

$ avr-gcc -mmcu=attiny40  -dumpbase "" -save-temps -dp -Os -o pr43879-3.elf -mlra pr43879-3.c

Target: avr
Configured with: --target=avr --disable-nls --with-dwarf2 --with-gnu-as --with-gnu-ld --with-long-double=64 --disable-libcc1 --disable-shared --enable-languages=c,c++
gcc version 15.0.1 20250121 (experimental) (GCC)

The assembly reads:
main:
	...
/* prologue: function */
/* frame size = 8 */
/* stack size = 8 */
.L__stack_usage = 8
# Y[5]:SI = 5 looks ok.
	ldi r20,lo8(5)	 ;  173	[c=4 l=1]  movqi_insn/1
	subi r28,-5	 ;  120	[c=8 l=2]  *addhi3/3
	sbci r29,-1
	st Y+,r20		 ;  169	[c=4 l=1]  movqi_insn/2
	st Y+,__zero_reg__		 ;  177	[c=4 l=1]  movqi_insn/2
	st Y+,__zero_reg__		 ;  178	[c=4 l=1]  movqi_insn/2
	st Y+,__zero_reg__		 ;  179	[c=4 l=1]  movqi_insn/2
	subi r28,9	 ;  122	[c=8 l=2]  *addhi3/3
	sbc r29,__zero_reg__
.L9:
# Y[1]:SI = 5 looks ok.
	ldi r20,lo8(5)	 ;  180	[c=4 l=1]  movqi_insn/1
	ld __tmp_reg__,Y+	 ;  73	[c=8 l=1]  *addhi3/3
	st Y+,r20		 ;  143	[c=4 l=1]  movqi_insn/2
	st Y+,__zero_reg__		 ;  184	[c=4 l=1]  movqi_insn/2
	st Y+,__zero_reg__		 ;  185	[c=4 l=1]  movqi_insn/2
	st Y+,__zero_reg__		 ;  186	[c=4 l=1]  movqi_insn/2
	subi r28,5	 ;  75	[c=8 l=2]  *addhi3/3
	sbc r29,__zero_reg__
.L10:
# Reding 4 times R20:QI = Y[4] ???
	subi r28,-4	 ;  88	[c=8 l=2]  *addhi3/3
	sbci r29,-1
	ld r20,Y		 ;  89	[c=4 l=1]  movqi_insn/3
	push r20		 ;  11	[c=4 l=1]  pushqi1/0
	ld r20,Y		 ;  93	[c=4 l=1]  movqi_insn/3
	push r20		 ;  13	[c=4 l=1]  pushqi1/0
	ld r20,Y		 ;  97	[c=4 l=1]  movqi_insn/3
	push r20		 ;  15	[c=4 l=1]  pushqi1/0
	ld r20,Y		 ;  101	[c=4 l=1]  movqi_insn/3
	push r20		 ;  17	[c=4 l=1]  pushqi1/0
	subi r28,-5	 ;  132	[c=8 l=2]  *addhi3/3
	sbci r29,-1
# Reading from R22:SI = Y[9] but was never initialized.  Note frame size is 8.
	ld r22,Y+		 ;  155	[c=4 l=1]  movqi_insn/3
	ld r23,Y+		 ;  156	[c=4 l=1]  movqi_insn/3
	ld r24,Y+		 ;  157	[c=4 l=1]  movqi_insn/3
	ld r25,Y+		 ;  158	[c=4 l=1]  movqi_insn/3
	rcall f1	 ;  19	[c=0 l=1]  call_insn/1

The code after label .L10 looks very wrong.  R23 (and maybe other regs) are never properly initialized since they read garbage from the stack.

FYI, the error persists with -mno-fuse-add and is a bit more complicated then, but the problem remains:
.L10:
	subi r28,lo8(-(4))	 ;  78	[c=4 l=5]  movqi_insn/3
	sbci r29,hi8(-(4))
	ld r20,Y
	subi r28,lo8((4))
	sbci r29,hi8((4))
	push r20		 ;  11	[c=4 l=1]  pushqi1/0
	subi r28,lo8(-(4))	 ;  79	[c=4 l=5]  movqi_insn/3
	sbci r29,hi8(-(4))
	ld r20,Y
	subi r28,lo8((4))
	sbci r29,hi8((4))
	push r20		 ;  13	[c=4 l=1]  pushqi1/0
	subi r28,lo8(-(4))	 ;  80	[c=4 l=5]  movqi_insn/3
	sbci r29,hi8(-(4))
	ld r20,Y
	subi r28,lo8((4))
	sbci r29,hi8((4))
	push r20		 ;  15	[c=4 l=1]  pushqi1/0
	subi r28,lo8(-(4))	 ;  81	[c=4 l=5]  movqi_insn/3
	sbci r29,hi8(-(4))
	ld r20,Y
	subi r28,lo8((4))
	sbci r29,hi8((4))
	push r20		 ;  17	[c=4 l=1]  pushqi1/0
	subi r28,lo8(-(9))	 ;  82	[c=16 l=8]  *movsi/2
	sbci r29,hi8(-(9))
	ld r22,Y+
	ld r23,Y+
	ld r24,Y+
	ld r25,Y
	subi r28,lo8((9+3))
	sbci r29,hi8((9+3))

Running the testsuite from $builddir/gcc:

$ make -k check-gcc RUNTESTFLAGS="--target_board=attiny40-sim  dg-torture.exp=pr43879-3.c -tool_opts='-mlra'"
...
FAIL: gcc.dg/torture/pr43879-3.c   -O1  execution test
FAIL: gcc.dg/torture/pr43879-3.c   -O2  execution test
FAIL: gcc.dg/torture/pr43879-3.c   -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  execution test
FAIL: gcc.dg/torture/pr43879-3.c   -O3 -g  execution test
FAIL: gcc.dg/torture/pr43879-3.c   -Os  execution test
FAIL: gcc.dg/torture/pr43879-3.c   -O2 -flto -fno-use-linker-plugin -flto-partition=none  execution test
Comment 1 Georg-Johann Lay 2025-01-21 19:32:51 UTC
Created attachment 60230 [details]
reduced C99 test case

Here is a reduced test case that fails with -mlra -mmcu=attiny40 for any optimization level > 0.
Comment 2 Georg-Johann Lay 2025-01-21 20:16:37 UTC
(In reply to Georg-Johann Lay from comment #1)
> Created attachment 60230 [details]
> reduced C99 test case

In that test case:

__attribute__((noipa))
void func2 (long a, long b)
{
  static unsigned char count = 0;
  if (b != count++)
    __builtin_abort ();
}

int main (void)
{
  for (long b = 0; b < 5; ++b)
    func2 (0, b);

  return 0;
}

With -mlra -Os -mmcu=attiny40 -dp, main prepares argument b as follows (it lives in Y[1]:SI):

	subi r28,-4	 ;  48	[c=8 l=2]  *addhi3/3
	sbci r29,-1
	ld r20,Y		 ;  49	[c=4 l=1]  movqi_insn/3
	push r20		 ;  8	[c=4 l=1]  pushqi1/0
	ld r20,Y		 ;  53	[c=4 l=1]  movqi_insn/3
	push r20		 ;  10	[c=4 l=1]  pushqi1/0
	ld r20,Y		 ;  57	[c=4 l=1]  movqi_insn/3
	push r20		 ;  12	[c=4 l=1]  pushqi1/0
	ld r20,Y+		 ;  96	[c=4 l=1]  movqi_insn/3
	push r20		 ;  14	[c=4 l=1]  pushqi1/0

which is complete garbage and looks like elimination going bananas.  With -mno-lra, all is fine:

	subi r28,-4	 ;  47	[c=8 l=2]  *addhi3/3
	sbci r29,-1
	ld r20,Y		 ;  48	[c=4 l=1]  movqi_insn/3
	push r20		 ;  8	[c=4 l=1]  pushqi1/0
	ld r21,-Y		 ;  91	[c=4 l=1]  movqi_insn/3
	push r21		 ;  10	[c=4 l=1]  pushqi1/0
	ld r22,-Y		 ;  93	[c=4 l=1]  movqi_insn/3
	push r22		 ;  12	[c=4 l=1]  pushqi1/0
	ld r23,-Y		 ;  95	[c=4 l=1]  movqi_insn/3
	push r23		 ;  14	[c=4 l=1]  pushqi1/0

(FYI, the FP in Y is adjusted later (after the call) due to -mfuse-add, which also tidies the byte loads to LD -Y.)
Comment 3 Georg-Johann Lay 2025-01-22 10:33:12 UTC
Created attachment 60238 [details]
C99 test case that fails on ordinary AVRs (not avrtiny)

This test case fails on ordinary AVRs like -mmcu=atmega128.  It passes more arguments and clobbers some regs so that b lives in a stack slot:

__attribute__((noipa))
void func2 (long long a1, long long a2, long b)
{
  static unsigned char count = 0;
  if (b != count++)
    __builtin_abort ();
}

int main (void)
{
  for (long b = 0; b < 5; ++b)
    {
      __asm ("" ::: "r5", "r9");
      func2 (0, 0, b);
    }

  return 0;
}

$ avr-gcc -mmcu=atmega128 -dumpbase "" -save-temps -dp -Os -mlra ...

The argument preparation for b reads:

.L4:
	ldd r24,Y+4	 ;  63	[c=4 l=1]  movqi_insn/3
	push r24	 ;  9	[c=4 l=1]  pushqi1/0
	ldd r24,Y+4	 ;  64	[c=4 l=1]  movqi_insn/3
	push r24	 ;  11	[c=4 l=1]  pushqi1/0
	ldd r24,Y+4	 ;  65	[c=4 l=1]  movqi_insn/3
	push r24	 ;  13	[c=4 l=1]  pushqi1/0
	ldd r24,Y+4	 ;  66	[c=4 l=1]  movqi_insn/3
	push r24	 ;  15	[c=4 l=1]  pushqi1/0

What's also strange is that the code has a frame size of 4 and consequently, b lives in Y[1]:SI.  However, after the call of func2, the code accesses Y[5]:SI that was never initialized:

/* prologue: function */
/* frame size = 4 */
/* stack size = 4 */
.L__stack_usage = 4
# long b = 0;
	std Y+1,__zero_reg__	 ;  115	[c=4 l=1]  movqi_insn/2
	std Y+2,__zero_reg__	 ;  116	[c=4 l=1]  movqi_insn/2
	std Y+3,__zero_reg__	 ;  117	[c=4 l=1]  movqi_insn/2
	std Y+4,__zero_reg__	 ;  118	[c=4 l=1]  movqi_insn/2
.L4:
# Crippled passing of b.  Seems tlike LRA thinks that PUSH changes Y.
	ldd r24,Y+4	 ;  63	[c=4 l=1]  movqi_insn/3
	push r24	 ;  9	[c=4 l=1]  pushqi1/0
	ldd r24,Y+4	 ;  64	[c=4 l=1]  movqi_insn/3
	push r24	 ;  11	[c=4 l=1]  pushqi1/0
	ldd r24,Y+4	 ;  65	[c=4 l=1]  movqi_insn/3
	push r24	 ;  13	[c=4 l=1]  pushqi1/0
	ldd r24,Y+4	 ;  66	[c=4 l=1]  movqi_insn/3
	push r24	 ;  15	[c=4 l=1]  pushqi1/0
# Prepare long long args a1 and a2.
	...
	rcall func2	 ;  32	[c=0 l=1]  call_insn/1
# Following access is wrong.  As it seems, regalloc
# thinks that the PUSHes above changed the frame pointer?
	ldd r24,Y+5	 ;  102	[c=4 l=1]  movqi_insn/3
	ldd r25,Y+6	 ;  103	[c=4 l=1]  movqi_insn/3
	ldd r26,Y+7	 ;  104	[c=4 l=1]  movqi_insn/3
	ldd r27,Y+8	 ;  105	[c=4 l=1]  movqi_insn/3
	adiw r24,1	 ;  84	[c=16 l=3]  *addsi3/1
	adc r26,__zero_reg__
	adc r27,__zero_reg__
	std Y+5,r24	 ;  106	[c=4 l=1]  movqi_insn/2
	std Y+6,r25	 ;  107	[c=4 l=1]  movqi_insn/2
	std Y+7,r26	 ;  108	[c=4 l=1]  movqi_insn/2
	std Y+8,r27	 ;  109	[c=4 l=1]  movqi_insn/2
	 ; SP += 4	 ;  35	[c=8 l=4]  *addhi3_sp
	pop __tmp_reg__
	pop __tmp_reg__
	pop __tmp_reg__
	pop __tmp_reg__
# After popping b, loading b is from the correct offset:
	ldd r24,Y+1	 ;  110	[c=4 l=1]  movqi_insn/3
	ldd r25,Y+2	 ;  111	[c=4 l=1]  movqi_insn/3
	ldd r26,Y+3	 ;  112	[c=4 l=1]  movqi_insn/3
	ldd r27,Y+4	 ;  113	[c=4 l=1]  movqi_insn/3
	sbiw r26,0	 ;  87	[c=28 l=3]  *cmpsi/2
	sbci r25,hi8(5)
	sbci r24,lo8(5)
	brne .L4	 ;  88	[c=4 l=1]  branch
...
Comment 4 GCC Commits 2025-01-22 11:05:14 UTC
The master branch has been updated by Georg-Johann Lay <gjl@gcc.gnu.org>:

https://gcc.gnu.org/g:6f4592ae95eed53dc3a370f98c04a8f25f007811

commit r15-7123-g6f4592ae95eed53dc3a370f98c04a8f25f007811
Author: Georg-Johann Lay <avr@gjlay.de>
Date:   Wed Jan 22 12:02:16 2025 +0100

    AVR: Add test cases for PR118591.
    
    gcc/testsuite/
            PR rtl-optimization/118591
            * gcc.target/avr/torture/pr118591-1.c: New test.
            * gcc.target/avr/torture/pr118591-2.c: New test.
Comment 5 Denis Chertykov 2025-04-19 17:06:49 UTC
The patch is simple.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index 7dbc7fe1e00..6c86e4f8f6c 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -5883,8 +5883,8 @@ static bool
 inherit_reload_reg (bool def_p, int original_regno,
 		    enum reg_class cl, rtx_insn *insn, rtx next_usage_insns)
 {
-  if (optimize_function_for_size_p (cfun))
-    return false;
+  //  if (optimize_function_for_size_p (cfun))
+  //    return false;
 
   enum reg_class rclass = lra_get_allocno_class (original_regno);
   rtx original_reg = regno_reg_rtx[original_regno];
Comment 6 Denis Chertykov 2025-04-19 19:44:54 UTC
Proposed patch

https://gcc.gnu.org/pipermail/gcc-patches/2025-April/681478.html
Comment 7 Georg-Johann Lay 2025-06-27 13:02:02 UTC
The test case works now after the patches for PR120424 are upstream.