Bug 40361 - Conditional return not always profitable with -Os
Summary: Conditional return not always profitable with -Os
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: rtl-optimization (show other bugs)
Version: 4.5.0
: P3 enhancement
Target Milestone: 4.5.4
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization, patch
Depends on:
Blocks: 16996
  Show dependency treegraph
 
Reported: 2009-06-06 14:43 UTC by Steven Bosscher
Modified: 2022-04-26 19:19 UTC (History)
3 users (show)

See Also:
Host:
Target: arm-elf
Build:
Known to work:
Known to fail:
Last reconfirmed: 2010-01-02 00:18:21


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Steven Bosscher 2009-06-06 14:43:36 UTC
Consider this test case:

extern void bar1 (void);
extern void bar2 (void);

int a;
int b;

int foo (void)
{
  if (a < 0)
    {
      bar1 ();
      if (b < 0)
	b = 0;
    }
  else
    {
      bar2 ();
      if (b < 0)
	b = 0;
    }
}


Compiling x86_64-unknown-linux X arm-elf "xgcc (GCC) 4.5.0 20090606 (experimental) [trunk revision 148236]", with options -march=armv7-r -Os -dAP" without patches results in this code:


	.file	"t.c"
	.text
	.align	2
	.global	foo
	.type	foo, %function
foo:
	@ args = 0, pretend = 0, frame = 0
	@ frame_needed = 0, uses_anonymous_args = 0
	@ basic block 2
	ldr	r3, .L6	@ 67	*arm_movsi_insn/5	[length = 4]
	stmfd	sp!, {r4, lr}	@ 69	*push_multi	[length = 4]
	ldr	r3, [r3, #0]	@ 6	*arm_movsi_insn/5	[length = 4]
	ldr	r4, .L6+4	@ 62	*arm_movsi_insn/5	[length = 4]
	cmp	r3, #0	@ 7	*arm_cmpsi_insn/1	[length = 4]
	bge	.L2	@ 8	*arm_cond_branch	[length = 4]
	@ basic block 3
	bl	bar1	@ 10	*call_symbol	[length = 4]
	ldr	r3, [r4, #0]	@ 12	*arm_movsi_insn/5	[length = 4]
	cmp	r3, #0	@ 13	*arm_cmpsi_insn/1	[length = 4]
	movlt	r3, #0	@ 17	neon_vornv2di+78/2	[length = 4]
	strlt	r3, [r4, #0]	@ 18	neon_vornv2di+78/6	[length = 4]
	ldmfd	sp!, {r4, pc}	@ 71	return	[length = 12]
.L2:
	@ basic block 4
	bl	bar2	@ 23	*call_symbol	[length = 4]
	ldr	r3, [r4, #0]	@ 25	*arm_movsi_insn/5	[length = 4]
	cmp	r3, #0	@ 26	*arm_cmpsi_insn/1	[length = 4]
	ldmgefd	sp!, {r4, pc}	@ 27	*cond_return	[length = 12]
	@ basic block 5
	mov	r3, #0	@ 30	*arm_movsi_insn/2	[length = 4]
	str	r3, [r4, #0]	@ 31	*arm_movsi_insn/6	[length = 4]
	ldmfd	sp!, {r4, pc}	@ 73	return	[length = 12]
.L7:
	.align	2
.L6:
	.word	a
	.word	b
	.size	foo, .-foo
	.comm	a,4,4
	.comm	b,4,4
	.ident	"GCC: (GNU) 4.5.0 20090606 (experimental) [trunk revision 148236]"



Note the conditional return at the end of basic block 3.  It voids a crossjumping opportunity and results in bigger code compared with a direct branch to a shared return instruction.  This comes from threading the return instruction in prologue/epilogue threading.  With a patch to disable this transformation when optimizing for size, I get this code:

	.file	"t.c"
	.text
	.align	2
	.global	foo
	.type	foo, %function
foo:
	@ args = 0, pretend = 0, frame = 0
	@ frame_needed = 0, uses_anonymous_args = 0
	@ basic block 2
	ldr	r3, .L9	@ 67	*arm_movsi_insn/5	[length = 4]
	stmfd	sp!, {r4, lr}	@ 69	*push_multi	[length = 4]
	ldr	r3, [r3, #0]	@ 6	*arm_movsi_insn/5	[length = 4]
	ldr	r4, .L9+4	@ 62	*arm_movsi_insn/5	[length = 4]
	cmp	r3, #0	@ 7	*arm_cmpsi_insn/1	[length = 4]
	bge	.L2	@ 8	*arm_cond_branch	[length = 4]
	@ basic block 3
	bl	bar1	@ 10	*call_symbol	[length = 4]
	b	.L7	@ 87	*arm_jump	[length = 4]
.L2:
	@ basic block 4
	bl	bar2	@ 23	*call_symbol	[length = 4]
.L7:
	@ basic block 5
	ldr	r3, [r4, #0]	@ 25	*arm_movsi_insn/5	[length = 4]
	cmp	r3, #0	@ 26	*arm_cmpsi_insn/1	[length = 4]
	movlt	r3, #0	@ 30	neon_vornv2di+78/2	[length = 4]
	strlt	r3, [r4, #0]	@ 31	neon_vornv2di+78/6	[length = 4]
	ldmfd	sp!, {r4, pc}	@ 72	return	[length = 12]
.L10:
	.align	2
.L9:
	.word	a
	.word	b
	.size	foo, .-foo
	.comm	a,4,4
	.comm	b,4,4
	.ident	"GCC: (GNU) 4.5.0 20090606 (experimental) [trunk revision 148236]"


Basic block 3 now ends in a direct jump and the similar code is cross-jumped.  

The size of the unpatched code (sum of the reported insn lengths) is 100 for the unpatched code, and 64 bytes for the patched code.

This is the patch:

Index: ../../trunk/gcc/function.c
===================================================================
--- ../../trunk/gcc/function.c	(revision 148236)
+++ ../../trunk/gcc/function.c	(working copy)
@@ -5022,13 +5022,18 @@
 
   rtl_profile_for_bb (EXIT_BLOCK_PTR);
 #ifdef HAVE_return
-  if (optimize && HAVE_return)
+  if (HAVE_return && optimize_function_for_speed_p (cfun))
     {
       /* If we're allowed to generate a simple return instruction,
 	 then by definition we don't need a full epilogue.  Examine
 	 the block that falls through to EXIT.   If it does not
-	 contain any code, examine its predecessors and try to
-	 emit (conditional) return instructions.  */
+	 contain any code, and we are optimizing for speed, then
+	 examine its predecessors and try to emit (conditional)
+	 return instructions.  If optimizing for size, conditional
+	 returns are usually not a win, because they can interfere
+	 with crossjumping opportunities, and because there may be
+	 registers to restore from the stack in the return insns
+	 itself (such as the ARM return instruction).  */
 
       basic_block last;
       rtx label;
Comment 1 Steven Bosscher 2009-06-06 17:56:28 UTC
I tested the patch on arm-elf ("-march=armv7-r" and "-march=armv7 -mthumb"), trunk revision 148235, -Os, unpatched and patched, with CSiBE:

armv7-r:
unpatched : 3557127 bytes
patched   : 3554655 bytes
win: 2472 bytes (0.0006%)

armv7 thumb2:
unpatched : 2663575 bytes
patched   : 2663575 bytes
(identical)


For ARM some functions win, a few function lose, but the patch is an overall win.
Comment 2 Andrew Pinski 2021-07-26 06:42:01 UTC
So the cross jumping opportunity since at least 5.4 even with a conditional return.

        ldr     r3, .L8
        stmfd   sp!, {r4, lr}
        ldr     r3, [r3]
        ldr     r4, .L8+4
        cmp     r3, #0
        bge     .L2
        bl      bar1
        ldr     r3, [r4]
        cmp     r3, #0
        ldmgefd sp!, {r4, pc}
.L3:
        mov     r3, #0
        str     r3, [r4]
        ldmfd   sp!, {r4, pc}
.L2:
        bl      bar2
        ldr     r3, [r4]
        cmp     r3, #0
        blt     .L3
        ldmfd   sp!, {r4, pc}

The trunk produces this:
        push    {r4, lr}
        ldr     r4, .L9
        ldr     r3, [r4]
        cmp     r3, #0
        bge     .L2
        bl      bar1
.L8:
        ldr     r3, [r4, #4]
        cmp     r3, #0
        movlt   r3, #0
        strlt   r3, [r4, #4]
        pop     {r4, pc}
.L2:
        bl      bar2
        b       .L8

Which is even more cross jumped and note push and stmfd are the same here just written differently.
Comment 3 Roger Sayle 2022-04-26 19:19:05 UTC
I agree with Andrew Pinski that the issue with this test case (conditional return vs. cross-jumping on ARM code size) appears to have been fixed years ago, according to godbolt.org prior to GCC 4.5.4.