Bug 11873

Summary: inefficient use of registers induces size and time overhead
Product: gcc Reporter: willy tarreau <willy>
Component: rtl-optimizationAssignee: Not yet assigned to anyone <unassigned>
Status: NEW ---    
Severity: enhancement CC: gcc-bugs, ian
Priority: P2 Keywords: missed-optimization, ra
Version: 3.3.1   
Target Milestone: ---   
Host: Target: i586-linux-gnu
Build: Known to work:
Known to fail: Last reconfirmed: 2011-05-22 16:27:10
Bug Depends on: 15792    
Bug Blocks:    

Description willy tarreau 2003-08-10 08:38:58 UTC
I wrote a simple test function which uses an unsigned long long arg and returns
its integer part + 1 if not 0, otherwise 0. GCC 3.3.1 passes some intermediate
values through several registers while it's unneeded. Here comes all my analysis.
I hope this can help improving the optimizer.
The following code :

int test(unsigned long long x) {
        if (x) {
                return (int)x + 1;
        }
        else {
                return (int)x;
        }
}

processed this way :

# 1 "bool.c"
# 1 "<built-in>"
# 1 "<command line>"
# 1 "bool.c"
int test(unsigned long long x) {
        if (x) {
                return (int)x + 1;
        }
        else {
                return (int)x;
        }
}



produces the following code when compiled with gcc 2.95.3 :
(gcc -c -O2 -fomit-frame-pointer bool.c)

00000000 <test>:
   0:   8b 54 24 04             mov    0x4(%esp,1),%edx
   4:   8b 4c 24 08             mov    0x8(%esp,1),%ecx
   8:   89 d0                   mov    %edx,%eax
   a:   09 c8                   or     %ecx,%eax
   c:   75 03                   jne    11 <test+0x11>
   e:   89 d0                   mov    %edx,%eax
  10:   c3                      ret
  11:   8d 42 01                lea    0x1(%edx),%eax
  14:   c3                      ret
  15:   8d 76 00                lea    0x0(%esi),%esi
  18:


and this one with gcc-3.3.1 :
(gcc-3.3.1 -c -O2 -fomit-frame-pointer bool.c)

00000000 <test>:
   0:   53                      push   %ebx
   1:   8b 4c 24 0c             mov    0xc(%esp,1),%ecx
   5:   8b 54 24 08             mov    0x8(%esp,1),%edx
   9:   89 cb                   mov    %ecx,%ebx
   b:   89 d0                   mov    %edx,%eax
   d:   09 d3                   or     %edx,%ebx
   f:   74 03                   je     14 <test+0x14>
  11:   8d 42 01                lea    0x1(%edx),%eax
  14:   5b                      pop    %ebx
  15:   c3                      ret
  16:   8d 76 00                lea    0x0(%esi),%esi
  19:   8d bc 27 00 00 00 00    lea    0x0(%edi,1),%edi
  20:


the EBX register is used and clobbered for nothing here. The
same code could be written this way, which is fully equivalent
and saves some cycles and bytes :

00000000 <test>:
   0:   8b 4c 24 0c             mov    0xc(%esp,1),%ecx
   4:   8b 54 24 08             mov    0x8(%esp,1),%edx
   8:   89 d0                   mov    %edx,%eax
   a:   09 d1                   or     %edx,%ecx
   c:   74 03                   je     11 <test+0x11>
   e:   8d 42 01                lea    0x1(%edx),%eax
  11:   c3                      ret
  12:

Now we can also save EDX and some more bytes :

00000000 <test>:
   0:   8b 4c 24 0c             mov    0xc(%esp,1),%ecx
   4:   8b 54 24 08             mov    0x8(%esp,1),%eax
   a:   09 c1                   or     %eax,%ecx
   c:   74 01                   je     f <test+0xf>
   e:   41                      inc    %eax
   f:   c3                      ret
  10:

Here are the compilers versions :

wtap:~/dev$ gcc -v
Reading specs from /usr/lib/gcc-lib/i586-pc-linux-gnu/2.95.3/specs
gcc version 2.95.3 20010315 (release)

wtap:~/dev$ gcc-3.3.1 -v
Reading specs from /usr/lib/gcc-lib/i586-pc-linux-gnu/3.3.1/specs
Configured with: ./configure --prefix=/usr --with-cpu=i386 --host=i586-pc-linux-gnu --enable-languages=c,c++ --disable-nls --disable-locale --enable-shared --enable-target-optspace --enable-version-specific-runtime-libs --program-suffix=-3.3.1 --enable-threads
Thread model: posix
gcc version 3.3.1
Comment 1 Andrew Pinski 2003-08-10 13:52:00 UTC
I can confirm this on the mainline (20030809).  GCC is not really good at optimizing long long's, I 
have some improvements but it seems not to help in this case.
Comment 2 Andrew Pinski 2004-06-03 04:29:02 UTC
I filed 15792 to track part of this bug.
Comment 3 Andrew Pinski 2004-11-13 06:01:18 UTC
I don't know why this was put in waiting but it should not have been.
Comment 4 Rask Ingemann Lambertsen 2007-11-09 23:51:41 UTC
This has improved (-O2 -fomit-frame-pointer):

test:
	movl	4(%esp), %eax	# 32	*movsi_1/1	[length = 4]
	movl	8(%esp), %edx	# 44	*movsi_1/1	[length = 4]
	orl	%eax, %edx	# 6	*iorsi_1/1	[length = 2]
	addl	$1, %eax	# 35	*addsi_1/1	[length = 3]
	cmpl	$1, %edx	# 38	*cmpsi_1_insn/1	[length = 3]
	sbbl	%edx, %edx	# 39	x86_movsicc_0_m1	[length = 2]
	notl	%edx		# 40	*one_cmplsi2_1	[length = 2]
	andl	%edx, %eax	# 41	*andsi_1/1	[length = 2]
	ret			# 47	return_internal	[length = 1]
	.ident	"GCC: (GNU) 4.3.0 20071102 (experimental)"

With -Os -fomit-frame-pointer we get:

test:
	movl	4(%esp), %edx	# 32	*movsi_1/1	[length = 4]
	xorl	%eax, %eax	# 48	*movsi_xor	[length = 2]
	movl	8(%esp), %ecx	# 43	*movsi_1/1	[length = 4]
	orl	%edx, %ecx	# 7	*iorsi_3	[length = 2]
	je	.L3		# 8	*jcc_1		[length = 2]
	leal	1(%edx), %eax	# 44	*lea_1		[length = 3]
.L3:
	ret			# 47	return_internal	[length = 1]

With -O2/-Os -fomit-frame-pointer -march=pentiumpro:

test:
	movl	4(%esp), %edx	# 32	*movsi_1/1	[length = 4]
	xorl	%eax, %eax	# 46	*movsi_xor	[length = 2]
	leal	1(%edx), %ecx	# 41	*lea_1		[length = 3]
	orl	8(%esp), %edx	# 36	*iorsi_3	[length = 4]
	cmovne	%ecx, %eax	# 38	*movsicc_noc/1	[length = 3]
	ret			# 44	return_internal	[length = 1]

I would probably code it like so:

	movl	4(%esp), %eax		; 4
	movl	8(%esp), %edx		; 4
	orl	%eax,	%edx		; 2
	addl	$-1,	%edx		; 3
	adcl	$0,	%eax		; 3
	ret				; 1