Bug 21150 - Suboptimal byte extraction from 64bits
Summary: Suboptimal byte extraction from 64bits
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: rtl-optimization (show other bugs)
Version: 3.4.3
: P2 normal
Target Milestone: 4.6.3
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization, ra
Depends on:
Blocks:
 
Reported: 2005-04-21 13:10 UTC by vda
Modified: 2021-07-25 00:51 UTC (History)
2 users (show)

See Also:
Host:
Target: i386-*-*
Build:
Known to work:
Known to fail: 4.3.0
Last reconfirmed: 2006-01-15 20:37:27


Attachments
generate assembly with -S and compare results (581 bytes, text/plain)
2005-04-21 13:12 UTC, vda
Details

Note You need to log in before you can comment on or make changes to this bug.
Description vda 2005-04-21 13:10:11 UTC
Bytes are typically extracted from e.g. u64's by something like

#define D5(v) (((v) >> 40) & 0xff)

Testcase shows that gcc does not optimize this "good enough".
Comment 1 vda 2005-04-21 13:12:51 UTC
Created attachment 8701 [details]
generate assembly with -S and compare results
Comment 2 vda 2005-04-24 13:26:21 UTC
I don't think that bug description is correct.
I believe similar observation will be valid for byte extraction
from u32 and u16, and for u16-from-u32, etc.

Update for latest gcc.
This is what 4.0.0 produces from the testcase:

# gcc -O2 -fomit-frame-pointer -S helper.c
# cat helper.s
 [I removed non-essential stuff]
a:
        movl    v+8, %eax
        shrl    $8, %eax
        xorb    v, %al
        xorb    v+18, %al
        xorb    v+27, %al
        xorb    v+36, %al
        movl    v+40, %edx
        movl    v+44, %ecx
        movl    %ecx, %edx
        xorl    %ecx, %ecx
        shrl    $8, %edx
        xorl    %edx, %eax
        xorb    v+54, %al
        xorb    v+63, %al
        movzbl  %al, %eax
        ret
b:
        movl    v+8, %eax
        movl    v+12, %edx
        shrdl   $8, %edx, %eax
        shrl    $8, %edx
        xorb    v, %al
        movl    v+16, %edx
        movl    v+20, %ecx
        shrdl   $16, %ecx, %edx
        shrl    $16, %ecx
        xorl    %edx, %eax
        movl    v+24, %edx
        movl    v+28, %ecx
        shrdl   $24, %ecx, %edx
        shrl    $24, %ecx
        xorl    %edx, %eax
        xorb    v+36, %al
        movl    v+40, %edx
        movl    v+44, %ecx
        movl    %ecx, %edx
        xorl    %ecx, %ecx
        shrl    $8, %edx
        xorl    %edx, %eax
        xorb    v+54, %al
        xorb    v+63, %al
        movzbl  %al, %eax
        ret
c:
        movb    v+9, %al
        xorb    v, %al
        xorb    v+18, %al
        xorb    v+27, %al
        xorb    v+36, %al
        xorb    v+45, %al
        xorb    v+54, %al
        xorb    v+63, %al
        movzbl  %al, %eax
        ret
d:
        movl    v+8, %eax
        movl    v+12, %edx
        shrdl   $8, %edx, %eax
        shrl    $8, %edx
        xorb    v, %al
        movl    v+16, %edx
        movl    v+20, %ecx
        shrdl   $16, %ecx, %edx
        shrl    $16, %ecx
        xorl    %edx, %eax
        movl    v+24, %edx
        movl    v+28, %ecx
        shrdl   $24, %ecx, %edx
        shrl    $24, %ecx
        xorl    %edx, %eax
        xorb    v+36, %al
        movl    v+40, %edx
        movl    v+44, %ecx
        movl    %ecx, %edx
        xorl    %ecx, %ecx
        shrl    $8, %edx
        xorl    %edx, %eax
        xorb    v+54, %al
        xorb    v+63, %al
        movzbl  %al, %eax
        ret

As you can see, a,b and d results are far from optimal,
while c is almost perfect.

Note that people typically use d, i.e. this:
#define D7(v) (((v) >> 56))
#define D6(v) (((v) >> 48) & 0xff)
#define D5(v) (((v) >> 40) & 0xff)
#define D4(v) (((v) >> 32) & 0xff)
#define D3(v) (((v) >> 24) & 0xff)
#define D2(v) (((v) >> 16) & 0xff)
#define D1(v) (((v) >>  8) & 0xff)
#define D0(v) ((v) & 0xff)
Comment 3 Andrew Pinski 2005-07-04 22:21:33 UTC
Confirmed, this is most likely a sub register problem.
Comment 4 Rask Ingemann Lambertsen 2007-11-09 19:48:12 UTC
I think this might be a middle-end issue related to PR 7061 or PR 15184. We're doing slightly better with GCC 4.3.0 (because of subreg lowering, I guess), but not much (asm output with -dp for readability):

a:
	movl	v+44, %eax	# 53	*movsi_1/1	[length = 5]
	movl	v+8, %edx	# 23	*movsi_1/1	[length = 6]
	shrl	$8, %eax	# 54	*lshrsi3_1/1	[length = 3]
	xorb	v+36, %al	# 11	*xorqi_1/1	[length = 6]
	xorb	v, %al		# 13	*xorqi_1/1	[length = 6]
	xorb	v+54, %al	# 17	*xorqi_1/1	[length = 6]
	xorb	v+63, %al	# 21	*xorqi_1/1	[length = 6]
	shrl	$8, %edx	# 24	*lshrsi3_1/1	[length = 3]
	xorl	%edx, %eax	# 66	*xorsi_1/1	[length = 2]
	xorb	v+18, %al	# 29	*xorqi_1/1	[length = 6]
	xorb	v+27, %al	# 33	*xorqi_1/1	[length = 6]
	ret			# 69	return_internal	[length = 1]
b:
	pushl	%ebx		# 75	*pushsi2	[length = 1]
	movl	v+20, %edx	# 69	*movsi_1/1	[length = 6]
	movl	v+12, %ebx	# 67	*movsi_1/1	[length = 6]
	movl	v+8, %ecx	# 66	*movsi_1/1	[length = 6]
	movl	v+16, %eax	# 68	*movsi_1/1	[length = 5]
	shrdl	$8, %ebx, %ecx	# 81	x86_shrd_1/1	[length = 4]
	shrdl	$16, %edx, %eax	# 83	x86_shrd_1/1	[length = 4]
	movl	v+24, %edx	# 71	*movsi_1/1	[length = 6]
	xorl	%ecx, %eax	# 70	*xorsi_1/1	[length = 2]
	movl	v+28, %ecx	# 72	*movsi_1/1	[length = 6]
	xorb	v, %al		# 13	*xorqi_1/1	[length = 6]
	popl	%ebx		# 78	popsi1		[length = 1]
	shrdl	$24, %ecx, %edx	# 85	x86_shrd_1/1	[length = 4]
	xorl	%edx, %eax	# 73	*xorsi_1/1	[length = 2]
	movl	v+44, %edx	# 53	*movsi_1/1	[length = 6]
	xorb	v+36, %al	# 21	*xorqi_1/1	[length = 6]
	shrl	$8, %edx	# 54	*lshrsi3_1/1	[length = 3]
	xorl	%edx, %eax	# 74	*xorsi_1/1	[length = 2]
	xorb	v+54, %al	# 29	*xorqi_1/1	[length = 6]
	xorb	v+63, %al	# 33	*xorqi_1/1	[length = 6]
	ret			# 79	return_internal	[length = 1]
c:
	movzbl	v+9, %eax	# 7	*movqi_1/3	[length = 7]
	xorb	v+18, %al	# 8	*xorqi_1/1	[length = 6]
	xorb	v, %al		# 9	*xorqi_1/1	[length = 6]
	xorb	v+27, %al	# 10	*xorqi_1/1	[length = 6]
	xorb	v+36, %al	# 11	*xorqi_1/1	[length = 6]
	xorb	v+45, %al	# 12	*xorqi_1/1	[length = 6]
	xorb	v+54, %al	# 13	*xorqi_1/1	[length = 6]
	xorb	v+63, %al	# 14	*xorqi_1/1	[length = 6]
	ret			# 33	return_internal	[length = 1]
d:
	pushl	%ebx		# 75	*pushsi2	[length = 1]
	movl	v+20, %edx	# 69	*movsi_1/1	[length = 6]
	movl	v+12, %ebx	# 67	*movsi_1/1	[length = 6]
	movl	v+8, %ecx	# 66	*movsi_1/1	[length = 6]
	movl	v+16, %eax	# 68	*movsi_1/1	[length = 5]
	shrdl	$8, %ebx, %ecx	# 81	x86_shrd_1/1	[length = 4]
	shrdl	$16, %edx, %eax	# 83	x86_shrd_1/1	[length = 4]
	movl	v+24, %edx	# 71	*movsi_1/1	[length = 6]
	xorl	%ecx, %eax	# 70	*xorsi_1/1	[length = 2]
	movl	v+28, %ecx	# 72	*movsi_1/1	[length = 6]
	xorb	v, %al		# 13	*xorqi_1/1	[length = 6]
	popl	%ebx		# 78	popsi1		[length = 1]
	shrdl	$24, %ecx, %edx	# 85	x86_shrd_1/1	[length = 4]
	xorl	%edx, %eax	# 73	*xorsi_1/1	[length = 2]
	movl	v+44, %edx	# 53	*movsi_1/1	[length = 6]
	xorb	v+36, %al	# 21	*xorqi_1/1	[length = 6]
	shrl	$8, %edx	# 54	*lshrsi3_1/1	[length = 3]
	xorl	%edx, %eax	# 74	*xorsi_1/1	[length = 2]
	xorb	v+54, %al	# 29	*xorqi_1/1	[length = 6]
	xorb	v+63, %al	# 33	*xorqi_1/1	[length = 6]
	ret			# 79	return_internal	[length = 1]
	.ident	"GCC: (GNU) 4.3.0 20071102 (experimental)"
Comment 5 Andrew Pinski 2008-09-14 03:53:31 UTC
IRA is slightly worse, it uses one extra register ....
Comment 6 Denis Vlasenko 2013-01-18 11:12:18 UTC
Guess this can be closed now. All four cases look good:

$ cat helper-4.6.3-O2.asm 
helper-4.6.3-O2.o:     file format elf32-i386
...
00000000 <a>:
   0:	0f b6 05 2d 00 00 00 	movzbl 0x2d,%eax
   7:	32 05 24 00 00 00    	xor    0x24,%al
   d:	32 05 00 00 00 00    	xor    0x0,%al
  13:	32 05 36 00 00 00    	xor    0x36,%al
  19:	32 05 3f 00 00 00    	xor    0x3f,%al
  1f:	32 05 09 00 00 00    	xor    0x9,%al
  25:	32 05 12 00 00 00    	xor    0x12,%al
  2b:	32 05 1b 00 00 00    	xor    0x1b,%al
  31:	c3                   	ret    
Disassembly of section .text.b:
00000000 <b>:
   0:	0f b6 05 12 00 00 00 	movzbl 0x12,%eax
   7:	32 05 09 00 00 00    	xor    0x9,%al
   d:	32 05 00 00 00 00    	xor    0x0,%al
  13:	32 05 1b 00 00 00    	xor    0x1b,%al
  19:	32 05 24 00 00 00    	xor    0x24,%al
  1f:	32 05 2d 00 00 00    	xor    0x2d,%al
  25:	32 05 36 00 00 00    	xor    0x36,%al
  2b:	32 05 3f 00 00 00    	xor    0x3f,%al
  31:	c3                   	ret    
Disassembly of section .text.c:
00000000 <c>:
   0:	0f b6 05 09 00 00 00 	movzbl 0x9,%eax
   7:	32 05 00 00 00 00    	xor    0x0,%al
   d:	32 05 12 00 00 00    	xor    0x12,%al
  13:	32 05 1b 00 00 00    	xor    0x1b,%al
  19:	32 05 24 00 00 00    	xor    0x24,%al
  1f:	32 05 2d 00 00 00    	xor    0x2d,%al
  25:	32 05 36 00 00 00    	xor    0x36,%al
  2b:	32 05 3f 00 00 00    	xor    0x3f,%al
  31:	c3                   	ret    
Disassembly of section .text.d:
00000000 <d>:
   0:	0f b6 05 12 00 00 00 	movzbl 0x12,%eax
   7:	32 05 09 00 00 00    	xor    0x9,%al
   d:	32 05 00 00 00 00    	xor    0x0,%al
  13:	32 05 1b 00 00 00    	xor    0x1b,%al
  19:	32 05 24 00 00 00    	xor    0x24,%al
  1f:	32 05 2d 00 00 00    	xor    0x2d,%al
  25:	32 05 36 00 00 00    	xor    0x36,%al
  2b:	32 05 3f 00 00 00    	xor    0x3f,%al
  31:	c3                   	ret    

Curiously, -Os manages to squeeze two more bytes out of it.

helper-4.6.3-Os.o:     file format elf32-i386
00000000 <a>:
   0:   a0 2d 00 00 00          mov    0x2d,%al
        ^^^^^^^^^^^^^^          ^^^^^^^^^^^^^^^ better than movzbl
   5:   33 05 24 00 00 00       xor    0x24,%eax  << why %eax? oh well...
   b:   33 05 00 00 00 00       xor    0x0,%eax
  11:   32 05 36 00 00 00       xor    0x36,%al
  17:   32 05 3f 00 00 00       xor    0x3f,%al
  1d:   32 05 09 00 00 00       xor    0x9,%al
  23:   32 05 12 00 00 00       xor    0x12,%al
  29:   32 05 1b 00 00 00       xor    0x1b,%al
  2f:   c3                      ret
Comment 7 Denis Vlasenko 2016-04-15 14:07:19 UTC
Fixed at least in 4.7.2, maybe earlier. With -m32 -fomit-frame-pointer -O2:

a:      movzbl  v+45, %eax
        xorb    v+36, %al
        xorb    v, %al
        xorb    v+54, %al
        xorb    v+63, %al
        xorb    v+9, %al
        xorb    v+18, %al
        xorb    v+27, %al
        ret
b:      movzbl  v+18, %eax
        xorb    v+9, %al
        xorb    v, %al
        xorb    v+27, %al
        xorb    v+36, %al
        xorb    v+45, %al
        xorb    v+54, %al
        xorb    v+63, %al
        ret
c:      movzbl  v+9, %eax
        xorb    v, %al
        xorb    v+18, %al
        xorb    v+27, %al
        xorb    v+36, %al
        xorb    v+45, %al
        xorb    v+54, %al
        xorb    v+63, %al
        ret
d:      movzbl  v+18, %eax
        xorb    v+9, %al
        xorb    v, %al
        xorb    v+27, %al
        xorb    v+36, %al
        xorb    v+45, %al
        xorb    v+54, %al
        xorb    v+63, %al
        ret

With same but -Os, my only complaint is that word-sized XORs are needlessly adding partial register update stalls:

d:      movb    v+18, %al
        xorb    v+9, %al
        xorl    v, %eax
        xorb    v+27, %al
        xorl    v+36, %eax
        xorb    v+45, %al
        xorb    v+54, %al
        xorb    v+63, %al
        ret

but overall it looks much better. Feel free to close this BZ.
Comment 8 Andrew Pinski 2021-07-25 00:51:24 UTC
Fixed in GCC 4.6.3 and above.