Bytes are typically extracted from e.g. u64's by something like #define D5(v) (((v) >> 40) & 0xff) Testcase shows that gcc does not optimize this "good enough".
Created attachment 8701 [details] generate assembly with -S and compare results
I don't think that bug description is correct. I believe similar observation will be valid for byte extraction from u32 and u16, and for u16-from-u32, etc. Update for latest gcc. This is what 4.0.0 produces from the testcase: # gcc -O2 -fomit-frame-pointer -S helper.c # cat helper.s [I removed non-essential stuff] a: movl v+8, %eax shrl $8, %eax xorb v, %al xorb v+18, %al xorb v+27, %al xorb v+36, %al movl v+40, %edx movl v+44, %ecx movl %ecx, %edx xorl %ecx, %ecx shrl $8, %edx xorl %edx, %eax xorb v+54, %al xorb v+63, %al movzbl %al, %eax ret b: movl v+8, %eax movl v+12, %edx shrdl $8, %edx, %eax shrl $8, %edx xorb v, %al movl v+16, %edx movl v+20, %ecx shrdl $16, %ecx, %edx shrl $16, %ecx xorl %edx, %eax movl v+24, %edx movl v+28, %ecx shrdl $24, %ecx, %edx shrl $24, %ecx xorl %edx, %eax xorb v+36, %al movl v+40, %edx movl v+44, %ecx movl %ecx, %edx xorl %ecx, %ecx shrl $8, %edx xorl %edx, %eax xorb v+54, %al xorb v+63, %al movzbl %al, %eax ret c: movb v+9, %al xorb v, %al xorb v+18, %al xorb v+27, %al xorb v+36, %al xorb v+45, %al xorb v+54, %al xorb v+63, %al movzbl %al, %eax ret d: movl v+8, %eax movl v+12, %edx shrdl $8, %edx, %eax shrl $8, %edx xorb v, %al movl v+16, %edx movl v+20, %ecx shrdl $16, %ecx, %edx shrl $16, %ecx xorl %edx, %eax movl v+24, %edx movl v+28, %ecx shrdl $24, %ecx, %edx shrl $24, %ecx xorl %edx, %eax xorb v+36, %al movl v+40, %edx movl v+44, %ecx movl %ecx, %edx xorl %ecx, %ecx shrl $8, %edx xorl %edx, %eax xorb v+54, %al xorb v+63, %al movzbl %al, %eax ret As you can see, a,b and d results are far from optimal, while c is almost perfect. Note that people typically use d, i.e. this: #define D7(v) (((v) >> 56)) #define D6(v) (((v) >> 48) & 0xff) #define D5(v) (((v) >> 40) & 0xff) #define D4(v) (((v) >> 32) & 0xff) #define D3(v) (((v) >> 24) & 0xff) #define D2(v) (((v) >> 16) & 0xff) #define D1(v) (((v) >> 8) & 0xff) #define D0(v) ((v) & 0xff)
Confirmed, this is most likely a sub register problem.
I think this might be a middle-end issue related to PR 7061 or PR 15184. We're doing slightly better with GCC 4.3.0 (because of subreg lowering, I guess), but not much (asm output with -dp for readability): a: movl v+44, %eax # 53 *movsi_1/1 [length = 5] movl v+8, %edx # 23 *movsi_1/1 [length = 6] shrl $8, %eax # 54 *lshrsi3_1/1 [length = 3] xorb v+36, %al # 11 *xorqi_1/1 [length = 6] xorb v, %al # 13 *xorqi_1/1 [length = 6] xorb v+54, %al # 17 *xorqi_1/1 [length = 6] xorb v+63, %al # 21 *xorqi_1/1 [length = 6] shrl $8, %edx # 24 *lshrsi3_1/1 [length = 3] xorl %edx, %eax # 66 *xorsi_1/1 [length = 2] xorb v+18, %al # 29 *xorqi_1/1 [length = 6] xorb v+27, %al # 33 *xorqi_1/1 [length = 6] ret # 69 return_internal [length = 1] b: pushl %ebx # 75 *pushsi2 [length = 1] movl v+20, %edx # 69 *movsi_1/1 [length = 6] movl v+12, %ebx # 67 *movsi_1/1 [length = 6] movl v+8, %ecx # 66 *movsi_1/1 [length = 6] movl v+16, %eax # 68 *movsi_1/1 [length = 5] shrdl $8, %ebx, %ecx # 81 x86_shrd_1/1 [length = 4] shrdl $16, %edx, %eax # 83 x86_shrd_1/1 [length = 4] movl v+24, %edx # 71 *movsi_1/1 [length = 6] xorl %ecx, %eax # 70 *xorsi_1/1 [length = 2] movl v+28, %ecx # 72 *movsi_1/1 [length = 6] xorb v, %al # 13 *xorqi_1/1 [length = 6] popl %ebx # 78 popsi1 [length = 1] shrdl $24, %ecx, %edx # 85 x86_shrd_1/1 [length = 4] xorl %edx, %eax # 73 *xorsi_1/1 [length = 2] movl v+44, %edx # 53 *movsi_1/1 [length = 6] xorb v+36, %al # 21 *xorqi_1/1 [length = 6] shrl $8, %edx # 54 *lshrsi3_1/1 [length = 3] xorl %edx, %eax # 74 *xorsi_1/1 [length = 2] xorb v+54, %al # 29 *xorqi_1/1 [length = 6] xorb v+63, %al # 33 *xorqi_1/1 [length = 6] ret # 79 return_internal [length = 1] c: movzbl v+9, %eax # 7 *movqi_1/3 [length = 7] xorb v+18, %al # 8 *xorqi_1/1 [length = 6] xorb v, %al # 9 *xorqi_1/1 [length = 6] xorb v+27, %al # 10 *xorqi_1/1 [length = 6] xorb v+36, %al # 11 *xorqi_1/1 [length = 6] xorb v+45, %al # 12 *xorqi_1/1 [length = 6] xorb v+54, %al # 13 *xorqi_1/1 [length = 6] xorb v+63, %al # 14 *xorqi_1/1 [length = 6] ret # 33 return_internal [length = 1] d: pushl %ebx # 75 *pushsi2 [length = 1] movl v+20, %edx # 69 *movsi_1/1 [length = 6] movl v+12, %ebx # 67 *movsi_1/1 [length = 6] movl v+8, %ecx # 66 *movsi_1/1 [length = 6] movl v+16, %eax # 68 *movsi_1/1 [length = 5] shrdl $8, %ebx, %ecx # 81 x86_shrd_1/1 [length = 4] shrdl $16, %edx, %eax # 83 x86_shrd_1/1 [length = 4] movl v+24, %edx # 71 *movsi_1/1 [length = 6] xorl %ecx, %eax # 70 *xorsi_1/1 [length = 2] movl v+28, %ecx # 72 *movsi_1/1 [length = 6] xorb v, %al # 13 *xorqi_1/1 [length = 6] popl %ebx # 78 popsi1 [length = 1] shrdl $24, %ecx, %edx # 85 x86_shrd_1/1 [length = 4] xorl %edx, %eax # 73 *xorsi_1/1 [length = 2] movl v+44, %edx # 53 *movsi_1/1 [length = 6] xorb v+36, %al # 21 *xorqi_1/1 [length = 6] shrl $8, %edx # 54 *lshrsi3_1/1 [length = 3] xorl %edx, %eax # 74 *xorsi_1/1 [length = 2] xorb v+54, %al # 29 *xorqi_1/1 [length = 6] xorb v+63, %al # 33 *xorqi_1/1 [length = 6] ret # 79 return_internal [length = 1] .ident "GCC: (GNU) 4.3.0 20071102 (experimental)"
IRA is slightly worse, it uses one extra register ....
Guess this can be closed now. All four cases look good: $ cat helper-4.6.3-O2.asm helper-4.6.3-O2.o: file format elf32-i386 ... 00000000 <a>: 0: 0f b6 05 2d 00 00 00 movzbl 0x2d,%eax 7: 32 05 24 00 00 00 xor 0x24,%al d: 32 05 00 00 00 00 xor 0x0,%al 13: 32 05 36 00 00 00 xor 0x36,%al 19: 32 05 3f 00 00 00 xor 0x3f,%al 1f: 32 05 09 00 00 00 xor 0x9,%al 25: 32 05 12 00 00 00 xor 0x12,%al 2b: 32 05 1b 00 00 00 xor 0x1b,%al 31: c3 ret Disassembly of section .text.b: 00000000 <b>: 0: 0f b6 05 12 00 00 00 movzbl 0x12,%eax 7: 32 05 09 00 00 00 xor 0x9,%al d: 32 05 00 00 00 00 xor 0x0,%al 13: 32 05 1b 00 00 00 xor 0x1b,%al 19: 32 05 24 00 00 00 xor 0x24,%al 1f: 32 05 2d 00 00 00 xor 0x2d,%al 25: 32 05 36 00 00 00 xor 0x36,%al 2b: 32 05 3f 00 00 00 xor 0x3f,%al 31: c3 ret Disassembly of section .text.c: 00000000 <c>: 0: 0f b6 05 09 00 00 00 movzbl 0x9,%eax 7: 32 05 00 00 00 00 xor 0x0,%al d: 32 05 12 00 00 00 xor 0x12,%al 13: 32 05 1b 00 00 00 xor 0x1b,%al 19: 32 05 24 00 00 00 xor 0x24,%al 1f: 32 05 2d 00 00 00 xor 0x2d,%al 25: 32 05 36 00 00 00 xor 0x36,%al 2b: 32 05 3f 00 00 00 xor 0x3f,%al 31: c3 ret Disassembly of section .text.d: 00000000 <d>: 0: 0f b6 05 12 00 00 00 movzbl 0x12,%eax 7: 32 05 09 00 00 00 xor 0x9,%al d: 32 05 00 00 00 00 xor 0x0,%al 13: 32 05 1b 00 00 00 xor 0x1b,%al 19: 32 05 24 00 00 00 xor 0x24,%al 1f: 32 05 2d 00 00 00 xor 0x2d,%al 25: 32 05 36 00 00 00 xor 0x36,%al 2b: 32 05 3f 00 00 00 xor 0x3f,%al 31: c3 ret Curiously, -Os manages to squeeze two more bytes out of it. helper-4.6.3-Os.o: file format elf32-i386 00000000 <a>: 0: a0 2d 00 00 00 mov 0x2d,%al ^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^ better than movzbl 5: 33 05 24 00 00 00 xor 0x24,%eax << why %eax? oh well... b: 33 05 00 00 00 00 xor 0x0,%eax 11: 32 05 36 00 00 00 xor 0x36,%al 17: 32 05 3f 00 00 00 xor 0x3f,%al 1d: 32 05 09 00 00 00 xor 0x9,%al 23: 32 05 12 00 00 00 xor 0x12,%al 29: 32 05 1b 00 00 00 xor 0x1b,%al 2f: c3 ret
Fixed at least in 4.7.2, maybe earlier. With -m32 -fomit-frame-pointer -O2: a: movzbl v+45, %eax xorb v+36, %al xorb v, %al xorb v+54, %al xorb v+63, %al xorb v+9, %al xorb v+18, %al xorb v+27, %al ret b: movzbl v+18, %eax xorb v+9, %al xorb v, %al xorb v+27, %al xorb v+36, %al xorb v+45, %al xorb v+54, %al xorb v+63, %al ret c: movzbl v+9, %eax xorb v, %al xorb v+18, %al xorb v+27, %al xorb v+36, %al xorb v+45, %al xorb v+54, %al xorb v+63, %al ret d: movzbl v+18, %eax xorb v+9, %al xorb v, %al xorb v+27, %al xorb v+36, %al xorb v+45, %al xorb v+54, %al xorb v+63, %al ret With same but -Os, my only complaint is that word-sized XORs are needlessly adding partial register update stalls: d: movb v+18, %al xorb v+9, %al xorl v, %eax xorb v+27, %al xorl v+36, %eax xorb v+45, %al xorb v+54, %al xorb v+63, %al ret but overall it looks much better. Feel free to close this BZ.
Fixed in GCC 4.6.3 and above.