The function below is from crafty, from the current version of crafty, not the one in SPEC2000, but they are functionally identical. extern unsigned char first_one[65536]; int FirstOne(unsigned long long arg1) { if (arg1 >> 48) return (first_one[arg1 >> 48]); if ((arg1 >> 32) & 65535) return (first_one[(arg1 >> 32) & 65535] + 16); if ((arg1 >> 16) & 65535) return (first_one[(arg1 >> 16) & 65535] + 32); return (first_one[arg1 & 65535] + 48); } The code mainline generates for this with -O2 on x86 is: pushl %ebp xorl %edx, %edx movl %esp, %ebp subl $12, %esp movl %ebx, (%esp) movl %edx, %ecx movl %esi, 4(%esp) movl %edi, 8(%esp) movl 12(%ebp), %edi movl 8(%ebp), %esi movl %edi, %eax shrl $16, %eax orl %eax, %ecx je .L2 movzbl first_one(%eax), %eax movl (%esp), %ebx movl 4(%esp), %esi movl 8(%esp), %edi movl %ebp, %esp popl %ebp ret .p2align 4,,7 .L2: movl %edi, %eax xorl %ebx, %ebx movzwl %ax,%ecx movl %ebx, %eax orl %ecx, %eax je .L5 movzbl first_one(%ecx), %eax movl (%esp), %ebx movl 4(%esp), %esi movl 8(%esp), %edi movl %ebp, %esp popl %ebp addl $16, %eax ret .p2align 4,,7 .L5: movl %esi, %eax xorl %ebx, %ebx shrdl $16, %edi, %eax movzwl %ax,%ecx movl %ebx, %eax orl %ecx, %eax je .L7 movzbl first_one(%ecx), %eax movl (%esp), %ebx movl 4(%esp), %esi movl 8(%esp), %edi movl %ebp, %esp popl %ebp addl $32, %eax ret .p2align 4,,7 .L7: movl %esi, %eax movl (%esp), %ebx andl $65535, %eax movl 4(%esp), %esi movzbl first_one(%eax), %eax movl 8(%esp), %edi movl %ebp, %esp popl %ebp addl $48, %eax ret The code generated on SPARC for this function can be seen in PR16532 Intel's compiler generates much nicer code: # parameter 1: 4 + %esp ..B1.1: # Preds ..B1.0 movl 8(%esp), %edx #3.5 movl %edx, %ecx #5.15 shrl $16, %ecx #5.15 xorl %eax, %eax #5.3 orl %ecx, %eax #5.3 jne ..B1.7 # Prob 28% #5.3 # LOE edx ecx ebx ebp esi edi ..B1.2: # Preds ..B1.1 movzwl %dx, %edx #7.22 xorl %eax, %eax #7.3 orl %edx, %eax #7.3 jne ..B1.6 # Prob 28% #7.3 # LOE edx ebx ebp esi edi ..B1.3: # Preds ..B1.2 movl 4(%esp), %edx #9.16 shrl $16, %edx #9.16 xorl %eax, %eax #9.3 orl %edx, %eax #9.3 je ..B1.5 # Prob 50% #9.3 # LOE edx ebx ebp esi edi ..B1.4: # Preds ..B1.3 movzbl first_one(%edx), %eax #10.13 addl $32, %eax #10.47 ret #10.47 # LOE ..B1.5: # Preds ..B1.3 movl 4(%esp), %edx #11.28 movzwl %dx, %ecx #11.28 movzbl first_one(%ecx), %eax #11.11 addl $48, %eax #11.37 ret #11.37 # LOE ..B1.6: # Preds ..B1.2 movzbl first_one(%edx), %eax #8.13 addl $16, %eax #8.47 ret #8.47 # LOE ..B1.7: # Preds ..B1.1 movzbl first_one(%ecx), %eax #6.13 ret #6.13
Confirmed, basically 64bit and subregisters are not optimized that well on x86, see PR 15792 for another example.
Here is a simplified test case that shows a code quality regression: extern unsigned char first_one[65536]; int FirstOnet(unsigned long long arg1) { if (arg1 >> 48) return (first_one[arg1 >> 48]); return 0; } the code generated by gcc-3.0 -O2 -fomit-frame-pointer is: movl 8(%esp), %edx movl %edx, %eax shrl $16, %eax xorl %edx, %edx movl %eax, %ecx orl %edx, %ecx je .L3 movzbl first_one(%eax), %eax ret .p2align 2 .L3: xorl %eax, %eax ret and by mainline (a bit worse): pushl %ebx <- using a callee saved register xorl %ecx, %ecx movl 12(%esp), %edx movl %edx, %eax <- why not load directly to eax? xorl %edx, %edx shrl $16, %eax movl %edx, %ebx orl %eax, %ebx je .L4 movzbl first_one(%eax), %ecx .L4: popl %ebx movl %ecx, %eax ret Here is what Intel's compiler generates: movzwl 10(%esp), %edx #28.5 xorl %eax, %eax #30.3 orl %edx, %eax #30.3 je ..B1.3 # Prob 50% #30.3 # LOE edx ebx ebp esi edi ..B1.2: # Preds ..B1.1 movzbl first_one(%edx), %eax #31.13 ret #31.13 # LOE ..B1.3: # Preds ..B1.1 xorl %eax, %eax #32.10 ret #32.10
Using -fno-if-conversion when compiling to the code in comment #2 improves the generated assembly a bit: movl 8(%esp), %edx movl %edx, %eax xorl %edx, %edx shrl $16, %eax movl %edx, %ecx orl %eax, %ecx jne .L2 xorl %eax, %eax ret .p2align 4,,7 .L2: movzbl first_one(%eax), %eax ret
Looks like a register allocation issue.
IRA produces even worse code ...
(In reply to comment #2) > Here is a simplified test case that shows a code quality regression: > > extern unsigned char first_one[65536]; > int FirstOnet(unsigned long long arg1) > { > if (arg1 >> 48) > return (first_one[arg1 >> 48]); > return 0; > } > > the code generated by gcc-3.0 -O2 -fomit-frame-pointer is: > > movl 8(%esp), %edx > movl %edx, %eax > shrl $16, %eax > xorl %edx, %edx > movl %eax, %ecx > orl %edx, %ecx > je .L3 > movzbl first_one(%eax), %eax > ret > .p2align 2 > .L3: > xorl %eax, %eax > ret > > and by mainline (a bit worse): > > pushl %ebx <- using a callee saved register > xorl %ecx, %ecx > movl 12(%esp), %edx > movl %edx, %eax <- why not load directly to eax? > xorl %edx, %edx > shrl $16, %eax > movl %edx, %ebx > orl %eax, %ebx > je .L4 > movzbl first_one(%eax), %ecx > .L4: > popl %ebx > movl %ecx, %eax > ret > > > Here is what Intel's compiler generates: > > movzwl 10(%esp), %edx #28.5 > xorl %eax, %eax #30.3 > orl %edx, %eax #30.3 > je ..B1.3 # Prob 50% #30.3 > # LOE edx ebx ebp esi edi > ..B1.2: # Preds ..B1.1 > movzbl first_one(%edx), %eax #31.13 > ret #31.13 > # LOE > ..B1.3: # Preds ..B1.1 > xorl %eax, %eax #32.10 > ret #32.10 > > > $ cc1 -quiet -m32 -O2 t.c -fdump-tree-optimized $ cat t.s .file "t.c" .text .p2align 4,,15 .globl FirstOnet .type FirstOnet, @function FirstOnet: .LFB0: .cfi_startproc movzwl 10(%esp), %edx xorl %eax, %eax testl %edx, %edx je .L2 movzbl first_one(%edx), %eax .L2: rep ret .cfi_endproc .LFE0: .size FirstOnet, .-FirstOnet .ident "GCC: (GNU) 4.6.0 20110312 (experimental) [trunk revision 170907]" .section .note.GNU-stack,"",@progbits