Bug 16541 - code quality issue for bit manipulations with 64bit
Summary: code quality issue for bit manipulations with 64bit
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: rtl-optimization (show other bugs)
Version: 4.0.0
: P2 enhancement
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization, ra
Depends on: 15792 18427
Blocks:
  Show dependency treegraph
 
Reported: 2004-07-14 16:17 UTC by Dan Nicolaescu
Modified: 2011-05-22 15:21 UTC (History)
2 users (show)

See Also:
Host:
Target: i686-*-*
Build:
Known to work:
Known to fail:
Last reconfirmed: 2006-01-15 20:28:51


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Dan Nicolaescu 2004-07-14 16:17:13 UTC
The function below is from crafty, from the current version of crafty, not the
one in SPEC2000, but they are functionally identical. 

extern unsigned char first_one[65536];

int FirstOne(unsigned long long arg1)
{
  if (arg1 >> 48)
    return (first_one[arg1 >> 48]);
  if ((arg1 >> 32) & 65535)
    return (first_one[(arg1 >> 32) & 65535] + 16);
  if ((arg1 >> 16) & 65535)
    return (first_one[(arg1 >> 16) & 65535] + 32);
  return (first_one[arg1 & 65535] + 48);
}

The code mainline generates for this with -O2 on x86 is: 

        pushl   %ebp
        xorl    %edx, %edx
        movl    %esp, %ebp
        subl    $12, %esp
        movl    %ebx, (%esp)
        movl    %edx, %ecx
        movl    %esi, 4(%esp)
        movl    %edi, 8(%esp)
        movl    12(%ebp), %edi
        movl    8(%ebp), %esi
        movl    %edi, %eax
        shrl    $16, %eax
        orl     %eax, %ecx
        je      .L2
        movzbl  first_one(%eax), %eax
        movl    (%esp), %ebx
        movl    4(%esp), %esi
        movl    8(%esp), %edi
        movl    %ebp, %esp
        popl    %ebp
        ret
        .p2align 4,,7
.L2:
        movl    %edi, %eax
        xorl    %ebx, %ebx
        movzwl  %ax,%ecx
        movl    %ebx, %eax
        orl     %ecx, %eax
        je      .L5
        movzbl  first_one(%ecx), %eax
        movl    (%esp), %ebx
        movl    4(%esp), %esi
        movl    8(%esp), %edi
        movl    %ebp, %esp
        popl    %ebp
        addl    $16, %eax
        ret
        .p2align 4,,7
.L5:
        movl    %esi, %eax
        xorl    %ebx, %ebx
        shrdl   $16, %edi, %eax
        movzwl  %ax,%ecx
        movl    %ebx, %eax
        orl     %ecx, %eax
        je      .L7
        movzbl  first_one(%ecx), %eax
        movl    (%esp), %ebx
        movl    4(%esp), %esi
        movl    8(%esp), %edi
        movl    %ebp, %esp
        popl    %ebp
        addl    $32, %eax
        ret
        .p2align 4,,7
.L7:
        movl    %esi, %eax
        movl    (%esp), %ebx
        andl    $65535, %eax
        movl    4(%esp), %esi
        movzbl  first_one(%eax), %eax
        movl    8(%esp), %edi
        movl    %ebp, %esp
        popl    %ebp
        addl    $48, %eax
        ret

The code generated on SPARC for this function can be seen in PR16532

Intel's compiler  generates much nicer code: 

# parameter 1: 4 + %esp
..B1.1:                         # Preds ..B1.0
        movl      8(%esp), %edx                                 #3.5
        movl      %edx, %ecx                                    #5.15
        shrl      $16, %ecx                                     #5.15
        xorl      %eax, %eax                                    #5.3
        orl       %ecx, %eax                                    #5.3
        jne       ..B1.7        # Prob 28%                      #5.3
                                # LOE edx ecx ebx ebp esi edi
..B1.2:                         # Preds ..B1.1
        movzwl    %dx, %edx                                     #7.22
        xorl      %eax, %eax                                    #7.3
        orl       %edx, %eax                                    #7.3
        jne       ..B1.6        # Prob 28%                      #7.3
                                # LOE edx ebx ebp esi edi
..B1.3:                         # Preds ..B1.2
        movl      4(%esp), %edx                                 #9.16
        shrl      $16, %edx                                     #9.16
        xorl      %eax, %eax                                    #9.3
        orl       %edx, %eax                                    #9.3
        je        ..B1.5        # Prob 50%                      #9.3
                                # LOE edx ebx ebp esi edi
..B1.4:                         # Preds ..B1.3
        movzbl    first_one(%edx), %eax                         #10.13
        addl      $32, %eax                                     #10.47
        ret                                                     #10.47
                                # LOE
..B1.5:                         # Preds ..B1.3
        movl      4(%esp), %edx                                 #11.28
        movzwl    %dx, %ecx                                     #11.28
        movzbl    first_one(%ecx), %eax                         #11.11
        addl      $48, %eax                                     #11.37
        ret                                                     #11.37
                                # LOE
..B1.6:                         # Preds ..B1.2
        movzbl    first_one(%edx), %eax                         #8.13
        addl      $16, %eax                                     #8.47
        ret                                                     #8.47
                                # LOE
..B1.7:                         # Preds ..B1.1
        movzbl    first_one(%ecx), %eax                         #6.13
        ret                                                     #6.13
Comment 1 Andrew Pinski 2004-07-14 16:28:16 UTC
Confirmed, basically 64bit and subregisters are not optimized that well on x86, see PR 15792 for 
another example.
Comment 2 Dan Nicolaescu 2004-07-14 21:58:49 UTC
Here is a simplified test case that shows a code quality regression:

extern unsigned char first_one[65536];
int FirstOnet(unsigned long long arg1)
{
  if (arg1 >> 48)
    return (first_one[arg1 >> 48]);
  return 0;
}

the code generated by gcc-3.0 -O2 -fomit-frame-pointer is:

        movl    8(%esp), %edx
        movl    %edx, %eax
        shrl    $16, %eax
        xorl    %edx, %edx
        movl    %eax, %ecx
        orl     %edx, %ecx
        je      .L3
        movzbl  first_one(%eax), %eax
        ret
        .p2align 2
.L3:
        xorl    %eax, %eax
        ret

and by mainline (a bit worse): 

        pushl   %ebx               <- using a callee saved register
        xorl    %ecx, %ecx
        movl    12(%esp), %edx
        movl    %edx, %eax         <- why not load directly to eax?
        xorl    %edx, %edx
        shrl    $16, %eax
        movl    %edx, %ebx
        orl     %eax, %ebx
        je      .L4
        movzbl  first_one(%eax), %ecx
.L4:
        popl    %ebx
        movl    %ecx, %eax
        ret


Here is what Intel's compiler generates:

        movzwl    10(%esp), %edx                                #28.5
        xorl      %eax, %eax                                    #30.3
        orl       %edx, %eax                                    #30.3
        je        ..B1.3        # Prob 50%                      #30.3
                                # LOE edx ebx ebp esi edi
..B1.2:                         # Preds ..B1.1
        movzbl    first_one(%edx), %eax                         #31.13
        ret                                                     #31.13
                                # LOE
..B1.3:                         # Preds ..B1.1
        xorl      %eax, %eax                                    #32.10
        ret                                                     #32.10


Comment 3 Dan Nicolaescu 2004-08-31 17:22:50 UTC
Using -fno-if-conversion when compiling to the code in comment #2 improves the
generated assembly a bit:

        movl    8(%esp), %edx
        movl    %edx, %eax
        xorl    %edx, %edx
        shrl    $16, %eax
        movl    %edx, %ecx
        orl     %eax, %ecx
        jne     .L2
        xorl    %eax, %eax
        ret
        .p2align 4,,7
.L2:
        movzbl  first_one(%eax), %eax
        ret
Comment 4 Andrew Pinski 2005-05-12 17:40:42 UTC
Looks like a register allocation issue.
Comment 5 Andrew Pinski 2008-09-14 04:20:26 UTC
IRA produces even worse code ...
Comment 6 Steven Bosscher 2011-05-22 15:21:05 UTC
(In reply to comment #2)
> Here is a simplified test case that shows a code quality regression:
> 
> extern unsigned char first_one[65536];
> int FirstOnet(unsigned long long arg1)
> {
>   if (arg1 >> 48)
>     return (first_one[arg1 >> 48]);
>   return 0;
> }
> 
> the code generated by gcc-3.0 -O2 -fomit-frame-pointer is:
> 
>         movl    8(%esp), %edx
>         movl    %edx, %eax
>         shrl    $16, %eax
>         xorl    %edx, %edx
>         movl    %eax, %ecx
>         orl     %edx, %ecx
>         je      .L3
>         movzbl  first_one(%eax), %eax
>         ret
>         .p2align 2
> .L3:
>         xorl    %eax, %eax
>         ret
> 
> and by mainline (a bit worse): 
> 
>         pushl   %ebx               <- using a callee saved register
>         xorl    %ecx, %ecx
>         movl    12(%esp), %edx
>         movl    %edx, %eax         <- why not load directly to eax?
>         xorl    %edx, %edx
>         shrl    $16, %eax
>         movl    %edx, %ebx
>         orl     %eax, %ebx
>         je      .L4
>         movzbl  first_one(%eax), %ecx
> .L4:
>         popl    %ebx
>         movl    %ecx, %eax
>         ret
> 
> 
> Here is what Intel's compiler generates:
> 
>         movzwl    10(%esp), %edx                                #28.5
>         xorl      %eax, %eax                                    #30.3
>         orl       %edx, %eax                                    #30.3
>         je        ..B1.3        # Prob 50%                      #30.3
>                                 # LOE edx ebx ebp esi edi
> ..B1.2:                         # Preds ..B1.1
>         movzbl    first_one(%edx), %eax                         #31.13
>         ret                                                     #31.13
>                                 # LOE
> ..B1.3:                         # Preds ..B1.1
>         xorl      %eax, %eax                                    #32.10
>         ret                                                     #32.10
> 
> 
> 

$ cc1 -quiet -m32 -O2 t.c -fdump-tree-optimized
$ cat t.s 
	.file	"t.c"
	.text
	.p2align 4,,15
	.globl	FirstOnet
	.type	FirstOnet, @function
FirstOnet:
.LFB0:
	.cfi_startproc
	movzwl	10(%esp), %edx
	xorl	%eax, %eax
	testl	%edx, %edx
	je	.L2
	movzbl	first_one(%edx), %eax
.L2:
	rep
	ret
	.cfi_endproc
.LFE0:
	.size	FirstOnet, .-FirstOnet
	.ident	"GCC: (GNU) 4.6.0 20110312 (experimental) [trunk revision 170907]"
	.section	.note.GNU-stack,"",@progbits