Bug 24929 - long long shift/mask operations should be better optimized
Summary: long long shift/mask operations should be better optimized
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: middle-end (show other bugs)
Version: 4.1.0
: P3 enhancement
Target Milestone: 4.3.0
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
: 28405 (view as bug list)
Depends on:
Blocks:
 
Reported: 2005-11-18 02:30 UTC by Thomas Kho
Modified: 2021-08-29 22:39 UTC (History)
5 users (show)

See Also:
Host:
Target: i686-linux
Build:
Known to work:
Known to fail:
Last reconfirmed: 2006-02-02 18:16:13


Attachments
shift/mask long long testcase (195 bytes, text/plain)
2005-11-18 02:35 UTC, Thomas Kho
Details

Note You need to log in before you can comment on or make changes to this bug.
Description Thomas Kho 2005-11-18 02:30:07 UTC
shift/mask operations on long long like (x << 8) | ((y >> 48) & 0xffull) could be further optimized for x86. Please see comments in the attached test case (posted earlier in PR 17886).
Comment 1 Thomas Kho 2005-11-18 02:35:52 UTC
Created attachment 10273 [details]
shift/mask long long testcase

Here is a rough instruction-count comparison for f() compiled at -O2,
march=pentiumpro between icc9 and gcc head 20051108 with the patch in PR 17886, comment #16:

icc: 11
gcc: 23

`icc -O2 -march=pentiumpro -S test3.c` gives:
        movl      4(%esp), %eax
        movl      8(%esp), %ecx
        movl      %eax, %edx
        shrl      $24, %edx
        shll      $8, %eax
        shll      $8, %ecx
        orl       %ecx, %edx
        movzwl    18(%esp), %ecx
        movzbl    %cl, %ecx
        orl       %ecx, %eax
        ret

`gcc -c test3.c -save-temps -O2 -march=pentiumpro -momit-leaf-frame-pointer`
gives:
        subl    $12, %esp
        movl    %edi, 8(%esp)
        movl    28(%esp), %edi
        movl    16(%esp), %eax
        movl    20(%esp), %edx
        movl    %esi, 4(%esp)
        movl    24(%esp), %esi
        movl    %edi, %esi
        xorl    %edi, %edi
        movl    8(%esp), %edi
        movl    %ebx, (%esp)
        shrl    $16, %esi
        xorl    %ebx, %ebx
        shldl   $8, %eax, %edx
        movl    %esi, %ecx
        movl    4(%esp), %esi
        orl     %ebx, %edx
        movl    (%esp), %ebx
        andl    $255, %ecx
        sall    $8, %eax
        addl    $12, %esp
        orl     %ecx, %eax
        ret

For comparison, here's the code from gcc 2.95.3. It generates the same 18
instructions for both march=i386 and march=pentiumpro.
`gcc -c test3.c -save-temps -O2 -momit-leaf-frame-pointer -march=pentiumpro`:
        pushl %ebx
        movl 8(%esp),%ecx
        movl 12(%esp),%ebx
        movl 16(%esp),%eax
        movl 20(%esp),%edx
        shldl $8,%ecx,%ebx
        sall $8,%ecx
        movl %edx,%eax
        xorl %edx,%edx
        shrl $16,%eax
        andl $255,%eax
        andl $0,%edx
        orl %eax,%ecx
        orl %edx,%ebx
        movl %ecx,%eax
        movl %ebx,%edx
        popl %ebx
        ret
Comment 2 Ian Lance Taylor 2006-02-02 18:14:29 UTC
With an updated version of RTH's subreg lowering pass, I get this instruction sequence:

f:
	movl	16(%esp), %eax
	movl	4(%esp), %edx
	movl	8(%esp), %ecx
	shrl	$16, %eax
	andl	$255, %eax
	shldl	$8, %edx, %ecx
	sall	$8, %edx
	orl	%edx, %eax
	movl	%ecx, %edx
	ret

This is one instruction shorter than the icc sequence, due to the use of shldl.  It could be improved by switching the roles of %ecx and %edx to avoid the final move, although that is complex to implement give the way the register allocator currently handles pseudo-registers larger than word mode.
Comment 3 Andrew Pinski 2006-02-02 18:16:13 UTC
Confirmed.
Comment 4 Ian Lance Taylor 2006-06-27 06:05:37 UTC
With my current version of the lower-subreg patch, I get this with -O2 -momit-leaf-frame-pointer:

f:
	movl	16(%esp), %eax
	movl	4(%esp), %ecx
	movl	8(%esp), %edx
	shrl	$16, %eax
	andl	$255, %eax
	shldl	$8, %ecx, %edx
	sall	$8, %ecx
	orl	%ecx, %eax
	ret

which may be optimal.
Comment 5 Uroš Bizjak 2006-06-27 10:12:57 UTC
(In reply to comment #4)

> which may be optimal.

        movzbl  18(%esp), %eax

could be used in this particular case.
Comment 6 Steven Bosscher 2006-09-20 22:19:50 UTC
*** Bug 28405 has been marked as a duplicate of this bug. ***
Comment 7 Bernhard Reutner-Fischer 2009-06-04 18:18:11 UTC
(In reply to comment #5)

>         movzbl  18(%esp), %eax
> 
> could be used in this particular case.

4.3.3 onward seem to do that. Fixed?

$ for i in 4.2 4.3 4.4 4.5.orig-HEAD;do printf "### %s\n" $(gcc-$i -dumpversion) ; gcc-$i -march=i386 -O2 -S -o- pr24929.c -fomit-frame-pointer | awk 'BEGIN{yep=0;}/^f:/{yep=1;}/^\./{yep=0;}{if (yep){print $0}}';done
### 4.2.4
f:
	pushl	%edi
	pushl	%esi
	pushl	%ebx
	movl	16(%esp), %esi
	movl	20(%esp), %edi
	movl	24(%esp), %ecx
	movl	28(%esp), %ebx
	movl	%ebx, %ecx
	xorl	%ebx, %ebx
	shrl	$16, %ecx
	movzbl	%cl,%eax
	xorl	%edx, %edx
	shldl	$8, %esi, %edi
	sall	$8, %esi
	orl	%esi, %eax
	orl	%edi, %edx
	popl	%ebx
	popl	%esi
	popl	%edi
	ret
	.size	f, .-f
	.p2align 2,,3
### 4.3.3
f:
	movl	4(%esp), %edx
	movl	8(%esp), %ecx
	shldl	$8, %edx, %ecx
	sall	$8, %edx
	movzbl	18(%esp), %eax
	orl	%edx, %eax
	movl	%ecx, %edx
	ret
	.size	f, .-f
	.p2align 2,,3
### 4.4.0
f:
	movl	4(%esp), %edx
	movl	8(%esp), %ecx
	shldl	$8, %edx, %ecx
	sall	$8, %edx
	movzbl	18(%esp), %eax
	orl	%edx, %eax
	movl	%ecx, %edx
	ret
	.size	f, .-f
	.p2align 2,,3
### 4.5.0
f:
	movl	4(%esp), %edx
	movl	8(%esp), %ecx
	shldl	$8, %edx, %ecx
	sall	$8, %edx
	movzbl	18(%esp), %eax
	orl	%edx, %eax
	movl	%ecx, %edx
	ret
	.size	f, .-f
	.p2align 2,,3

Comment 8 Andrew Pinski 2021-08-29 22:39:06 UTC
Fixed a long time ago.