24929 – long long shift/mask operations should be better optimized

Bug 24929 - long long shift/mask operations should be better optimized

Summary: long long shift/mask operations should be better optimized

Status:	RESOLVED FIXED

Alias:	None

Product:	gcc
Classification:	Unclassified
Component:	middle-end (show other bugs)
Version:	4.1.0

Importance:	P3 enhancement
Target Milestone:	4.3.0
Assignee:	Not yet assigned to anyone

URL:
Keywords:	missed-optimization

Duplicates (1):	28405 (view as bug list)
Depends on:
Blocks:

Reported:	2005-11-18 02:30 UTC by Thomas Kho
Modified:	2021-08-29 22:39 UTC (History)
CC List:	5 users (show)

See Also:
Host:
Target:	i686-linux
Build:
Known to work:
Known to fail:
Last reconfirmed:	2006-02-02 18:16:13

Attachments
shift/mask long long testcase (195 bytes, text/plain) 2005-11-18 02:35 UTC, Thomas Kho	Details
View All Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description Thomas Kho 2005-11-18 02:30:07 UTC

shift/mask operations on long long like (x << 8) | ((y >> 48) & 0xffull) could be further optimized for x86. Please see comments in the attached test case (posted earlier in PR 17886).

Comment 1 Thomas Kho 2005-11-18 02:35:52 UTC

Created attachment 10273 [details]
shift/mask long long testcase

Here is a rough instruction-count comparison for f() compiled at -O2,
march=pentiumpro between icc9 and gcc head 20051108 with the patch in PR 17886, comment #16:

icc: 11
gcc: 23

`icc -O2 -march=pentiumpro -S test3.c` gives:
        movl      4(%esp), %eax
        movl      8(%esp), %ecx
        movl      %eax, %edx
        shrl      $24, %edx
        shll      $8, %eax
        shll      $8, %ecx
        orl       %ecx, %edx
        movzwl    18(%esp), %ecx
        movzbl    %cl, %ecx
        orl       %ecx, %eax
        ret

`gcc -c test3.c -save-temps -O2 -march=pentiumpro -momit-leaf-frame-pointer`
gives:
        subl    $12, %esp
        movl    %edi, 8(%esp)
        movl    28(%esp), %edi
        movl    16(%esp), %eax
        movl    20(%esp), %edx
        movl    %esi, 4(%esp)
        movl    24(%esp), %esi
        movl    %edi, %esi
        xorl    %edi, %edi
        movl    8(%esp), %edi
        movl    %ebx, (%esp)
        shrl    $16, %esi
        xorl    %ebx, %ebx
        shldl   $8, %eax, %edx
        movl    %esi, %ecx
        movl    4(%esp), %esi
        orl     %ebx, %edx
        movl    (%esp), %ebx
        andl    $255, %ecx
        sall    $8, %eax
        addl    $12, %esp
        orl     %ecx, %eax
        ret

For comparison, here's the code from gcc 2.95.3. It generates the same 18
instructions for both march=i386 and march=pentiumpro.
`gcc -c test3.c -save-temps -O2 -momit-leaf-frame-pointer -march=pentiumpro`:
        pushl %ebx
        movl 8(%esp),%ecx
        movl 12(%esp),%ebx
        movl 16(%esp),%eax
        movl 20(%esp),%edx
        shldl $8,%ecx,%ebx
        sall $8,%ecx
        movl %edx,%eax
        xorl %edx,%edx
        shrl $16,%eax
        andl $255,%eax
        andl $0,%edx
        orl %eax,%ecx
        orl %edx,%ebx
        movl %ecx,%eax
        movl %ebx,%edx
        popl %ebx
        ret

Comment 2 Ian Lance Taylor 2006-02-02 18:14:29 UTC

With an updated version of RTH's subreg lowering pass, I get this instruction sequence:

f:
	movl	16(%esp), %eax
	movl	4(%esp), %edx
	movl	8(%esp), %ecx
	shrl	$16, %eax
	andl	$255, %eax
	shldl	$8, %edx, %ecx
	sall	$8, %edx
	orl	%edx, %eax
	movl	%ecx, %edx
	ret

This is one instruction shorter than the icc sequence, due to the use of shldl.  It could be improved by switching the roles of %ecx and %edx to avoid the final move, although that is complex to implement give the way the register allocator currently handles pseudo-registers larger than word mode.

Comment 3 Andrew Pinski 2006-02-02 18:16:13 UTC

Confirmed.

Comment 4 Ian Lance Taylor 2006-06-27 06:05:37 UTC

With my current version of the lower-subreg patch, I get this with -O2 -momit-leaf-frame-pointer:

f:
	movl	16(%esp), %eax
	movl	4(%esp), %ecx
	movl	8(%esp), %edx
	shrl	$16, %eax
	andl	$255, %eax
	shldl	$8, %ecx, %edx
	sall	$8, %ecx
	orl	%ecx, %eax
	ret

which may be optimal.

Comment 5 Uroš Bizjak 2006-06-27 10:12:57 UTC

(In reply to comment #4)

> which may be optimal.

        movzbl  18(%esp), %eax

could be used in this particular case.

Comment 6 Steven Bosscher 2006-09-20 22:19:50 UTC

*** Bug 28405 has been marked as a duplicate of this bug. ***

Comment 7 Bernhard Reutner-Fischer 2009-06-04 18:18:11 UTC

(In reply to comment #5)

>         movzbl  18(%esp), %eax
> 
> could be used in this particular case.

4.3.3 onward seem to do that. Fixed?

$ for i in 4.2 4.3 4.4 4.5.orig-HEAD;do printf "### %s\n" $(gcc-$i -dumpversion) ; gcc-$i -march=i386 -O2 -S -o- pr24929.c -fomit-frame-pointer | awk 'BEGIN{yep=0;}/^f:/{yep=1;}/^\./{yep=0;}{if (yep){print $0}}';done
### 4.2.4
f:
	pushl	%edi
	pushl	%esi
	pushl	%ebx
	movl	16(%esp), %esi
	movl	20(%esp), %edi
	movl	24(%esp), %ecx
	movl	28(%esp), %ebx
	movl	%ebx, %ecx
	xorl	%ebx, %ebx
	shrl	$16, %ecx
	movzbl	%cl,%eax
	xorl	%edx, %edx
	shldl	$8, %esi, %edi
	sall	$8, %esi
	orl	%esi, %eax
	orl	%edi, %edx
	popl	%ebx
	popl	%esi
	popl	%edi
	ret
	.size	f, .-f
	.p2align 2,,3
### 4.3.3
f:
	movl	4(%esp), %edx
	movl	8(%esp), %ecx
	shldl	$8, %edx, %ecx
	sall	$8, %edx
	movzbl	18(%esp), %eax
	orl	%edx, %eax
	movl	%ecx, %edx
	ret
	.size	f, .-f
	.p2align 2,,3
### 4.4.0
f:
	movl	4(%esp), %edx
	movl	8(%esp), %ecx
	shldl	$8, %edx, %ecx
	sall	$8, %edx
	movzbl	18(%esp), %eax
	orl	%edx, %eax
	movl	%ecx, %edx
	ret
	.size	f, .-f
	.p2align 2,,3
### 4.5.0
f:
	movl	4(%esp), %edx
	movl	8(%esp), %ecx
	shldl	$8, %edx, %ecx
	sall	$8, %edx
	movzbl	18(%esp), %eax
	orl	%edx, %eax
	movl	%ecx, %edx
	ret
	.size	f, .-f
	.p2align 2,,3

Comment 8 Andrew Pinski 2021-08-29 22:39:06 UTC

Fixed a long time ago.