Bug 14504 - Missed Bit Twiddling Optimization
Summary: Missed Bit Twiddling Optimization
Status: NEW
Alias: None
Product: gcc
Classification: Unclassified
Component: rtl-optimization (show other bugs)
Version: 3.3.2
: P2 enhancement
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks:
 
Reported: 2004-03-09 17:16 UTC by Stephan T. Lavavej
Modified: 2024-10-31 20:02 UTC (History)
4 users (show)

See Also:
Host:
Target: i686-pc-linux-gnu
Build:
Known to work:
Known to fail:
Last reconfirmed: 2019-03-04 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Stephan T. Lavavej 2004-03-09 17:16:59 UTC
g++ 3.3.2 on i686-pc-linux-gnu with -s -O3 -fomit-frame-pointer compiles:

unsigned long cond_mask(bool flag, unsigned long mask, unsigned long target) {
    return flag ? target | mask : target & ~mask;
}

into:

cmpb    $0, 4(%esp)
movl    8(%esp), %eax
movl    12(%esp), %edx
je      .L2
orl     %edx, %eax
ret
.L2:
notl    %eax
andl    %edx, %eax
ret

This appears to be a straightforward translation of the code into assembly. 
With the same options, this:

unsigned long cond_mask(bool flag, unsigned long mask, unsigned long target) {
    return (mask | target ^ 0xFFFFFFFFUL + flag) ^ 0xFFFFFFFFUL + flag;
}

is compiled into:

movzbl  4(%esp), %edx
movl    12(%esp), %eax
movl    8(%esp), %ecx
decl    %edx
xorl    %edx, %eax
orl     %ecx, %eax
xorl    %edx, %eax
ret

Again this appears to be a straightforward translation of the code into 
assembly (instead of flag + 0xFFFFFFFFUL, it uses static_cast<unsigned long>
(flag) - 1, which is the same thing).

However, the rewritten code lacks a branch yet does the exact same thing.

g++ ought to be able to perform this reasonably simple transformation on its 
own - if, indeed, the transformation is desirable (which I think it is).

NB:
I do not know assembly, so I may have deleted important information from g++'s 
output. I don't think I have, however.

This case was suggested by ryanm@microsoft.com, who said "this is something I 
will never expect a compiler to be able to optimize for me.  ^_^". I wrote the 
transformed code; perhaps there is a cleverer way to transform it which I am 
not aware of.
Comment 1 Andrew Pinski 2004-03-09 17:35:06 UTC
Confirmed.
Comment 2 Kazu Hirata 2004-03-11 00:09:18 UTC
I've got a patch.

/* There are four cases to handle modulo whether FLAG appears
   positively and ordering of operands TARGET and MASK in the
   expression.

   With my patch, gcc -O2 -fomit-frame-pointer -mregparm=3 generates
   the following assembly code.  */

typedef _Bool bool;

unsigned long
cond_mask_0 (bool flag, unsigned long mask, unsigned long target)
{
  return flag ? target | mask : target & ~mask;
#if 0
	movzbl	%al, %eax
	decl	%eax
	xorl	%eax, %ecx
	orl	%edx, %ecx
	xorl	%ecx, %eax
#endif
}

unsigned long
cond_mask_1 (bool flag, unsigned long mask, unsigned long target)
{
  return flag ? target | ~mask : target & mask;
#if 0
	movzbl	%al, %eax
	negl	%eax
	xorl	%eax, %ecx
	andl	%edx, %ecx
	xorl	%ecx, %eax
#endif
}

unsigned long
cond_mask_2 (bool flag, unsigned long mask, unsigned long target)
{
  return flag ? target & mask : target | ~mask;
#if 0
	movzbl	%al, %eax
	decl	%eax
	xorl	%eax, %ecx
	andl	%edx, %ecx
	xorl	%ecx, %eax
#endif
}

unsigned long
cond_mask_3 (bool flag, unsigned long mask, unsigned long target)
{
  return flag ? target & ~mask : target | mask;
#if 0
	movzbl	%al, %eax
	negl	%eax
	xorl	%eax, %ecx
	orl	%edx, %ecx
	xorl	%ecx, %eax
#endif
}
Comment 3 Stephan T. Lavavej 2004-03-11 21:33:21 UTC
Neat. Is this patch for 3.3.x, 3.4.0, mainline, or some combination?
Comment 4 Kazu Hirata 2004-03-11 21:35:15 UTC
Mainline as this is not a regression or bug fix.
Comment 5 Stephan T. Lavavej 2004-03-11 21:41:04 UTC
Ok. (I thought that 3.3.4 was open for non-invasive enhancements; now I see 
that it's open for non-invasive non-regression bugfixes.)
Comment 6 Falk Hueffner 2004-03-11 22:10:38 UTC
My comment posted to gcc-bugs didn't make it into bugzilla, so I'll repeat it.

This is not universally a win. For example, on an Alpha EV5, the first:

andnot  a2,a1,t1
or      a2,a1,v0
cmoveq  a0,t1,v0

takes 2 cycles, and the second:

lda     t1,-1(a0)
xor     a2,t1,v0
or      a1,v0,t0
xor     t0,t1,v0

takes 4 cycles and 1 more insn. I suspect on i386 it'd be similar if
you enable conditional moves.
Comment 7 Stephan T. Lavavej 2004-03-11 22:21:56 UTC
Well, the Right Thing to do is to perform the transformation when it's a win, 
and also perform the /reverse transformation/ when it's a win, and not 
transform anything otherwise.
Comment 8 Kazu Hirata 2004-03-11 22:29:45 UTC
Err, i386 is a two-address machine with only a few registers,
so cmove doesn't work as well in this case.

typedef _Bool bool;

unsigned long
cond_mask (bool flag, unsigned long mask, unsigned long target)
{
  unsigned long l = target | mask;
  unsigned long r = target & ~mask;
  return flag ? l : r;
#if 0
	pushl	%ebx
	movl	%eax, %ebx ; %bl = flag
	movl	%ecx, %eax
	orl	%edx, %eax ; %eax = target | mask
	notl	%edx       ; ~mask
	andl	%ecx, %edx ; %edx = target & ~mask
	testb	%bl, %bl
	popl	%ebx
	cmove	%edx, %eax
#endif
}
Comment 9 Steven Bosscher 2019-03-04 11:39:09 UTC
// -O3 -m32 -fomit-frame-pointer
unsigned long cond_mask_1(bool flag, unsigned long mask, unsigned long target) {
    return flag ? target | mask : target & ~mask;
}

unsigned long cond_mask_2(bool flag, unsigned long mask, unsigned long target) {
    return (mask | target ^ 0xFFFFFFFFUL + flag) ^ 0xFFFFFFFFUL + flag;
}


GCC trunk:
cond_mask_1(bool, unsigned long, unsigned long):
        movl    12(%esp), %eax
        movl    8(%esp), %edx
        movl    %eax, %ecx
        orl     %edx, %ecx
        notl    %edx
        andl    %eax, %edx
        cmpb    $0, 4(%esp)
        movl    %ecx, %eax
        cmove   %edx, %eax
        ret
cond_mask_2(bool, unsigned long, unsigned long):
        movzbl  4(%esp), %eax
        leal    -1(%eax), %edx
        movl    12(%esp), %eax
        xorl    %edx, %eax
        orl     8(%esp), %eax
        xorl    %edx, %eax
        ret
Comment 10 Andrew Pinski 2024-10-31 20:02:31 UTC
GCC 4.6 started to use cmov.

I am not 100% sure but cmov might be better especially for other targets (aarch64 which has andn too).