Bug 38126 - suboptimal code for (a && b || !a && !b)
Summary: suboptimal code for (a && b || !a && !b)
Status: NEW
Alias: None
Product: gcc
Classification: Unclassified
Component: target (show other bugs)
Version: 4.3.0
: P3 enhancement
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks:
 
Reported: 2008-11-15 00:06 UTC by Martin Sebor
Modified: 2022-12-19 20:57 UTC (History)
2 users (show)

See Also:
Host:
Target: x86_64
Build:
Known to work:
Known to fail:
Last reconfirmed: 2012-02-01 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Martin Sebor 2008-11-15 00:06:31 UTC
I would expect gcc to generate comparable code for both functions below, or perhaps even better code for foo() than for bar() since the code in foo() is likely to be more common than the equivalent code in bar(). However, the code produced for foo() is suboptimal in comparison to the code for bar(). In my timings on x86 with gcc 4.3.0 at -O2, foo() appears to run about 5% slower than bar().

$ cat t.c && gcc -S -O2 t.c && cat t.s
int foo (int *a, int *b) { return a && b || !a && !b; }
int bar (int *a, int *b) { return !!a == !!b; }
        .file   "t.c"
        .text
        .p2align 4,,15
.globl foo
        .type   foo, @function
foo:
.LFB2:
        testq   %rdi, %rdi
        je      .L2
        testq   %rsi, %rsi
        movl    $1, %eax
        je      .L2
        rep
        ret
        .p2align 4,,10
        .p2align 3
.L2:
        testq   %rdi, %rdi
        sete    %al
        testq   %rsi, %rsi
        sete    %dl
        andl    %edx, %eax
        movzbl  %al, %eax
        ret
.LFE2:
        .size   foo, .-foo
        .p2align 4,,15
.globl bar
        .type   bar, @function
bar:
.LFB3:
        testq   %rdi, %rdi
        sete    %al
        testq   %rsi, %rsi
        setne   %dl
        xorl    %edx, %eax
        movzbl  %al, %eax
        ret
.LFE3:
        .size   bar, .-bar
        .section        .eh_frame,"a",@progbits
.Lframe1:
        .long   .LECIE1-.LSCIE1
.LSCIE1:
        .long   0x0
        .byte   0x1
        .string "zR"
        .uleb128 0x1
        .sleb128 -8
        .byte   0x10
        .uleb128 0x1
        .byte   0x3
        .byte   0xc
        .uleb128 0x7
        .uleb128 0x8
        .byte   0x90
        .uleb128 0x1
        .align 8
.LECIE1:
.LSFDE1:
        .long   .LEFDE1-.LASFDE1
.LASFDE1:
        .long   .LASFDE1-.Lframe1
        .long   .LFB2
        .long   .LFE2-.LFB2
        .uleb128 0x0
        .align 8
.LEFDE1:
.LSFDE3:
        .long   .LEFDE3-.LASFDE3
.LASFDE3:
        .long   .LASFDE3-.Lframe1
        .long   .LFB3
        .long   .LFE3-.LFB3
        .uleb128 0x0
        .align 8
.LEFDE3:
        .ident  "GCC: (GNU) 4.3.0 20080428 (Red Hat 4.3.0-8)"
        .section        .note.GNU-stack,"",@progbits
Comment 1 Martin Sebor 2009-09-12 23:33:46 UTC
Code involving bool variables is similarly suboptimal:

$ cat t.cpp && gcc -O2 -S t.cpp && cat t.s
bool foo (bool a, bool b) {
    return a && b || !a && !b;
}

bool bar (bool a, bool b) {
    return a == b;
}
        .file   "t.cpp"
        .text
        .p2align 4,,15
.globl _Z3foobb
        .type   _Z3foobb, @function
_Z3foobb:
.LFB0:
        .cfi_startproc
        .cfi_personality 0x3,__gxx_personality_v0
        movl    %esi, %edx
        movl    %esi, %eax
        xorl    $1, %edx
        testb   %dil, %dil
        cmove   %edx, %eax
        ret
        .cfi_endproc
.LFE0:
        .size   _Z3foobb, .-_Z3foobb
        .p2align 4,,15
.globl _Z3barbb
        .type   _Z3barbb, @function
_Z3barbb:
.LFB1:
        .cfi_startproc
        .cfi_personality 0x3,__gxx_personality_v0
        cmpb    %dil, %sil
        sete    %al
        ret
        .cfi_endproc
.LFE1:
        .size   _Z3barbb, .-_Z3barbb
        .ident  "GCC: (GNU) 4.4.1 20090725 (Red Hat 4.4.1-2)"
        .section        .note.GNU-stack,"",@progbits
Comment 2 Andrew Pinski 2012-02-01 22:37:56 UTC
Confirmed.
Comment 3 Steven Fuerst 2012-02-02 05:25:34 UTC
For the first pair of functions, this is even faster:

	neg %rdi
	sbb %eax, %eax
	neg %rsi
	adc $1, %eax
	and $1, %eax
	retq
Comment 4 Steven Fuerst 2012-02-02 06:11:27 UTC
Two more cases for simple boolean logic optimizations.

gcc-4.7 produces with -O3 for

int test_and(long long x, long long y)
{
	return x && y;
}

	test   %rsi, %rsi
	setne  %dl
	xor    %eax, %eax
	test   %rdi, %rdi
	setne  %al
	and    %edx, %eax
	retq

Whereas this is faster:

	neg %rdi
	sbb %rdi, %rdi
	xor %eax, %eax
	and %rsi, %rdi
	setne %al
	retq

Also

int test_other(long long x, long long y)
{
	return !x && y; /* or !(x || !y) */
}

gives

	test   %rsi,%rsi
	setne  %dl
	xor    %eax,%eax
	test   %rdi,%rdi
	sete   %al
	and    %edx,%eax
	retq

when
	sub $1, %rsi
	sbb %rsi, %rsi
	xor %eax, %eax
	or %rdi, %rsi
	sete %al
	retq
is faster.
Comment 5 Andrew Pinski 2021-06-03 02:48:07 UTC
For the original testcase with GCC 7, we get the same(similar enough) code gen for both functions now.
foo:
.LFB0:
        .cfi_startproc
        testq   %rdi, %rdi
        setne   %al
        testq   %rsi, %rsi
        sete    %dl
        xorl    %edx, %eax
        movzbl  %al, %eax
        ret
bar:
.LFB1:
        .cfi_startproc
        testq   %rdi, %rdi
        sete    %al
        testq   %rsi, %rsi
        setne   %dl
        xorl    %edx, %eax
        movzbl  %al, %eax
        ret
Comment 6 Andrew Pinski 2022-11-26 20:40:34 UTC
(In reply to Martin Sebor from comment #1)
> Code involving bool variables is similarly suboptimal:

The bool part was fixed for GCC 13 by r13-1779-g375668e0508fbe .

The rest is a target issue dealing with using sbb more.