This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: improved RTL-level if conversion using scratchpads [half-hammock edition]
- From: Abe <abe_skolnik at yahoo dot com>
- To: Bernd Schmidt <bschmidt at redhat dot com>
- Cc: Sebastian Pop <sebpop at gmail dot com>, Kyrill Tkachov <kyrylo dot tkachov at arm dot com>, "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Date: Wed, 11 Nov 2015 11:01:27 -0600
- Subject: Re: improved RTL-level if conversion using scratchpads [half-hammock edition]
- Authentication-results: sourceware.org; auth=none
- References: <563BE9A7 dot 30803 at yahoo dot com> <563C8748 dot 3040901 at redhat dot com> <563CDCAF dot 20703 at yahoo dot com> <563CE5C2 dot 5060409 at redhat dot com> <56426336 dot 1020409 at yahoo dot com> <56435830 dot 9070808 at redhat dot com>
I don't see how a three-state property for a single MEM is necessary or helpful
I guess I could coalesce those two callees into one callee that still returns only
a bool, but I was trying not to make gratuitous changes to the existing code.
I think performance numbers are a fairly important part of a submission like this
Understood and agreed.
That having been said, I have already analyzed the assembly code that results from
my new if conversion, and it is clear that sometimes doing the conversion allows
other GCC passes to do a better job because the code in question is now one big
basic block; before that change the other passes in question were "nervous",
and therefor did not do the optimization that they otherwise would have done,
because they were unable to prove the correctness of the transformation.
> where the transformation isn't an obvious improvement
> (as opposed to removing an instruction or suchlike).
If_conversion can indirectly lead to the removal of _several_ instructions
due to the unification of basic blocks and the removal of labels,
such that other passes can see that there is no way [barring a malfunction
or human tampering e.g. via a debugger or a security exploit] for control
flow to enter in the middle and invalidate liveness assumptions.
I will paste in my source code for a simple torture test I wrote in order to check
the operation of the new scratchpad allocation algorithm, as well as the AArch64
[64-bit ARM] assembly code with and without my work. Without adding any other
optimizations myself, GCC [at "-O3" in both] did much better with the conversion
than without it at compiling code with a repeated constant. The scheduler was
also much more free to hoist loads and sink stores, thereby filling in
otherwise-empty "bubbles" in the CPU pipeline, thereby using machine cycles for
beneficial work instead of wasting those same cycles sitting around doing nothing
while waiting for data to be fetched from main RAM because it is not in cache.
Of note, for the test-case source code shown below, with my new if conversion
GCC is doing a _great_ job of re-using the value 127 across integer-size boundaries,
i.e. using the fact that the 64-bit value 127 has the 32-bit value 127 as its lower
32 bits, etc. In the original test case, which has the assignments in opposite order,
GCC fails to do so across integer-size boundaries even with my new if conversion,
which probably indicates room for improvement in other optimization passes.
In fact, it even redundantly loads the 32-bit value 127 three times.
A coworker of mine said this last part is probably a sign of a bug in GCC.
Of course, if/when the conditional branches in question are not very predictable
and the data upon which they depend is frequently not in cache, then the
if conversion is an even bigger win than it is just by eliminating instructions.
The code I mentioned above follows my sign-off.
Sincerely,
Abe
char C[9];
short S[9];
int I[9];
long L[9];
long long LL[9];
void half_hammock_torture() {
if (LL[1]) LL[2] = 127;
if ( L[1]) L[2] = 127;
if ( I[1]) I[2] = 127;
if ( S[1]) S[2] = 127;
if ( C[1]) C[2] = 127;
}
.file "spad-allocation-algorithm_torture_test___reversed_order.c"
.text
.align 2
.align 3
.global half_hammock_torture
.arch armv8-a+fp+simd
//.tune generic
.type half_hammock_torture, %function
half_hammock_torture:
adrp x0, LL
add x0, x0, :lo12:LL
ldr x1, [x0, 8]
cbz x1, .L2
mov x1, 127
str x1, [x0, 16]
.L2:
adrp x0, L
add x0, x0, :lo12:L
ldr x1, [x0, 8]
cbz x1, .L3
mov x1, 127
str x1, [x0, 16]
.L3:
adrp x0, I
add x0, x0, :lo12:I
ldr w1, [x0, 4]
cbz w1, .L4
mov w1, 127
str w1, [x0, 8]
.L4:
adrp x0, S
add x0, x0, :lo12:S
ldrsh w1, [x0, 2]
cbz w1, .L5
mov w1, 127
strh w1, [x0, 4]
.L5:
adrp x0, C
add x0, x0, :lo12:C
ldrb w1, [x0, 1]
cbz w1, .L1
mov w1, 127
strb w1, [x0, 2]
.L1:
ret
.size half_hammock_torture, .-half_hammock_torture
.comm LL,72,8
.comm L,72,8
.comm I,36,8
.comm S,18,8
.comm C,9,8
.ident "GCC: (GNU) 6.0.0 20151001 (experimental)"
.section .note.GNU-stack,"",%progbits
.file "spad-allocation-algorithm_torture_test___reversed_order.c"
.text
.align 2
.align 3
.global half_hammock_torture
.arch armv8-a+fp+simd
//.tune generic
.type half_hammock_torture, %function
half_hammock_torture:
adrp x3, LL
adrp x2, L
add x3, x3, :lo12:LL
add x2, x2, :lo12:L
adrp x1, I
adrp x0, S
add x1, x1, :lo12:I
add x0, x0, :lo12:S
ldr x6, [x3, 8]
sub sp, sp, #16
ldr x5, [x2, 8]
mov x4, sp
ldr w7, [x1, 4]
cmp x6, xzr
add x3, x3, 16
ldrsh w6, [x0, 2]
csel x3, x4, x3, eq
add x2, x2, 16
cmp x5, xzr
add x1, x1, 8
mov x5, 127
csel x2, x4, x2, eq
cmp w7, wzr
add x0, x0, 4
csel x1, x4, x1, eq
cmp w6, wzr
str x5, [x3]
csel x0, x4, x0, eq
str x5, [x2]
adrp x2, C
str w5, [x1]
add x1, x2, :lo12:C
strh w5, [x0]
add x0, x1, 2
ldrb w1, [x1, 1]
cmp w1, wzr
csel x4, x4, x0, eq
strb w5, [x4]
add sp, sp, 16
ret
.size half_hammock_torture, .-half_hammock_torture
.comm LL,72,8
.comm L,72,8
.comm I,36,8
.comm S,18,8
.comm C,9,8
.ident "GCC: (GNU) 6.0.0 20151001 (experimental)"
.section .note.GNU-stack,"",%progbits