improved RTL-level if conversion using scratchpads [half-hammock edition]

Wed Nov 11 17:01:00 GMT 2015

> I don't see how a three-state property for a single MEM is necessary or helpful

I guess I could coalesce those two callees into one callee that still returns only
a bool, but I was trying not to make gratuitous changes to the existing code.

> I think performance numbers are a fairly important part of a submission like this

Understood and agreed.

That having been said, I have already analyzed the assembly code that results from
my new if conversion, and it is clear that sometimes doing the conversion allows
other GCC passes to do a better job because the code in question is now one big
basic block; before that change the other passes in question were "nervous",
and therefor did not do the optimization that they otherwise would have done,
because they were unable to prove the correctness of the transformation.

 > where the transformation isn't an obvious improvement
 > (as opposed to removing an instruction or suchlike).

If_conversion can indirectly lead to the removal of _several_ instructions
due to the unification of basic blocks and the removal of labels,
such that other passes can see that there is no way [barring a malfunction
or human tampering e.g. via a debugger or a security exploit] for control
flow to enter in the middle and invalidate liveness assumptions.

I will paste in my source code for a simple torture test I wrote in order to check
the operation of the new scratchpad allocation algorithm, as well as the AArch64
[64-bit ARM] assembly code with and without my work.  Without adding any other
optimizations myself, GCC [at "-O3" in both] did much better with the conversion
than without it at compiling code with a repeated constant.  The scheduler was
also much more free to hoist loads and sink stores, thereby filling in
otherwise-empty "bubbles" in the CPU pipeline, thereby using machine cycles for
beneficial work instead of wasting those same cycles sitting around doing nothing
while waiting for data to be fetched from main RAM because it is not in cache.

Of note, for the test-case source code shown below, with my new if conversion
GCC is doing a _great_ job of re-using the value 127 across integer-size boundaries,
i.e. using the fact that the 64-bit value 127 has the 32-bit value 127 as its lower
32 bits, etc.  In the original test case, which has the assignments in opposite order,
GCC fails to do so across integer-size boundaries even with my new if conversion,
which probably indicates room for improvement in other optimization passes.
In fact, it even redundantly loads the 32-bit value 127 three times.
A coworker of mine said this last part is probably a sign of a bug in GCC.

Of course, if/when the conditional branches in question are not very predictable
and the data upon which they depend is frequently not in cache, then the
if conversion is an even bigger win than it is just by eliminating instructions.

The code I mentioned above follows my sign-off.

Sincerely,

Abe

char       C[9];
short      S[9];
int        I[9];
long       L[9];
long long LL[9];

void half_hammock_torture() {

   if (LL[1])  LL[2] = 127;
   if ( L[1])   L[2] = 127;
   if ( I[1])   I[2] = 127;
   if ( S[1])   S[2] = 127;
   if ( C[1])   C[2] = 127;

}

	.file	"spad-allocation-algorithm_torture_test___reversed_order.c"
	.text
	.align	2
	.align	3
	.global	half_hammock_torture
	.arch armv8-a+fp+simd
	//.tune generic
	.type	half_hammock_torture, %function
half_hammock_torture:
	adrp	x0, LL
	add	x0, x0, :lo12:LL
	ldr	x1, [x0, 8]
	cbz	x1, .L2
	mov	x1, 127
	str	x1, [x0, 16]
.L2:
	adrp	x0, L
	add	x0, x0, :lo12:L
	ldr	x1, [x0, 8]
	cbz	x1, .L3
	mov	x1, 127
	str	x1, [x0, 16]
.L3:
	adrp	x0, I
	add	x0, x0, :lo12:I
	ldr	w1, [x0, 4]
	cbz	w1, .L4
	mov	w1, 127
	str	w1, [x0, 8]
.L4:
	adrp	x0, S
	add	x0, x0, :lo12:S
	ldrsh	w1, [x0, 2]
	cbz	w1, .L5
	mov	w1, 127
	strh	w1, [x0, 4]
.L5:
	adrp	x0, C
	add	x0, x0, :lo12:C
	ldrb	w1, [x0, 1]
	cbz	w1, .L1
	mov	w1, 127
	strb	w1, [x0, 2]
.L1:
	ret
	.size	half_hammock_torture, .-half_hammock_torture
	.comm	LL,72,8
	.comm	L,72,8
	.comm	I,36,8
	.comm	S,18,8
	.comm	C,9,8
	.ident	"GCC: (GNU) 6.0.0 20151001 (experimental)"
	.section	.note.GNU-stack,"",%progbits

	.file	"spad-allocation-algorithm_torture_test___reversed_order.c"
	.text
	.align	2
	.align	3
	.global	half_hammock_torture
	.arch armv8-a+fp+simd
	//.tune generic
	.type	half_hammock_torture, %function
half_hammock_torture:
	adrp	x3, LL
	adrp	x2, L
	add	x3, x3, :lo12:LL
	add	x2, x2, :lo12:L
	adrp	x1, I
	adrp	x0, S
	add	x1, x1, :lo12:I
	add	x0, x0, :lo12:S
	ldr	x6, [x3, 8]
	sub	sp, sp, #16
	ldr	x5, [x2, 8]
	mov	x4, sp
	ldr	w7, [x1, 4]
	cmp	x6, xzr
	add	x3, x3, 16
	ldrsh	w6, [x0, 2]
	csel	x3, x4, x3, eq
	add	x2, x2, 16
	cmp	x5, xzr
	add	x1, x1, 8
	mov	x5, 127
	csel	x2, x4, x2, eq
	cmp	w7, wzr
	add	x0, x0, 4
	csel	x1, x4, x1, eq
	cmp	w6, wzr
	str	x5, [x3]
	csel	x0, x4, x0, eq
	str	x5, [x2]
	adrp	x2, C
	str	w5, [x1]
	add	x1, x2, :lo12:C
	strh	w5, [x0]
	add	x0, x1, 2
	ldrb	w1, [x1, 1]
	cmp	w1, wzr
	csel	x4, x4, x0, eq
	strb	w5, [x4]
	add	sp, sp, 16
	ret
	.size	half_hammock_torture, .-half_hammock_torture
	.comm	LL,72,8
	.comm	L,72,8
	.comm	I,36,8
	.comm	S,18,8
	.comm	C,9,8
	.ident	"GCC: (GNU) 6.0.0 20151001 (experimental)"
	.section	.note.GNU-stack,"",%progbits