[PATCH 00/10] vect: Reuse reduction accumulators between loops
Richard Sandiford
richard.sandiford@arm.com
Thu Jul 8 12:38:20 GMT 2021
Quoting from the final patch in the series:
------------------------------------------------------------------------
This patch adds support for reusing a main loop's reduction accumulator
in an epilogue loop. This in turn lets the loops share a single piece
of vector->scalar reduction code.
The patch has the following restrictions:
(1) The epilogue reduction can only operate on a single vector
(e.g. ncopies must be 1 for non-SLP reductions, and the group size
must be <= the element count for SLP reductions).
(2) Both loops must use the same vector mode for their accumulators.
This means that the patch is restricted to targets that support
--param vect-partial-vector-usage=1.
(3) The reduction must be a standard “tree code” reduction.
However, these restrictions could be lifted in future. For example,
if the main loop operates on 128-bit vectors and the epilogue loop
operates on 64-bit vectors, we could in future reduce the 128-bit
vector by one stage and use the 64-bit result as the starting point
for the epilogue result.
The patch tries to handle chained SLP reductions, unchained SLP
reductions and non-SLP reductions. It also handles cases in which
the epilogue loop is entered directly (rather than via the main loop)
and cases in which the epilogue loop can be skipped.
------------------------------------------------------------------------
However, it ended up being difficult to do that without some preparatory
clean-ups. Some of them could probably stand on their own, but others
are a bit “meh” without the final patch to justify them.
The diff below shows the effect of the patch when compiling:
unsigned short __attribute__((noipa))
add_loop (unsigned short *x, int n)
{
unsigned short res = 0;
for (int i = 0; i < n; ++i)
res += x[i];
return res;
}
with -O3 --param vect-partial-vector-usage=1 on an SVE target:
add_loop: add_loop:
.LFB0: .LFB0:
.cfi_startproc .cfi_startproc
mov x4, x0 <
cmp w1, 0 cmp w1, 0
ble .L7 ble .L7
cnth x0 | cnth x4
sub w2, w1, #1 sub w2, w1, #1
sub w3, w0, #1 | sub w3, w4, #1
cmp w2, w3 cmp w2, w3
bcc .L8 bcc .L8
sub w0, w1, w0 | sub w4, w1, w4
mov x3, 0 mov x3, 0
cnth x5 cnth x5
mov z0.b, #0 mov z0.b, #0
ptrue p0.b, all ptrue p0.b, all
.p2align 3,,7 .p2align 3,,7
.L4: .L4:
ld1h z1.h, p0/z, [x4, x3, | ld1h z1.h, p0/z, [x0, x3,
mov x2, x3 mov x2, x3
add x3, x3, x5 add x3, x3, x5
add z0.h, z0.h, z1.h add z0.h, z0.h, z1.h
cmp w0, w3 | cmp w4, w3
bcs .L4 bcs .L4
uaddv d0, p0, z0.h <
umov w0, v0.h[0] <
inch x2 inch x2
and w0, w0, 65535 <
cmp w1, w2 cmp w1, w2
beq .L2 | beq .L6
.L3: .L3:
sub w1, w1, w2 sub w1, w1, w2
mov z1.b, #0 | add x2, x0, w2, uxtw 1
whilelo p0.h, wzr, w1 whilelo p0.h, wzr, w1
add x2, x4, w2, uxtw 1 | ld1h z1.h, p0/z, [x2]
ptrue p1.b, all | add z0.h, p0/m, z0.h, z1.
ld1h z0.h, p0/z, [x2] | .L6:
sel z0.h, p0, z0.h, z1.h | ptrue p0.b, all
uaddv d0, p1, z0.h | uaddv d0, p0, z0.h
fmov x1, d0 | umov w0, v0.h[0]
add w0, w0, w1, uxth <
and w0, w0, 65535 and w0, w0, 65535
.L2: <
ret ret
.p2align 2,,3 .p2align 2,,3
.L7: .L7:
mov w0, 0 mov w0, 0
ret ret
.L8: .L8:
mov w2, 0 mov w2, 0
mov w0, 0 | mov z0.b, #0
b .L3 b .L3
.cfi_endproc .cfi_endproc
Kewen, could you give this a spin on Power 10 to see whether it
works/helps there? I've attached a combined diff.
Series tested on aarch64-linux-gnu and x86_64-linux-gnu.
Richard
-------------- next part --------------
A non-text attachment was scrubbed...
Name: combined.diff
Type: text/x-diff
Size: 87151 bytes
Desc: not available
URL: <https://gcc.gnu.org/pipermail/gcc-patches/attachments/20210708/9bdc5921/attachment-0001.bin>
More information about the Gcc-patches
mailing list