[PATCH 00/10] vect: Reuse reduction accumulators between loops

Thu Jul 8 12:38:20 GMT 2021

Quoting from the final patch in the series:

------------------------------------------------------------------------
This patch adds support for reusing a main loop's reduction accumulator
in an epilogue loop.  This in turn lets the loops share a single piece
of vector->scalar reduction code.

The patch has the following restrictions:

(1) The epilogue reduction can only operate on a single vector
    (e.g. ncopies must be 1 for non-SLP reductions, and the group size
    must be <= the element count for SLP reductions).

(2) Both loops must use the same vector mode for their accumulators.
    This means that the patch is restricted to targets that support
    --param vect-partial-vector-usage=1.

(3) The reduction must be a standard “tree code” reduction.

However, these restrictions could be lifted in future.  For example,
if the main loop operates on 128-bit vectors and the epilogue loop
operates on 64-bit vectors, we could in future reduce the 128-bit
vector by one stage and use the 64-bit result as the starting point
for the epilogue result.

The patch tries to handle chained SLP reductions, unchained SLP
reductions and non-SLP reductions.  It also handles cases in which
the epilogue loop is entered directly (rather than via the main loop)
and cases in which the epilogue loop can be skipped.
------------------------------------------------------------------------

However, it ended up being difficult to do that without some preparatory
clean-ups.  Some of them could probably stand on their own, but others
are a bit “meh” without the final patch to justify them.

The diff below shows the effect of the patch when compiling:

  unsigned short __attribute__((noipa))
  add_loop (unsigned short *x, int n)
  {
    unsigned short res = 0;
    for (int i = 0; i < n; ++i)
      res += x[i];
    return res;
  }

with -O3 --param vect-partial-vector-usage=1 on an SVE target:

add_loop:				add_loop:
.LFB0:					.LFB0:
	.cfi_startproc				.cfi_startproc
	mov	x4, x0		      <
	cmp	w1, 0				cmp	w1, 0
	ble	.L7				ble	.L7
	cnth	x0		      |		cnth	x4
	sub	w2, w1, #1			sub	w2, w1, #1
	sub	w3, w0, #1	      |		sub	w3, w4, #1
	cmp	w2, w3				cmp	w2, w3
	bcc	.L8				bcc	.L8
	sub	w0, w1, w0	      |		sub	w4, w1, w4
	mov	x3, 0				mov	x3, 0
	cnth	x5				cnth	x5
	mov	z0.b, #0			mov	z0.b, #0
	ptrue	p0.b, all			ptrue	p0.b, all
	.p2align 3,,7				.p2align 3,,7
.L4:					.L4:
	ld1h	z1.h, p0/z, [x4, x3,  |		ld1h	z1.h, p0/z, [x0, x3, 
	mov	x2, x3				mov	x2, x3
	add	x3, x3, x5			add	x3, x3, x5
	add	z0.h, z0.h, z1.h		add	z0.h, z0.h, z1.h
	cmp	w0, w3		      |		cmp	w4, w3
	bcs	.L4				bcs	.L4
	uaddv	d0, p0, z0.h	      <
	umov	w0, v0.h[0]	      <
	inch	x2				inch	x2
	and	w0, w0, 65535	      <
	cmp	w1, w2				cmp	w1, w2
	beq	.L2		      |		beq	.L6
.L3:					.L3:
	sub	w1, w1, w2			sub	w1, w1, w2
	mov	z1.b, #0	      |		add	x2, x0, w2, uxtw 1
	whilelo	p0.h, wzr, w1			whilelo	p0.h, wzr, w1
	add	x2, x4, w2, uxtw 1    |		ld1h	z1.h, p0/z, [x2]
	ptrue	p1.b, all	      |		add	z0.h, p0/m, z0.h, z1.
	ld1h	z0.h, p0/z, [x2]      |	.L6:
	sel	z0.h, p0, z0.h, z1.h  |		ptrue	p0.b, all
	uaddv	d0, p1, z0.h	      |		uaddv	d0, p0, z0.h
	fmov	x1, d0		      |		umov	w0, v0.h[0]
	add	w0, w0, w1, uxth      <
	and	w0, w0, 65535			and	w0, w0, 65535
.L2:				      <
	ret					ret
	.p2align 2,,3				.p2align 2,,3
.L7:					.L7:
	mov	w0, 0				mov	w0, 0
	ret					ret
.L8:					.L8:
	mov	w2, 0				mov	w2, 0
	mov	w0, 0		      |		mov	z0.b, #0
	b	.L3				b	.L3
	.cfi_endproc				.cfi_endproc

Kewen, could you give this a spin on Power 10 to see whether it
works/helps there?  I've attached a combined diff.

Series tested on aarch64-linux-gnu and x86_64-linux-gnu.

Richard

-------------- next part --------------
A non-text attachment was scrubbed...
Name: combined.diff
Type: text/x-diff
Size: 87151 bytes
Desc: not available
URL: <https://gcc.gnu.org/pipermail/gcc-patches/attachments/20210708/9bdc5921/attachment-0001.bin>