[PATCH V2] Fix inaccuracy in cunroll/cunrolli when considering what's innermost loop.
liuhongt
hongtao.liu@intel.com
Tue Dec 10 02:58:02 GMT 2024
> Please pass 'sbitmap' instead of auto_sbitmap&, it should properly
> decay to that. Applies everywhere I think.
>
Changed.
> In fact I wonder whether we should simply populate the bitmap
> from a
>
> for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
> bitmap_set_bit (original_innermost, loop->num);
>
> loop instead and pass pass-is-cunrolli && bitmap_bit_p (..) as
> cunrolli down to try_unroll_loop_completely?
Changed, but still need to pass sbitmap from tree_unroll_loops_completely
down to canonicalize_induction_variables and tree_unroll_loops_completely_1
since "outer loop" could already be changed in later 2 callers.
BTW arm ci reported 2 regressed testcase so I added
* gcc.dg/tree-ssa/pr83403-1.c: Add --param max-completely-peeled-insns=300 for arm*-*-*.
* gcc.dg/tree-ssa/pr83403-2.c: Ditto.
For 32-bit arm, there're more stmts in the innermost loop,
and removal of the reduction prevents completely unrolling for them.
For aarch64, it looks fine.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?
r15-919-gef27b91b62c3aa removed 1 / 3 size reduction for innermost
loop, but it doesn't accurately remember what's "innermost" for 2
testcases in PR117888.
1) For pass_cunroll, the "innermost" loop could be an originally outer
loop with inner loop completely unrolled by cunrolli. The patch moves
local variable cunrolli to parameter of tree_unroll_loops_completely
and passes it directly from execute of the pass.
2) For pass_cunrolli, cunrolli is set to false when the sibling loop
of a innermost loop is completely unrolled, and it inaccurately
takes the innermost loop as an "outer" loop. The patch add another
paramter innermost to helps recognizing the "original" innermost loop.
gcc/ChangeLog:
PR tree-optimization/117888
* tree-ssa-loop-ivcanon.cc (try_unroll_loop_completely): Use
cunrolli instead of cunrolli && !loop->inner to check if it's
innermost loop.
(canonicalize_loop_induction_variables): Add new parameter
const_sbitmap innermost, and pass
cunrolli
&& (unsigned) loop->num < SBITMAP_SIZE (innermost)
&& bitmap_bit_p (innermost, loop->num) as "cunrolli" to
try_unroll_loop_completely
(canonicalize_induction_variables): Pass innermost to
canonicalize_loop_induction_variables.
(tree_unroll_loops_completely_1): Add new parameter
const_sbitmap innermost.
(tree_unroll_loops_completely): Move local variable cunrolli
to parameter to indicate it's from pass cunrolli, also track
all "original" innermost loop at the beginning.
gcc/testsuite/ChangeLog:
* gcc.dg/pr117888-2.c: New test.
* gcc.dg/vect/pr117888-1.c: Ditto.
* gcc.dg/tree-ssa/pr83403-1.c: Add
--param max-completely-peeled-insns=300 for arm*-*-*.
* gcc.dg/tree-ssa/pr83403-2.c: Ditto.
---
gcc/testsuite/gcc.dg/pr117888-2.c | 37 ++++++++++++
gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c | 1 +
gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c | 1 +
gcc/testsuite/gcc.dg/vect/pr117888-1.c | 71 +++++++++++++++++++++++
gcc/tree-ssa-loop-ivcanon.cc | 60 +++++++++++++------
5 files changed, 151 insertions(+), 19 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/pr117888-2.c
create mode 100644 gcc/testsuite/gcc.dg/vect/pr117888-1.c
diff --git a/gcc/testsuite/gcc.dg/pr117888-2.c b/gcc/testsuite/gcc.dg/pr117888-2.c
new file mode 100644
index 00000000000..97aa93d8ace
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr117888-2.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize -fdump-tree-cunroll-details" } */
+
+typedef struct {
+ double real;
+ double imag;
+} complex;
+
+typedef struct { complex e[3][3]; } su3_matrix;
+
+void mult_su3_nn( su3_matrix *a, su3_matrix *b, su3_matrix *c )
+{
+ int i,j;
+ double t,ar,ai,br,bi,cr,ci;
+ for(i=0;i<3;i++)
+ for(j=0;j<3;j++){
+
+ ar=a->e[i][0].real; ai=a->e[i][0].imag;
+ br=b->e[0][j].real; bi=b->e[0][j].imag;
+ cr=ar*br; t=ai*bi; cr -= t;
+ ci=ar*bi; t=ai*br; ci += t;
+
+ ar=a->e[i][1].real; ai=a->e[i][1].imag;
+ br=b->e[1][j].real; bi=b->e[1][j].imag;
+ t=ar*br; cr += t; t=ai*bi; cr -= t;
+ t=ar*bi; ci += t; t=ai*br; ci += t;
+
+ ar=a->e[i][2].real; ai=a->e[i][2].imag;
+ br=b->e[2][j].real; bi=b->e[2][j].imag;
+ t=ar*br; cr += t; t=ai*bi; cr -= t;
+ t=ar*bi; ci += t; t=ai*br; ci += t;
+
+ c->e[i][j].real=cr;
+ c->e[i][j].imag=ci;
+ }
+}
+/* { dg-final { scan-tree-dump-times "optimized: loop with 2 iterations completely unrolled" 1 "cunroll" { target i?86-*-* x86_64-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
index bfc703d1aa6..293fd2dbd97 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
@@ -1,6 +1,7 @@
/* { dg-do compile } */
/* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
/* { dg-additional-options "--param max-completely-peeled-insns=200" { target { s390*-*-* } } } */
+/* { dg-additional-options "--param max-completely-peeled-insns=300" { target { arm*-*-* } } } */
#define TYPE unsigned int
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
index 9130d9bd583..b421b387bca 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
@@ -1,6 +1,7 @@
/* { dg-do compile } */
/* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
/* { dg-additional-options "--param max-completely-peeled-insns=200" { target { s390*-*-* } } } */
+/* { dg-additional-options "--param max-completely-peeled-insns=300" { target { arm*-*-* } } } */
#define TYPE int
diff --git a/gcc/testsuite/gcc.dg/vect/pr117888-1.c b/gcc/testsuite/gcc.dg/vect/pr117888-1.c
new file mode 100644
index 00000000000..4796a7c83c1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr117888-1.c
@@ -0,0 +1,71 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-vect-details" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_shift } */
+/* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */
+/* { dg-additional-options "--param max-completely-peeled-insns=200" { target powerpc64*-*-* } } */
+
+typedef unsigned short ggml_fp16_t;
+static float table_f32_f16[1 << 16];
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+ unsigned short s;
+ __builtin_memcpy(&s, &f, sizeof(unsigned short));
+ return table_f32_f16[s];
+}
+
+typedef struct {
+ ggml_fp16_t d;
+ ggml_fp16_t m;
+ unsigned char qh[4];
+ unsigned char qs[32 / 2];
+} block_q5_1;
+
+typedef struct {
+ float d;
+ float s;
+ char qs[32];
+} block_q8_1;
+
+void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+ const int qk = 32;
+ const int nb = n / qk;
+
+ const block_q5_1 * restrict x = vx;
+ const block_q8_1 * restrict y = vy;
+
+ float sumf = 0.0;
+
+ for (int i = 0; i < nb; i++) {
+ unsigned qh;
+ __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
+
+ int sumi = 0;
+
+ if (qh) {
+ for (int j = 0; j < qk/2; ++j) {
+ const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
+ const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
+
+ const int x0 = (x[i].qs[j] & 0xF) | xh_0;
+ const int x1 = (x[i].qs[j] >> 4) | xh_1;
+
+ sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+ }
+ }
+ else {
+ for (int j = 0; j < qk/2; ++j) {
+ const int x0 = (x[i].qs[j] & 0xF);
+ const int x1 = (x[i].qs[j] >> 4);
+
+ sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+ }
+ }
+
+ sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
+ }
+
+ *s = sumf;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index 9a94d82fc4e..451b09adc26 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -939,8 +939,7 @@ try_unroll_loop_completely (class loop *loop,
1) It could increase register pressure.
2) Big loop after completely unroll may not be vectorized
by BB vectorizer. */
- else if ((cunrolli && !loop->inner
- ? unr_insns : unr_insns - est_eliminated)
+ else if ((cunrolli ? unr_insns : unr_insns - est_eliminated)
> (unsigned) param_max_completely_peeled_insns)
{
if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1248,7 +1247,9 @@ try_peel_loop (class loop *loop,
static bool
canonicalize_loop_induction_variables (class loop *loop,
bool create_iv, enum unroll_level ul,
- bool try_eval, bool allow_peel, bool cunrolli)
+ bool try_eval, bool allow_peel,
+ const_sbitmap innermost,
+ bool cunrolli)
{
edge exit = NULL;
tree niter;
@@ -1334,8 +1335,15 @@ canonicalize_loop_induction_variables (class loop *loop,
modified |= remove_redundant_iv_tests (loop);
dump_user_location_t locus = find_loop_location (loop);
+
+ bool innermost_cunrolli_p
+ = cunrolli
+ && (unsigned) loop->num < SBITMAP_SIZE (innermost)
+ && bitmap_bit_p (innermost, loop->num);
+
if (try_unroll_loop_completely (loop, exit, niter, may_be_zero, ul,
- maxiter, locus, allow_peel, cunrolli))
+ maxiter, locus, allow_peel,
+ innermost_cunrolli_p))
return true;
if (create_iv
@@ -1372,14 +1380,19 @@ canonicalize_induction_variables (void)
bool changed = false;
bool irred_invalidated = false;
bitmap loop_closed_ssa_invalidated = BITMAP_ALLOC (NULL);
+ auto_sbitmap innermost (number_of_loops (cfun));
+ bitmap_clear (innermost);
estimate_numbers_of_iterations (cfun);
for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
{
- changed |= canonicalize_loop_induction_variables (loop,
- true, UL_SINGLE_ITER,
- true, false, false);
+ changed
+ |= canonicalize_loop_induction_variables (loop,
+ true, UL_SINGLE_ITER,
+ true, false,
+ (const_sbitmap) innermost,
+ false);
}
gcc_assert (!need_ssa_update_p (cfun));
@@ -1413,7 +1426,8 @@ canonicalize_induction_variables (void)
static bool
tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
- bitmap father_bbs, class loop *loop, bool cunrolli)
+ bitmap father_bbs, class loop *loop,
+ const_sbitmap innermost, bool cunrolli)
{
class loop *loop_father;
bool changed = false;
@@ -1431,7 +1445,8 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
if (!child_father_bbs)
child_father_bbs = BITMAP_ALLOC (NULL);
if (tree_unroll_loops_completely_1 (may_increase_size, unroll_outer,
- child_father_bbs, inner, cunrolli))
+ child_father_bbs, inner,
+ innermost, cunrolli))
{
bitmap_ior_into (father_bbs, child_father_bbs);
bitmap_clear (child_father_bbs);
@@ -1477,7 +1492,8 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
ul = UL_NO_GROWTH;
if (canonicalize_loop_induction_variables
- (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer, cunrolli))
+ (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer,
+ innermost, cunrolli))
{
/* If we'll continue unrolling, we need to propagate constants
within the new basic blocks to fold away induction variable
@@ -1503,19 +1519,28 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
/* Unroll LOOPS completely if they iterate just few times. Unless
MAY_INCREASE_SIZE is true, perform the unrolling only if the
- size of the code does not increase. */
+ size of the code does not increase.
+ cunrolli is true when passs is cunrolli. */
static unsigned int
-tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
+tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer, bool cunrolli)
{
bitmap father_bbs = BITMAP_ALLOC (NULL);
bool changed;
int iteration = 0;
bool irred_invalidated = false;
- bool cunrolli = true;
+ auto_sbitmap innermost (number_of_loops (cfun));
+ bitmap_clear (innermost);
estimate_numbers_of_iterations (cfun);
+ /* Mark all innermost loop at the begining. */
+ for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
+ {
+ if (!loop->inner)
+ bitmap_set_bit (innermost, loop->num);
+ }
+
do
{
changed = false;
@@ -1530,14 +1555,11 @@ tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
changed = tree_unroll_loops_completely_1 (may_increase_size,
unroll_outer, father_bbs,
current_loops->tree_root,
+ (const_sbitmap) innermost,
cunrolli);
if (changed)
{
unsigned i;
- /* For the outer loop, considering that the inner loop is completely
- unrolled, it would expose more optimization opportunities, so it's
- better to keep 2/3 reduction of estimated unrolled size. */
- cunrolli = false;
unloop_loops (loops_to_unloop, loops_to_unloop_nunroll,
edges_to_remove, loop_closed_ssa_invalidated,
@@ -1697,7 +1719,7 @@ pass_complete_unroll::execute (function *fun)
re-peeling the same loop multiple times. */
if (flag_peel_loops)
peeled_loops = BITMAP_ALLOC (NULL);
- unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size, true);
+ unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size, true, false);
if (peeled_loops)
{
BITMAP_FREE (peeled_loops);
@@ -1753,7 +1775,7 @@ pass_complete_unrolli::execute (function *fun)
if (number_of_loops (fun) > 1)
{
scev_initialize ();
- ret = tree_unroll_loops_completely (optimize >= 3, false);
+ ret = tree_unroll_loops_completely (optimize >= 3, false, true);
scev_finalize ();
}
loop_optimizer_finalize ();
--
2.34.1
More information about the Gcc-patches
mailing list