This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [SVE] PR86753
- From: Prathamesh Kulkarni <prathamesh dot kulkarni at linaro dot org>
- To: Richard Sandiford <richard dot sandiford at arm dot com>
- Cc: Richard Biener <richard dot guenther at gmail dot com>, gcc Patches <gcc-patches at gcc dot gnu dot org>
- Date: Mon, 9 Sep 2019 22:06:46 +0530
- Subject: Re: [SVE] PR86753
- References: <CAAgBjMnTn_3hiTxFPW-=QBp3=Pq0oCx1OhUbdRKwN9bWkGQ_UQ@mail.gmail.com> <CAAgBjM=usye_qrYR58sKxjFdSJwD4hEiZ_2FbF48NPMHQKyupg@mail.gmail.com> <CAFiYyc3ksU2GMcZ9vytPx-4DUhxxhva7mCci4ZKog+KdcAh8Bg@mail.gmail.com> <CAAgBjMm=_L9VoE3mDhFAtemz7_2MDbRiY-10=9yb=7GX9=ZOuA@mail.gmail.com> <mptsgpsxd6j.fsf@arm.com> <CAAgBjMkX786wQ2CgtBOTTY_ejD1Zp=KfmAnT08NFkDtNe3ZLJA@mail.gmail.com> <mptwof4vujx.fsf@arm.com> <CAAgBjMntcR5nBs96aD6_FJPGSjiCJVGsbGEk_WBxhrhgPAOQBA@mail.gmail.com> <CAFiYyc10BkstnK2f6X91S3JGZUX5QDz5zF_WDEiJdLxRRFEvwQ@mail.gmail.com> <CAAgBjM=0MN8j_uC2pp7+S6C2BfQArvtW6u2WDug2pEA4hUAzmw@mail.gmail.com> <mptblwbszdr.fsf@arm.com> <CAFiYyc2YRrjG3onU9BuEFvYCy6Mo=o=9XPdSNFEaswebV0xe_A@mail.gmail.com> <mpt8srests5.fsf@arm.com> <CAAgBjM=oZjX18-DkRXxPpU0U2Yx3iJwL0NkPN91L52yg4b7PSg@mail.gmail.com> <mpt1rx6r4tz.fsf@arm.com> <CAAgBjMkL7LKekotcEFzDPej-wvoOAZ+e4EF1J_1qyYWs_B1RsA@mail.gmail.com> <mpt1rx5psqw.fsf@arm.com> <CAFiYyc3FJMy67yFfkBaJuKXuKFrsBOAM3EVn+GSYARYbarHuQg@mail.gmail.com> <CAAgBjMknbQBT-CU-We_mZak5Yn+GLpQqbYs4+pkxLNO57kMh4A@mail.gmail.com> <mpt7e6nce43.fsf@arm.com> <CAAgBjM=BNtAkcj__EhBWqd5FYZgQJiMO_gR4aq49ixdE=_aAJQ@mail.gmail.com> <mpth85laffc.fsf@arm.com>
On Mon, 9 Sep 2019 at 16:45, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > With patch, the only following FAIL remains for aarch64-sve.exp:
> > FAIL: gcc.target/aarch64/sve/cond_unary_2.c -march=armv8.2-a+sve
> > scan-assembler-times \\tmovprfx\\t 6
> > which now contains 14.
> > Should I adjust the test, assuming the change isn't a regression ?
>
> Well, it is kind-of a regression, but it really just means that the
> integer code is now consistent with the floating-point code in having
> an unnecessary MOVPRFX. So I think adjusting the count is fine.
> Presumably any future fix for the existing redundant MOVPRFXs will
> apply to the new ones as well.
>
> The patch looks good to me, just some very minor nits:
>
> > @@ -8309,11 +8309,12 @@ vect_double_mask_nunits (tree type)
> >
> > /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
> > contain a sequence of NVECTORS masks that each control a vector of type
> > - VECTYPE. */
> > + VECTYPE. SCALAR_MASK if non-null, represents the mask used for corresponding
> > + load/store stmt. */
>
> Should be two spaces between sentences. Maybe:
>
> VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
> these vector masks with the vector version of SCALAR_MASK. */
>
> since the mask isn't necessarily for a load or store statement.
>
> > [...]
> > @@ -1879,7 +1879,8 @@ static tree permute_vec_elements (tree, tree, tree, stmt_vec_info,
> > says how the load or store is going to be implemented and GROUP_SIZE
> > is the number of load or store statements in the containing group.
> > If the access is a gather load or scatter store, GS_INFO describes
> > - its arguments.
> > + its arguments. SCALAR_MASK is the scalar mask used for corresponding
> > + load or store stmt.
>
> Maybe:
>
> its arguments. If the load or store is conditional, SCALAR_MASK is the
> condition under which it occurs.
>
> since SCALAR_MASK can be null here too.
>
> > [...]
> > @@ -9975,6 +9978,31 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
> > /* Handle cond expr. */
> > for (j = 0; j < ncopies; j++)
> > {
> > + tree loop_mask = NULL_TREE;
> > + bool swap_cond_operands = false;
> > +
> > + if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > + {
> > + scalar_cond_masked_key cond (cond_expr, ncopies);
> > + if (loop_vinfo->scalar_cond_masked_set.contains (cond))
> > + {
> > + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
> > + loop_mask = vect_get_loop_mask (gsi, masks, ncopies, vectype, j);
> > + }
> > + else
> > + {
> > + cond.code = invert_tree_comparison (cond.code,
> > + HONOR_NANS (TREE_TYPE (cond.op0)));
>
> Long line. Maybe just split it out into a separate assignment:
>
> bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0));
> cond.code = invert_tree_comparison (cond.code, honor_nans);
>
> > + if (loop_vinfo->scalar_cond_masked_set.contains (cond))
> > + {
> > + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
> > + loop_mask = vect_get_loop_mask (gsi, masks, ncopies, vectype, j);
>
> Long line here too.
>
> > [...]
> > @@ -10090,6 +10121,26 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
> > }
> > }
> > }
> > +
> > + if (loop_mask)
> > + {
> > + if (COMPARISON_CLASS_P (vec_compare))
> > + {
> > + tree tmp = make_ssa_name (vec_cmp_type);
> > + gassign *g = gimple_build_assign (tmp,
> > + TREE_CODE (vec_compare),
> > + TREE_OPERAND (vec_compare, 0),
> d> + TREE_OPERAND (vec_compare, 1));
>
> Two long lines.
>
> > + vect_finish_stmt_generation (stmt_info, g, gsi);
> > + vec_compare = tmp;
> > + }
> > +
> > + tree tmp2 = make_ssa_name (vec_cmp_type);
> > + gassign *g = gimple_build_assign (tmp2, BIT_AND_EXPR, vec_compare, loop_mask);
>
> Long line here too.
>
> > [...]
> > diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
> > index dc181524744..c4b2d8e8647 100644
> > --- a/gcc/tree-vectorizer.c
> > +++ b/gcc/tree-vectorizer.c
> > @@ -1513,3 +1513,39 @@ make_pass_ipa_increase_alignment (gcc::context *ctxt)
> > {
> > return new pass_ipa_increase_alignment (ctxt);
> > }
> > +
> > +/* If code(T) is comparison op or def of comparison stmt,
> > + extract it's operands.
> > + Else return <NE_EXPR, T, 0>. */
> > +
> > +void
> > +scalar_cond_masked_key::get_cond_ops_from_tree (tree t)
> > +{
> > + if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison)
> > + {
> > + this->code = TREE_CODE (t);
> > + this->op0 = TREE_OPERAND (t, 0);
> > + this->op1 = TREE_OPERAND (t, 1);
> > + return;
> > + }
> > +
> > + if (TREE_CODE (t) == SSA_NAME)
> > + {
> > + gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (t));
> > + if (stmt)
> > + {
>
> Might as well do this as:
>
> if (TREE_CODE (t) == SSA_NAME)
> if (gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (t)))
> {
>
> The patch (as hoped) introduces some XPASSes:
>
> XPASS: gcc.target/aarch64/sve/cond_cnot_2.c scan-assembler-not \\tsel\\t
> XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, #0\\.0\\n 21
> XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d\\n 42
> XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, #0\\.0\\n 15
> XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s\\n 30
> XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times \\tfcmgt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, #0\\.0\\n 21
> XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times \\tfcmgt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d\\n 42
> XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times \\tfcmgt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, #0\\.0\\n 15
> XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times \\tfcmgt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s\\n 30
> XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, #0\\.0\\n 21
> XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d\\n 42
> XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, #0\\.0\\n 15
> XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s\\n 30
> XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times \\tfcmlt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, #0\\.0\\n 21
> XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times \\tfcmlt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d\\n 42
> XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times \\tfcmlt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, #0\\.0\\n 15
> XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times \\tfcmlt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s\\n 30
> XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times \\tfcmuo\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d\\n 252
> XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times \\tfcmuo\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s\\n 180
> XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, #0\\.0 21
> XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d 42
> XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, #0\\.0 15
> XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s 30
> XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, #0\\.0 21
> XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d 42
> XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, #0\\.0 15
> XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s 30
>
> Could you remove the associated xfails (and comments above them where
> appropriate)?
>
> OK with those changes from my POV, but please give Richi a day or so
> to object.
>
> Thanks for doing this.
Thanks for the suggestions, I have updated the patch accordingly.
Boostrap+test in progress on x86_64-unknown-linux-gnu and aarch64-linux-gnu.
Richi, does the patch look OK to you ?
Thanks,
Prathamesh
>
> Richard
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c
index d689e21dc11..3df2431be38 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c
@@ -32,4 +32,4 @@ TEST_ALL (DEF_LOOP)
/* { dg-final { scan-assembler-not {\tmov\tz} } } */
/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
/* Currently we canonicalize the ?: so that !b[i] is the "false" value. */
-/* { dg-final { scan-assembler-not {\tsel\t} { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c
index 69468eb69be..d2ffcc758f3 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c
@@ -11,7 +11,10 @@
INT_TYPE *__restrict pred, int n) \
{ \
for (int i = 0; i < n; ++i) \
- r[i] = pred[i] ? (FLOAT_TYPE) a[i] : b[i]; \
+ { \
+ FLOAT_TYPE bi = b[i]; \
+ r[i] = pred[i] ? (FLOAT_TYPE) a[i] : bi; \
+ } \
}
#define TEST_ALL(T) \
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c
index 55b535fa0cf..d55aef0bb9a 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c
@@ -11,7 +11,10 @@
INT_TYPE *__restrict pred, int n) \
{ \
for (int i = 0; i < n; ++i) \
- r[i] = pred[i] ? (INT_TYPE) a[i] : b[i]; \
+ { \
+ INT_TYPE bi = b[i]; \
+ r[i] = pred[i] ? (INT_TYPE) a[i] : bi; \
+ } \
}
#define TEST_ALL(T) \
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c
index adf828398bb..68a9d2c3b6c 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c
@@ -13,7 +13,10 @@
TYPE *__restrict pred, int n) \
{ \
for (int i = 0; i < n; ++i) \
- r[i] = pred[i] ? OP (a[i]) : b[i]; \
+ { \
+ TYPE bi = b[i]; \
+ r[i] = pred[i] ? OP (a[i]) : bi; \
+ } \
}
#define TEST_INT_TYPE(T, TYPE) \
@@ -57,5 +60,5 @@ TEST_ALL (DEF_LOOP)
/* At the moment we don't manage to avoid using MOVPRFX for the
floating-point functions. */
/* { dg-final { scan-assembler-not {\tmovprfx\t} { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tmovprfx\t} 6 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\t} 14 } } */
/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c
index 5c04bcdb3f5..a1b0667dab5 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c
@@ -15,5 +15,9 @@ f (double *restrict a, double *restrict b, double *restrict c,
}
}
-/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+/* See https://gcc.gnu.org/ml/gcc-patches/2019-08/msg01644.html
+ for XFAILing the below test. */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
/* { dg-final { scan-assembler-not {\tfmad\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c
index 00d84760a19..b38f23e87ba 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c
@@ -98,24 +98,24 @@ TEST_CMP (nugt)
/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
/* 5 for lt, 5 for ult and 5 for nult. */
-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
/* 5 for le, 5 for ule and 5 for nule. */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
/* 5 for gt, 5 for ugt and 5 for nugt. */
-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
/* 5 for ge, 5 for uge and 5 for nuge. */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
/* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} } } */
/* 3 loops * 5 invocations for all 12 unordered comparisons. */
-/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 180 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 180 } } */
/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 7 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 14 { xfail *-*-* } } } */
@@ -123,19 +123,19 @@ TEST_CMP (nugt)
/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
/* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} } } */
/* 3 loops * 5 invocations, with 2 invocations having ncopies == 2,
for all 12 unordered comparisons. */
-/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 252 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 252 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c
index 23bfb7b2649..2f16fbff522 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c
@@ -19,16 +19,16 @@
/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 40 { xfail *-*-* } } } */
/* 5 for le, 5 for ule and 5 for nule. */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 } } */
/* 5 for gt, 5 for ugt, 5 for nueq and 5 for nugt. */
/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 20 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 40 { xfail *-*-* } } } */
/* 5 for ge, 5 for uge and 5 for nuge. */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 } } */
/* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} } } */
/* 3 loops * 5 invocations for ordered, unordered amd ueq. */
@@ -43,14 +43,14 @@
/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 28 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 56 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 } } */
/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 28 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 56 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 } } */
/* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} } } */
/* 3 loops * 5 invocations, with 2 invocations having ncopies == 2,
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index b0cbbac0cb5..acd8d67d2a1 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -7197,7 +7197,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
}
else
vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
- vectype_in);
+ vectype_in, NULL);
}
if (dump_enabled_p ()
&& reduction_type == FOLD_LEFT_REDUCTION)
@@ -8110,7 +8110,7 @@ vectorizable_live_operation (stmt_vec_info stmt_info,
gcc_assert (ncopies == 1 && !slp_node);
vect_record_loop_mask (loop_vinfo,
&LOOP_VINFO_MASKS (loop_vinfo),
- 1, vectype);
+ 1, vectype, NULL);
}
}
return true;
@@ -8309,11 +8309,12 @@ vect_double_mask_nunits (tree type)
/* Record that a fully-masked version of LOOP_VINFO would need MASKS to
contain a sequence of NVECTORS masks that each control a vector of type
- VECTYPE. */
+ VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
+ these vector masks with the vector version of SCALAR_MASK. */
void
vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
- unsigned int nvectors, tree vectype)
+ unsigned int nvectors, tree vectype, tree scalar_mask)
{
gcc_assert (nvectors != 0);
if (masks->length () < nvectors)
@@ -8329,6 +8330,12 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
rgm->max_nscalars_per_iter = nscalars_per_iter;
rgm->mask_type = build_same_sized_truth_vector_type (vectype);
}
+
+ if (scalar_mask)
+ {
+ scalar_cond_masked_key cond (scalar_mask, nvectors);
+ loop_vinfo->scalar_cond_masked_set.add (cond);
+ }
}
/* Given a complete set of masks MASKS, extract mask number INDEX
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index dd9d45a9547..a5d4902e140 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1879,7 +1879,8 @@ static tree permute_vec_elements (tree, tree, tree, stmt_vec_info,
says how the load or store is going to be implemented and GROUP_SIZE
is the number of load or store statements in the containing group.
If the access is a gather load or scatter store, GS_INFO describes
- its arguments.
+ its arguments. If the load or store is conditional, SCALAR_MASK is the
+ condition under which it occurs.
Clear LOOP_VINFO_CAN_FULLY_MASK_P if a fully-masked loop is not
supported, otherwise record the required mask types. */
@@ -1888,7 +1889,7 @@ static void
check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
vec_load_store_type vls_type, int group_size,
vect_memory_access_type memory_access_type,
- gather_scatter_info *gs_info)
+ gather_scatter_info *gs_info, tree scalar_mask)
{
/* Invariant loads need no special support. */
if (memory_access_type == VMAT_INVARIANT)
@@ -1912,7 +1913,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
return;
}
unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype);
- vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype);
+ vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask);
return;
}
@@ -1936,7 +1937,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
return;
}
unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype);
- vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype);
+ vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask);
return;
}
@@ -1974,7 +1975,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
unsigned int nvectors;
if (can_div_away_from_zero_p (group_size * vf, nunits, &nvectors))
- vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype);
+ vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask);
else
gcc_unreachable ();
}
@@ -3436,7 +3437,9 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
unsigned int nvectors = (slp_node
? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
: ncopies);
- vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out);
+ tree scalar_mask = gimple_call_arg (stmt_info->stmt, mask_opno);
+ vect_record_loop_mask (loop_vinfo, masks, nvectors,
+ vectype_out, scalar_mask);
}
return true;
}
@@ -7390,7 +7393,7 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
if (loop_vinfo
&& LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
check_load_store_masking (loop_vinfo, vectype, vls_type, group_size,
- memory_access_type, &gs_info);
+ memory_access_type, &gs_info, mask);
STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
vect_model_store_cost (stmt_info, ncopies, rhs_dt, memory_access_type,
@@ -8637,7 +8640,7 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
if (loop_vinfo
&& LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
check_load_store_masking (loop_vinfo, vectype, VLS_LOAD, group_size,
- memory_access_type, &gs_info);
+ memory_access_type, &gs_info, mask);
STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
vect_model_load_cost (stmt_info, ncopies, memory_access_type,
@@ -9975,6 +9978,32 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
/* Handle cond expr. */
for (j = 0; j < ncopies; j++)
{
+ tree loop_mask = NULL_TREE;
+ bool swap_cond_operands = false;
+
+ if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ {
+ scalar_cond_masked_key cond (cond_expr, ncopies);
+ if (loop_vinfo->scalar_cond_masked_set.contains (cond))
+ {
+ vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+ loop_mask = vect_get_loop_mask (gsi, masks, ncopies, vectype, j);
+ }
+ else
+ {
+ bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0));
+ cond.code = invert_tree_comparison (cond.code, honor_nans);
+ if (loop_vinfo->scalar_cond_masked_set.contains (cond))
+ {
+ vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+ loop_mask = vect_get_loop_mask (gsi, masks, ncopies,
+ vectype, j);
+ cond_code = cond.code;
+ swap_cond_operands = true;
+ }
+ }
+ }
+
stmt_vec_info new_stmt_info = NULL;
if (j == 0)
{
@@ -10052,6 +10081,9 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
vec_then_clause = vec_oprnds2[i];
vec_else_clause = vec_oprnds3[i];
+ if (swap_cond_operands)
+ std::swap (vec_then_clause, vec_else_clause);
+
if (masked)
vec_compare = vec_cond_lhs;
else
@@ -10090,6 +10122,28 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
}
}
}
+
+ if (loop_mask)
+ {
+ if (COMPARISON_CLASS_P (vec_compare))
+ {
+ tree tmp = make_ssa_name (vec_cmp_type);
+ tree op0 = TREE_OPERAND (vec_compare, 0);
+ tree op1 = TREE_OPERAND (vec_compare, 1);
+ gassign *g = gimple_build_assign (tmp,
+ TREE_CODE (vec_compare),
+ op0, op1);
+ vect_finish_stmt_generation (stmt_info, g, gsi);
+ vec_compare = tmp;
+ }
+
+ tree tmp2 = make_ssa_name (vec_cmp_type);
+ gassign *g = gimple_build_assign (tmp2, BIT_AND_EXPR,
+ vec_compare, loop_mask);
+ vect_finish_stmt_generation (stmt_info, g, gsi);
+ vec_compare = tmp2;
+ }
+
if (reduction_type == EXTRACT_LAST_REDUCTION)
{
if (!is_gimple_val (vec_compare))
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index dc181524744..e44eb3dda07 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -1513,3 +1513,36 @@ make_pass_ipa_increase_alignment (gcc::context *ctxt)
{
return new pass_ipa_increase_alignment (ctxt);
}
+
+/* If code(T) is comparison op or def of comparison stmt,
+ extract it's operands.
+ Else return <NE_EXPR, T, 0>. */
+
+void
+scalar_cond_masked_key::get_cond_ops_from_tree (tree t)
+{
+ if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison)
+ {
+ this->code = TREE_CODE (t);
+ this->op0 = TREE_OPERAND (t, 0);
+ this->op1 = TREE_OPERAND (t, 1);
+ return;
+ }
+
+ if (TREE_CODE (t) == SSA_NAME)
+ if (gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (t)))
+ {
+ tree_code code = gimple_assign_rhs_code (stmt);
+ if (TREE_CODE_CLASS (code) == tcc_comparison)
+ {
+ this->code = code;
+ this->op0 = gimple_assign_rhs1 (stmt);
+ this->op1 = gimple_assign_rhs2 (stmt);
+ return;
+ }
+ }
+
+ this->code = NE_EXPR;
+ this->op0 = t;
+ this->op1 = build_zero_cst (TREE_TYPE (t));
+}
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1456cde4c2c..e20a61ee33f 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -26,6 +26,7 @@ typedef class _stmt_vec_info *stmt_vec_info;
#include "tree-data-ref.h"
#include "tree-hash-traits.h"
#include "target.h"
+#include "hash-set.h"
/* Used for naming of new temporaries. */
enum vect_var_kind {
@@ -174,7 +175,71 @@ public:
#define SLP_TREE_TWO_OPERATORS(S) (S)->two_operators
#define SLP_TREE_DEF_TYPE(S) (S)->def_type
+struct scalar_cond_masked_key
+{
+ scalar_cond_masked_key (tree t, unsigned ncopies_)
+ : ncopies (ncopies_)
+ {
+ get_cond_ops_from_tree (t);
+ }
+
+ void get_cond_ops_from_tree (tree);
+
+ unsigned ncopies;
+ tree_code code;
+ tree op0;
+ tree op1;
+};
+template<>
+struct default_hash_traits<scalar_cond_masked_key>
+{
+ typedef scalar_cond_masked_key compare_type;
+ typedef scalar_cond_masked_key value_type;
+
+ static inline hashval_t
+ hash (value_type v)
+ {
+ inchash::hash h;
+ h.add_int (v.code);
+ inchash::add_expr (v.op0, h, 0);
+ inchash::add_expr (v.op1, h, 0);
+ h.add_int (v.ncopies);
+ return h.end ();
+ }
+
+ static inline bool
+ equal (value_type existing, value_type candidate)
+ {
+ return (existing.ncopies == candidate.ncopies
+ && existing.code == candidate.code
+ && operand_equal_p (existing.op0, candidate.op0, 0)
+ && operand_equal_p (existing.op1, candidate.op1, 0));
+ }
+
+ static inline void
+ mark_empty (value_type &v)
+ {
+ v.ncopies = 0;
+ }
+
+ static inline bool
+ is_empty (value_type v)
+ {
+ return v.ncopies == 0;
+ }
+
+ static inline void mark_deleted (value_type &) {}
+
+ static inline bool is_deleted (const value_type &)
+ {
+ return false;
+ }
+
+ static inline void remove (value_type &) {}
+};
+
+typedef hash_set<scalar_cond_masked_key> scalar_cond_masked_set_type;
/* Describes two objects whose addresses must be unequal for the vectorized
loop to be valid. */
@@ -255,6 +320,9 @@ public:
/* Cost data used by the target cost model. */
void *target_cost_data;
+ /* Set of scalar conditions that have loop mask applied. */
+ scalar_cond_masked_set_type scalar_cond_masked_set;
+
private:
stmt_vec_info new_stmt_vec_info (gimple *stmt);
void set_vinfo_for_stmt (gimple *, stmt_vec_info);
@@ -1617,7 +1685,7 @@ extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *,
extern tree vect_halve_mask_nunits (tree);
extern tree vect_double_mask_nunits (tree);
extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *,
- unsigned int, tree);
+ unsigned int, tree, tree);
extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
unsigned int, tree, unsigned int);