This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH] 2015-07-29 Benedikt Huber <benedikt dot huber at theobroma-systems dot com> Philipp Tomsich <philipp dot tomsich at theobroma-systems dot com>
- From: Kyrill Tkachov <kyrylo dot tkachov at arm dot com>
- To: Benedikt Huber <benedikt dot huber at theobroma-systems dot com>, "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Cc: "philipp dot tomsich at theobroma-systems dot com" <philipp dot tomsich at theobroma-systems dot com>, "Venkataramanan dot Kumar at amd dot com" <Venkataramanan dot Kumar at amd dot com>, "pinskia at gmail dot com" <pinskia at gmail dot com>, "e dot menezes at samsung dot com" <e dot menezes at samsung dot com>
- Date: Thu, 30 Jul 2015 13:31:24 +0100
- Subject: Re: [PATCH] 2015-07-29 Benedikt Huber <benedikt dot huber at theobroma-systems dot com> Philipp Tomsich <philipp dot tomsich at theobroma-systems dot com>
- Authentication-results: sourceware.org; auth=none
- References: <1438192065-29602-1-git-send-email-benedikt dot huber at theobroma-systems dot com> <1438192065-29602-2-git-send-email-benedikt dot huber at theobroma-systems dot com>
Hi Benedikt,
On 29/07/15 18:47, Benedikt Huber wrote:
* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and
rsqrtf.
* config/aarch64/aarch64-opts.h: -mrecip has a default value
depending on the core.
* config/aarch64/aarch64-protos.h: Declare.
* config/aarch64/aarch64-simd.md: Matching expressions for
frsqrte and frsqrts.
* config/aarch64/aarch64.c: New functions. Emit rsqrt
estimation code in fast math mode.
* config/aarch64/aarch64.md: Added enum entries.
* config/aarch64/aarch64.opt: Added options -mrecip and
-mlow-precision-recip-sqrt.
* testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
for frsqrte and frsqrts
* testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt.
<snip>
+
/* Expand an expression EXP that calls a built-in function,
with result going to TARGET if that's convenient. */
rtx
@@ -1139,6 +1213,13 @@ aarch64_expand_builtin (tree exp,
else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
return aarch64_crc32_expand_builtin (fcode, exp, target);
+ if (fcode == AARCH64_BUILTIN_RSQRT_DF ||
+ fcode == AARCH64_BUILTIN_RSQRT_SF ||
+ fcode == AARCH64_BUILTIN_RSQRT_V2DF ||
+ fcode == AARCH64_BUILTIN_RSQRT_V2SF ||
+ fcode == AARCH64_BUILTIN_RSQRT_V4SF)
+ return aarch64_expand_builtin_rsqrt (fcode, exp, target);
+
The style guidelines are to put the '||' on a new line like so:
(fcode == AARCH64_BUILTIN_RSQRT_DF
|| fcode == AARCH64_BUILTIN_RSQRT_SF
|| ...
gcc_unreachable ();
}
@@ -1296,6 +1377,28 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
return NULL_TREE;
}
+tree
+aarch64_builtin_rsqrt (unsigned int fn, bool md_fn)
+{
+ if (md_fn)
+ {
+ if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
+ return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
+ if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf)
+ return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF];
+ if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf)
+ return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF];
+ }
+ else
+ {
+ if (fn == BUILT_IN_SQRT)
+ return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF];
+ if (fn == BUILT_IN_SQRTF)
+ return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF];
+ }
+ return NULL_TREE;
+}
+
#undef VAR1
#define VAR1(T, N, MAP, A) \
case AARCH64_SIMD_BUILTIN_##T##_##N##A:
diff --git a/gcc/config/aarch64/aarch64-opts.h b/gcc/config/aarch64/aarch64-opts.h
index 24bfd9f..75e9c67 100644
--- a/gcc/config/aarch64/aarch64-opts.h
+++ b/gcc/config/aarch64/aarch64-opts.h
@@ -64,4 +64,11 @@ enum aarch64_code_model {
AARCH64_CMODEL_LARGE
};
+/* Each core can have -mrecip enabled or disabled by default. */
+enum aarch64_mrecip {
+ AARCH64_MRECIP_OFF = 0,
+ AARCH64_MRECIP_ON,
+ AARCH64_MRECIP_DEFAULT,
+};
+
#endif
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 4062c27..dabe345 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -197,6 +197,7 @@ struct tune_params
int min_div_recip_mul_sf;
int min_div_recip_mul_df;
unsigned int extra_tuning_flags;
+ bool mrecip_default_enabled;
};
Please create a new field to put into the extra_tuning_flags.
This involves adding an entry aarch64-tuning-flags.def.
This way we avoid overpopulating the tuning structure with a multitude of
on/off boolean options.
#define AARCH64_FUSION_PAIR(x, name, index) \
@@ -321,6 +322,8 @@ void aarch64_print_operand (FILE *, rtx, char);
void aarch64_print_operand_address (FILE *, rtx);
void aarch64_emit_call_insn (rtx);
+void aarch64_emit_swrsqrt (rtx, rtx);
+
/* Initialize builtins for SIMD intrinsics. */
void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index b90f938..ac8e12d 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -353,6 +353,33 @@
[(set_attr "type" "neon_fp_mul_d_scalar_q")]
)
+(define_insn "*rsqrte_<mode>"
+ [(set (match_operand:VALLF 0 "register_operand" "=w")
+ (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+ UNSPEC_RSQRTE))]
+ "TARGET_SIMD"
+ "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
+ [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
+
+(define_insn "*rsqrts_<mode>"
+ [(set (match_operand:VALLF 0 "register_operand" "=w")
+ (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
+ (match_operand:VALLF 2 "register_operand" "w")]
+ UNSPEC_RSQRTS))]
+ "TARGET_SIMD"
+ "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
+ [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
+
+(define_expand "rsqrt_<mode>"
+ [(set (match_operand:VALLF 0 "register_operand" "=w")
+ (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+ UNSPEC_RSQRT))]
+ "TARGET_SIMD"
+{
+ aarch64_emit_swrsqrt (operands[0], operands[1]);
+ DONE;
+})
Please add the number of operands to the pattern names, for example:
"rsqrte_<mode>2"
+
(define_insn "*aarch64_mul3_elt_to_64v2df"
[(set (match_operand:DF 0 "register_operand" "=w")
(mult:DF
<snip>
/* Support for fine-grained override of the tuning structures. */
@@ -6961,6 +6967,66 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
return aarch64_tune_params.memmov_cost;
}
+extern tree aarch64_builtin_rsqrt (unsigned int fn, bool md_fn);
+
+static tree
+aarch64_builtin_reciprocal (unsigned int fn,
+ bool md_fn,
+ bool)
+{
+ if (!flag_finite_math_only ||
+ flag_trapping_math ||
+ !flag_unsafe_math_optimizations ||
+ optimize_size ||
+ flag_mrecip == AARCH64_MRECIP_OFF ||
+ (flag_mrecip == AARCH64_MRECIP_DEFAULT &&
+ !aarch64_tune_params.mrecip_default_enabled))
Similar comment on the placement of the '||'.
+ {
+ return NULL_TREE;
+ }
+
+ return aarch64_builtin_rsqrt (fn, md_fn);
+}
+
+void
+aarch64_emit_swrsqrt (rtx dst, rtx src)
+{
+ enum machine_mode mode = GET_MODE (src);
+ gcc_assert (
+ mode == SFmode || mode == V2SFmode || mode == V4SFmode ||
+ mode == DFmode || mode == V2DFmode);
+
+ rtx xsrc = gen_reg_rtx (mode);
+ emit_move_insn (xsrc, src);
+ rtx x0 = gen_reg_rtx (mode);
+ emit_set_insn (x0,
+ gen_rtx_UNSPEC (mode, gen_rtvec (1, xsrc), UNSPEC_RSQRTE));
+
+ bool double_mode = (mode == DFmode || mode == V2DFmode);
+
+ int iterations = 2;
+ if (double_mode)
+ iterations = 3;
+
+ if (flag_mrecip_low_precision_sqrt)
+ iterations--;
+
+ for (int i = 0; i < iterations; ++i)
+ {
+ rtx x1 = gen_reg_rtx (mode);
+ rtx x2 = gen_reg_rtx (mode);
+ rtx x3 = gen_reg_rtx (mode);
+ emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+ emit_set_insn (x3, gen_rtx_UNSPEC (
+ mode, gen_rtvec (2, xsrc, x2), UNSPEC_RSQRTS));
+ emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
+ x0 = x1;
You can probably avoid manually creating these RTXes and risking
mis-constructing them (or someone modifying them by mistake in the future)
by just calling the gen_* functions for the patterns you want to generate.
For example, instead of doing:
+ emit_set_insn (x3, gen_rtx_UNSPEC (
+ mode, gen_rtvec (2, xsrc, x2), UNSPEC_RSQRTS));
You can probably just do:
emit_insn (gen_rsqrts_{sf,df}2 (x3, x2));
This will require you to rename the patterns in aarch64.md to not have a '*'
in their name.
+
+ emit_move_insn (dst, x0);
+ return;
+}
+
/* Return the number of instructions that can be issued per cycle. */
static int
aarch64_sched_issue_rate (void)
@@ -12099,6 +12165,9 @@ aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
#define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
+#undef TARGET_BUILTIN_RECIPROCAL
+#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
+
#undef TARGET_VECTOR_MODE_SUPPORTED_P
#define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 1e343fa..d7944b2 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -122,6 +122,9 @@
UNSPEC_VSTRUCTDUMMY
UNSPEC_SP_SET
UNSPEC_SP_TEST
+ UNSPEC_RSQRT
+ UNSPEC_RSQRTE
+ UNSPEC_RSQRTS
])
(define_c_enum "unspecv" [
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index 98ef9f6..581dc21 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -124,3 +124,11 @@ Enum(aarch64_abi) String(ilp32) Value(AARCH64_ABI_ILP32)
EnumValue
Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
+
+mrecip
+Common Report Var(flag_mrecip) Optimization Init(AARCH64_MRECIP_DEFAULT)
+Generate software reciprocal square root for better throughput. Default with fast-math.
s/fast-math/-ffast-math/.
Also, this is not the default with -ffast-math. The default now
depends on the tuning core, right?
Thanks,
Kyrill