This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH][AArch64][1/2] Add fmul-by-power-of-2+fcvt optimisation
- From: Kyrill Tkachov <kyrylo dot tkachov at arm dot com>
- To: GCC Patches <gcc-patches at gcc dot gnu dot org>
- Cc: Marcus Shawcroft <marcus dot shawcroft at arm dot com>, Richard Earnshaw <Richard dot Earnshaw at arm dot com>, James Greenhalgh <james dot greenhalgh at arm dot com>
- Date: Mon, 19 Oct 2015 14:57:07 +0100
- Subject: [PATCH][AArch64][1/2] Add fmul-by-power-of-2+fcvt optimisation
- Authentication-results: sourceware.org; auth=none
Hi all,
The fcvtzs and fcvtzu instructions have a form where they convert to a fixed-point form with a specified number of
fractional bits. In practice this has the effect of multiplying the floating point argument by 2^<number of fractional bits>
and then converting the result to integer. We can exploit that behaviour during combine to eliminate a floating-point multiplication
by an FP immediate that is a power of 2 i.e. 4.0, 8.0, 16.0 etc.
For example for code:
int
sffoo1 (float a)
{
return a * 4.0f;
}
we currently generate:
sffoo1:
fmov s1, 4.0e+0
fmul s0, s0, s1
fcvtzs w0, s0
ret
with this patch we can generate:
sffoo1:
fcvtzs w0, s0, #2
ret
We already pefrom the analogous combination for the arm target (see the *combine_vcvtf2i pattern in config/arm/vfp.md)
However, this patch also implements the fcvtzu form i.e. the unsigned_fix form as well as the vector forms.
However, not everything is rosy. The code:
int
foo (float a)
{
return a * 32.0f;
}
will not trigger the optimisation because 32.0f is stored in the constant pool and due to a deficiency in
simplify-rtx.c the simplification doesn't get through. I have a patch to fix that as part 2/2.
Also, for code:
int
foo (float a)
{
return a * 2.0f;
}
This gets folded early on as a + a and thus emits an fadd instruction followed by a fcvtzs.
Nothing we can do about that (in this patch at least).
I've seen this trigger once in 453.povray in SPEC2006 and one other time in 435.gromacs after
patch 2/2 is applied. I've heard this can also trigger in codec-like codebases and I did see it
trigger a few times in ffmpeg.
Bootstrapped and tested on aarch64.
Ok for trunk?
Thanks,
Kyrill
2015-10-19 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* config/aarch64/aarch64.md
(*aarch64_fcvt<su_optab><GPF:mode><GPI:mode>2_mult): New pattern.
* config/aarch64/aarch64-simd.md
(*aarch64_fcvt<su_optab><VDQF:mode><fcvt_target>2_mult): Likewise.
* config/aarch64/aarch64.c (aarch64_rtx_costs): Handle above patterns.
(aarch64_fpconst_pow_of_2): New function.
(aarch64_vec_fpconst_pow_of_2): Likewise.
* config/aarch64/aarch64-protos.h (aarch64_fpconst_pow_of_2): Declare
prototype.
(aarch64_vec_fpconst_pow_of_2): Likewise.
* config/aarch64/predicates.md (aarch64_fp_pow2): New predicate.
(aarch64_fp_vec_pow2): Likewise.
2015-10-19 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* gcc.target/aarch64/fmul_fcvt_1.c: New test.
* gcc.target/aarch64/fmul_fcvt_2.c: Likewise.
commit a13a5967a1f94744776d616ca84d5512b24bf546
Author: Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Date: Thu Oct 8 15:17:47 2015 +0100
[AArch64] Add fmul+fcvt optimisation
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index a8ac8d3..309dcfb 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -294,12 +294,14 @@ enum aarch64_symbol_type aarch64_classify_symbol (rtx, rtx);
enum aarch64_symbol_type aarch64_classify_tls_symbol (rtx);
enum reg_class aarch64_regno_regclass (unsigned);
int aarch64_asm_preferred_eh_data_format (int, int);
+int aarch64_fpconst_pow_of_2 (rtx);
machine_mode aarch64_hard_regno_caller_save_mode (unsigned, unsigned,
machine_mode);
int aarch64_hard_regno_mode_ok (unsigned, machine_mode);
int aarch64_hard_regno_nregs (unsigned, machine_mode);
int aarch64_simd_attr_length_move (rtx_insn *);
int aarch64_uxt_size (int, HOST_WIDE_INT);
+int aarch64_vec_fpconst_pow_of_2 (rtx);
rtx aarch64_final_eh_return_addr (void);
rtx aarch64_legitimize_reload_address (rtx *, machine_mode, int, int, int);
const char *aarch64_output_move_struct (rtx *operands);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 6a2ab61..3d2c496 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1654,6 +1654,27 @@ (define_insn "l<fcvt_pattern><su_optab><VDQF:mode><fcvt_target>2"
[(set_attr "type" "neon_fp_to_int_<Vetype><q>")]
)
+(define_insn "*aarch64_fcvt<su_optab><VDQF:mode><fcvt_target>2_mult"
+ [(set (match_operand:<FCVT_TARGET> 0 "register_operand" "=w")
+ (FIXUORS:<FCVT_TARGET> (unspec:<FCVT_TARGET>
+ [(mult:VDQF
+ (match_operand:VDQF 1 "register_operand" "w")
+ (match_operand:VDQF 2 "aarch64_fp_vec_pow2" ""))]
+ UNSPEC_FRINTZ)))]
+ "TARGET_SIMD
+ && IN_RANGE (aarch64_vec_fpconst_pow_of_2 (operands[2]), 1,
+ GET_MODE_BITSIZE (GET_MODE_INNER (<VDQF:MODE>mode)))"
+ {
+ int fbits = aarch64_vec_fpconst_pow_of_2 (operands[2]);
+ char buf[64];
+ sprintf (buf, "fcvtz<su>\\t%%0.<Vtype>, %%1.<Vtype>, #%d", fbits);
+ output_asm_insn (buf, operands);
+ return "";
+ }
+ [(set_attr "type" "neon_fp_to_int_<Vetype><q>")]
+)
+
+
(define_expand "<optab><VDQF:mode><fcvt_target>2"
[(set (match_operand:<FCVT_TARGET> 0 "register_operand")
(FIXUORS:<FCVT_TARGET> (unspec:<FCVT_TARGET>
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2ec76a5..9b76746 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -6808,6 +6808,19 @@ cost_plus:
else
*cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
}
+
+ /* We can combine fmul by a power of 2 followed by a fcvt into a single
+ fixed-point fcvt. */
+ if (GET_CODE (x) == MULT
+ && ((VECTOR_MODE_P (mode)
+ && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
+ || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
+ {
+ *cost += rtx_cost (XEXP (x, 0), VOIDmode,
+ (enum rtx_code) code, 0, speed);
+ return true;
+ }
+
*cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
return true;
@@ -13386,6 +13399,54 @@ aarch64_reorg (void)
#undef TARGET_MACHINE_DEPENDENT_REORG
#define TARGET_MACHINE_DEPENDENT_REORG aarch64_reorg
+
+/* If X is a positive CONST_DOUBLE with a value that is a power of 2
+ return the log2 of that value. Otherwise return -1. */
+
+int
+aarch64_fpconst_pow_of_2 (rtx x)
+{
+ const REAL_VALUE_TYPE *r;
+
+ if (!CONST_DOUBLE_P (x))
+ return -1;
+
+ r = CONST_DOUBLE_REAL_VALUE (x);
+
+ if (REAL_VALUE_NEGATIVE (*r)
+ || REAL_VALUE_ISNAN (*r)
+ || REAL_VALUE_ISINF (*r)
+ || !real_isinteger (r, DFmode))
+ return -1;
+
+ return exact_log2 (real_to_integer (r));
+}
+
+/* If X is a vector of equal CONST_DOUBLE values and that value is
+ Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
+
+int
+aarch64_vec_fpconst_pow_of_2 (rtx x)
+{
+ if (GET_CODE (x) != CONST_VECTOR)
+ return -1;
+
+ if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
+ return -1;
+
+ int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
+ if (firstval <= 0)
+ return -1;
+
+ int count = CONST_VECTOR_NUNITS (x);
+ int i;
+ for (i = 1; i < count; i++)
+ if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
+ return -1;
+
+ return firstval;
+}
+
/* Implement TARGET_PROMOTED_TYPE to promote __fp16 to float. */
static tree
aarch64_promoted_type (const_tree t)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index bb821a3..6a9d52c 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4287,6 +4287,25 @@ (define_insn "l<fcvt_pattern><su_optab><GPF:mode><GPI:mode>2"
[(set_attr "type" "f_cvtf2i")]
)
+(define_insn "*aarch64_fcvt<su_optab><GPF:mode><GPI:mode>2_mult"
+ [(set (match_operand:GPI 0 "register_operand" "=r")
+ (FIXUORS:GPI
+ (mult:GPF
+ (match_operand:GPF 1 "register_operand" "w")
+ (match_operand:GPF 2 "aarch64_fp_pow2" "F"))))]
+ "TARGET_FLOAT
+ && IN_RANGE (aarch64_fpconst_pow_of_2 (operands[2]), 1,
+ GET_MODE_BITSIZE (<GPI:MODE>mode))"
+ {
+ int fbits = aarch64_fpconst_pow_of_2 (operands[2]);
+ char buf[64];
+ sprintf (buf, "fcvtz<su>\\t%%<GPI:w>0, %%<GPF:s>1, #%d", fbits);
+ output_asm_insn (buf, operands);
+ return "";
+ }
+ [(set_attr "type" "f_cvtf2i")]
+)
+
;; fma - no throw
(define_insn "fma<mode>4"
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 8af4b81..1bcbf62 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -87,6 +87,13 @@ (define_predicate "aarch64_fp_compare_operand"
(and (match_code "const_double")
(match_test "aarch64_float_const_zero_rtx_p (op)"))))
+(define_predicate "aarch64_fp_pow2"
+ (and (match_code "const_double")
+ (match_test "aarch64_fpconst_pow_of_2 (op) > 0")))
+
+(define_predicate "aarch64_fp_vec_pow2"
+ (match_test "aarch64_vec_fpconst_pow_of_2 (op) > 0"))
+
(define_predicate "aarch64_plus_immediate"
(and (match_code "const_int")
(ior (match_test "aarch64_uimm12_shift (INTVAL (op))")
diff --git a/gcc/testsuite/gcc.target/aarch64/fmul_fcvt_1.c b/gcc/testsuite/gcc.target/aarch64/fmul_fcvt_1.c
new file mode 100644
index 0000000..5af8290
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmul_fcvt_1.c
@@ -0,0 +1,129 @@
+/* { dg-do run } */
+/* { dg-options "-save-temps -O2 -fno-inline" } */
+
+#define FUNC_DEFS(__a) \
+int \
+sffoo##__a (float x) \
+{ \
+ return x * __a##.0f; \
+} \
+ \
+unsigned int \
+usffoo##__a (float x) \
+{ \
+ return x * __a##.0f; \
+} \
+ \
+long \
+lsffoo##__a (float x) \
+{ \
+ return x * __a##.0f; \
+} \
+ \
+unsigned long \
+ulsffoo##__a (float x) \
+{ \
+ return x * __a##.0f; \
+}
+
+#define FUNC_DEFD(__a) \
+long \
+dffoo##__a (double x) \
+{ \
+ return x * __a##.0; \
+} \
+ \
+unsigned long \
+udffoo##__a (double x) \
+{ \
+ return x * __a##.0; \
+} \
+int \
+sdffoo##__a (double x) \
+{ \
+ return x * __a##.0; \
+} \
+ \
+unsigned int \
+usdffoo##__a (double x) \
+{ \
+ return x * __a##.0; \
+}
+
+FUNC_DEFS (4)
+FUNC_DEFD (4)
+/* { dg-final { scan-assembler "fcvtzs\tw\[0-9\], s\[0-9\]*.*#2" } } */
+/* { dg-final { scan-assembler "fcvtzs\tx\[0-9\], s\[0-9\]*.*#2" } } */
+/* { dg-final { scan-assembler "fcvtzs\tx\[0-9\], d\[0-9\]*.*#2" } } */
+/* { dg-final { scan-assembler "fcvtzs\tw\[0-9\], d\[0-9\]*.*#2" } } */
+/* { dg-final { scan-assembler "fcvtzu\tw\[0-9\], s\[0-9\]*.*#2" } } */
+/* { dg-final { scan-assembler "fcvtzu\tx\[0-9\], s\[0-9\]*.*#2" } } */
+/* { dg-final { scan-assembler "fcvtzu\tx\[0-9\], d\[0-9\]*.*#2" } } */
+/* { dg-final { scan-assembler "fcvtzu\tw\[0-9\], d\[0-9\]*.*#2" } } */
+
+FUNC_DEFS (8)
+FUNC_DEFD (8)
+/* { dg-final { scan-assembler "fcvtzs\tw\[0-9\], s\[0-9\]*.*#3" } } */
+/* { dg-final { scan-assembler "fcvtzs\tx\[0-9\], s\[0-9\]*.*#3" } } */
+/* { dg-final { scan-assembler "fcvtzs\tx\[0-9\], d\[0-9\]*.*#3" } } */
+/* { dg-final { scan-assembler "fcvtzs\tw\[0-9\], d\[0-9\]*.*#3" } } */
+/* { dg-final { scan-assembler "fcvtzu\tw\[0-9\], s\[0-9\]*.*#3" } } */
+/* { dg-final { scan-assembler "fcvtzu\tx\[0-9\], s\[0-9\]*.*#3" } } */
+/* { dg-final { scan-assembler "fcvtzu\tx\[0-9\], d\[0-9\]*.*#3" } } */
+/* { dg-final { scan-assembler "fcvtzu\tw\[0-9\], d\[0-9\]*.*#3" } } */
+
+FUNC_DEFS (16)
+FUNC_DEFD (16)
+/* { dg-final { scan-assembler "fcvtzs\tw\[0-9\], s\[0-9\]*.*#4" } } */
+/* { dg-final { scan-assembler "fcvtzs\tx\[0-9\], s\[0-9\]*.*#4" } } */
+/* { dg-final { scan-assembler "fcvtzs\tx\[0-9\], d\[0-9\]*.*#4" } } */
+/* { dg-final { scan-assembler "fcvtzs\tw\[0-9\], d\[0-9\]*.*#4" } } */
+/* { dg-final { scan-assembler "fcvtzu\tw\[0-9\], s\[0-9\]*.*#4" } } */
+/* { dg-final { scan-assembler "fcvtzu\tx\[0-9\], s\[0-9\]*.*#4" } } */
+/* { dg-final { scan-assembler "fcvtzu\tx\[0-9\], d\[0-9\]*.*#4" } } */
+/* { dg-final { scan-assembler "fcvtzu\tw\[0-9\], d\[0-9\]*.*#4" } } */
+
+
+#define FUNC_TESTS(__a, __b) \
+do \
+ { \
+ if (sffoo##__a (__b) != (int)(__b * __a)) \
+ __builtin_abort (); \
+ if (usffoo##__a (__b) != (unsigned int)(__b * __a)) \
+ __builtin_abort (); \
+ if (lsffoo##__a (__b) != (long)(__b * __a)) \
+ __builtin_abort (); \
+ if (ulsffoo##__a (__b) != (unsigned long)(__b * __a)) \
+ __builtin_abort (); \
+ } while (0)
+
+#define FUNC_TESTD(__a, __b) \
+do \
+ { \
+ if (dffoo##__a (__b) != (long)(__b * __a)) \
+ __builtin_abort (); \
+ if (udffoo##__a (__b) != (unsigned long)(__b * __a)) \
+ __builtin_abort (); \
+ if (sdffoo##__a (__b) != (int)(__b * __a)) \
+ __builtin_abort (); \
+ if (usdffoo##__a (__b) != (unsigned int)(__b * __a)) \
+ __builtin_abort (); \
+ } while (0)
+
+int
+main (void)
+{
+ float i;
+
+ for (i = -0.001; i < 32.0; i += 1.0f)
+ {
+ FUNC_TESTS (4, i);
+ FUNC_TESTS (8, i);
+ FUNC_TESTS (16, i);
+
+ FUNC_TESTD (4, i);
+ FUNC_TESTD (8, i);
+ FUNC_TESTD (16, i);
+ }
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fmul_fcvt_2.c b/gcc/testsuite/gcc.target/aarch64/fmul_fcvt_2.c
new file mode 100644
index 0000000..39cf890
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmul_fcvt_2.c
@@ -0,0 +1,67 @@
+/* { dg-do run } */
+/* { dg-options "-save-temps -O2 -ftree-vectorize -fno-inline" } */
+
+#define N 1024
+
+#define FUNC_DEF(__a) \
+void \
+foo##__a (float *a, int *b) \
+{ \
+ int i; \
+ for (i = 0; i < N; i++) \
+ b[i] = a[i] * __a##.0f; \
+}
+
+FUNC_DEF (4)
+FUNC_DEF (8)
+FUNC_DEF (16)
+
+int ints[N];
+float floats[N];
+
+void
+reset_ints (int *arr)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ arr[i] = 0;
+}
+
+void
+check_result (int *is, int n)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ if (is[i] != i * n)
+ __builtin_abort ();
+}
+
+#define FUNC_CHECK(__a) \
+do \
+ { \
+ reset_ints (ints); \
+ foo##__a (floats, ints); \
+ check_result (ints, __a); \
+ } while (0)
+
+
+int
+main (void)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ floats[i] = (float) i;
+
+ FUNC_CHECK (4);
+ FUNC_CHECK (8);
+ FUNC_CHECK (16);
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler-not "fmul\tv\[0-9\]*.*" } } */
+/* { dg-final { scan-assembler "fcvtzs\tv\[0-9\].4s, v\[0-9\].4s*.*#2" } } */
+/* { dg-final { scan-assembler "fcvtzs\tv\[0-9\].4s, v\[0-9\].4s*.*#3" } } */
+/* { dg-final { scan-assembler "fcvtzs\tv\[0-9\].4s, v\[0-9\].4s*.*#4" } } */