[PATCH][AArch64] Improve code generation for float16 vector code
Alan Lawrence
alan.lawrence@arm.com
Mon Sep 7 13:10:00 GMT 2015
On 04/09/15 13:32, James Greenhalgh wrote:
> In that case, these should be implemented as inline assembly blocks. As it
> stands, the code generation for these intrinsics will be very poor with this
> patch applied.
>
> I'm going to hold off OKing this until I see a follow-up to fix the code
> generation, either replacing those particular intrinsics with inline asm,
> or doing the more comprehensive fix in the back-end.
>
> Thanks,
> James
In that case, here is the follow-up now ;). This fixes each of the following
functions to generate a single instruction followed by ret:
* vld1_dup_f16, vld1q_dup_f16
* vset_lane_f16, vsetq_lane_f16
* vget_lane_f16, vgetq_lane_f16
* For IN of type either float16x4_t or float16x8_t, and constant C:
return (float16x4_t) {in[C], in[C], in[C], in[C]};
* Similarly,
return (float16x8_t) {in[C], in[C], in[C], in[C], in[C], in[C], in[C], in[C]};
(These correspond intuitively to what one might expect for "vdup_lane_f16",
"vdup_laneq_f16", "vdupq_lane_f16" and "vdupq_laneq_f16" intrinsics,
although such intrinsics do not actually exist.)
This patch does not deal with equivalents to vdup_n_s16 and other intrinsics
that load immediates, rather than using elements of pre-existing vectors.
I'd welcome thoughts/opinions on what testcase would be appropriate. Correctness
of all the intrinsics is already tested by the advsimd-intrinsics testsuite, and
the only way I can see to verify code generation, is to scan-assembler looking
for particular instructions; do we wish to see more scan-assembler tests?
Bootstrapped + check-gcc on aarch64-none-linux-gnu.
Thanks,
Alan
gcc/ChangeLog:
* config/aarch64/aarch64-simd.md (aarch64_simd_dup<mode>,
aarch64_dup_lane<mode>, aarch64_dup_lane_<vswap_width_name><mode>,
aarch64_simd_vec_set<mode>, vec_set<mode>, vec_perm_const<mode>,
vec_init<mode>, *aarch64_simd_ld1r<mode>, vec_extract<mode>): Add
V4HF and V8HF variants to iterator.
* config/aarch64/aarch64.c (aarch64_evpc_dup): Add V4HF and V8HF cases.
* config/aarch64/iterators.md (VDQF_F16): New.
(VSWAP_WIDTH, vswap_width_name): Add V4HF and V8HF cases.
---
gcc/config/aarch64/aarch64-simd.md | 39 +++++++++++++++++++-------------------
gcc/config/aarch64/aarch64.c | 2 ++
gcc/config/aarch64/iterators.md | 7 ++++++-
3 files changed, 28 insertions(+), 20 deletions(-)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 160acf9..b303d58 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -53,18 +53,19 @@
)
(define_insn "aarch64_simd_dup<mode>"
- [(set (match_operand:VDQF 0 "register_operand" "=w")
- (vec_duplicate:VDQF (match_operand:<VEL> 1 "register_operand" "w")))]
+ [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
+ (vec_duplicate:VDQF_F16
+ (match_operand:<VEL> 1 "register_operand" "w")))]
"TARGET_SIMD"
"dup\\t%0.<Vtype>, %1.<Vetype>[0]"
[(set_attr "type" "neon_dup<q>")]
)
(define_insn "aarch64_dup_lane<mode>"
- [(set (match_operand:VALL 0 "register_operand" "=w")
- (vec_duplicate:VALL
+ [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+ (vec_duplicate:VALL_F16
(vec_select:<VEL>
- (match_operand:VALL 1 "register_operand" "w")
+ (match_operand:VALL_F16 1 "register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand" "i")])
)))]
"TARGET_SIMD"
@@ -76,8 +77,8 @@
)
(define_insn "aarch64_dup_lane_<vswap_width_name><mode>"
- [(set (match_operand:VALL 0 "register_operand" "=w")
- (vec_duplicate:VALL
+ [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+ (vec_duplicate:VALL_F16
(vec_select:<VEL>
(match_operand:<VSWAP_WIDTH> 1 "register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand" "i")])
@@ -834,11 +835,11 @@
)
(define_insn "aarch64_simd_vec_set<mode>"
- [(set (match_operand:VDQF 0 "register_operand" "=w")
- (vec_merge:VDQF
- (vec_duplicate:VDQF
+ [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
+ (vec_merge:VDQF_F16
+ (vec_duplicate:VDQF_F16
(match_operand:<VEL> 1 "register_operand" "w"))
- (match_operand:VDQF 3 "register_operand" "0")
+ (match_operand:VDQF_F16 3 "register_operand" "0")
(match_operand:SI 2 "immediate_operand" "i")))]
"TARGET_SIMD"
{
@@ -851,7 +852,7 @@
)
(define_expand "vec_set<mode>"
- [(match_operand:VDQF 0 "register_operand" "+w")
+ [(match_operand:VDQF_F16 0 "register_operand" "+w")
(match_operand:<VEL> 1 "register_operand" "w")
(match_operand:SI 2 "immediate_operand" "")]
"TARGET_SIMD"
@@ -4691,9 +4692,9 @@
;; vec_perm support
(define_expand "vec_perm_const<mode>"
- [(match_operand:VALL 0 "register_operand")
- (match_operand:VALL 1 "register_operand")
- (match_operand:VALL 2 "register_operand")
+ [(match_operand:VALL_F16 0 "register_operand")
+ (match_operand:VALL_F16 1 "register_operand")
+ (match_operand:VALL_F16 2 "register_operand")
(match_operand:<V_cmp_result> 3)]
"TARGET_SIMD"
{
@@ -4957,7 +4958,7 @@
;; Standard pattern name vec_init<mode>.
(define_expand "vec_init<mode>"
- [(match_operand:VALL 0 "register_operand" "")
+ [(match_operand:VALL_F16 0 "register_operand" "")
(match_operand 1 "" "")]
"TARGET_SIMD"
{
@@ -4966,8 +4967,8 @@
})
(define_insn "*aarch64_simd_ld1r<mode>"
- [(set (match_operand:VALL 0 "register_operand" "=w")
- (vec_duplicate:VALL
+ [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+ (vec_duplicate:VALL_F16
(match_operand:<VEL> 1 "aarch64_simd_struct_operand" "Utv")))]
"TARGET_SIMD"
"ld1r\\t{%0.<Vtype>}, %1"
@@ -5014,7 +5015,7 @@
(define_expand "vec_extract<mode>"
[(match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "")
- (match_operand:VALL 1 "register_operand" "")
+ (match_operand:VALL_F16 1 "register_operand" "")
(match_operand:SI 2 "immediate_operand" "")]
"TARGET_SIMD"
{
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index f589ca3..362e0a0 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -11749,6 +11749,8 @@ aarch64_evpc_dup (struct expand_vec_perm_d *d)
case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
+ case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
+ case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index c2af1de..b1f483c 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -82,7 +82,10 @@
;; pointer-sized quantities. Exactly one of the two alternatives will match.
(define_mode_iterator PTR [(SI "ptr_mode == SImode") (DI "ptr_mode == DImode")])
-;; Vector Float modes.
+;; Vector Float modes suitable for moving, loading and storing.
+(define_mode_iterator VDQF_F16 [V4HF V8HF V2SF V4SF V2DF])
+
+;; Vector Float modes, barring HF modes.
(define_mode_iterator VDQF [V2SF V4SF V2DF])
;; Vector Float modes, and DF.
@@ -638,12 +641,14 @@
(V2SI "V4SI") (V4SI "V2SI")
(DI "V2DI") (V2DI "DI")
(V2SF "V4SF") (V4SF "V2SF")
+ (V4HF "V8HF") (V8HF "V4HF")
(DF "V2DF") (V2DF "DF")])
(define_mode_attr vswap_width_name [(V8QI "to_128") (V16QI "to_64")
(V4HI "to_128") (V8HI "to_64")
(V2SI "to_128") (V4SI "to_64")
(DI "to_128") (V2DI "to_64")
+ (V4HF "to_128") (V8HF "to_64")
(V2SF "to_128") (V4SF "to_64")
(DF "to_128") (V2DF "to_64")])
--
1.9.1
More information about the Gcc-patches
mailing list