[PATCH] ARM PR68620 (ICE with FP16 on armeb)
Christophe Lyon
christophe.lyon@linaro.org
Fri Jan 15 10:39:00 GMT 2016
Hi,
The attached patch fixes PR68620.
It wasn't sufficient to add the movv4hf pattern, because this also
enabled other transformations, and I had to update existing support
such that the tests continue to pass after using new code paths.
I added V4HF/V8HF to the VQXMOV and VDQ iterators to enable the use of
these modes in the relevant patterns.
For the vec_set<mode>_internal and neon_vld1_dup<mode> patterns, I
switched to an existing iterator which already had the needed
V4HF/V8HF (so I switched to VD_LANE and VQ2).
For neon_vdupn, I chose to implement neon_vdup_nv4hf and
neon_vdup_nv8hf instead of updating the VX iterator because I thought
it was not desirable to impact neon_vrev32<mode>.
I had to update neon_valid_immediate to return -1 when handling FP16
immediates (they cannot be represented in neon mov instructions).
Finally, I had to adjust the vget_lane_f16/vset_lane_f16
implementations in arm_neon.h to account for the different lane
numbering in big-endian. This has the benefit of making
vldX_lane_f16_indices_1
vstX_lane_f16_indices_1.c
vcvt_f16.c
vcvtf16_f32.c
now pass on armeb.
Regarding the testsuite, I've added the testcase that would otherwise
ICE, and the arm_fp effective target I've also proposed in my other
testsuite patch related to target attributes.
I've tested this patch using QEMU on arm-linux and armeb-linux targets.
OK?
Christophe.
-------------- next part --------------
gcc/ChangeLog:
2016-01-15 Christophe Lyon <christophe.lyon@linaro.org>
PR target/68620
* config/arm/arm.c (neon_valid_immediate): Handle FP16 vectors.
* config/arm/arm_neon.h (__arm_lane): New helper macro.
(vget_lane_f16): Handle big-endian.
(vgetq_lane_f16): Likewise.
(vset_lane_f16): Likewise.
(vsetq_lane_f16): Likewise.
* config/arm/iterators.md (VQXMOV): Add V8HF.
(VDQ): Add V4HF and V8HF.
(V_reg): Handle V4HF and V8HF.
(Is_float_mode): Likewise.
* config/arm/neon.md (movv4hf, movv8hf, neon_vdup_nv4hf,
neon_vdup_nv8hf): New patterns.
(vec_set<mode>_internal, neon_vld1_dup<mode>): Use VD_LANE iterator.
(neon_vld1_dup<mode>): Use VQ2 iterator.
* doc/sourcebuild.texi (arm_fp_ok, arm_fp): Add documentation.
gcc/testsuite/ChangeLog:
2016-01-15 Christophe Lyon <christophe.lyon@linaro.org>
PR target/68620
* gcc.target/arm/pr68620.c: New test.
* lib/target-supports.exp
(check_effective_target_arm_fp_ok_nocache)
(check_effective_target_arm_fp_ok, add_options_for_arm_fp): New.
-------------- next part --------------
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 3588b83..b1f408c 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -12370,6 +12370,10 @@ neon_valid_immediate (rtx op, machine_mode mode, int inverse,
if (!vfp3_const_double_rtx (el0) && el0 != CONST0_RTX (GET_MODE (el0)))
return -1;
+ /* FP16 vectors cannot be represented. */
+ if (innersize == 2)
+ return -1;
+
r0 = CONST_DOUBLE_REAL_VALUE (el0);
for (i = 1; i < n_elts; i++)
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 0a33d21..b4aabd9 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -5252,12 +5252,22 @@ vget_lane_s32 (int32x2_t __a, const int __b)
were marked always-inline so there were no call sites, the declaration
would nonetheless raise an error. Hence, we must use a macro instead. */
+ /* For big-endian, GCC's vector indices are the opposite way around
+ to the architectural lane indices used by Neon intrinsics. */
+#ifdef __ARM_BIG_ENDIAN
+ /* Here, 3 is (4-1) where 4 is the number of lanes. This is also the
+ right value for vectors with 8 lanes. */
+#define __arm_lane(__vec, __idx) (__idx ^ 3)
+#else
+#define __arm_lane(__vec, __idx) __idx
+#endif
+
#define vget_lane_f16(__v, __idx) \
__extension__ \
({ \
float16x4_t __vec = (__v); \
__builtin_arm_lane_check (4, __idx); \
- float16_t __res = __vec[__idx]; \
+ float16_t __res = __vec[__arm_lane(__vec, __idx)]; \
__res; \
})
#endif
@@ -5334,7 +5344,7 @@ vgetq_lane_s32 (int32x4_t __a, const int __b)
({ \
float16x8_t __vec = (__v); \
__builtin_arm_lane_check (8, __idx); \
- float16_t __res = __vec[__idx]; \
+ float16_t __res = __vec[__arm_lane(__vec, __idx)]; \
__res; \
})
#endif
@@ -5412,7 +5422,7 @@ vset_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
float16_t __elem = (__e); \
float16x4_t __vec = (__v); \
__builtin_arm_lane_check (4, __idx); \
- __vec[__idx] = __elem; \
+ __vec[__arm_lane (__vec, __idx)] = __elem; \
__vec; \
})
#endif
@@ -5490,7 +5500,7 @@ vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c)
float16_t __elem = (__e); \
float16x8_t __vec = (__v); \
__builtin_arm_lane_check (8, __idx); \
- __vec[__idx] = __elem; \
+ __vec[__arm_lane (__vec, __idx)] = __elem; \
__vec; \
})
#endif
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 6a54125..88e1c3d 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -99,7 +99,7 @@
(define_mode_iterator VQI [V16QI V8HI V4SI])
;; Quad-width vector modes, with TImode added, for moves.
-(define_mode_iterator VQXMOV [V16QI V8HI V4SI V4SF V2DI TI])
+(define_mode_iterator VQXMOV [V16QI V8HI V8HF V4SI V4SF V2DI TI])
;; Opaque structure types wider than TImode.
(define_mode_iterator VSTRUCT [EI OI CI XI])
@@ -114,7 +114,7 @@
(define_mode_iterator VN [V8HI V4SI V2DI])
;; All supported vector modes (except singleton DImode).
-(define_mode_iterator VDQ [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DI])
+(define_mode_iterator VDQ [V8QI V16QI V4HI V8HI V2SI V4SI V4HF V8HF V2SF V4SF V2DI])
;; All supported vector modes (except those with 64-bit integer elements).
(define_mode_iterator VDQW [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF])
@@ -424,6 +424,7 @@
;; Register width from element mode
(define_mode_attr V_reg [(V8QI "P") (V16QI "q")
(V4HI "P") (V8HI "q")
+ (V4HF "P") (V8HF "q")
(V2SI "P") (V4SI "q")
(V2SF "P") (V4SF "q")
(DI "P") (V2DI "q")
@@ -572,6 +573,7 @@
(define_mode_attr Is_float_mode [(V8QI "false") (V16QI "false")
(V4HI "false") (V8HI "false")
(V2SI "false") (V4SI "false")
+ (V4HF "true") (V8HF "true")
(V2SF "true") (V4SF "true")
(DI "false") (V2DI "false")])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 62fb6da..9e04e5c 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -137,6 +137,30 @@
}
})
+(define_expand "movv4hf"
+ [(set (match_operand:V4HF 0 "s_register_operand")
+ (match_operand:V4HF 1 "s_register_operand"))]
+ "TARGET_NEON && TARGET_FP16"
+{
+ if (can_create_pseudo_p ())
+ {
+ if (!REG_P (operands[0]))
+ operands[1] = force_reg (V4HFmode, operands[1]);
+ }
+})
+
+(define_expand "movv8hf"
+ [(set (match_operand:V8HF 0 "")
+ (match_operand:V8HF 1 ""))]
+ "TARGET_NEON && TARGET_FP16"
+{
+ if (can_create_pseudo_p ())
+ {
+ if (!REG_P (operands[0]))
+ operands[1] = force_reg (V8HFmode, operands[1]);
+ }
+})
+
(define_insn "*neon_mov<mode>"
[(set (match_operand:VSTRUCT 0 "nonimmediate_operand" "=w,Ut,w")
(match_operand:VSTRUCT 1 "general_operand" " w,w, Ut"))]
@@ -299,11 +323,11 @@
[(set_attr "type" "neon_load1_1reg<q>")])
(define_insn "vec_set<mode>_internal"
- [(set (match_operand:VD 0 "s_register_operand" "=w,w")
- (vec_merge:VD
- (vec_duplicate:VD
+ [(set (match_operand:VD_LANE 0 "s_register_operand" "=w,w")
+ (vec_merge:VD_LANE
+ (vec_duplicate:VD_LANE
(match_operand:<V_elem> 1 "nonimmediate_operand" "Um,r"))
- (match_operand:VD 3 "s_register_operand" "0,0")
+ (match_operand:VD_LANE 3 "s_register_operand" "0,0")
(match_operand:SI 2 "immediate_operand" "i,i")))]
"TARGET_NEON"
{
@@ -2806,6 +2830,22 @@ if (BYTES_BIG_ENDIAN)
[(set_attr "type" "neon_from_gp<q>")]
)
+(define_insn "neon_vdup_nv4hf"
+ [(set (match_operand:V4HF 0 "s_register_operand" "=w")
+ (vec_duplicate:V4HF (match_operand:HF 1 "s_register_operand" "r")))]
+ "TARGET_NEON"
+ "vdup.16\t%P0, %1"
+ [(set_attr "type" "neon_from_gp")]
+)
+
+(define_insn "neon_vdup_nv8hf"
+ [(set (match_operand:V8HF 0 "s_register_operand" "=w")
+ (vec_duplicate:V8HF (match_operand:HF 1 "s_register_operand" "r")))]
+ "TARGET_NEON"
+ "vdup.16\t%q0, %1"
+ [(set_attr "type" "neon_from_gp_q")]
+)
+
(define_insn "neon_vdup_n<mode>"
[(set (match_operand:V32 0 "s_register_operand" "=w,w")
(vec_duplicate:V32 (match_operand:<V_elem> 1 "s_register_operand" "r,t")))]
@@ -4305,8 +4345,8 @@ if (BYTES_BIG_ENDIAN)
)
(define_insn "neon_vld1_dup<mode>"
- [(set (match_operand:VD 0 "s_register_operand" "=w")
- (vec_duplicate:VD (match_operand:<V_elem> 1 "neon_struct_operand" "Um")))]
+ [(set (match_operand:VD_LANE 0 "s_register_operand" "=w")
+ (vec_duplicate:VD_LANE (match_operand:<V_elem> 1 "neon_struct_operand" "Um")))]
"TARGET_NEON"
"vld1.<V_sz_elem>\t{%P0[]}, %A1"
[(set_attr "type" "neon_load1_all_lanes<q>")]
@@ -4322,8 +4362,8 @@ if (BYTES_BIG_ENDIAN)
)
(define_insn "neon_vld1_dup<mode>"
- [(set (match_operand:VQ 0 "s_register_operand" "=w")
- (vec_duplicate:VQ (match_operand:<V_elem> 1 "neon_struct_operand" "Um")))]
+ [(set (match_operand:VQ2 0 "s_register_operand" "=w")
+ (vec_duplicate:VQ2 (match_operand:<V_elem> 1 "neon_struct_operand" "Um")))]
"TARGET_NEON"
{
return "vld1.<V_sz_elem>\t{%e0[], %f0[]}, %A1";
diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
index 61de4a5..3f2e0e3 100644
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -1514,6 +1514,12 @@ ARM target generates 32-bit code.
@item arm_eabi
ARM target adheres to the ABI for the ARM Architecture.
+@item arm_fp_ok
+@anchor{arm_fp_ok}
+ARM target defines @code{__ARM_FP} using @code{-mfloat-abi=softfp} or
+equivalent options. Some multilibs may be incompatible with these
+options.
+
@item arm_hf_eabi
ARM target adheres to the VFP and Advanced SIMD Register Arguments
variant of the ABI for the ARM Architecture (as selected with
@@ -1527,6 +1533,11 @@ Some multilibs may be incompatible with these options.
ARM target supports @code{-mcpu=iwmmxt}.
Some multilibs may be incompatible with this option.
+@item arm_fp
+@code{__ARM_FP} definition. Only ARM targets support this feature, and only then
+in certain modes; see the @ref{arm_fp_ok,,arm_fp_ok effective target
+keyword}.
+
@item arm_neon
ARM target supports generating NEON instructions.
diff --git a/gcc/testsuite/gcc.target/arm/pr68620.c b/gcc/testsuite/gcc.target/arm/pr68620.c
new file mode 100644
index 0000000..984992f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr68620.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_fp_ok } */
+/* { dg-options "-mfp16-format=ieee" } */
+/* { dg-add-options arm_fp } */
+
+#include "arm_neon.h"
+
+float16x4_t __attribute__((target("fpu=neon-fp16")))
+foo (float32x4_t arg)
+{
+ return vcvt_f16_f32 (arg);
+}
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 4e349e9..228e68d 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -2721,6 +2721,46 @@ proc check_effective_target_arm_hard_vfp_ok { } {
}
}
+# Return 1 if this is an ARM target defining __ARM_FP. We may need
+# -mfloat-abi=softfp or equivalent options. Some multilibs may be
+# incompatible with these options. Also set et_arm_fp_flags to the
+# best options to add.
+
+proc check_effective_target_arm_fp_ok_nocache { } {
+ global et_arm_fp_flags
+ set et_arm_fp_flags ""
+ if { [check_effective_target_arm32] } {
+ foreach flags {"" "-mfloat-abi=softfp" "-mfloat-abi=hard"} {
+ if { [check_no_compiler_messages_nocache arm_fp_ok object {
+ #ifndef __ARM_FP
+ #error __ARM_FP not defined
+ #endif
+ } "$flags"] } {
+ set et_arm_fp_flags $flags
+ return 1
+ }
+ }
+ }
+ return 0
+}
+
+proc check_effective_target_arm_fp_ok { } {
+ return [check_cached_effective_target arm_fp_ok \
+ check_effective_target_arm_fp_ok_nocache]
+}
+
+# Add the options needed to define __ARM_FP. We need either
+# -mfloat-abi=softfp or -mfloat-abi=hard, but if one is already
+# specified by the multilib, use it.
+
+proc add_options_for_arm_fp { flags } {
+ if { ! [check_effective_target_arm_fp_ok] } {
+ return "$flags"
+ }
+ global et_arm_fp_flags
+ return "$flags $et_arm_fp_flags"
+}
+
# Return 1 if this is an ARM target that supports DSP multiply with
# current multilib flags.
More information about the Gcc-patches
mailing list