[Bug target/93613] Missed optimization with _mm256_permute2x128_si256 intrinsic
jakub at gcc dot gnu.org
gcc-bugzilla@gcc.gnu.org
Thu Feb 6 14:42:00 GMT 2020
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93613
--- Comment #1 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
I've tried:
--- gcc/config/i386/sse.md.jj 2020-02-06 13:40:27.485007762 +0100
+++ gcc/config/i386/sse.md 2020-02-06 15:24:35.097743017 +0100
@@ -81,7 +81,6 @@ (define_c_enum "unspec" [
;; For AVX2 support
UNSPEC_VPERMVAR
- UNSPEC_VPERMTI
UNSPEC_GATHER
UNSPEC_VSIBADDR
@@ -20224,15 +20223,55 @@ (define_insn "avx512f_perm<mode>_1<mask_
(set_attr "prefix" "<mask_prefix2>")
(set_attr "mode" "<sseinsnmode>")])
-(define_insn "avx2_permv2ti"
- [(set (match_operand:V4DI 0 "register_operand" "=x")
- (unspec:V4DI
- [(match_operand:V4DI 1 "register_operand" "x")
- (match_operand:V4DI 2 "nonimmediate_operand" "xm")
- (match_operand:SI 3 "const_0_to_255_operand" "n")]
- UNSPEC_VPERMTI))]
+(define_expand "avx2_permv2ti"
+ [(match_operand:V4DI 0 "register_operand")
+ (match_operand:V4DI 1 "register_operand")
+ (match_operand:V4DI 2 "nonimmediate_operand")
+ (match_operand:SI 3 "const_0_to_255_operand")]
"TARGET_AVX2"
- "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+{
+ int mask = INTVAL (operands[3]);
+ int first = (mask & 0x08) ? 8 : (mask & 0x03) * 2;
+ int second = (mask & 0x80) ? 8 : (mask & 0x30) / 8;
+ emit_insn (gen_avx2_permv2ti_1 (operands[0], operands[1],
+ operands[2], CONST0_RTX (V8DImode),
+ GEN_INT (first),
+ GEN_INT (first + 1),
+ GEN_INT (second),
+ GEN_INT (second + 1)));
+ DONE;
+})
+
+(define_insn "avx2_permv2ti_1"
+ [(set (match_operand:V4DI 0 "register_operand" "=x")
+ (vec_select:V4DI
+ (vec_concat:V16DI
+ (vec_concat:V8DI
+ (match_operand:V4DI 1 "register_operand" "x")
+ (match_operand:V4DI 2 "nonimmediate_operand" "xm"))
+ (match_operand:V8DI 3 "const0_operand" "C"))
+ (parallel [(match_operand 4 "const_0_to_15_operand")
+ (match_operand 5 "const_0_to_15_operand")
+ (match_operand 6 "const_0_to_15_operand")
+ (match_operand 7 "const_0_to_15_operand")])))]
+ "TARGET_AVX2
+ && (INTVAL (operands[4]) & 2) == 0
+ && INTVAL (operands[5]) == INTVAL (operands[4]) + 1
+ && (INTVAL (operands[6]) & 2) == 0
+ && INTVAL (operands[7]) == INTVAL (operands[6]) + 1"
+{
+ int mask = 0;
+ if (INTVAL (operands[4]) >= 8)
+ mask |= 0x08;
+ else
+ mask |= INTVAL (operands[4]) / 2;
+ if (INTVAL (operands[6]) >= 8)
+ mask |= 0x80;
+ else
+ mask |= INTVAL (operands[6]) * 8;
+ operands[4] = GEN_INT (mask);
+ return "vperm2i128\t{%4, %2, %1, %0|%0, %1, %2, %4}";
+}
[(set_attr "type" "sselog")
(set_attr "prefix" "vex")
(set_attr "mode" "OI")])
but unfortunately it doesn't help, guess we'll need to improve simplify-rtx.c
to deal with that (and for the last 3 functions it even makes things worse, as
combine then simplifies those patterns to vector constants but we don't have an
instruction that would force the const_vector into memory that combine could
match and could be split before reload). For those I guess we want gimple
folding of the builtin. Of course, people really should use __builtin_shuffle
instead of this mess... ;)
More information about the Gcc-bugs
mailing list