This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] Improve V?TImode shifts (PR target/82370)
- From: Jakub Jelinek <jakub at redhat dot com>
- To: Uros Bizjak <ubizjak at gmail dot com>, Kirill Yukhin <kirill dot yukhin at gmail dot com>, "H.J. Lu" <hjl dot tools at gmail dot com>
- Cc: gcc-patches at gcc dot gnu dot org
- Date: Wed, 4 Oct 2017 21:35:50 +0200
- Subject: [PATCH] Improve V?TImode shifts (PR target/82370)
- Authentication-results: sourceware.org; auth=none
- Authentication-results: ext-mx10.extmail.prod.ext.phx2.redhat.com; dmarc=none (p=none dis=none) header.from=redhat.com
- Authentication-results: ext-mx10.extmail.prod.ext.phx2.redhat.com; spf=fail smtp.mailfrom=jakub at redhat dot com
- Dmarc-filter: OpenDMARC Filter v1.3.2 mx1.redhat.com 86FD75F7BC
- Reply-to: Jakub Jelinek <jakub at redhat dot com>
Hi!
The following patch tweaks the TImode vector shifts similarly
to the earlier vector shift patch, so that for shifts by immediate
we can accept a memory input. Additionally, it removes the vec_shl_*
expander, because the middle-end has dropped that a few years ago,
and merges the left and right shift patterns using code iterators.
Appart from the code/names that can be handled by mode attributes,
the only difference was that one of the insns had
(set_attr "atom_unit" "sishuf")
and the other didn't. I hope that is just an error, I'd really expect
both vpslldq and vpsrldq to use the same atom unit, isn't that the case?
CCing H.J. for that. If it is intentional difference, I can of course
adjust the patch and undo the merging of the 4 define_insns into 2.
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
2017-10-04 Jakub Jelinek <jakub@redhat.com>
PR target/82370
* config/i386/sse.md (VIMAX_AVX2): Remove V4TImode.
(VIMAX_AVX2_AVX512BW, VIMAX_AVX512VL): New mode iterators.
(vec_shl_<mode>): Remove unused expander.
(avx512bw_<shift_insn><mode>3): New define_insn.
(<sse2_avx2>_ashl<mode>3, <sse2_avx2>_lshr<mode>3): Replaced by ...
(<sse2_avx2>_<shift_insn><mode>3): ... this. New define_insn.
* gcc.target/i386/pr82370.c: New test.
--- gcc/config/i386/sse.md.jj 2017-10-04 12:18:19.000000000 +0200
+++ gcc/config/i386/sse.md 2017-10-04 15:34:00.541860351 +0200
@@ -371,10 +371,17 @@ (define_mode_iterator V16FI
[V16SF V16SI])
;; ??? We should probably use TImode instead.
-(define_mode_iterator VIMAX_AVX2
+(define_mode_iterator VIMAX_AVX2_AVX512BW
[(V4TI "TARGET_AVX512BW") (V2TI "TARGET_AVX2") V1TI])
-;; ??? This should probably be dropped in favor of VIMAX_AVX2.
+;; Suppose TARGET_AVX512BW as baseline
+(define_mode_iterator VIMAX_AVX512VL
+ [V4TI (V2TI "TARGET_AVX512VL") (V1TI "TARGET_AVX512VL")])
+
+(define_mode_iterator VIMAX_AVX2
+ [(V2TI "TARGET_AVX2") V1TI])
+
+;; ??? This should probably be dropped in favor of VIMAX_AVX2_AVX512BW.
(define_mode_iterator SSESCALARMODE
[(V4TI "TARGET_AVX512BW") (V2TI "TARGET_AVX2") TI])
@@ -10792,9 +10799,9 @@ (define_insn "<shift_insn><mode>3<mask_n
(set_attr "mode" "<sseinsnmode>")])
-(define_expand "vec_shl_<mode>"
+(define_expand "vec_shr_<mode>"
[(set (match_dup 3)
- (ashift:V1TI
+ (lshiftrt:V1TI
(match_operand:VI_128 1 "register_operand")
(match_operand:SI 2 "const_0_to_255_mul_8_operand")))
(set (match_operand:VI_128 0 "register_operand") (match_dup 4))]
@@ -10805,48 +10812,24 @@ (define_expand "vec_shl_<mode>"
operands[4] = gen_lowpart (<MODE>mode, operands[3]);
})
-(define_insn "<sse2_avx2>_ashl<mode>3"
- [(set (match_operand:VIMAX_AVX2 0 "register_operand" "=x,v")
- (ashift:VIMAX_AVX2
- (match_operand:VIMAX_AVX2 1 "register_operand" "0,v")
- (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n,n")))]
- "TARGET_SSE2"
+(define_insn "avx512bw_<shift_insn><mode>3"
+ [(set (match_operand:VIMAX_AVX512VL 0 "register_operand" "=v")
+ (any_lshift:VIMAX_AVX512VL
+ (match_operand:VIMAX_AVX512VL 1 "nonimmediate_operand" "vm")
+ (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n")))]
+ "TARGET_AVX512BW"
{
operands[2] = GEN_INT (INTVAL (operands[2]) / 8);
-
- switch (which_alternative)
- {
- case 0:
- return "pslldq\t{%2, %0|%0, %2}";
- case 1:
- return "vpslldq\t{%2, %1, %0|%0, %1, %2}";
- default:
- gcc_unreachable ();
- }
+ return "vp<vshift>dq\t{%2, %1, %0|%0, %1, %2}";
}
- [(set_attr "isa" "noavx,avx")
- (set_attr "type" "sseishft")
+ [(set_attr "type" "sseishft")
(set_attr "length_immediate" "1")
- (set_attr "prefix_data16" "1,*")
- (set_attr "prefix" "orig,vex")
+ (set_attr "prefix" "maybe_evex")
(set_attr "mode" "<sseinsnmode>")])
-(define_expand "vec_shr_<mode>"
- [(set (match_dup 3)
- (lshiftrt:V1TI
- (match_operand:VI_128 1 "register_operand")
- (match_operand:SI 2 "const_0_to_255_mul_8_operand")))
- (set (match_operand:VI_128 0 "register_operand") (match_dup 4))]
- "TARGET_SSE2"
-{
- operands[1] = gen_lowpart (V1TImode, operands[1]);
- operands[3] = gen_reg_rtx (V1TImode);
- operands[4] = gen_lowpart (<MODE>mode, operands[3]);
-})
-
-(define_insn "<sse2_avx2>_lshr<mode>3"
+(define_insn "<sse2_avx2>_<shift_insn><mode>3"
[(set (match_operand:VIMAX_AVX2 0 "register_operand" "=x,v")
- (lshiftrt:VIMAX_AVX2
+ (any_lshift:VIMAX_AVX2
(match_operand:VIMAX_AVX2 1 "register_operand" "0,v")
(match_operand:SI 2 "const_0_to_255_mul_8_operand" "n,n")))]
"TARGET_SSE2"
@@ -10856,9 +10839,9 @@ (define_insn "<sse2_avx2>_lshr<mode>3"
switch (which_alternative)
{
case 0:
- return "psrldq\t{%2, %0|%0, %2}";
+ return "p<vshift>dq\t{%2, %0|%0, %2}";
case 1:
- return "vpsrldq\t{%2, %1, %0|%0, %1, %2}";
+ return "vp<vshift>dq\t{%2, %1, %0|%0, %1, %2}";
default:
gcc_unreachable ();
}
--- gcc/testsuite/gcc.target/i386/pr82370.c.jj 2017-10-04 16:01:16.350247297 +0200
+++ gcc/testsuite/gcc.target/i386/pr82370.c 2017-10-04 16:03:06.704922288 +0200
@@ -0,0 +1,18 @@
+/* PR target/82370 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512vl -mavx512bw -masm=att" } */
+/* { dg-final { scan-assembler-times "vpslldq\[ \t]\+\\\$5, \\(%\[a-z0-9,]*\\), %xmm\[0-9]\+" 1 } } */
+/* { dg-final { scan-assembler-times "vpsrldq\[ \t]\+\\\$5, \\(%\[a-z0-9,]*\\), %xmm\[0-9]\+" 1 } } */
+/* { dg-final { scan-assembler-times "vpslldq\[ \t]\+\\\$5, \\(%\[a-z0-9,]*\\), %ymm\[0-9]\+" 1 } } */
+/* { dg-final { scan-assembler-times "vpsrldq\[ \t]\+\\\$5, \\(%\[a-z0-9,]*\\), %ymm\[0-9]\+" 1 } } */
+/* { dg-final { scan-assembler-times "vpslldq\[ \t]\+\\\$5, \\(%\[a-z0-9,]*\\), %zmm\[0-9]\+" 1 } } */
+/* { dg-final { scan-assembler-times "vpsrldq\[ \t]\+\\\$5, \\(%\[a-z0-9,]*\\), %zmm\[0-9]\+" 1 } } */
+
+#include <x86intrin.h>
+
+__m512i f1 (__m512i *x) { return _mm512_bslli_epi128 (*x, 5); }
+__m512i f2 (__m512i *x) { return _mm512_bsrli_epi128 (*x, 5); }
+__m256i f3 (__m256i *x) { return _mm256_bslli_epi128 (*x, 5); }
+__m256i f4 (__m256i *x) { return _mm256_bsrli_epi128 (*x, 5); }
+__m128i f5 (__m128i *x) { return _mm_bslli_si128 (*x, 5); }
+__m128i f6 (__m128i *x) { return _mm_bsrli_si128 (*x, 5); }
Jakub