vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128 ymm are 3 clk.
We can optimze them to vblend, vmovaps when there's no cross-lane.
gcc/ChangeLog:
* config/i386/sse.md: Modify insn vperm{i,f}
and vshuf{i,f}.
gcc/testsuite/ChangeLog:
* gcc.target/i386/avx512vl-vshuff32x4-1.c: Modify test.
* gcc.target/i386/avx512vl-vshuff64x2-1.c: Ditto.
* gcc.target/i386/avx512vl-vshufi32x4-1.c: Ditto.
* gcc.target/i386/avx512vl-vshufi64x2-1.c: Ditto.
* gcc.target/i386/opt-vperm-vshuf-1.c: New test.
* gcc.target/i386/opt-vperm-vshuf-2.c: Ditto.
* gcc.target/i386/opt-vperm-vshuf-3.c: Ditto.
mask = INTVAL (operands[3]) / 2;
mask |= (INTVAL (operands[5]) - 4) / 2 << 1;
operands[3] = GEN_INT (mask);
+ if (INTVAL (operands[3]) == 2 && !<mask_applied>)
+ return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
return "vshuf<shuffletype>64x2\t{%3, %2, %1, %0<mask_operand7>|%0<mask_operand7>, %1, %2, %3}";
}
[(set_attr "type" "sselog")
mask |= (INTVAL (operands[7]) - 8) / 4 << 1;
operands[3] = GEN_INT (mask);
+ if (INTVAL (operands[3]) == 2 && !<mask_applied>)
+ return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+
return "vshuf<shuffletype>32x4\t{%3, %2, %1, %0<mask_operand11>|%0<mask_operand11>, %1, %2, %3}";
}
[(set_attr "type" "sselog")
(match_operand:SI 3 "const_0_to_255_operand")]
UNSPEC_VPERMTI))]
"TARGET_AVX2"
- "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ {
+ int mask = INTVAL (operands[3]);
+ if ((mask & 0xbb) == 16)
+ {
+ if (rtx_equal_p (operands[0], operands[1]))
+ return "";
+ else
+ return "vmovaps\t{%1, %0|%0, %1}";
+ }
+ if ((mask & 0xbb) == 50)
+ {
+ if (rtx_equal_p (operands[0], operands[2]))
+ return "";
+ else
+ return "vmovaps\t{%2, %0|%0, %2}";
+ }
+ if ((mask & 0xbb) == 18)
+ return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
+ if ((mask & 0xbb) == 48)
+ return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+ return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+ }
[(set_attr "type" "sselog")
(set_attr "prefix" "vex")
(set_attr "mode" "OI")])
&& avx_vperm2f128_parallel (operands[3], <MODE>mode)"
{
int mask = avx_vperm2f128_parallel (operands[3], <MODE>mode) - 1;
- if (mask == 0x12)
- return "vinsert<i128>\t{$0, %x2, %1, %0|%0, %1, %x2, 0}";
- if (mask == 0x20)
+ if ((mask & 0xbb) == 0x12)
+ return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
+ if ((mask & 0xbb) == 0x30)
+ return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+ if ((mask & 0xbb) == 0x20)
return "vinsert<i128>\t{$1, %x2, %1, %0|%0, %1, %x2, 1}";
operands[3] = GEN_INT (mask);
return "vperm2<i128>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
void extern
avx512vl_test (void)
{
- x = _mm256_shuffle_f32x4 (x, x, 2);
+ x = _mm256_shuffle_f32x4 (x, x, 3);
x = _mm256_mask_shuffle_f32x4 (x, m, x, x, 2);
x = _mm256_maskz_shuffle_f32x4 (m, x, x, 2);
}
void extern
avx512vl_test (void)
{
- x = _mm256_shuffle_f64x2 (x, x, 2);
+ x = _mm256_shuffle_f64x2 (x, x, 3);
x = _mm256_mask_shuffle_f64x2 (x, m, x, x, 2);
x = _mm256_maskz_shuffle_f64x2 (m, x, x, 2);
}
void extern
avx512vl_test (void)
{
- x = _mm256_shuffle_i32x4 (x, x, 2);
+ x = _mm256_shuffle_i32x4 (x, x, 3);
x = _mm256_mask_shuffle_i32x4 (x, m, x, x, 2);
x = _mm256_maskz_shuffle_i32x4 (m, x, x, 2);
}
void extern
avx512vl_test (void)
{
- x = _mm256_shuffle_i64x2 (x, x, 2);
+ x = _mm256_shuffle_i64x2 (x, x, 3);
x = _mm256_mask_shuffle_i64x2 (x, m, x, x, 2);
x = _mm256_maskz_shuffle_i64x2 (m, x, x, 2);
}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=sapphirerapids" } */
+/* { dg-final { scan-assembler-times "vmovaps" 1 } } */
+/* { dg-final { scan-assembler-times "vblendps\t\\\$15" 1 } } */
+/* { dg-final { scan-assembler-times "vblendps\t\\\$240" 5 } } */
+
+#include<x86intrin.h>
+
+/* Vpermi128/Vpermf128 */
+__m256i
+perm0 (__m256i a, __m256i b)
+{
+ return _mm256_permute2x128_si256 (a, b, 50);
+}
+
+__m256i
+perm1 (__m256i a, __m256i b)
+{
+ return _mm256_permute2x128_si256 (a, b, 18);
+}
+
+__m256i
+perm2 (__m256i a, __m256i b)
+{
+ return _mm256_permute2x128_si256 (a, b, 48);
+}
+
+/* vshuf{i,f}{32x4,64x2} ymm .*/
+__m256i
+shuff0 (__m256i a, __m256i b)
+{
+ return _mm256_shuffle_i32x4(a, b, 2);
+}
+
+__m256
+shuff1 (__m256 a, __m256 b)
+{
+ return _mm256_shuffle_f32x4(a, b, 2);
+}
+
+__m256i
+shuff2 (__m256i a, __m256i b)
+{
+ return _mm256_shuffle_i64x2(a, b, 2);
+}
+
+__m256d
+shuff3 (__m256d a, __m256d b)
+{
+ return _mm256_shuffle_f64x2(a, b, 2);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=sapphirerapids" } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vblendps" } } */
+/* { dg-final { scan-assembler-not "vperm2i128" } } */
+/* { dg-final { scan-assembler-not "vperm2f128" } } */
+
+#include<x86intrin.h>
+
+__m256i
+perm0 (__m256i a, __m256i b)
+{
+ return _mm256_permute2x128_si256 (a, b, 16);
+}
+
+__m256d
+perm1 (__m256d a, __m256d b)
+{
+ return _mm256_permute2f128_pd (a, b, 16);
+}
+
+__m256
+perm2 (__m256 a, __m256 b)
+{
+ return _mm256_permute2f128_ps (a, b, 16);
+}
+
+__m256i
+perm3 (__m256i a, __m256i b)
+{
+ return _mm256_permute2f128_si256 (a, b, 16);
+}
+
+__m256i
+perm4 (__m256i a, __m256i b)
+{
+ return _mm256_permute2x128_si256 (a, b, 20);
+}
+
+__m256d
+perm5 (__m256d a, __m256d b)
+{
+ return _mm256_permute2f128_pd (a, b, 20);
+}
+
+__m256i
+perm6 (__m256i a, __m256i b)
+{
+ return _mm256_permute2x128_si256 (a, b, 80);
+}
+
+__m256d
+perm7 (__m256d a, __m256d b)
+{
+ return _mm256_permute2f128_pd (a, b, 80);
+}
+
+__m256i
+perm8 (__m256i a, __m256i b)
+{
+ return _mm256_permute2x128_si256 (a, b, 84);
+}
+
+__m256d
+perm9 (__m256d a, __m256d b)
+{
+ return _mm256_permute2f128_pd (a, b, 84);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=sapphirerapids" } */
+/* { dg-final { scan-assembler-times "vmov..." 3 } } */
+/* { dg-final { scan-assembler-times "vblendps\t\\\$15" 3 } } */
+/* { dg-final { scan-assembler-times "vblendps\t\\\$240" 3 } } */
+/* { dg-final { scan-assembler-not "vperm2f128" } } */
+
+#include<x86intrin.h>
+
+/* Vpermf128 */
+__m256
+perm0 (__m256 a, __m256 b)
+{
+ return _mm256_permute2f128_ps (a, b, 50);
+}
+
+__m256
+perm1 (__m256 a, __m256 b)
+{
+ return _mm256_permute2f128_ps (a, b, 18);
+}
+
+__m256
+perm2 (__m256 a, __m256 b)
+{
+ return _mm256_permute2f128_ps (a, b, 48);
+}
+
+__m256i
+perm3 (__m256i a, __m256i b)
+{
+ return _mm256_permute2f128_si256 (a, b, 50);
+}
+
+__m256i
+perm4 (__m256i a, __m256i b)
+{
+ return _mm256_permute2f128_si256 (a, b, 18);
+}
+
+__m256i
+perm5 (__m256i a, __m256i b)
+{
+ return _mm256_permute2f128_si256 (a, b, 48);
+}
+
+__m256d
+perm6 (__m256d a, __m256d b)
+{
+ return _mm256_permute2f128_pd (a, b, 50);
+}
+
+__m256d
+perm7 (__m256d a, __m256d b)
+{
+ return _mm256_permute2f128_pd (a, b, 18);
+}
+
+__m256d
+perm8 (__m256d a, __m256d b)
+{
+ return _mm256_permute2f128_pd (a, b, 48);
+}