[gcc r13-1762] Lower complex type move to enable vectorization for complex type load&store.

hongtao Liu liuhongt@gcc.gnu.org
Wed Jul 20 08:06:58 GMT 2022


https://gcc.gnu.org/g:f9d4c3b45c5ed5f45c8089c990dbd4e181929c3d

commit r13-1762-gf9d4c3b45c5ed5f45c8089c990dbd4e181929c3d
Author: liuhongt <hongtao.liu@intel.com>
Date:   Tue Jul 19 17:24:52 2022 +0800

    Lower complex type move to enable vectorization for complex type load&store.
    
    2022-07-20  Richard Biener  <richard.guenther@gmail.com>
                Hongtao Liu  <hongtao.liu@intel.com>
    
    gcc/ChangeLog:
    
            PR tree-optimization/106010
            * tree-complex.cc (init_dont_simulate_again): Lower complex
            type move.
            (expand_complex_move): Also expand COMPLEX_CST for rhs.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/i386/pr106010-1a.c: New test.
            * gcc.target/i386/pr106010-1b.c: New test.
            * gcc.target/i386/pr106010-1c.c: New test.
            * gcc.target/i386/pr106010-2a.c: New test.
            * gcc.target/i386/pr106010-2b.c: New test.
            * gcc.target/i386/pr106010-2c.c: New test.
            * gcc.target/i386/pr106010-3a.c: New test.
            * gcc.target/i386/pr106010-3b.c: New test.
            * gcc.target/i386/pr106010-3c.c: New test.
            * gcc.target/i386/pr106010-4a.c: New test.
            * gcc.target/i386/pr106010-4b.c: New test.
            * gcc.target/i386/pr106010-4c.c: New test.
            * gcc.target/i386/pr106010-5a.c: New test.
            * gcc.target/i386/pr106010-5b.c: New test.
            * gcc.target/i386/pr106010-5c.c: New test.
            * gcc.target/i386/pr106010-6a.c: New test.
            * gcc.target/i386/pr106010-6b.c: New test.
            * gcc.target/i386/pr106010-6c.c: New test.
            * gcc.target/i386/pr106010-7a.c: New test.
            * gcc.target/i386/pr106010-7b.c: New test.
            * gcc.target/i386/pr106010-7c.c: New test.
            * gcc.target/i386/pr106010-8a.c: New test.
            * gcc.target/i386/pr106010-8b.c: New test.
            * gcc.target/i386/pr106010-8c.c: New test.
            * gcc.target/i386/pr106010-9a.c: New test.
            * gcc.target/i386/pr106010-9b.c: New test.
            * gcc.target/i386/pr106010-9c.c: New test.
            * gcc.target/i386/pr106010-9d.c: New test.

Diff:
---
 gcc/testsuite/gcc.target/i386/pr106010-1a.c |  58 ++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-1b.c |  63 +++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-1c.c |  41 ++++++++
 gcc/testsuite/gcc.target/i386/pr106010-2a.c |  82 +++++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-2b.c |  62 +++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-2c.c |  47 +++++++++
 gcc/testsuite/gcc.target/i386/pr106010-3a.c |  80 ++++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-3c.c |  69 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 ++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-4b.c |  67 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-4c.c |  54 ++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 +++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-5b.c |  80 ++++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-5c.c |  62 +++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 ++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 ++++++++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-6c.c |  80 ++++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-7a.c |  58 ++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-7b.c |  63 +++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-7c.c |  41 ++++++++
 gcc/testsuite/gcc.target/i386/pr106010-8a.c |  58 ++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-8b.c |  53 ++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-8c.c |  38 +++++++
 gcc/testsuite/gcc.target/i386/pr106010-9a.c |  89 ++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-9b.c |  90 ++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-9c.c |  90 ++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-9d.c |  92 ++++++++++++++++
 gcc/tree-complex.cc                         |   9 +-
 29 files changed, 2141 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1a.c b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
new file mode 100644
index 00000000000..b608f484934
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "vect" } } */
+
+#define N 10000
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1b.c b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
new file mode 100644
index 00000000000..0f377c3a548
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
@@ -0,0 +1,63 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-1a.c"
+
+void
+avx_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
+  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
+  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
+  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
+  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
+  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
+  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
+  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
+  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
+  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
+  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
+  char* p_init = (char*) malloc (2 * N * sizeof (double));
+
+  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
+  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
+  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
+  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
+  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
+  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
+
+  for (int i = 0; i != 2 * N * sizeof (double); i++)
+    p_init[i] = i;
+
+  memcpy (pd_src, p_init, 2 * N * sizeof (double));
+  memcpy (ps_src, p_init, 2 * N * sizeof (float));
+  memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
+  memcpy (epi32_src, p_init, 2 * N * sizeof (int));
+  memcpy (epi16_src, p_init, 2 * N * sizeof (short));
+  memcpy (epi8_src, p_init, 2 * N * sizeof (char));
+
+  foo_pd (pd_dst, pd_src);
+  foo_ps (ps_dst, ps_src);
+  foo_epi64 (epi64_dst, epi64_src);
+  foo_epi32 (epi32_dst, epi32_src);
+  foo_epi16 (epi16_dst, epi16_src);
+  foo_epi8 (epi8_dst, epi8_src);
+  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1c.c b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
new file mode 100644
index 00000000000..f07e9fb2d3d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
@@ -0,0 +1,41 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "vect" } } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#include <string.h>
+
+static void do_test (void);
+
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+#define N 10000
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
+
+static void
+do_test (void)
+{
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
+
+  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
+
+  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
+    p_init[i] = i;
+
+  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
+
+  foo_ph (ph_dst, ph_src);
+  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
+    __builtin_abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2a.c b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
new file mode 100644
index 00000000000..d2e2f8d4f43
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
@@ -0,0 +1,82 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "slp2" } } */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+  a[8] = b[8];
+  a[9] = b[9];
+  a[10] = b[10];
+  a[11] = b[11];
+  a[12] = b[12];
+  a[13] = b[13];
+  a[14] = b[14];
+  a[15] = b[15];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2b.c b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
new file mode 100644
index 00000000000..ac360752693
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
@@ -0,0 +1,62 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-2a.c"
+
+void
+avx_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (32);
+  _Complex double* pd_dst = (_Complex double*) malloc (32);
+  _Complex float* ps_src = (_Complex float*) malloc (32);
+  _Complex float* ps_dst = (_Complex float*) malloc (32);
+  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
+  _Complex int* epi32_src = (_Complex int*) malloc (32);
+  _Complex int* epi32_dst = (_Complex int*) malloc (32);
+  _Complex short* epi16_src = (_Complex short*) malloc (32);
+  _Complex short* epi16_dst = (_Complex short*) malloc (32);
+  _Complex char* epi8_src = (_Complex char*) malloc (32);
+  _Complex char* epi8_dst = (_Complex char*) malloc (32);
+  char* p = (char* ) malloc (32);
+
+  __builtin_memset (pd_dst, 0, 32);
+  __builtin_memset (ps_dst, 0, 32);
+  __builtin_memset (epi64_dst, 0, 32);
+  __builtin_memset (epi32_dst, 0, 32);
+  __builtin_memset (epi16_dst, 0, 32);
+  __builtin_memset (epi8_dst, 0, 32);
+
+  for (int i = 0; i != 32; i++)
+    p[i] = i;
+  __builtin_memcpy (pd_src, p, 32);
+  __builtin_memcpy (ps_src, p, 32);
+  __builtin_memcpy (epi64_src, p, 32);
+  __builtin_memcpy (epi32_src, p, 32);
+  __builtin_memcpy (epi16_src, p, 32);
+  __builtin_memcpy (epi8_src, p, 32);
+
+  foo_pd (pd_dst, pd_src);
+  foo_ps (ps_dst, ps_src);
+  foo_epi64 (epi64_dst, epi64_src);
+  foo_epi32 (epi32_dst, epi32_src);
+  foo_epi16 (epi16_dst, epi16_src);
+  foo_epi8 (epi8_dst, epi8_src);
+  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2c.c b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
new file mode 100644
index 00000000000..a002f209ec9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
@@ -0,0 +1,47 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
+/* { dg-require-effective-target avx512fp16 } */
+
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+}
+
+void
+do_test (void)
+{
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
+  char* p = (char* ) malloc (32);
+
+   __builtin_memset (ph_dst, 0, 32);
+ 
+  for (int i = 0; i != 32; i++)
+    p[i] = i;
+  __builtin_memcpy (ph_src, p, 32);
+ 
+  foo_ph (ph_dst, ph_src);
+  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3a.c b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
new file mode 100644
index 00000000000..c1b64b56b1c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
@@ -0,0 +1,80 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 2 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 6, 7, 4, 5 \}} 1 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17 \}} 1 "slp2" } }  */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* __restrict b)
+{
+  a[0] = b[1];
+  a[1] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* __restrict b)
+{
+  a[0] = b[1];
+  a[1] = b[0];
+  a[2] = b[3];
+  a[3] = b[2];
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
+{
+  a[0] = b[1];
+  a[1] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* __restrict b)
+{
+  a[0] = b[3];
+  a[1] = b[2];
+  a[2] = b[1];
+  a[3] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* __restrict b)
+{
+  a[0] = b[7];
+  a[1] = b[6];
+  a[2] = b[5];
+  a[3] = b[4];
+  a[4] = b[3];
+  a[5] = b[2];
+  a[6] = b[1];
+  a[7] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* __restrict b)
+{
+  a[0] = b[7];
+  a[1] = b[6];
+  a[2] = b[5];
+  a[3] = b[4];
+  a[4] = b[3];
+  a[5] = b[2];
+  a[6] = b[1];
+  a[7] = b[0];
+  a[8] = b[15];
+  a[9] = b[14];
+  a[10] = b[13];
+  a[11] = b[12];
+  a[12] = b[11];
+  a[13] = b[10];
+  a[14] = b[9];
+  a[15] = b[8];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3b.c b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
new file mode 100644
index 00000000000..e4fa3f3a541
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
@@ -0,0 +1,126 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx2 } */
+
+#include "avx2-check.h"
+#include <string.h>
+#include "pr106010-3a.c"
+
+void
+avx2_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (32);
+  _Complex double* pd_dst = (_Complex double*) malloc (32);
+  _Complex double* pd_exp = (_Complex double*) malloc (32);
+  _Complex float* ps_src = (_Complex float*) malloc (32);
+  _Complex float* ps_dst = (_Complex float*) malloc (32);
+  _Complex float* ps_exp = (_Complex float*) malloc (32);
+  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
+  _Complex long long* epi64_exp = (_Complex long long*) malloc (32);
+  _Complex int* epi32_src = (_Complex int*) malloc (32);
+  _Complex int* epi32_dst = (_Complex int*) malloc (32);
+  _Complex int* epi32_exp = (_Complex int*) malloc (32);
+  _Complex short* epi16_src = (_Complex short*) malloc (32);
+  _Complex short* epi16_dst = (_Complex short*) malloc (32);
+  _Complex short* epi16_exp = (_Complex short*) malloc (32);
+  _Complex char* epi8_src = (_Complex char*) malloc (32);
+  _Complex char* epi8_dst = (_Complex char*) malloc (32);
+  _Complex char* epi8_exp = (_Complex char*) malloc (32);
+  char* p = (char* ) malloc (32);
+  char* q = (char* ) malloc (32);
+
+  __builtin_memset (pd_dst, 0, 32);
+  __builtin_memset (ps_dst, 0, 32);
+  __builtin_memset (epi64_dst, 0, 32);
+  __builtin_memset (epi32_dst, 0, 32);
+  __builtin_memset (epi16_dst, 0, 32);
+  __builtin_memset (epi8_dst, 0, 32);
+
+  for (int i = 0; i != 32; i++)
+    p[i] = i;
+  __builtin_memcpy (pd_src, p, 32);
+  __builtin_memcpy (ps_src, p, 32);
+  __builtin_memcpy (epi64_src, p, 32);
+  __builtin_memcpy (epi32_src, p, 32);
+  __builtin_memcpy (epi16_src, p, 32);
+  __builtin_memcpy (epi8_src, p, 32);
+
+  for (int i = 0; i != 16; i++)
+    {
+      p[i] = i + 16;
+      p[i + 16] = i;
+    }
+  __builtin_memcpy (pd_exp, p, 32);
+  __builtin_memcpy (epi64_exp, p, 32);
+
+  for (int i = 0; i != 8; i++)
+    {
+      p[i] = i + 8;
+      p[i + 8] = i;
+      p[i + 16] = i + 24;
+      p[i + 24] = i + 16;
+      q[i] = i + 24;
+      q[i + 8] = i + 16;
+      q[i + 16] = i + 8;
+      q[i + 24] = i;
+    }
+  __builtin_memcpy (ps_exp, p, 32);
+  __builtin_memcpy (epi32_exp, q, 32);
+
+
+  for (int i = 0; i != 4; i++)
+    {
+      q[i] = i + 28;
+      q[i + 4] = i + 24;
+      q[i + 8] = i + 20;
+      q[i + 12] = i + 16;
+      q[i + 16] = i + 12;
+      q[i + 20] = i + 8;
+      q[i + 24] = i + 4;
+      q[i + 28] = i;
+    }
+  __builtin_memcpy (epi16_exp, q, 32);
+
+  for (int i = 0; i != 2; i++)
+    {
+      q[i] = i + 14;
+      q[i + 2] = i + 12;
+      q[i + 4] = i + 10;
+      q[i + 6] = i + 8;
+      q[i + 8] = i + 6;
+      q[i + 10] = i + 4;
+      q[i + 12] = i + 2;
+      q[i + 14] = i;
+      q[i + 16] = i + 30;
+      q[i + 18] = i + 28;
+      q[i + 20] = i + 26;
+      q[i + 22] = i + 24;
+      q[i + 24] = i + 22;
+      q[i + 26] = i + 20;
+      q[i + 28] = i + 18;
+      q[i + 30] = i + 16;
+    }
+  __builtin_memcpy (epi8_exp, q, 32);
+
+  foo_pd (pd_dst, pd_src);
+  foo_ps (ps_dst, ps_src);
+  foo_epi64 (epi64_dst, epi64_src);
+  foo_epi32 (epi32_dst, epi32_src);
+  foo_epi16 (epi16_dst, epi16_src);
+  foo_epi8 (epi8_dst, epi8_src);
+  if (__builtin_memcmp (pd_dst, pd_exp, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_exp, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_exp, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_exp, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_exp, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi8_dst, epi8_exp, 32) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3c.c b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
new file mode 100644
index 00000000000..5a5a3d4b992
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
@@ -0,0 +1,69 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 8, 9, 6, 7, 14, 15, 12, 13, 4, 5, 10, 11 \}} 1 "slp2" } }  */
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
+{
+  a[0] = b[1];
+  a[1] = b[0];
+  a[2] = b[4];
+  a[3] = b[3];
+  a[4] = b[7];
+  a[5] = b[6];
+  a[6] = b[2];
+  a[7] = b[5];
+}
+
+void
+do_test (void)
+{
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
+  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (32);
+  char* p = (char* ) malloc (32);
+  char* q = (char* ) malloc (32);
+
+  __builtin_memset (ph_dst, 0, 32);
+
+  for (int i = 0; i != 32; i++)
+    p[i] = i;
+  __builtin_memcpy (ph_src, p, 32);
+
+  for (int i = 0; i != 4; i++)
+    {
+      p[i] = i + 4;
+      p[i + 4] = i;
+      p[i + 8] = i + 16;
+      p[i + 12] = i + 12;
+      p[i + 16] = i + 28;
+      p[i + 20] = i + 24;
+      p[i + 24] = i + 8;
+      p[i + 28] = i + 20;
+      q[i] = i + 28;
+      q[i + 4] = i + 24;
+      q[i + 8] = i + 20;
+      q[i + 12] = i + 16;
+      q[i + 16] = i + 12;
+      q[i + 20] = i + 8;
+      q[i + 24] = i + 4;
+      q[i + 28] = i;
+    }
+  __builtin_memcpy (ph_exp, p, 32);
+
+  foo_ph (ph_dst, ph_src);
+  if (__builtin_memcmp (ph_dst, ph_exp, 32) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4a.c b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
new file mode 100644
index 00000000000..b7b0b532bb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
@@ -0,0 +1,101 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "slp2" } } */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a,
+	_Complex double b1,
+	_Complex double b2)
+{
+  a[0] = b1;
+  a[1] = b2;
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a,
+	_Complex float b1, _Complex float b2,
+	_Complex float b3, _Complex float b4)
+{
+  a[0] = b1;
+  a[1] = b2;
+  a[2] = b3;
+  a[3] = b4;
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a,
+	   _Complex long long b1,
+	   _Complex long long b2)
+{
+  a[0] = b1;
+  a[1] = b2;
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a,
+	   _Complex int b1, _Complex int b2,
+	   _Complex int b3, _Complex int b4)
+{
+  a[0] = b1;
+  a[1] = b2;
+  a[2] = b3;
+  a[3] = b4;
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a,
+	   _Complex short b1, _Complex short b2,
+	   _Complex short b3, _Complex short b4,
+	   _Complex short b5, _Complex short b6,
+	   _Complex short b7,_Complex short b8)
+{
+  a[0] = b1;
+  a[1] = b2;
+  a[2] = b3;
+  a[3] = b4;
+  a[4] = b5;
+  a[5] = b6;
+  a[6] = b7;
+  a[7] = b8;
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a,
+	  _Complex char b1, _Complex char b2,
+	  _Complex char b3, _Complex char b4,
+	  _Complex char b5, _Complex char b6,
+	  _Complex char b7,_Complex char b8,
+	  _Complex char b9, _Complex char b10,
+	  _Complex char b11, _Complex char b12,
+	  _Complex char b13, _Complex char b14,
+	  _Complex char b15,_Complex char b16)
+{
+  a[0] = b1;
+  a[1] = b2;
+  a[2] = b3;
+  a[3] = b4;
+  a[4] = b5;
+  a[5] = b6;
+  a[6] = b7;
+  a[7] = b8;
+  a[8] = b9;
+  a[9] = b10;
+  a[10] = b11;
+  a[11] = b12;
+  a[12] = b13;
+  a[13] = b14;
+  a[14] = b15;
+  a[15] = b16;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4b.c b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
new file mode 100644
index 00000000000..e2e79508c4b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
@@ -0,0 +1,67 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-4a.c"
+
+void
+avx_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (32);
+  _Complex double* pd_dst = (_Complex double*) malloc (32);
+  _Complex float* ps_src = (_Complex float*) malloc (32);
+  _Complex float* ps_dst = (_Complex float*) malloc (32);
+  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
+  _Complex int* epi32_src = (_Complex int*) malloc (32);
+  _Complex int* epi32_dst = (_Complex int*) malloc (32);
+  _Complex short* epi16_src = (_Complex short*) malloc (32);
+  _Complex short* epi16_dst = (_Complex short*) malloc (32);
+  _Complex char* epi8_src = (_Complex char*) malloc (32);
+  _Complex char* epi8_dst = (_Complex char*) malloc (32);
+  char* p = (char* ) malloc (32);
+
+  __builtin_memset (pd_dst, 0, 32);
+  __builtin_memset (ps_dst, 0, 32);
+  __builtin_memset (epi64_dst, 0, 32);
+  __builtin_memset (epi32_dst, 0, 32);
+  __builtin_memset (epi16_dst, 0, 32);
+  __builtin_memset (epi8_dst, 0, 32);
+
+  for (int i = 0; i != 32; i++)
+    p[i] = i;
+  __builtin_memcpy (pd_src, p, 32);
+  __builtin_memcpy (ps_src, p, 32);
+  __builtin_memcpy (epi64_src, p, 32);
+  __builtin_memcpy (epi32_src, p, 32);
+  __builtin_memcpy (epi16_src, p, 32);
+  __builtin_memcpy (epi8_src, p, 32);
+
+  foo_pd (pd_dst, pd_src[0], pd_src[1]);
+  foo_ps (ps_dst, ps_src[0], ps_src[1], ps_src[2], ps_src[3]);
+  foo_epi64 (epi64_dst, epi64_src[0], epi64_src[1]);
+  foo_epi32 (epi32_dst, epi32_src[0], epi32_src[1], epi32_src[2], epi32_src[3]);
+  foo_epi16 (epi16_dst, epi16_src[0], epi16_src[1], epi16_src[2], epi16_src[3],
+	     epi16_src[4], epi16_src[5], epi16_src[6], epi16_src[7]);
+  foo_epi8 (epi8_dst, epi8_src[0], epi8_src[1], epi8_src[2], epi8_src[3],
+	    epi8_src[4], epi8_src[5], epi8_src[6], epi8_src[7],
+	    epi8_src[8], epi8_src[9], epi8_src[10], epi8_src[11],
+	    epi8_src[12], epi8_src[13], epi8_src[14], epi8_src[15]);
+
+  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi8_dst, epi8_src, 32) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4c.c b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
new file mode 100644
index 00000000000..8e02aefe3b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
@@ -0,0 +1,54 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -fdump-tree-slp-details -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "slp2" } } */
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a,
+	_Complex _Float16 b1, _Complex _Float16 b2,
+	_Complex _Float16 b3, _Complex _Float16 b4,
+	_Complex _Float16 b5, _Complex _Float16 b6,
+	_Complex _Float16 b7,_Complex _Float16 b8)
+{
+  a[0] = b1;
+  a[1] = b2;
+  a[2] = b3;
+  a[3] = b4;
+  a[4] = b5;
+  a[5] = b6;
+  a[6] = b7;
+  a[7] = b8;
+}
+
+void
+do_test (void)
+{
+
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
+
+  char* p = (char* ) malloc (32);
+
+  __builtin_memset (ph_dst, 0, 32);
+
+  for (int i = 0; i != 32; i++)
+    p[i] = i;
+
+  __builtin_memcpy (ph_src, p, 32);
+
+  foo_ph (ph_dst, ph_src[0], ph_src[1], ph_src[2], ph_src[3],
+	  ph_src[4], ph_src[5], ph_src[6], ph_src[7]);
+
+  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
+    __builtin_abort ();
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5a.c b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
new file mode 100644
index 00000000000..9d4a6f9846b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
@@ -0,0 +1,117 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 4 "slp2" } } */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* __restrict b)
+{
+  a[0] = b[2];
+  a[1] = b[3];
+  a[2] = b[0];
+  a[3] = b[1];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* __restrict b)
+{
+  a[0] = b[4];
+  a[1] = b[5];
+  a[2] = b[6];
+  a[3] = b[7];
+  a[4] = b[0];
+  a[5] = b[1];
+  a[6] = b[2];
+  a[7] = b[3];
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
+{
+  a[0] = b[2];
+  a[1] = b[3];
+  a[2] = b[0];
+  a[3] = b[1];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* __restrict b)
+{
+  a[0] = b[4];
+  a[1] = b[5];
+  a[2] = b[6];
+  a[3] = b[7];
+  a[4] = b[0];
+  a[5] = b[1];
+  a[6] = b[2];
+  a[7] = b[3];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* __restrict b)
+{
+  a[0] = b[8];
+  a[1] = b[9];
+  a[2] = b[10];
+  a[3] = b[11];
+  a[4] = b[12];
+  a[5] = b[13];
+  a[6] = b[14];
+  a[7] = b[15];
+  a[8] = b[0];
+  a[9] = b[1];
+  a[10] = b[2];
+  a[11] = b[3];
+  a[12] = b[4];
+  a[13] = b[5];
+  a[14] = b[6];
+  a[15] = b[7];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* __restrict b)
+{
+  a[0] = b[16];
+  a[1] = b[17];
+  a[2] = b[18];
+  a[3] = b[19];
+  a[4] = b[20];
+  a[5] = b[21];
+  a[6] = b[22];
+  a[7] = b[23];
+  a[8] = b[24];
+  a[9] = b[25];
+  a[10] = b[26];
+  a[11] = b[27];
+  a[12] = b[28];
+  a[13] = b[29];
+  a[14] = b[30];
+  a[15] = b[31];
+  a[16] = b[0];
+  a[17] = b[1];
+  a[18] = b[2];
+  a[19] = b[3];
+  a[20] = b[4];
+  a[21] = b[5];
+  a[22] = b[6];
+  a[23] = b[7];
+  a[24] = b[8];
+  a[25] = b[9];
+  a[26] = b[10];
+  a[27] = b[11];
+  a[28] = b[12];
+  a[29] = b[13];
+  a[30] = b[14];
+  a[31] = b[15];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5b.c b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
new file mode 100644
index 00000000000..d5c6ebeb5cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
@@ -0,0 +1,80 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-5a.c"
+
+void
+avx_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (64);
+  _Complex double* pd_dst = (_Complex double*) malloc (64);
+  _Complex double* pd_exp = (_Complex double*) malloc (64);
+  _Complex float* ps_src = (_Complex float*) malloc (64);
+  _Complex float* ps_dst = (_Complex float*) malloc (64);
+  _Complex float* ps_exp = (_Complex float*) malloc (64);
+  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
+  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
+  _Complex int* epi32_src = (_Complex int*) malloc (64);
+  _Complex int* epi32_dst = (_Complex int*) malloc (64);
+  _Complex int* epi32_exp = (_Complex int*) malloc (64);
+  _Complex short* epi16_src = (_Complex short*) malloc (64);
+  _Complex short* epi16_dst = (_Complex short*) malloc (64);
+  _Complex short* epi16_exp = (_Complex short*) malloc (64);
+  _Complex char* epi8_src = (_Complex char*) malloc (64);
+  _Complex char* epi8_dst = (_Complex char*) malloc (64);
+  _Complex char* epi8_exp = (_Complex char*) malloc (64);
+  char* p = (char* ) malloc (64);
+  char* q = (char* ) malloc (64);
+
+  __builtin_memset (pd_dst, 0, 64);
+  __builtin_memset (ps_dst, 0, 64);
+  __builtin_memset (epi64_dst, 0, 64);
+  __builtin_memset (epi32_dst, 0, 64);
+  __builtin_memset (epi16_dst, 0, 64);
+  __builtin_memset (epi8_dst, 0, 64);
+
+  for (int i = 0; i != 64; i++)
+    {
+      p[i] = i;
+      q[i] = (i + 32) % 64;
+    }
+  __builtin_memcpy (pd_src, p, 64);
+  __builtin_memcpy (ps_src, p, 64);
+  __builtin_memcpy (epi64_src, p, 64);
+  __builtin_memcpy (epi32_src, p, 64);
+  __builtin_memcpy (epi16_src, p, 64);
+  __builtin_memcpy (epi8_src, p, 64);
+
+  __builtin_memcpy (pd_exp, q, 64);
+  __builtin_memcpy (ps_exp, q, 64);
+  __builtin_memcpy (epi64_exp, q, 64);
+  __builtin_memcpy (epi32_exp, q, 64);
+  __builtin_memcpy (epi16_exp, q, 64);
+  __builtin_memcpy (epi8_exp, q, 64);
+
+  foo_pd (pd_dst, pd_src);
+  foo_ps (ps_dst, ps_src);
+  foo_epi64 (epi64_dst, epi64_src);
+  foo_epi32 (epi32_dst, epi32_src);
+  foo_epi16 (epi16_dst, epi16_src);
+  foo_epi8 (epi8_dst, epi8_src);
+
+  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5c.c b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
new file mode 100644
index 00000000000..9ce4e6dd5c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
@@ -0,0 +1,62 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 4 "slp2" } } */
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
+{
+  a[0] = b[8];
+  a[1] = b[9];
+  a[2] = b[10];
+  a[3] = b[11];
+  a[4] = b[12];
+  a[5] = b[13];
+  a[6] = b[14];
+  a[7] = b[15];
+  a[8] = b[0];
+  a[9] = b[1];
+  a[10] = b[2];
+  a[11] = b[3];
+  a[12] = b[4];
+  a[13] = b[5];
+  a[14] = b[6];
+  a[15] = b[7];
+}
+
+void
+do_test (void)
+{
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
+  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
+  char* p = (char* ) malloc (64);
+  char* q = (char* ) malloc (64);
+
+  __builtin_memset (ph_dst, 0, 64);
+
+  for (int i = 0; i != 64; i++)
+    {
+      p[i] = i;
+      q[i] = (i + 32) % 64;
+    }
+  __builtin_memcpy (ph_src, p, 64);
+
+  __builtin_memcpy (ph_exp, q, 64);
+
+  foo_ph (ph_dst, ph_src);
+
+  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6a.c b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
new file mode 100644
index 00000000000..65a90d03684
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
@@ -0,0 +1,115 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 4 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 4 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* __restrict b)
+{
+  a[0] = b[3];
+  a[1] = b[2];
+  a[2] = b[1];
+  a[3] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* __restrict b)
+{
+  a[0] = b[7];
+  a[1] = b[6];
+  a[2] = b[5];
+  a[3] = b[4];
+  a[4] = b[3];
+  a[5] = b[2];
+  a[6] = b[1];
+  a[7] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
+{
+  a[0] = b[3];
+  a[1] = b[2];
+  a[2] = b[1];
+  a[3] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* __restrict b)
+{
+  a[0] = b[7];
+  a[1] = b[6];
+  a[2] = b[5];
+  a[3] = b[4];
+  a[4] = b[3];
+  a[5] = b[2];
+  a[6] = b[1];
+  a[7] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* __restrict b)
+{
+  a[0] = b[15];
+  a[1] = b[14];
+  a[2] = b[13];
+  a[3] = b[12];
+  a[4] = b[11];
+  a[5] = b[10];
+  a[6] = b[9];
+  a[7] = b[8];
+  a[8] = b[7];
+  a[9] = b[6];
+  a[10] = b[5];
+  a[11] = b[4];
+  a[12] = b[3];
+  a[13] = b[2];
+  a[14] = b[1];
+  a[15] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* __restrict b)
+{
+  a[0] = b[31];
+  a[1] = b[30];
+  a[2] = b[29];
+  a[3] = b[28];
+  a[4] = b[27];
+  a[5] = b[26];
+  a[6] = b[25];
+  a[7] = b[24];
+  a[8] = b[23];
+  a[9] = b[22];
+  a[10] = b[21];
+  a[11] = b[20];
+  a[12] = b[19];
+  a[13] = b[18];
+  a[14] = b[17];
+  a[15] = b[16];
+  a[16] = b[15];
+  a[17] = b[14];
+  a[18] = b[13];
+  a[19] = b[12];
+  a[20] = b[11];
+  a[21] = b[10];
+  a[22] = b[9];
+  a[23] = b[8];
+  a[24] = b[7];
+  a[25] = b[6];
+  a[26] = b[5];
+  a[27] = b[4];
+  a[28] = b[3];
+  a[29] = b[2];
+  a[30] = b[1];
+  a[31] = b[0];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6b.c b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
new file mode 100644
index 00000000000..1c5bb020939
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
@@ -0,0 +1,157 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx2 } */
+
+#include "avx2-check.h"
+#include <string.h>
+#include "pr106010-6a.c"
+
+void
+avx2_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (64);
+  _Complex double* pd_dst = (_Complex double*) malloc (64);
+  _Complex double* pd_exp = (_Complex double*) malloc (64);
+  _Complex float* ps_src = (_Complex float*) malloc (64);
+  _Complex float* ps_dst = (_Complex float*) malloc (64);
+  _Complex float* ps_exp = (_Complex float*) malloc (64);
+  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
+  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
+  _Complex int* epi32_src = (_Complex int*) malloc (64);
+  _Complex int* epi32_dst = (_Complex int*) malloc (64);
+  _Complex int* epi32_exp = (_Complex int*) malloc (64);
+  _Complex short* epi16_src = (_Complex short*) malloc (64);
+  _Complex short* epi16_dst = (_Complex short*) malloc (64);
+  _Complex short* epi16_exp = (_Complex short*) malloc (64);
+  _Complex char* epi8_src = (_Complex char*) malloc (64);
+  _Complex char* epi8_dst = (_Complex char*) malloc (64);
+  _Complex char* epi8_exp = (_Complex char*) malloc (64);
+  char* p = (char* ) malloc (64);
+  char* q = (char* ) malloc (64);
+
+  __builtin_memset (pd_dst, 0, 64);
+  __builtin_memset (ps_dst, 0, 64);
+  __builtin_memset (epi64_dst, 0, 64);
+  __builtin_memset (epi32_dst, 0, 64);
+  __builtin_memset (epi16_dst, 0, 64);
+  __builtin_memset (epi8_dst, 0, 64);
+
+  for (int i = 0; i != 64; i++)
+    p[i] = i;
+
+  __builtin_memcpy (pd_src, p, 64);
+  __builtin_memcpy (ps_src, p, 64);
+  __builtin_memcpy (epi64_src, p, 64);
+  __builtin_memcpy (epi32_src, p, 64);
+  __builtin_memcpy (epi16_src, p, 64);
+  __builtin_memcpy (epi8_src, p, 64);
+
+
+  for (int i = 0; i != 16; i++)
+    {
+      q[i] = i + 48;
+      q[i + 16] = i + 32;
+      q[i + 32] = i + 16;
+      q[i + 48] = i;
+    }
+ 
+  __builtin_memcpy (pd_exp, q, 64);
+  __builtin_memcpy (epi64_exp, q, 64);
+
+   for (int i = 0; i != 8; i++)
+    {
+      q[i] = i + 56;
+      q[i + 8] = i + 48;
+      q[i + 16] = i + 40;
+      q[i + 24] = i + 32;
+      q[i + 32] = i + 24;
+      q[i + 40] = i + 16;
+      q[i + 48] = i + 8;
+      q[i + 56] = i;
+    }
+
+  __builtin_memcpy (ps_exp, q, 64);
+  __builtin_memcpy (epi32_exp, q, 64);
+
+  for (int i = 0; i != 4; i++)
+    {
+      q[i] = i + 60;
+      q[i + 4] = i + 56;
+      q[i + 8] = i + 52;
+      q[i + 12] = i + 48;
+      q[i + 16] = i + 44;
+      q[i + 20] = i + 40;
+      q[i + 24] = i + 36;
+      q[i + 28] = i + 32;
+      q[i + 32] = i + 28;
+      q[i + 36] = i + 24;
+      q[i + 40] = i + 20;
+      q[i + 44] = i + 16;
+      q[i + 48] = i + 12;
+      q[i + 52] = i + 8;
+      q[i + 56] = i + 4;
+      q[i + 60] = i;
+    }
+
+  __builtin_memcpy (epi16_exp, q, 64);
+
+  for (int i = 0; i != 2; i++)
+    {
+      q[i] = i + 62;
+      q[i + 2] = i + 60;
+      q[i + 4] = i + 58;
+      q[i + 6] = i + 56;
+      q[i + 8] = i + 54;
+      q[i + 10] = i + 52;
+      q[i + 12] = i + 50;
+      q[i + 14] = i + 48;
+      q[i + 16] = i + 46;
+      q[i + 18] = i + 44;
+      q[i + 20] = i + 42;
+      q[i + 22] = i + 40;
+      q[i + 24] = i + 38;
+      q[i + 26] = i + 36;
+      q[i + 28] = i + 34;
+      q[i + 30] = i + 32;
+      q[i + 32] = i + 30;
+      q[i + 34] = i + 28;
+      q[i + 36] = i + 26;
+      q[i + 38] = i + 24;
+      q[i + 40] = i + 22;
+      q[i + 42] = i + 20;
+      q[i + 44] = i + 18;
+      q[i + 46] = i + 16;
+      q[i + 48] = i + 14;
+      q[i + 50] = i + 12;
+      q[i + 52] = i + 10;
+      q[i + 54] = i + 8;
+      q[i + 56] = i + 6;
+      q[i + 58] = i + 4;
+      q[i + 60] = i + 2;
+      q[i + 62] = i;
+    }
+  __builtin_memcpy (epi8_exp, q, 64);
+
+  foo_pd (pd_dst, pd_src);
+  foo_ps (ps_dst, ps_src);
+  foo_epi64 (epi64_dst, epi64_src);
+  foo_epi32 (epi32_dst, epi32_src);
+  foo_epi16 (epi16_dst, epi16_src);
+  foo_epi8 (epi8_dst, epi8_src);
+
+  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6c.c b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
new file mode 100644
index 00000000000..b859d884a7f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
@@ -0,0 +1,80 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } } */
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
+{
+  a[0] = b[15];
+  a[1] = b[14];
+  a[2] = b[13];
+  a[3] = b[12];
+  a[4] = b[11];
+  a[5] = b[10];
+  a[6] = b[9];
+  a[7] = b[8];
+  a[8] = b[7];
+  a[9] = b[6];
+  a[10] = b[5];
+  a[11] = b[4];
+  a[12] = b[3];
+  a[13] = b[2];
+  a[14] = b[1];
+  a[15] = b[0];
+}
+
+void
+do_test (void)
+{
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
+  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
+  char* p = (char* ) malloc (64);
+  char* q = (char* ) malloc (64);
+
+  __builtin_memset (ph_dst, 0, 64);
+
+  for (int i = 0; i != 64; i++)
+    p[i] = i;
+
+  __builtin_memcpy (ph_src, p, 64);
+
+  for (int i = 0; i != 4; i++)
+    {
+      q[i] = i + 60;
+      q[i + 4] = i + 56;
+      q[i + 8] = i + 52;
+      q[i + 12] = i + 48;
+      q[i + 16] = i + 44;
+      q[i + 20] = i + 40;
+      q[i + 24] = i + 36;
+      q[i + 28] = i + 32;
+      q[i + 32] = i + 28;
+      q[i + 36] = i + 24;
+      q[i + 40] = i + 20;
+      q[i + 44] = i + 16;
+      q[i + 48] = i + 12;
+      q[i + 52] = i + 8;
+      q[i + 56] = i + 4;
+      q[i + 60] = i;
+    }
+
+  __builtin_memcpy (ph_exp, q, 64);
+
+  foo_ph (ph_dst, ph_src);
+  
+  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7a.c b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
new file mode 100644
index 00000000000..2ea01fac927
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
+
+#define N 10000
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
new file mode 100644
index 00000000000..26482cc10f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
@@ -0,0 +1,63 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-7a.c"
+
+void
+avx_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
+  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
+  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
+  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
+  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
+  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
+  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
+  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
+  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
+  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
+  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
+  char* p_init = (char*) malloc (2 * N * sizeof (double));
+
+  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
+  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
+  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
+  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
+  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
+  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
+
+  for (int i = 0; i != 2 * N * sizeof (double); i++)
+    p_init[i] = i % 2 + 3;
+
+  memcpy (pd_src, p_init, 2 * N * sizeof (double));
+  memcpy (ps_dst, p_init, 2 * N * sizeof (float));
+  memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
+  memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
+  memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
+  memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
+
+  foo_pd (pd_dst, pd_src[0]);
+  foo_ps (ps_dst, ps_src[0]);
+  foo_epi64 (epi64_dst, epi64_src[0]);
+  foo_epi32 (epi32_dst, epi32_src[0]);
+  foo_epi16 (epi16_dst, epi16_src[0]);
+  foo_epi8 (epi8_dst, epi8_src[0]);
+  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7c.c b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
new file mode 100644
index 00000000000..7f4056a5ecc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
@@ -0,0 +1,41 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#include <string.h>
+
+static void do_test (void);
+
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+#define N 10000
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16 b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
+
+static void
+do_test (void)
+{
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
+
+  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
+
+  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
+    p_init[i] = i % 2 + 3;
+
+  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
+
+  foo_ph (ph_dst, ph_src[0]);
+  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
+    __builtin_abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8a.c b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
new file mode 100644
index 00000000000..11054b60d30
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
+
+#define N 10000
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1.0 + 2.0i;
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1.0f + 2.0fi;
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1 + 2i;
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1 + 2i;
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1 + 2i;
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1 + 2i;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8b.c b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
new file mode 100644
index 00000000000..6bb0073b691
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
@@ -0,0 +1,53 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-8a.c"
+
+void
+avx_test (void)
+{
+  _Complex double pd_src = 1.0 + 2.0i;
+  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
+  _Complex float ps_src = 1.0 + 2.0i;
+  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
+  _Complex long long epi64_src = 1 + 2i;;
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
+  _Complex int epi32_src = 1 + 2i;
+  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
+  _Complex short epi16_src = 1 + 2i;
+  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
+  _Complex char epi8_src = 1 + 2i;
+  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
+
+  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
+  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
+  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
+  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
+  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
+  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
+
+  foo_pd (pd_dst);
+  foo_ps (ps_dst);
+  foo_epi64 (epi64_dst);
+  foo_epi32 (epi32_dst);
+  foo_epi16 (epi16_dst);
+  foo_epi8 (epi8_dst);
+  for (int i = 0 ; i != N; i++)
+    {
+      if (pd_dst[i] != pd_src)
+	__builtin_abort ();
+      if (ps_dst[i] != ps_src)
+	__builtin_abort ();
+      if (epi64_dst[i] != epi64_src)
+	__builtin_abort ();
+      if (epi32_dst[i] != epi32_src)
+	__builtin_abort ();
+      if (epi16_dst[i] != epi16_src)
+	__builtin_abort ();
+      if (epi8_dst[i] != epi8_src)
+	__builtin_abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8c.c b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
new file mode 100644
index 00000000000..61ae131829d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
@@ -0,0 +1,38 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#include <string.h>
+
+static void do_test (void);
+
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+#define N 10000
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1.0f16 + 2.0f16i;
+}
+
+static void
+do_test (void)
+{
+  _Complex _Float16 ph_src = 1.0f16 + 2.0f16i;
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+
+  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
+
+  foo_ph (ph_dst);
+  for (int i = 0; i != N; i++)
+    {
+      if (ph_dst[i] != ph_src)
+	__builtin_abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9a.c b/gcc/testsuite/gcc.target/i386/pr106010-9a.c
new file mode 100644
index 00000000000..e922f7b5400
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-9a.c
@@ -0,0 +1,89 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -fvect-cost-model=unlimited -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
+
+typedef struct { _Complex double c; double a1; double a2;}
+  cdf;
+typedef struct { _Complex double c; double a1; double a2; double a3; double a4;}
+  cdf2;
+typedef struct { _Complex double c1; _Complex double c2; double a1; double a2; double a3; double a4;}
+  cdf3;
+typedef struct { _Complex double c1; _Complex double c2; double a1; double a2;}
+  cdf4;
+
+#define N 100
+/* VMAT_ELEMENTWISE.  */
+void
+__attribute__((noipa))
+foo (cdf* a, cdf* __restrict b)
+{
+   for (int i = 0; i < N; ++i)
+    {
+      a[i].c = b[i].c;
+      a[i].a1 = b[i].a1;
+      a[i].a2 = b[i].a2;
+    }
+}
+
+/* VMAT_CONTIGUOUS_PERMUTE.  */
+void
+__attribute__((noipa))
+foo1 (cdf2* a, cdf2* __restrict b)
+{
+   for (int i = 0; i < N; ++i)
+    {
+      a[i].c = b[i].c;
+      a[i].a1 = b[i].a1;
+      a[i].a2 = b[i].a2;
+      a[i].a3 = b[i].a3;
+      a[i].a4 = b[i].a4;
+    }
+}
+
+/* VMAT_CONTIGUOUS.  */
+void
+__attribute__((noipa))
+foo2 (cdf3* a, cdf3* __restrict b)
+{
+   for (int i = 0; i < N; ++i)
+    {
+      a[i].c1 = b[i].c1;
+      a[i].c2 = b[i].c2;
+      a[i].a1 = b[i].a1;
+      a[i].a2 = b[i].a2;
+      a[i].a3 = b[i].a3;
+      a[i].a4 = b[i].a4;
+    }
+}
+
+/* VMAT_STRIDED_SLP.  */
+void
+__attribute__((noipa))
+foo3 (cdf4* a, cdf4* __restrict b)
+{
+   for (int i = 0; i < N; ++i)
+    {
+      a[i].c1 = b[i].c1;
+      a[i].c2 = b[i].c2;
+      a[i].a1 = b[i].a1;
+      a[i].a2 = b[i].a2;
+    }
+}
+
+/* VMAT_CONTIGUOUS_REVERSE.  */
+void
+__attribute__((noipa))
+foo4 (_Complex double* a, _Complex double* __restrict b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[N-i-1];
+}
+
+/* VMAT_CONTIGUOUS_DOWN.  */
+void
+__attribute__((noipa))
+foo5 (_Complex double* a, _Complex double* __restrict b)
+{
+  for (int i = 0; i != N; i++)
+    a[N-i-1] = b[0];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9b.c b/gcc/testsuite/gcc.target/i386/pr106010-9b.c
new file mode 100644
index 00000000000..e220445e6e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-9b.c
@@ -0,0 +1,90 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -msse2 -fvect-cost-model=unlimited" } */
+/* { dg-require-effective-target sse2 } */
+
+#include <string.h>
+#include "sse2-check.h"
+#include "pr106010-9a.c"
+
+static void
+sse2_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
+  _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
+  _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
+  _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
+  cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
+  cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
+  cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
+  cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
+  cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
+  cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
+  cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
+  cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
+  
+  char* p_init = (char*) malloc (N * sizeof (cdf3));
+
+  __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
+  __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
+  __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
+  __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
+  __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
+  __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
+
+  for (int i = 0; i != N * sizeof (cdf3); i++)
+    p_init[i] = i;
+
+  memcpy (cdf_src, p_init, N * sizeof (cdf));
+  memcpy (cdf2_src, p_init, N * sizeof (cdf2));
+  memcpy (cdf3_src, p_init, N * sizeof (cdf3));
+  memcpy (cdf4_src, p_init, N * sizeof (cdf4));
+  memcpy (pd_src, p_init, N * sizeof (_Complex double));
+  for (int i = 0; i != 2 * N * sizeof (double); i++)
+    p_init[i] = i % 16;
+  memcpy (pd_src2, p_init, N * sizeof (_Complex double));
+
+  foo (cdf_dst, cdf_src);
+  foo1 (cdf2_dst, cdf2_src);
+  foo2 (cdf3_dst, cdf3_src);
+  foo3 (cdf4_dst, cdf4_src);
+  foo4 (pd_dst, pd_src);
+  foo5 (pd_dst2, pd_src2);
+  for (int i = 0; i != N; i++)
+    {
+      p_init[(N - i - 1) * 16] = i * 16;
+      p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
+      p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
+      p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
+      p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
+      p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
+      p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
+      p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
+      p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
+      p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
+      p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
+      p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
+      p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
+      p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
+      p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
+      p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
+    }
+  memcpy (pd_src, p_init, N * 16);
+ 
+  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
+    __builtin_abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9c.c b/gcc/testsuite/gcc.target/i386/pr106010-9c.c
new file mode 100644
index 00000000000..ff51f6195b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-9c.c
@@ -0,0 +1,90 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mavx2 -fvect-cost-model=unlimited" } */
+/* { dg-require-effective-target avx2 } */
+
+#include <string.h>
+#include "avx2-check.h"
+#include "pr106010-9a.c"
+
+static void
+avx2_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
+  _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
+  _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
+  _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
+  cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
+  cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
+  cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
+  cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
+  cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
+  cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
+  cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
+  cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
+  
+  char* p_init = (char*) malloc (N * sizeof (cdf3));
+
+  __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
+  __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
+  __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
+  __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
+  __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
+  __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
+
+  for (int i = 0; i != N * sizeof (cdf3); i++)
+    p_init[i] = i;
+
+  memcpy (cdf_src, p_init, N * sizeof (cdf));
+  memcpy (cdf2_src, p_init, N * sizeof (cdf2));
+  memcpy (cdf3_src, p_init, N * sizeof (cdf3));
+  memcpy (cdf4_src, p_init, N * sizeof (cdf4));
+  memcpy (pd_src, p_init, N * sizeof (_Complex double));
+  for (int i = 0; i != 2 * N * sizeof (double); i++)
+    p_init[i] = i % 16;
+  memcpy (pd_src2, p_init, N * sizeof (_Complex double));
+
+  foo (cdf_dst, cdf_src);
+  foo1 (cdf2_dst, cdf2_src);
+  foo2 (cdf3_dst, cdf3_src);
+  foo3 (cdf4_dst, cdf4_src);
+  foo4 (pd_dst, pd_src);
+  foo5 (pd_dst2, pd_src2);
+  for (int i = 0; i != N; i++)
+    {
+      p_init[(N - i - 1) * 16] = i * 16;
+      p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
+      p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
+      p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
+      p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
+      p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
+      p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
+      p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
+      p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
+      p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
+      p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
+      p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
+      p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
+      p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
+      p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
+      p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
+    }
+  memcpy (pd_src, p_init, N * 16);
+ 
+  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
+    __builtin_abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9d.c b/gcc/testsuite/gcc.target/i386/pr106010-9d.c
new file mode 100644
index 00000000000..d4d8f1dd722
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-9d.c
@@ -0,0 +1,92 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mavx512f -mavx512vl -fvect-cost-model=unlimited -mprefer-vector-width=512" } */
+/* { dg-require-effective-target avx512f } */
+
+#include <string.h>
+#include <stdlib.h>
+#define AVX512F
+#include "avx512-check.h"
+#include "pr106010-9a.c"
+
+static void
+test_512 (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
+  _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
+  _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
+  _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
+  cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
+  cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
+  cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
+  cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
+  cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
+  cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
+  cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
+  cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
+  
+  char* p_init = (char*) malloc (N * sizeof (cdf3));
+
+  __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
+  __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
+  __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
+  __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
+  __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
+  __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
+
+  for (int i = 0; i != N * sizeof (cdf3); i++)
+    p_init[i] = i;
+
+  memcpy (cdf_src, p_init, N * sizeof (cdf));
+  memcpy (cdf2_src, p_init, N * sizeof (cdf2));
+  memcpy (cdf3_src, p_init, N * sizeof (cdf3));
+  memcpy (cdf4_src, p_init, N * sizeof (cdf4));
+  memcpy (pd_src, p_init, N * sizeof (_Complex double));
+  for (int i = 0; i != 2 * N * sizeof (double); i++)
+    p_init[i] = i % 16;
+  memcpy (pd_src2, p_init, N * sizeof (_Complex double));
+
+  foo (cdf_dst, cdf_src);
+  foo1 (cdf2_dst, cdf2_src);
+  foo2 (cdf3_dst, cdf3_src);
+  foo3 (cdf4_dst, cdf4_src);
+  foo4 (pd_dst, pd_src);
+  foo5 (pd_dst2, pd_src2);
+  for (int i = 0; i != N; i++)
+    {
+      p_init[(N - i - 1) * 16] = i * 16;
+      p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
+      p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
+      p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
+      p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
+      p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
+      p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
+      p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
+      p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
+      p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
+      p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
+      p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
+      p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
+      p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
+      p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
+      p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
+    }
+  memcpy (pd_src, p_init, N * 16);
+ 
+  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
+    __builtin_abort ();
+}
diff --git a/gcc/tree-complex.cc b/gcc/tree-complex.cc
index 61950a0f099..ea9df6114a1 100644
--- a/gcc/tree-complex.cc
+++ b/gcc/tree-complex.cc
@@ -297,6 +297,11 @@ init_dont_simulate_again (void)
 		break;
 
 	      default:
+		/* When expand_complex_move would trigger make sure we
+		   perform lowering even when there is no actual complex
+		   operation.  This helps consistency and vectorization.  */
+		if (TREE_CODE (TREE_TYPE (gimple_op (stmt, 0))) == COMPLEX_TYPE)
+		  saw_a_complex_op = true;
 		break;
 	      }
 
@@ -869,7 +874,9 @@ expand_complex_move (gimple_stmt_iterator *gsi, tree type)
 	  update_complex_assignment (gsi, r, i);
 	}
     }
-  else if (rhs && TREE_CODE (rhs) == SSA_NAME && !TREE_SIDE_EFFECTS (lhs))
+  else if (rhs
+	   && (TREE_CODE (rhs) == SSA_NAME || TREE_CODE (rhs) == COMPLEX_CST)
+	   && !TREE_SIDE_EFFECTS (lhs))
     {
       tree x;
       gimple *t;


More information about the Gcc-cvs mailing list