This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [RFC, RFH PATCH, i386] Fix gcc.target/i386/pr61403.c FAIL with -mavx2
- From: Jakub Jelinek <jakub at redhat dot com>
- To: Uros Bizjak <ubizjak at gmail dot com>
- Cc: Evgeny Stupachenko <evstupac at gmail dot com>, "H.J. Lu" <hjl dot tools at gmail dot com>, GCC Patches <gcc-patches at gcc dot gnu dot org>
- Date: Fri, 3 Oct 2014 13:11:21 +0200
- Subject: Re: [RFC, RFH PATCH, i386] Fix gcc.target/i386/pr61403.c FAIL with -mavx2
- Authentication-results: sourceware.org; auth=none
- References: <CAFULd4bNS96-6hfHRBXA4xUGqOrsqFkD42W1EWro2ROnL1_edQ at mail dot gmail dot com>
- Reply-to: Jakub Jelinek <jakub at redhat dot com>
On Thu, Oct 02, 2014 at 08:34:40PM +0200, Uros Bizjak wrote:
> Index: i386.c
> ===================================================================
> --- i386.c (revision 215802)
> +++ i386.c (working copy)
> @@ -43407,8 +43407,10 @@ expand_vec_perm_pblendv (struct expand_vec_perm_d
> AVX and AVX2 as they require more than 2 instructions. */
> if (d->one_operand_p)
> return false;
> - if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
> + if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
> ;
> + else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
> + ;
> else
> return false;
>
> --cut here--
>
> The comment above expand_vec_perm_pblendv claims that:
>
> /* Use the same checks as in expand_vec_perm_blend, but skipping
> AVX and AVX2 as they require more than 2 instructions. */
The comment is mostly right though, I'd say "as they sometimes require
more than 2 instructions".
> BTW: I have no access to avx2 target, so I can't test the patch with a
> runtime tests. OTOH, it doesn't ICE for "GCC_TEST_RUN_EXPENSIVE=1 make
> check-gcc RUNTESTFLAGS='--target_board=unix/-mavx2
> dg-torture.exp=vshuf*.c'".
Even the expensive testsuite has very limited coverage.
As I wanted to prove your patch will ICE, I wrote following generator:
#ifndef ITYPE
#define ITYPE TYPE
#endif
#define S2(X) #X
#define S(X) S2(X)
int
main ()
{
int i, j, nelt = 32 / sizeof (TYPE);
printf (
"typedef " S(TYPE) " V __attribute__ ((vector_size (32)));\n"
"typedef " S(ITYPE) " VI __attribute__ ((vector_size (32)));\n"
"V a, b, c;\n"
"\n"
"#define T(n, m...) void foo##n (void) { c = __builtin_shuffle (a, b, (VI) m); }\n"
"#define S(n, m...) T(n, m)\n");
for (i = 0; i < 100000; i++)
{
printf ("S (__LINE__, { ");
for (j = 0; j < nelt; j++)
{
int k = random () & 3;
int v = j;
if (k & 1)
v = ((k & 2) ? nelt : 0) + (random () & (nelt - 1));
printf ("%d%s", v, j < (nelt - 1) ? ", " : " })\n");
}
}
}
which can be compiled e.g. with
-DTYPE=char
-DTYPE=short
-DTYPE=int
-DTYPE=long
-DTYPE=float -DITYPE=int
-DTYPE=double -DITYPE=long
and then in each case generate 100000 tests (sort -u on it plus manual fixup
can decrease that, for the V4DI/V4DF cases substantially). The first one
triggered almost immediately an ICE, added to vshuf-32.inc (non-expensive).
With the following updated patch all those generated testcases don't ICE
(-mavx2 for the first four, -mavx for the last two).
Also tested with
GCC_TEST_RUN_EXPENSIVE=1 make check-gcc RUNTESTFLAGS='--target_board=unix/-mavx2 dg-torture.exp=vshuf*.c'
The pr61403.c testcase can be simplified into:
typedef float V __attribute__ ((vector_size (32)));
typedef int VI __attribute__ ((vector_size (32)));
V a, b, c;
#define T(n, m...) void foo##n (void) { c = __builtin_shuffle (a, b, (VI) m); }
T (0, { 0, 1, 2, 3, 4, 5, 10, 13 })
T (1, { 0, 1, 2, 3, 4, 8, 11, 14 })
T (2, { 0, 1, 2, 3, 4, 9, 12, 15 })
T (3, { 0, 13, 2, 3, 14, 5, 6, 15 })
T (4, { 0, 1, 8, 3, 4, 9, 6, 7 })
T (5, { 0, 3, 11, 0, 4, 12, 0, 5 })
T (6, { 0, 3, 6, 9, 12, 15, 0, 0 })
T (7, { 0, 8, 0, 1, 9, 0, 2, 10 })
T (8, { 10, 1, 2, 11, 4, 5, 12, 7 })
T (9, { 13, 0, 6, 14, 0, 7, 15, 0 })
T (10, { 1, 4, 7, 10, 13, 0, 0, 0 })
T (11, { 2, 5, 8, 11, 14, 0, 0, 0 })
permutations, where both your and my patch optimize
foo{0,1,2,3,4,8}.
2014-10-03 Jakub Jelinek <jakub@redhat.com>
Uros Bizjak <ubizjak@gmail.com>
PR tree-optimization/61403
* config/i386/i386.c (expand_vec_perm_palignr): Fix a spelling
error in comment. Also optimize 256-bit vectors for AVX2
or AVX (floating vectors only), provided the first permutation
can be performed in one insn.
* gcc.dg/torture/vshuf-32.inc: Add a new test 29.
--- gcc/config/i386/i386.c.jj 2014-10-03 09:26:14.000000000 +0200
+++ gcc/config/i386/i386.c 2014-10-03 12:39:24.040185310 +0200
@@ -43422,7 +43422,7 @@ expand_vec_perm_palignr (struct expand_v
/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
the permutation using the SSE4_1 pblendv instruction. Potentially
- reduces permutaion from 2 pshufb and or to 1 pshufb and pblendv. */
+ reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
static bool
expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
@@ -43432,11 +43432,14 @@ expand_vec_perm_pblendv (struct expand_v
enum machine_mode vmode = d->vmode;
bool ok;
- /* Use the same checks as in expand_vec_perm_blend, but skipping
- AVX and AVX2 as they require more than 2 instructions. */
+ /* Use the same checks as in expand_vec_perm_blend. */
if (d->one_operand_p)
return false;
- if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+ if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+ ;
+ else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
+ ;
+ else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
;
else
return false;
@@ -43458,7 +43461,7 @@ expand_vec_perm_pblendv (struct expand_v
respective lanes and 8 >= 8, but 2 not. */
if (which != 1 && which != 2)
return false;
- if (d->testing_p)
+ if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
return true;
/* First we apply one operand permutation to the part where
@@ -43474,7 +43477,12 @@ expand_vec_perm_pblendv (struct expand_v
dcopy.perm[i] = d->perm[i] & (nelt - 1);
ok = expand_vec_perm_1 (&dcopy);
- gcc_assert (ok);
+ if (GET_MODE_SIZE (vmode) != 16 && !ok)
+ return false;
+ else
+ gcc_assert (ok);
+ if (d->testing_p)
+ return true;
/* Next we put permuted elements into their positions. */
dcopy1 = *d;
--- gcc/testsuite/gcc.dg/torture/vshuf-32.inc.jj 2014-10-03 09:26:14.000000000 +0200
+++ gcc/testsuite/gcc.dg/torture/vshuf-32.inc 2014-10-03 12:34:26.780882721 +0200
@@ -28,7 +28,8 @@ T (24, 0, 3, 6, 9, 12, 15, 18, 21, 24, 2
T (25, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42) \
T (26, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52) \
T (27, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53) \
-T (28, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 0, 1, 2, 3, 4, 5, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 16, 17, 18, 19, 20, 21)
+T (28, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 0, 1, 2, 3, 4, 5, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 16, 17, 18, 19, 20, 21) \
+T (29, 0, 43, 2, 3, 57, 5, 6, 7, 8, 53, 40, 11, 12, 13, 42, 15, 16, 40, 18, 19, 20, 21, 22, 23, 24, 25, 36, 58, 36, 29, 30, 31)
#define EXPTESTS \
T (116, 13, 38, 47, 3, 17, 8, 38, 20, 59, 61, 39, 26, 7, 49, 63, 43, 57, 16, 40, 19, 4, 32, 27, 7, 52, 19, 46, 55, 36, 41, 48, 6) \
T (117, 39, 35, 59, 20, 56, 18, 58, 63, 57, 14, 2, 16, 5, 61, 35, 4, 53, 9, 52, 51, 27, 33, 61, 12, 3, 35, 36, 40, 37, 7, 45, 42) \
Jakub