This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH] Extend shift permutations on power of 2 cases
- From: Evgeny Stupachenko <evstupac at gmail dot com>
- To: Uros Bizjak <ubizjak at gmail dot com>
- Cc: GCC Patches <gcc-patches at gcc dot gnu dot org>
- Date: Wed, 12 Nov 2014 16:15:41 +0300
- Subject: Re: [PATCH] Extend shift permutations on power of 2 cases
- Authentication-results: sourceware.org; auth=none
- References: <CAOvf_xwSEVJfhvvKETqSOkkt3oHCZN9Ek15q=0OUOmqyv0JooA at mail dot gmail dot com> <CAFiYyc2OzjXGkT0=zc1zQJ1C4uauijtdR4RJWGjm=C22iokLvA at mail dot gmail dot com> <CAOvf_xxeRzHv9nHmXxWhjsOw9sto43a1mtd3aV_KYy_1OLGpHQ at mail dot gmail dot com>
To avoid misunderstanding.
I haven't yet committed this obvious fix.
Is it ok?
On Wed, Nov 12, 2014 at 2:15 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> Committed r217359.
> However, it appeared that AVX2 uses vperm2i128 for the shift here
> (instead of palignr for SSSE3/AVX). To handle AVX2 case we need to
> modify test case:
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
> b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
> index 1fbd258..020e983 100644
> --- a/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
> +++ b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
> @@ -19,4 +19,4 @@ pair_mul_sum(byte *in, byte *out, int size)
> }
> }
>
> -/* { dg-final { scan-assembler "palignr" } } */
> +/* { dg-final { scan-assembler "perm2i128|palignr" } } */
>
> On Tue, Nov 11, 2014 at 5:28 PM, Richard Biener
> <richard.guenther@gmail.com> wrote:
>> On Tue, Nov 11, 2014 at 3:21 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
>>> Hi,
>>>
>>> The patch extends shift permutations technique on power of 2 cases
>>> (previously even/odd transformations was used unconditionally).
>>> Basically the patch just add loop for load group of length 2, like it
>>> is done in "vect_permute_load_chain" function.
>>>
>>> For Silvermont it reduces insn sequence for load group of length 4
>>> from 31 to 20 insns.
>>> Performance for the test in the patch improved by ~20%.
>>>
>>> Bootstrap passed.
>>> Make check in progress.
>>>
>>> Is it ok?
>>
>> Ok.
>>
>> Thanks,
>> Richard.
>>
>>> 2014-11-11 Evgeny Stupachenko <evstupac@gmail.com>
>>>
>>> gcc/testsuite
>>> * gcc.target/i386/pr52252-atom-1.c: New.
>>>
>>> gcc/
>>> * tree-vect-data-refs.c (vect_shift_permute_load_chain): Extend shift
>>> permutations on power of 2 cases.
>>>
>>> diff --git a/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
>>> b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
>>> new file mode 100644
>>> index 0000000..1fbd258
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c
>>> @@ -0,0 +1,22 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-require-effective-target ssse3 } */
>>> +/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=slm" } */
>>> +#define byte unsigned char
>>> +
>>> +void
>>> +pair_mul_sum(byte *in, byte *out, int size)
>>> +{
>>> + int j;
>>> + for(j = 0; j < size; j++)
>>> + {
>>> + byte a = in[0];
>>> + byte b = in[1];
>>> + byte c = in[2];
>>> + byte d = in[3];
>>> + out[0] = (byte)(a * b) + (byte)(b * c) + (byte)(c * d) + (byte)(d * a);
>>> + in += 4;
>>> + out += 1;
>>> + }
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler "palignr" } } */
>>> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
>>> index 0bc0356..d2e0e93 100644
>>> --- a/gcc/tree-vect-data-refs.c
>>> +++ b/gcc/tree-vect-data-refs.c
>>> @@ -5379,8 +5379,9 @@ vect_shift_permute_load_chain (vec<tree> dr_chain,
>>> memcpy (result_chain->address (), dr_chain.address (),
>>> length * sizeof (tree));
>>>
>>> - if (length == 2 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
>>> + if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
>>> {
>>> + unsigned int j, log_length = exact_log2 (length);
>>> for (i = 0; i < nelt / 2; ++i)
>>> sel[i] = i * 2;
>>> for (i = 0; i < nelt / 2; ++i)
>>> @@ -5441,37 +5442,44 @@ vect_shift_permute_load_chain (vec<tree> dr_chain,
>>> select_mask = vect_gen_perm_mask (vectype, sel);
>>> gcc_assert (select_mask != NULL);
>>>
>>> - first_vect = dr_chain[0];
>>> - second_vect = dr_chain[1];
>>> -
>>> - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
>>> - perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> - first_vect, first_vect,
>>> - perm2_mask1);
>>> - vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> - vect[0] = data_ref;
>>> + for (i = 0; i < log_length; i++)
>>> + {
>>> + for (j = 0; j < length; j += 2)
>>> + {
>>> + first_vect = dr_chain[j];
>>> + second_vect = dr_chain[j + 1];
>>>
>>> - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
>>> - perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> - second_vect, second_vect,
>>> - perm2_mask2);
>>> - vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> - vect[1] = data_ref;
>>> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
>>> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> + first_vect, first_vect,
>>> + perm2_mask1);
>>> + vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> + vect[0] = data_ref;
>>>
>>> - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
>>> - perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> - vect[0], vect[1],
>>> - shift1_mask);
>>> - vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> - (*result_chain)[1] = data_ref;
>>> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
>>> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> + second_vect,
>>> second_vect,
>>> + perm2_mask2);
>>> + vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> + vect[1] = data_ref;
>>>
>>> - data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
>>> - perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> - vect[0], vect[1],
>>> - select_mask);
>>> - vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> - (*result_chain)[0] = data_ref;
>>> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
>>> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> + vect[0], vect[1],
>>> + shift1_mask);
>>> + vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> + (*result_chain)[j/2 + length/2] = data_ref;
>>>
>>> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
>>> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> + vect[0], vect[1],
>>> + select_mask);
>>> + vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> + (*result_chain)[j/2] = data_ref;
>>> + }
>>> + memcpy (dr_chain.address (), result_chain->address (),
>>> + length * sizeof (tree));
>>> + }
>>> return true;
>>> }
>>> if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)