[PATCH] Improve vec_widen_?mult_odd_*
Uros Bizjak
ubizjak@gmail.com
Sun Apr 28 08:32:00 GMT 2013
On Sat, Apr 27, 2013 at 12:20 AM, Jakub Jelinek <jakub@redhat.com> wrote:
> On
> #define N 4096
> unsigned int b[N], d[N];
>
> void
> foo (void)
> {
> int i;
> for (i = 0; i < N; i++)
> d[i] = b[i] / 3;
> }
> testcase I was looking earlier today because of the XOP issues,
> I've noticed we generate unnecessary code:
> vmovdqa .LC0(%rip), %ymm2
> ...
> vpsrlq $32, %ymm2, %ymm3
> before the loop and in the loop:
> vmovdqa b(%rax), %ymm0
> vpmuludq b(%rax), %ymm2, %ymm1
> ...
> vpsrlq $32, %ymm0, %ymm0
> vpmuludq %ymm3, %ymm0, %ymm0
> ...
> .LC0:
> .long -1431655765
> .long -1431655765
> .long -1431655765
> .long -1431655765
> .long -1431655765
> .long -1431655765
> .long -1431655765
> .long -1431655765
> The first vpsrlq and having an extra register live across the loop is not
> needed, if each pair of constants in the constant vector is equal, we can
> just use .LC0(%rip) (i.e. %ymm2 above) in both places.
>
> Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux.
>
> Apparently ix86_expand_binop_builtin wasn't prepared for NULL predicates
> (but generic code is), alternatively perhaps I could add a predicate
> that would accept nonimmediate_operand or CONSTANT_VECTOR if that is
> preferrable that way.
Yes, please add a new predicate, the pattern is much more descriptive
this way. (Without the predicate, it looks like an expander that
generates a RTX fragment, used instead of gen_RTX... sequences).
OTOH, does vector mode "general_operand" still accept scalar
immediates? The predicate, proposed above is effectively
"general_vector_operand", where only operands (including immediates)
with vector modes should be accepted. IIRC, there were some problems
with VOIDmode CONST0_RTX leaking through vector expanders, and there
is some code in ix86_expand_..._builtin that deals with this
situation.
> Also, not sure if force_reg or copy_to_mode_reg is preferrable.
I think this approach was just copied from generic code - or perhaps
historical code due to CONST0_RTX leaks, as described above (I'm away
from my keyboard, and not in the position to check this issue in the
code ATM).
> 2013-04-26 Jakub Jelinek <jakub@redhat.com>
>
> * config/i386/i386.c (ix86_expand_binop_builtin): Allow NULL
> predicate.
> (const_vector_equal_evenodd_p): New function.
> (ix86_expand_mul_widen_evenodd): Force op1 resp. op2 into register
> if they aren't nonimmediate operands. If their original values
> satisfy const_vector_equal_evenodd_p, don't shift them.
> * config/i386/sse.md (mul<mode>3): Remove predicates. For the
> SSE4.1 case force operands[{1,2}] into registers if not
> nonimmediate_operand.
> (vec_widen_smult_even_v4si): Use nonimmediate_operand predicates
> instead of register_operand.
> (vec_widen_<s>mult_odd_<mode>): Remove predicates.
>
> --- gcc/config/i386/i386.c.jj 2013-04-26 15:11:37.000000000 +0200
> +++ gcc/config/i386/i386.c 2013-04-26 19:03:54.777293448 +0200
> @@ -30149,9 +30150,11 @@ ix86_expand_binop_builtin (enum insn_cod
> op1 = gen_lowpart (TImode, x);
> }
>
> - if (!insn_data[icode].operand[1].predicate (op0, mode0))
> + if (insn_data[icode].operand[1].predicate
> + && !insn_data[icode].operand[1].predicate (op0, mode0))
> op0 = copy_to_mode_reg (mode0, op0);
> - if (!insn_data[icode].operand[2].predicate (op1, mode1))
> + if (insn_data[icode].operand[2].predicate
> + && !insn_data[icode].operand[2].predicate (op1, mode1))
> op1 = copy_to_mode_reg (mode1, op1);
>
> pat = GEN_FCN (icode) (target, op0, op1);
> @@ -40826,6 +40829,24 @@ ix86_expand_vecop_qihi (enum rtx_code co
> gen_rtx_fmt_ee (code, qimode, op1, op2));
> }
>
> +/* Helper function of ix86_expand_mul_widen_evenodd. Return true
> + if op is CONST_VECTOR with all odd elements equal to their
> + preceeding element. */
> +
> +static bool
> +const_vector_equal_evenodd_p (rtx op)
> +{
> + enum machine_mode mode = GET_MODE (op);
> + int i, nunits = GET_MODE_NUNITS (mode);
> + if (GET_CODE (op) != CONST_VECTOR
> + || nunits != CONST_VECTOR_NUNITS (op))
> + return false;
> + for (i = 0; i < nunits; i += 2)
> + if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
> + return false;
> + return true;
> +}
> +
> void
> ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
> bool uns_p, bool odd_p)
> @@ -40833,6 +40854,12 @@ ix86_expand_mul_widen_evenodd (rtx dest,
> enum machine_mode mode = GET_MODE (op1);
> enum machine_mode wmode = GET_MODE (dest);
> rtx x;
> + rtx orig_op1 = op1, orig_op2 = op2;
> +
> + if (!nonimmediate_operand (op1, mode))
> + op1 = force_reg (mode, op1);
> + if (!nonimmediate_operand (op2, mode))
> + op2 = force_reg (mode, op2);
>
> /* We only play even/odd games with vectors of SImode. */
> gcc_assert (mode == V4SImode || mode == V8SImode);
> @@ -40849,10 +40876,12 @@ ix86_expand_mul_widen_evenodd (rtx dest,
> }
>
> x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
> - op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
> - x, NULL, 1, OPTAB_DIRECT);
> - op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
> - x, NULL, 1, OPTAB_DIRECT);
> + if (!const_vector_equal_evenodd_p (orig_op1))
> + op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
> + x, NULL, 1, OPTAB_DIRECT);
> + if (!const_vector_equal_evenodd_p (orig_op2))
> + op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
> + x, NULL, 1, OPTAB_DIRECT);
> op1 = gen_lowpart (mode, op1);
> op2 = gen_lowpart (mode, op2);
> }
> --- gcc/config/i386/sse.md.jj 2013-04-26 15:11:37.000000000 +0200
> +++ gcc/config/i386/sse.md 2013-04-26 18:59:03.838753277 +0200
> @@ -5631,14 +5631,16 @@ (define_insn "*sse2_pmaddwd"
> (define_expand "mul<mode>3"
> [(set (match_operand:VI4_AVX2 0 "register_operand")
> (mult:VI4_AVX2
> - (match_operand:VI4_AVX2 1 "nonimmediate_operand")
> - (match_operand:VI4_AVX2 2 "nonimmediate_operand")))]
> + (match_operand:VI4_AVX2 1)
> + (match_operand:VI4_AVX2 2)))]
> "TARGET_SSE2"
> {
> if (TARGET_SSE4_1)
> {
> - if (CONSTANT_P (operands[2]))
> - operands[2] = force_const_mem (<MODE>mode, operands[2]);
> + if (!nonimmediate_operand (operands[1], <MODE>mode))
> + operands[1] = force_reg (<MODE>mode, operands[1]);
> + if (!nonimmediate_operand (operands[2], <MODE>mode))
> + operands[2] = force_reg (<MODE>mode, operands[2]);
> ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);
> }
> else
> @@ -5702,8 +5704,8 @@ (define_expand "vec_widen_<s>mult_lo_<mo
> ;; named patterns, but signed V4SI needs special help for plain SSE2.
> (define_expand "vec_widen_smult_even_v4si"
> [(match_operand:V2DI 0 "register_operand")
> - (match_operand:V4SI 1 "register_operand")
> - (match_operand:V4SI 2 "register_operand")]
> + (match_operand:V4SI 1 "nonimmediate_operand")
> + (match_operand:V4SI 2 "nonimmediate_operand")]
> "TARGET_SSE2"
> {
> ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2],
> @@ -5714,8 +5716,8 @@ (define_expand "vec_widen_smult_even_v4s
> (define_expand "vec_widen_<s>mult_odd_<mode>"
> [(match_operand:<sseunpackmode> 0 "register_operand")
> (any_extend:<sseunpackmode>
> - (match_operand:VI4_AVX2 1 "register_operand"))
> - (match_operand:VI4_AVX2 2 "register_operand")]
> + (match_operand:VI4_AVX2 1))
> + (match_operand:VI4_AVX2 2)]
> "TARGET_SSE2"
> {
> ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2],
>
> Jakub
More information about the Gcc-patches
mailing list