[PATCH] Improve vec_widen_?mult_odd_*

Uros Bizjak ubizjak@gmail.com
Sun Apr 28 08:32:00 GMT 2013


On Sat, Apr 27, 2013 at 12:20 AM, Jakub Jelinek <jakub@redhat.com> wrote:

> On
> #define N 4096
> unsigned int b[N], d[N];
>
> void
> foo (void)
> {
>   int i;
>   for (i = 0; i < N; i++)
>     d[i] = b[i] / 3;
> }
> testcase I was looking earlier today because of the XOP issues,
> I've noticed we generate unnecessary code:
>         vmovdqa .LC0(%rip), %ymm2
> ...
>         vpsrlq  $32, %ymm2, %ymm3
> before the loop and in the loop:
>         vmovdqa b(%rax), %ymm0
>         vpmuludq        b(%rax), %ymm2, %ymm1
> ...
>         vpsrlq  $32, %ymm0, %ymm0
>         vpmuludq        %ymm3, %ymm0, %ymm0
> ...
> .LC0:
>         .long   -1431655765
>         .long   -1431655765
>         .long   -1431655765
>         .long   -1431655765
>         .long   -1431655765
>         .long   -1431655765
>         .long   -1431655765
>         .long   -1431655765
> The first vpsrlq and having an extra register live across the loop is not
> needed, if each pair of constants in the constant vector is equal, we can
> just use .LC0(%rip) (i.e. %ymm2 above) in both places.
>
> Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux.
>
> Apparently ix86_expand_binop_builtin wasn't prepared for NULL predicates
> (but generic code is), alternatively perhaps I could add a predicate
> that would accept nonimmediate_operand or CONSTANT_VECTOR if that is
> preferrable that way.

Yes, please add a new predicate, the pattern is much more descriptive
this way. (Without the predicate, it looks like an expander that
generates a RTX fragment, used instead of gen_RTX... sequences).

OTOH, does vector mode "general_operand" still accept scalar
immediates? The predicate, proposed above is effectively
"general_vector_operand", where only operands (including immediates)
with vector modes should be accepted. IIRC, there were some problems
with VOIDmode CONST0_RTX leaking through vector expanders, and there
is some code in ix86_expand_..._builtin that deals with this
situation.

> Also, not sure if force_reg or copy_to_mode_reg is preferrable.

I think this approach was just copied from generic code - or perhaps
historical code due to CONST0_RTX leaks, as described above (I'm away
from my keyboard, and not in the position to check this issue in the
code ATM).

> 2013-04-26  Jakub Jelinek  <jakub@redhat.com>
>
>         * config/i386/i386.c (ix86_expand_binop_builtin): Allow NULL
>         predicate.
>         (const_vector_equal_evenodd_p): New function.
>         (ix86_expand_mul_widen_evenodd): Force op1 resp. op2 into register
>         if they aren't nonimmediate operands.  If their original values
>         satisfy const_vector_equal_evenodd_p, don't shift them.
>         * config/i386/sse.md (mul<mode>3): Remove predicates.  For the
>         SSE4.1 case force operands[{1,2}] into registers if not
>         nonimmediate_operand.
>         (vec_widen_smult_even_v4si): Use nonimmediate_operand predicates
>         instead of register_operand.
>         (vec_widen_<s>mult_odd_<mode>): Remove predicates.
>
> --- gcc/config/i386/i386.c.jj   2013-04-26 15:11:37.000000000 +0200
> +++ gcc/config/i386/i386.c      2013-04-26 19:03:54.777293448 +0200
> @@ -30149,9 +30150,11 @@ ix86_expand_binop_builtin (enum insn_cod
>        op1 = gen_lowpart (TImode, x);
>      }
>
> -  if (!insn_data[icode].operand[1].predicate (op0, mode0))
> +  if (insn_data[icode].operand[1].predicate
> +      && !insn_data[icode].operand[1].predicate (op0, mode0))
>      op0 = copy_to_mode_reg (mode0, op0);
> -  if (!insn_data[icode].operand[2].predicate (op1, mode1))
> +  if (insn_data[icode].operand[2].predicate
> +      && !insn_data[icode].operand[2].predicate (op1, mode1))
>      op1 = copy_to_mode_reg (mode1, op1);
>
>    pat = GEN_FCN (icode) (target, op0, op1);
> @@ -40826,6 +40829,24 @@ ix86_expand_vecop_qihi (enum rtx_code co
>                        gen_rtx_fmt_ee (code, qimode, op1, op2));
>  }
>
> +/* Helper function of ix86_expand_mul_widen_evenodd.  Return true
> +   if op is CONST_VECTOR with all odd elements equal to their
> +   preceeding element.  */
> +
> +static bool
> +const_vector_equal_evenodd_p (rtx op)
> +{
> +  enum machine_mode mode = GET_MODE (op);
> +  int i, nunits = GET_MODE_NUNITS (mode);
> +  if (GET_CODE (op) != CONST_VECTOR
> +      || nunits != CONST_VECTOR_NUNITS (op))
> +    return false;
> +  for (i = 0; i < nunits; i += 2)
> +    if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
> +      return false;
> +  return true;
> +}
> +
>  void
>  ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
>                                bool uns_p, bool odd_p)
> @@ -40833,6 +40854,12 @@ ix86_expand_mul_widen_evenodd (rtx dest,
>    enum machine_mode mode = GET_MODE (op1);
>    enum machine_mode wmode = GET_MODE (dest);
>    rtx x;
> +  rtx orig_op1 = op1, orig_op2 = op2;
> +
> +  if (!nonimmediate_operand (op1, mode))
> +    op1 = force_reg (mode, op1);
> +  if (!nonimmediate_operand (op2, mode))
> +    op2 = force_reg (mode, op2);
>
>    /* We only play even/odd games with vectors of SImode.  */
>    gcc_assert (mode == V4SImode || mode == V8SImode);
> @@ -40849,10 +40876,12 @@ ix86_expand_mul_widen_evenodd (rtx dest,
>         }
>
>        x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
> -      op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
> -                         x, NULL, 1, OPTAB_DIRECT);
> -      op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
> -                         x, NULL, 1, OPTAB_DIRECT);
> +      if (!const_vector_equal_evenodd_p (orig_op1))
> +       op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
> +                           x, NULL, 1, OPTAB_DIRECT);
> +      if (!const_vector_equal_evenodd_p (orig_op2))
> +       op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
> +                           x, NULL, 1, OPTAB_DIRECT);
>        op1 = gen_lowpart (mode, op1);
>        op2 = gen_lowpart (mode, op2);
>      }
> --- gcc/config/i386/sse.md.jj   2013-04-26 15:11:37.000000000 +0200
> +++ gcc/config/i386/sse.md      2013-04-26 18:59:03.838753277 +0200
> @@ -5631,14 +5631,16 @@ (define_insn "*sse2_pmaddwd"
>  (define_expand "mul<mode>3"
>    [(set (match_operand:VI4_AVX2 0 "register_operand")
>         (mult:VI4_AVX2
> -         (match_operand:VI4_AVX2 1 "nonimmediate_operand")
> -         (match_operand:VI4_AVX2 2 "nonimmediate_operand")))]
> +         (match_operand:VI4_AVX2 1)
> +         (match_operand:VI4_AVX2 2)))]
>    "TARGET_SSE2"
>  {
>    if (TARGET_SSE4_1)
>      {
> -      if (CONSTANT_P (operands[2]))
> -       operands[2] = force_const_mem (<MODE>mode, operands[2]);
> +      if (!nonimmediate_operand (operands[1], <MODE>mode))
> +       operands[1] = force_reg (<MODE>mode, operands[1]);
> +      if (!nonimmediate_operand (operands[2], <MODE>mode))
> +       operands[2] = force_reg (<MODE>mode, operands[2]);
>        ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);
>      }
>    else
> @@ -5702,8 +5704,8 @@ (define_expand "vec_widen_<s>mult_lo_<mo
>  ;; named patterns, but signed V4SI needs special help for plain SSE2.
>  (define_expand "vec_widen_smult_even_v4si"
>    [(match_operand:V2DI 0 "register_operand")
> -   (match_operand:V4SI 1 "register_operand")
> -   (match_operand:V4SI 2 "register_operand")]
> +   (match_operand:V4SI 1 "nonimmediate_operand")
> +   (match_operand:V4SI 2 "nonimmediate_operand")]
>    "TARGET_SSE2"
>  {
>    ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2],
> @@ -5714,8 +5716,8 @@ (define_expand "vec_widen_smult_even_v4s
>  (define_expand "vec_widen_<s>mult_odd_<mode>"
>    [(match_operand:<sseunpackmode> 0 "register_operand")
>     (any_extend:<sseunpackmode>
> -     (match_operand:VI4_AVX2 1 "register_operand"))
> -   (match_operand:VI4_AVX2 2 "register_operand")]
> +     (match_operand:VI4_AVX2 1))
> +   (match_operand:VI4_AVX2 2)]
>    "TARGET_SSE2"
>  {
>    ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2],
>
>         Jakub



More information about the Gcc-patches mailing list