[PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns

Fri Jun 11 16:25:08 GMT 2021

Christophe Lyon <christophe.lyon@linaro.org> writes:
> Thanks for the feedback. How about v2 attached?
> Do you want me to merge neon_vec_unpack<US> and
> mve_vec_unpack<US>  and only have different assembly?
> if (TARGET_HAVE_MVE)
>   return "vmovlb.<US>%#<V_sz_elem> %q0, %q1";
> else
>   return "vmovlb.<US>%#<V_sz_elem> %q0, %q1";

I think it'd be better to keep them separate, given that they have
different attributes.

> @@ -2199,10 +2222,23 @@ (define_insn "mve_vmovnbq_<supf><mode>"
>    [(set_attr "type" "mve_move")
>  ])
>  
> +;; vmovnb pattern used by the vec_pack_trunc expander to avoid the
> +;; need for an extra register.

“to avoid the need for an uninitailized input operand” might
be clearer.

> +(define_insn "@mve_vec_pack_trunc_lo_<mode>"
> +  [
> +   (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
> +	(unspec:<V_narrow_pack> [(match_operand:MVE_5 1 "s_register_operand" "w")]
> +	 VMOVNBQ_S))
> +  ]
> +  "TARGET_HAVE_MVE"
> +  "vmovnb.i%#<V_sz_elem>	%q0, %q1"
> +  [(set_attr "type" "mve_move")
> +])
> +
> […]
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> index 430a92ce966..3afb3e1d891 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -632,3 +632,91 @@ (define_expand "clz<mode>2"
>    "ARM_HAVE_<MODE>_ARITH
>     && !TARGET_REALLY_IWMMXT"
>  )
> +
> +;; vmovl[tb] are not available for V4SI on MVE
> +(define_expand "vec_unpack<US>_hi_<mode>"
> +  [(match_operand:<V_unpack> 0 "register_operand")
> +   (SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
> + "ARM_HAVE_<MODE>_ARITH
> +  && !TARGET_REALLY_IWMMXT
> +  && ! (<MODE>mode == V4SImode && TARGET_HAVE_MVE)
> +  && !BYTES_BIG_ENDIAN"
> +  {
> +    rtvec v = rtvec_alloc (<V_mode_nunits>/2);
> +    rtx t1;
> +    int i;
> +    for (i = 0; i < (<V_mode_nunits>/2); i++)
> +      RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
> +
> +    t1 = gen_rtx_PARALLEL (<MODE>mode, v);
> +
> +    if (TARGET_NEON)
> +      emit_insn (gen_neon_vec_unpack<US>_hi_<mode> (operands[0],
> +						    operands[1],
> +						    t1));
> +    else
> +      emit_insn (gen_mve_vec_unpack<US>_hi_<mode> (operands[0],
> +						   operands[1],
> +						   t1));
> +    DONE;
> +  }
> +)

Since the patterns are the same for both MVE and Neon (a good thing!),
it would be simpler to write this as:

(define_insn "mve_vec_unpack<US>_lo_<mode>"
  [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
	(SE:<V_unpack> (vec_select:<V_HALF>
			  (match_operand:VU 1 "register_operand" "w")
			  (match_dup 2))))]
  …

and then set operands[2] to what is t1 above.  There would then be
no need to call emit_insn & DONE, and we could use the natural
iterator (MVE_3) for the MVE patterns.

Same idea for the lo version.

> […]
> +;; vmovn[tb] are not available for V2DI on MVE
> +(define_expand "vec_pack_trunc_<mode>"
> + [(set (match_operand:<V_narrow_pack> 0 "register_operand")
> +       (vec_concat:<V_narrow_pack>
> +		(truncate:<V_narrow>
> +			(match_operand:VN 1 "register_operand"))
> +		(truncate:<V_narrow>
> +			(match_operand:VN 2 "register_operand"))))]
> + "ARM_HAVE_<MODE>_ARITH
> +  && !TARGET_REALLY_IWMMXT
> +  && ! (<MODE>mode == V2DImode && TARGET_HAVE_MVE)
> +  && !BYTES_BIG_ENDIAN"
> + {
> +   if (TARGET_NEON)
> +     {
> +       emit_insn (gen_neon_quad_vec_pack_trunc_<mode> (operands[0], operands[1],
> +						       operands[2]));
> +     }
> +   else
> +     {
> +       rtx tmpreg = gen_reg_rtx (<V_narrow_pack>mode);
> +       emit_insn (gen_mve_vec_pack_trunc_lo (<MODE>mode, tmpreg, operands[1]));
> +       emit_insn (gen_mve_vmovntq (VMOVNTQ_S, <MODE>mode,
> +       	                           tmpreg, tmpreg, operands[2]));
> +       emit_move_insn (operands[0], tmpreg);

vmovntq can assign directly to operands[0], without a separate move.

OK with those changes, thanks.

Richard

> +     }
> +   DONE;
> + }
> +)
> […]