[i386 PATCH] Improve vector initialization of (V4SF){0,a,0,0}

Roger Sayle roger@eyesopen.com
Mon Apr 10 17:47:00 GMT 2006


The following patch improves the functionality of ix86_expand_vector_init
to generalize the current code for constructing vectors containing one
variable non-zero element.

Consider the function f2 from the test case vecinit-1.c below:
vector float f2(void) { return (vector float){ 0.0, a, 0.0, 0.0}; }

Currently, with -O2 -msse2 on x86_64-unknown-linux-gnu we generate:

f2:     xorps   %xmm0, %xmm0
        movss   a(%rip), %xmm2
        movaps  %xmm0, %xmm1
        movss   %xmm2, %xmm0
        shufps  $225, %xmm1, %xmm0
        ret

with the patch below we now generate:

f2:     movss   a(%rip), %xmm0
        shufps  $81, %xmm0, %xmm0
        ret

I initially investigated improving the original code sequence in
the machine independent code in combine.  Unfortunately, this
exposes a number of significant defficiencies with GCC's current
rtx_cost mechanisms.  Consider the movaps, which has a latency of
six on Pentium4 machines, which appears to be cheaper than a
xorps or pxor (both of latency two) during combine!

The patch below tackles this problem at its source in the x86 backend,
and lays some of the groundwork for resolving PRs 24073 and 24074.

The approach generalizes the API of x86_expand_init_low_nonzero to
instead become x86_expand_init_one_nonzero, where we now additionally
pass which position is the nonzero element (matching what we currently
do for x86_expand_init_one_var).  For the time being I've only added
support for V4SFmode and V4SImode, using the appropriate shuffle
instructions, but the infrastructure is now in place for the other
vector modes in follow-up patches.


The following patch has been tested on x86_64-unknown-linux-gnu with
a full "make bootstrap", all default languages, and regression tested
with a top-level "make -k check" with no new failures.

Ok for mainline?



2006-04-10  Roger Sayle  <roger@eyesopen.com>

	* config/i386/i386.c (ix86_expand_vector_init_one_nonzero): Renamed
	from ix86_expand_vector_init_low_nonzero.  Take an additional
	one_var argument indicating which element is non-zero.  Support
	one_var != 0 for V4SFmode and V4SImode by permuting the result.
	(ix86_expand_vector_init): Call ix86_expand_vector_init_one_nonzero
	with one_var instead of ix86_expand_vector_init_low_nonzero.

	* gcc.target/i386/vecinit-1.c: New test case.
	* gcc.target/i386/vecinit-2.c: Likewise.


Index: config/i386/i386.c
===================================================================
*** config/i386/i386.c	(revision 112626)
--- config/i386/i386.c	(working copy)
*************** ix86_expand_vector_init_duplicate (bool
*** 17846,17860 ****
  }

  /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
!    whose low element is VAR, and other elements are zero.  Return true
     if successful.  */

  static bool
! ix86_expand_vector_init_low_nonzero (bool mmx_ok, enum machine_mode mode,
! 				     rtx target, rtx var)
  {
    enum machine_mode vsimode;
!   rtx x;

    switch (mode)
      {
--- 17846,17861 ----
  }

  /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
!    whose ONE_VAR element is VAR, and other elements are zero.  Return true
     if successful.  */

  static bool
! ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
! 				     rtx target, rtx var, int one_var)
  {
    enum machine_mode vsimode;
!   rtx new_target;
!   rtx x, tmp;

    switch (mode)
      {
*************** ix86_expand_vector_init_low_nonzero (boo
*** 17866,17871 ****
--- 17867,17874 ----

      case V2DFmode:
      case V2DImode:
+       if (one_var != 0)
+ 	return false;
        var = force_reg (GET_MODE_INNER (mode), var);
        x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
        emit_insn (gen_rtx_SET (VOIDmode, target, x));
*************** ix86_expand_vector_init_low_nonzero (boo
*** 17873,17882 ****

      case V4SFmode:
      case V4SImode:
        var = force_reg (GET_MODE_INNER (mode), var);
        x = gen_rtx_VEC_DUPLICATE (mode, var);
        x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
!       emit_insn (gen_rtx_SET (VOIDmode, target, x));
        return true;

      case V8HImode:
--- 17876,17930 ----

      case V4SFmode:
      case V4SImode:
+       if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+ 	new_target = gen_reg_rtx (mode);
+       else
+ 	new_target = target;
        var = force_reg (GET_MODE_INNER (mode), var);
        x = gen_rtx_VEC_DUPLICATE (mode, var);
        x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
!       emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
!       if (one_var != 0)
! 	{
! 	  /* We need to shuffle the value to the correct position, so
! 	     create a new pseudo to store the intermediate result.  */
!
! 	  /* With SSE2, we can use the integer shuffle insns.  */
! 	  if (mode != V4SFmode && TARGET_SSE2)
! 	    {
! 	      emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
! 					    GEN_INT (1),
! 					    GEN_INT (one_var == 1 ? 0 : 1),
! 					    GEN_INT (one_var == 2 ? 0 : 1),
! 					    GEN_INT (one_var == 3 ? 0 : 1)));
! 	      if (target != new_target)
! 		emit_move_insn (target, new_target);
! 	      return true;
! 	    }
!
! 	  /* Otherwise convert the intermediate result to V4SFmode and
! 	     use the SSE1 shuffle instructions.  */
! 	  if (mode != V4SFmode)
! 	    {
! 	      tmp = gen_reg_rtx (V4SFmode);
! 	      emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
! 	    }
! 	  else
! 	    tmp = new_target;
!
! 	  emit_insn (gen_sse_shufps_1 (tmp, tmp, tmp,
! 				       GEN_INT (1),
! 				       GEN_INT (one_var == 1 ? 0 : 1),
! 				       GEN_INT (one_var == 2 ? 0+4 : 1+4),
! 				       GEN_INT (one_var == 3 ? 0+4 : 1+4)));
!
! 	  if (mode != V4SFmode)
! 	    emit_move_insn (target, gen_lowpart (V4SImode, tmp));
! 	  else if (tmp != target)
! 	    emit_move_insn (target, tmp);
! 	}
!       else if (target != new_target)
! 	emit_move_insn (target, new_target);
        return true;

      case V8HImode:
*************** ix86_expand_vector_init_low_nonzero (boo
*** 17890,17900 ****
        vsimode = V2SImode;
        goto widen;
      widen:
        /* Zero extend the variable element to SImode and recurse.  */
        var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);

        x = gen_reg_rtx (vsimode);
!       if (!ix86_expand_vector_init_low_nonzero (mmx_ok, vsimode, x, var))
  	gcc_unreachable ();

        emit_move_insn (target, gen_lowpart (mode, x));
--- 17938,17952 ----
        vsimode = V2SImode;
        goto widen;
      widen:
+       if (one_var != 0)
+ 	return false;
+
        /* Zero extend the variable element to SImode and recurse.  */
        var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);

        x = gen_reg_rtx (vsimode);
!       if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
! 						var, one_var))
  	gcc_unreachable ();

        emit_move_insn (target, gen_lowpart (mode, x));
*************** ix86_expand_vector_init (bool mmx_ok, rt
*** 18151,18159 ****
       the pool and overwritten via move later.  */
    if (n_var == 1)
      {
!       if (all_const_zero && one_var == 0
! 	  && ix86_expand_vector_init_low_nonzero (mmx_ok, mode, target,
! 						  XVECEXP (vals, 0, 0)))
  	return;

        if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
--- 18203,18212 ----
       the pool and overwritten via move later.  */
    if (n_var == 1)
      {
!       if (all_const_zero
! 	  && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
! 						  XVECEXP (vals, 0, one_var),
! 						  one_var))
  	return;

        if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))


/* { dg-do compile } */
/* { dg-options "-O2 -msse2" } */
#define vector __attribute__((vector_size(16)))

float a;
vector float f1(void) { return (vector float){ a, 0.0, 0.0, 0.0}; }
vector float f2(void) { return (vector float){ 0.0, a, 0.0, 0.0}; }
vector float f3(void) { return (vector float){ 0.0, 0.0, a, 0.0}; }
vector float f4(void) { return (vector float){ 0.0, 0.0, 0.0, a}; }
/* { dg-final { scan-assembler-not "movaps" } } */
/* { dg-final { scan-assembler-not "xor" } } */


/* { dg-do compile } */
/* { dg-options "-O2 -msse2" } */
#define vector __attribute__((vector_size(16)))

int a;
vector int f1(void) { return (vector int){ a, 0, 0, 0}; }
vector int f2(void) { return (vector int){ 0, a, 0, 0}; }
vector int f3(void) { return (vector int){ 0, 0, a, 0}; }
vector int f4(void) { return (vector int){ 0, 0, 0, a}; }
/* { dg-final { scan-assembler-not "movaps" } } */
/* { dg-final { scan-assembler-not "xor" } } */


Roger
--



More information about the Gcc-patches mailing list