[i386 PATCH] PR24076: Improve V16QI/V8HI broadcast initialization

Roger Sayle roger@eyesopen.com
Sun Apr 16 02:19:00 GMT 2006


The following patch is the next in the series to improve x86 vector
initialization.  This patch resolves PR target/24076 by using more
efficient initialization sequences with SSE2.  As mentioned in the
bugzilla PR, RTH rejected Dale's original patch suggesting that the
correct approach was to fix ix86_expand_vector_init.  Andrew Pinski
then proposed such a solution but it was never followed up.  The
patch below improves on these previous efforts, and also adds support
for V8HImode.

Currently for the new vecinit-3.c test case we generate:

f:      movzbl  a(%rip), %eax
        movl    %eax, %edx
        sall    $8, %edx
        orl     %eax, %edx
        movzwl  %dx, %edx
        movl    %edx, %eax
        sall    $16, %eax
        orl     %edx, %eax
        movd    %rax, %xmm1
        movq    %rax, -8(%rsp)
        pshufd  $0, %xmm1, %xmm0
        ret

with the patch below we now generate:

f:	movzbl  a(%rip), %eax
        movd    %eax, %xmm0
        punpcklbw       %xmm0, %xmm0
        punpcklbw       %xmm0, %xmm0
        pshufd  $0, %xmm0, %xmm0
        ret


The following patch has been tested on x86_64-unknown-linux-gnu with
a full "make bootstrap", all default languages and regression tested
with a top-level "make -k check" with no new failures.  In addition
to my two new tests, I've retained Dale's two test cases, now named
sse-18.c and sse-19.c.

Ok for mainline?



2006-04-15  Roger Sayle  <roger@eyesopen.com>
	    Andrew Pinski  <pinskia@gcc.gnu.org>
	    Dale Johannesen  <dalej@apple.com>

	PR target/24076
	* config/i386/i386.c (ix86_expand_vector_init_duplicate): Add
	special case code to implement V8HImode and V16QImode with SSE2.

	* gcc.target/i386/vecinit-3.c: New testcase.
	* gcc.target/i386/vecinit-4.c: Likewise.
	* gcc.target/i386/sse-18.c: Likewise.
	* gcc.target/i386/sse-19.c: Likewise.


Index: config/i386/i386.c
===================================================================
*** config/i386/i386.c	(revision 112968)
--- config/i386/i386.c	(working copy)
*************** ix86_expand_vector_init_duplicate (bool
*** 17851,17861 ****
--- 17851,17916 ----
        wvmode = V4HImode;
        goto widen;
      case V8HImode:
+       if (TARGET_SSE2)
+ 	{
+ 	  rtx tmp1, tmp2;
+ 	  /* Extend HImode to SImode using a paradoxical SUBREG.  */
+ 	  tmp1 = gen_reg_rtx (SImode);
+ 	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
+ 	  /* Insert the SImode value as low element of V4SImode vector. */
+ 	  tmp2 = gen_reg_rtx (V4SImode);
+ 	  tmp1 = gen_rtx_VEC_MERGE (V4SImode,
+ 				    gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
+ 				    CONST0_RTX (V4SImode),
+ 				    const1_rtx);
+ 	  emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
+ 	  /* Cast the V4SImode vector back to a V8HImode vector.  */
+ 	  tmp1 = gen_reg_rtx (V8HImode);
+ 	  emit_move_insn (tmp1, gen_lowpart (V8HImode, tmp2));
+ 	  /* Duplicate the low short through the whole low SImode word.  */
+ 	  emit_insn (gen_sse2_punpcklwd (tmp1, tmp1, tmp1));
+ 	  /* Cast the V8HImode vector back to a V4SImode vector.  */
+ 	  tmp2 = gen_reg_rtx (V4SImode);
+ 	  emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
+ 	  /* Replicate the low element of the V4SImode vector.  */
+ 	  emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
+ 	  /* Cast the V2SImode back to V8HImode, and store in target.  */
+ 	  emit_move_insn (target, gen_lowpart (V8HImode, tmp2));
+ 	  return true;
+ 	}
        smode = HImode;
        wsmode = SImode;
        wvmode = V4SImode;
        goto widen;
      case V16QImode:
+       if (TARGET_SSE2)
+ 	{
+ 	  rtx tmp1, tmp2;
+ 	  /* Extend QImode to SImode using a paradoxical SUBREG.  */
+ 	  tmp1 = gen_reg_rtx (SImode);
+ 	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
+ 	  /* Insert the SImode value as low element of V4SImode vector. */
+ 	  tmp2 = gen_reg_rtx (V4SImode);
+ 	  tmp1 = gen_rtx_VEC_MERGE (V4SImode,
+ 				    gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
+ 				    CONST0_RTX (V4SImode),
+ 				    const1_rtx);
+ 	  emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
+ 	  /* Cast the V4SImode vector back to a V16QImode vector.  */
+ 	  tmp1 = gen_reg_rtx (V16QImode);
+ 	  emit_move_insn (tmp1, gen_lowpart (V16QImode, tmp2));
+ 	  /* Duplicate the low byte through the whole low SImode word.  */
+ 	  emit_insn (gen_sse2_punpcklbw (tmp1, tmp1, tmp1));
+ 	  emit_insn (gen_sse2_punpcklbw (tmp1, tmp1, tmp1));
+ 	  /* Cast the V16QImode vector back to a V4SImode vector.  */
+ 	  tmp2 = gen_reg_rtx (V4SImode);
+ 	  emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
+ 	  /* Replicate the low element of the V4SImode vector.  */
+ 	  emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
+ 	  /* Cast the V2SImode back to V16QImode, and store in target.  */
+ 	  emit_move_insn (target, gen_lowpart (V16QImode, tmp2));
+ 	  return true;
+ 	}
        smode = QImode;
        wsmode = HImode;
        wvmode = V8HImode;


/* { dg-do compile } */
/* { dg-options "-O2 -msse2" } */
#define vector __attribute__((vector_size(16)))

char a;
vector char f(void) { return (vector char){ a, a, a, a, a, a, a, a,
					    a, a, a, a, a, a, a, a }; }
/* { dg-final { scan-assembler-not "sall" } } */


/* { dg-do compile } */
/* { dg-options "-O2 -msse2" } */
#define vector __attribute__((vector_size(16)))

short a;
vector short f(void) { return (vector short){ a, a, a, a, a, a, a, a }; }
/* { dg-final { scan-assembler-not "sall" } } */


/* { dg-do run } */
/* { dg-options "-O3 -msse2" } */
extern void abort(void);
#include <emmintrin.h>
#include "../../gcc.dg/i386-cpuid.h"
__m128i foo (char) __attribute__((noinline));
__m128i foo (char x) {
  return _mm_set1_epi8(x);
}
__m128i bar (char) __attribute__((noinline));
__m128i bar (char x) {
  return _mm_set_epi8 (x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x);
}

main() {
  int i, j;
  union u { __m128i v; char c[16]; };
  union u x, y;
  unsigned long cpu_facilities;

  cpu_facilities = i386_cpuid ();

  if ((cpu_facilities & (bit_MMX | bit_SSE | bit_CMOV))
      != (bit_MMX | bit_SSE | bit_CMOV))
    /* If host has no vector support, pass.  */
    return 0;

  for (i = -128; i <= 127; i++)
    {
      x.v = foo ((char)i);
      y.v = bar ((char)i);
      for (j=0; j<16; j++)
	if (x.c[j] != y.c[j])
	  abort();
    }
  return 0;
}


/* { dg-do compile } */
/* { dg-options "-O3 -msse2" } */
/* { dg-final { scan-assembler "punpcklbw" } } */
extern void abort();
#include <emmintrin.h>
__m128i foo (char) __attribute__((noinline));
__m128i foo (char x) {
  return _mm_set1_epi8(x);
}
__m128i bar (char) __attribute__((noinline));
__m128i bar (char x) {
  return _mm_set_epi8 (x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x);
}

main() {
  int i, j;
  union u { __m128i v; char c[16]; };
  union u x, y;
  for (i = -128; i <= 127; i++)
    {
      x.v = foo ((char)i);
      y.v = bar ((char)i);
      for (j=0; j<16; j++)
	if (x.c[j] != y.c[j])
	  abort();
    }
  return 0;
}


Roger
--



More information about the Gcc-patches mailing list