This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [AVX] PATCH: Add vzeroall/vzeroupper patterns


On Sun, Apr 13, 2008 at 07:09:41AM +0200, Uros Bizjak wrote:
> H.J. Lu wrote:
>
>> When foo2 is called from a function with SSE instructions, the upper 128bits of
>> all AVX registers are undefined, which has a performance penalty. So we call
>> _mm256_zeroall to clear all AVX registers to improve performance when
>> the upper 128bits of all AVX registers may be undefined.
>>
>> With my implementation, I got
>>
>> bash-3.2$  /export/build/gnu/gcc-avx/build-x86_64-linux/gcc/xgcc
>> -B/export/build/gnu/gcc-avx/build-x86_64-linux/gcc/ -mavx -Wall  -S
>> avx.c -O2
>> bash-3.2$ cat avx.s
>>         .file   "avx.c"
>>         .text
>>         .p2align 4,,15
>> .globl foo2
>>         .type   foo2, @function
>> foo2:
>> .LFB686:
>>         movdqa  %xmm0, -24(%rsp)
>>         vzeroall
>>         vpxor   %xmm1, %xmm1, %xmm1
>>         movdqa  -24(%rsp), %xmm0
>>         jmp     bar
>> .LFE686:
>>         .size   foo2, .-foo2
>>
>> That is we spill the argument 'x'. But I couldn't optimize out
>>
>> vpxor   %xmm1, %xmm1, %xmm1
>>
>> when I tried an approach similar to yours. Instead, vzeroall was optimized out.
>>   
>
> I see. We need to add a unspec_volatile into the parallel, but instead of 
> clobbers, we can define real operation of the pattern (clearing the 
> registers to zero).
>
> Regarding pxor, in addition to adding unpec_volatile, I have changed my 
> test patch so that "vzeroall" now generates zero in V4SFmode. Following 
> testcase:
>
> --cut here--
> typedef float __m128f __attribute__ ((__vector_size__ (16)));
>
> __m128f _mm_vsetzero (int i)
> {
>  __m128f x;
>
>  __builtin_ia32_vzeroall ();
>
>  x = (__m128f) { 0.0f, 0.0f, 0.0f, 0.0f };
>  return __builtin_ia32_cvtsi2ss (x, i);
> }
> --cut here--
>
> now compiles to:
>
> .LFB2:
>        vzeroall
>        cvtsi2ss        %edi, %xmm0
>        ret
>
> so, xorps was removed by postreload pass, since xorps and vzeroall have 
> both cleared xmm0 in V4SFmode.
>
> It looks to me, that postreload pass needs to be extended a bit due to the 
> fact that const0_rtx represents zero (aka 0x0....0) in every mode. This 
> way, postreload pass would remove xorps in above example even when vzeroall 
> creates V4SImode zeros.
>


Hi Uros,

I adjusted your patch against AVX branch. Does it look OK?

Thanks.


H.J.
---
2008-04-13  Uros Bizjak  <ubizjak@gmail.com>
	    H.J. Lu  <hongjiu.lu@intel.com>

	* config/i386/i386.md (XMM0_REG...XMM15_REG): Removed.
	* config/i386/sse.md (avx_vzeroall_rex64): Likewise.
	* config/i386/i386.c (ix86_expand_builtin): Updated.

2008-04-13  Uros Bizjak  <ubizjak@gmail.com>

	* config/i386/predicates.md (vzeroall_operation): New.

	* config/i386/sse.md (avx_vzeroall): Rewrite to use define_expand.
	(*avx_vzeroall): New.

Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md	(revision 134221)
+++ config/i386/i386.md	(working copy)
@@ -244,24 +244,8 @@
    (FLAGS_REG			17)
    (FPSR_REG			18)
    (FPCR_REG			19)
-   (XMM0_REG			21)
-   (XMM1_REG			22)
-   (XMM2_REG			23)
-   (XMM3_REG			24)
-   (XMM4_REG			25)
-   (XMM5_REG			26)
-   (XMM6_REG			27)
-   (XMM7_REG			28)
    (R10_REG			39)
    (R11_REG			40)
-   (XMM8_REG			45)
-   (XMM9_REG			46)
-   (XMM10_REG			47)
-   (XMM11_REG			48)
-   (XMM12_REG			49)
-   (XMM13_REG			50)
-   (XMM14_REG			51)
-   (XMM15_REG			52)
   ])
 
 ;; Insns whose names begin with "x86_" are emitted by gen_FOO calls
Index: config/i386/predicates.md
===================================================================
--- config/i386/predicates.md	(revision 134221)
+++ config/i386/predicates.md	(working copy)
@@ -1073,3 +1073,15 @@
 (define_predicate "misaligned_operand"
   (and (match_code "mem")
        (match_test "MEM_ALIGN (op) < GET_MODE_ALIGNMENT (mode)")))
+
+;; Return 1 if OP is a vzeroall operation, known to be a PARALLEL.
+(define_predicate "vzeroall_operation"
+  (match_code "parallel")
+{
+  int nregs = TARGET_64BIT ? 16 : 8;
+
+  if (XVECLEN (op, 0) != nregs + 1)
+    return 0;
+
+  return 1;
+})
Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md	(revision 134221)
+++ config/i386/sse.md	(working copy)
@@ -8569,45 +8569,36 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
-(define_insn "avx_vzeroall"
-  [(unspec_volatile [(const_int 0)] UNSPECV_VZEROALL)
-   (clobber (reg:V8SI XMM0_REG))
-   (clobber (reg:V8SI XMM1_REG))
-   (clobber (reg:V8SI XMM2_REG))
-   (clobber (reg:V8SI XMM3_REG))
-   (clobber (reg:V8SI XMM4_REG))
-   (clobber (reg:V8SI XMM5_REG))
-   (clobber (reg:V8SI XMM6_REG))
-   (clobber (reg:V8SI XMM7_REG))]
-  "TARGET_AVX && !TARGET_64BIT"
-  "vzeroall"
-  [(set_attr "type" "sse")
-   (set_attr "memory" "none")
-   (set_attr "mode" "OI")])
+(define_expand "avx_vzeroall"
+  [(match_par_dup 0 [(const_int 0)])]
+  "TARGET_AVX"
+{
+  int nregs = TARGET_64BIT ? 16 : 8;
+  int regno;
 
-(define_insn "avx_vzeroall_rex64"
-  [(unspec_volatile [(const_int 0)] UNSPECV_VZEROALL)
-   (clobber (reg:V8SI XMM0_REG))
-   (clobber (reg:V8SI XMM1_REG))
-   (clobber (reg:V8SI XMM2_REG))
-   (clobber (reg:V8SI XMM3_REG))
-   (clobber (reg:V8SI XMM4_REG))
-   (clobber (reg:V8SI XMM5_REG))
-   (clobber (reg:V8SI XMM6_REG))
-   (clobber (reg:V8SI XMM7_REG))
-   (clobber (reg:V8SI XMM8_REG))
-   (clobber (reg:V8SI XMM9_REG))
-   (clobber (reg:V8SI XMM10_REG))
-   (clobber (reg:V8SI XMM11_REG))
-   (clobber (reg:V8SI XMM12_REG))
-   (clobber (reg:V8SI XMM13_REG))
-   (clobber (reg:V8SI XMM14_REG))
-   (clobber (reg:V8SI XMM15_REG))]
-  "TARGET_AVX && TARGET_64BIT"
+  operands[0] = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (nregs + 1));
+
+  XVECEXP (operands[0], 0, 0)
+    = gen_rtx_UNSPEC_VOLATILE (VOIDmode, gen_rtvec (1, const0_rtx),
+			       UNSPECV_VZEROALL);
+
+  for (regno = 0; regno < nregs; regno++)
+    XVECEXP (operands[0], 0, regno + 1)
+      = gen_rtx_SET (VOIDmode,
+		     gen_rtx_REG (V8SFmode, SSE_REGNO (regno)),
+		     CONST0_RTX (V8SFmode));
+})
+
+(define_insn "*avx_vzeroall"
+  [(match_parallel 0 "vzeroall_operation"
+    [(unspec_volatile [(const_int 0)] UNSPECV_VZEROALL)
+     (set (match_operand 1 "register_operand" "=x")
+          (match_operand 2 "const0_operand" "X"))])]
+  "TARGET_AVX"
   "vzeroall"
   [(set_attr "type" "sse")
    (set_attr "memory" "none")
-   (set_attr "mode" "OI")])
+   (set_attr "mode" "V8SF")])
 
 ;; FIXME: It clobbers the upper 128bits of AVX registers.
 (define_insn "avx_vzeroupper"
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 134221)
+++ config/i386/i386.c	(working copy)
@@ -21225,10 +21225,7 @@ ix86_expand_builtin (tree exp, rtx targe
   switch (fcode)
     {
     case IX86_BUILTIN_VZEROALL:
-      if (TARGET_64BIT)
-	emit_insn (gen_avx_vzeroall_rex64 ());
-      else
-	emit_insn (gen_avx_vzeroall ());
+      emit_insn (gen_avx_vzeroall ());
       return 0;
 
     case IX86_BUILTIN_VZEROUPPER:


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]