[PATCH][middle-end][i386][Version 4] Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

Qing Zhao QING.ZHAO@ORACLE.COM
Mon Oct 26 23:06:39 GMT 2020


Hi, Uros,

Could you please check the change compared to the previous version for i386.c as following:
Let me know any issue there.

Thanks a lot.

Qing

---
 gcc/config/i386/i386.c                             | 136 ++++++++++++++++++---
 .../gcc.target/i386/zero-scratch-regs-28.c         |  17 +++
 .../gcc.target/i386/zero-scratch-regs-29.c         |  11 ++
 .../gcc.target/i386/zero-scratch-regs-30.c         |  11 ++
 4 files changed, 155 insertions(+), 20 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c
 create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c
 create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index e66dcf0d587..65f778112d9 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -3554,17 +3554,17 @@ ix86_function_value_regno_p (const unsigned int regno)
 /* Check whether the register REGNO should be zeroed on X86.
    When ALL_SSE_ZEROED is true, all SSE registers have been zeroed
    together, no need to zero it again.
-   Stack registers (st0-st7) and mm0-mm7 are aliased with each other.
-   very hard to be zeroed individually, don't zero individual st or
-   mm registgers.  */
+   When NEED_ZERO_MMX is true, MMX registers should be cleared.  */
 
 static bool
 zero_call_used_regno_p (const unsigned int regno,
-			bool all_sse_zeroed)
+			bool all_sse_zeroed,
+			bool need_zero_mmx)
 {
   return GENERAL_REGNO_P (regno)
 	 || (!all_sse_zeroed && SSE_REGNO_P (regno))
-	 || MASK_REGNO_P (regno);
+	 || MASK_REGNO_P (regno)
+	 || (need_zero_mmx && MMX_REGNO_P (regno));
 }
 
 /* Return the machine_mode that is used to zero register REGNO.  */
@@ -3579,8 +3579,12 @@ zero_call_used_regno_mode (const unsigned int regno)
     return SImode;
   else if (SSE_REGNO_P (regno))
     return V4SFmode;
-  else
+  else if (MASK_REGNO_P (regno))
     return HImode;
+  else if (MMX_REGNO_P (regno))
+    return DImode;
+  else
+    gcc_unreachable ();
 }
 
 /* Generate a rtx to zero all vector registers together if possible,
@@ -3603,7 +3607,7 @@ zero_all_vector_registers (HARD_REG_SET need_zeroed_hardregs)
   return gen_avx_vzeroall ();
 }
 
-/* Generate insns to zero all st/mm registers together.
+/* Generate insns to zero all st registers together.
    Return true when zeroing instructions are generated.
    Assume the number of st registers that are zeroed is num_of_st,
    we will emit the following sequence to zero them together:
@@ -3616,23 +3620,50 @@ zero_all_vector_registers (HARD_REG_SET need_zeroed_hardregs)
 		  ...
 		  fstp %%st(0);
    i.e., num_of_st fldz followed by num_of_st fstp to clear the stack
-   mark stack slots empty.  */
+   mark stack slots empty.
+
+   How to compute the num_of_st?
+   There is no direct mapping from stack registers to hard register
+   numbers.  If one stack register need to be cleared, we don't know
+   where in the stack the value remains.  So, if any stack register 
+   need to be cleared, the whole stack should be cleared.  However,
+   x87 stack registers that hold the return value should be excluded.
+   x87 returns in the top (two for complex values) register, so
+   num_of_st should be 7/6 when x87 returns, otherwise it will be 8.  */
+
 
 static bool
-zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs)
+zero_all_st_registers (HARD_REG_SET need_zeroed_hardregs)
 {
   unsigned int num_of_st = 0;
   for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
-    if (STACK_REGNO_P (regno)
-	&& TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)
-	/* When the corresponding mm register also need to be cleared too.  */
-	&& TEST_HARD_REG_BIT (need_zeroed_hardregs,
-			      (regno - FIRST_STACK_REG + FIRST_MMX_REG)))
-      num_of_st++;
+    if ((STACK_REGNO_P (regno) || MMX_REGNO_P (regno))
+	&& TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
+      {
+	num_of_st++;
+	break;
+      }
 
   if (num_of_st == 0)
     return false;
 
+  bool return_with_x87 = false;
+  return_with_x87 = (crtl->return_rtx
+		     && (GET_CODE (crtl->return_rtx) == REG)
+		     && (STACK_REG_P (crtl->return_rtx)));
+
+  bool complex_return = false;
+  complex_return = (crtl->return_rtx
+		    && COMPLEX_MODE_P (GET_MODE (crtl->return_rtx)));
+
+  if (return_with_x87)
+    if (complex_return)
+      num_of_st = 6;
+    else
+      num_of_st = 7;
+  else
+    num_of_st = 8;
+
   rtx st_reg = gen_rtx_REG (XFmode, FIRST_STACK_REG);
   for (unsigned int i = 0; i < num_of_st; i++)
     emit_insn (gen_rtx_SET (st_reg, CONST0_RTX (XFmode)));
@@ -3646,6 +3677,43 @@ zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs)
   return true;
 }
 
+
+/* When the routine exit with MMX mode, if there is any ST registers
+   need to be zeroed, we should clear all MMX registers except the
+   one that holds the return value RET_MMX_REGNO.  */
+static bool
+zero_all_mm_registers (HARD_REG_SET need_zeroed_hardregs,
+		       unsigned int ret_mmx_regno)
+{
+  bool need_zero_all_mm = false;
+  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+    if (STACK_REGNO_P (regno)
+	&& TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
+      {
+	need_zero_all_mm = true;
+	break;
+      }
+
+  if (!need_zero_all_mm)
+    return false;
+
+  rtx zero_mmx = NULL_RTX;
+  machine_mode mode = DImode;
+  for (unsigned int regno = FIRST_MMX_REG; regno <= LAST_MMX_REG; regno++)
+    if (regno != ret_mmx_regno)
+      {
+	rtx reg = gen_rtx_REG (mode, regno);
+	if (zero_mmx == NULL_RTX)
+	  {
+	    zero_mmx = reg;
+	    emit_insn (gen_rtx_SET (reg, const0_rtx));
+	  }
+	else
+	  emit_move_insn (reg, zero_mmx);
+      }
+  return true;
+}
+
 /* TARGET_ZERO_CALL_USED_REGS.  */
 /* Generate a sequence of instructions that zero registers specified by
    NEED_ZEROED_HARDREGS.  Return the ZEROED_HARDREGS that are actually
@@ -3655,7 +3723,8 @@ ix86_zero_call_used_regs (HARD_REG_SET need_zeroed_hardregs)
 {
   HARD_REG_SET zeroed_hardregs;
   bool all_sse_zeroed = false;
-  bool st_zeroed = false;
+  bool all_st_zeroed = false;
+  bool all_mm_zeroed = false;
 
   /* first, let's see whether we can zero all vector registers together.  */
   rtx zero_all_vec_insn = zero_all_vector_registers (need_zeroed_hardregs);
@@ -3665,24 +3734,42 @@ ix86_zero_call_used_regs (HARD_REG_SET need_zeroed_hardregs)
       all_sse_zeroed = true;
     }
 
-  /* then, let's see whether we can zero all st+mm registers togeter.  */
-  st_zeroed = zero_all_st_mm_registers (need_zeroed_hardregs);
+  /* Then, decide which mode (MMX mode or x87 mode) the function exit with.
+     In order to decide whether we need to clear the MMX registers or the
+     stack registers.  */
+
+  bool exit_with_mmx_mode = (crtl->return_rtx
+			     && (GET_CODE (crtl->return_rtx) == REG)
+			     && (MMX_REG_P (crtl->return_rtx)));
+
+  /* then, let's see whether we can zero all st registers together.  */
+  if (!exit_with_mmx_mode)
+    all_st_zeroed = zero_all_st_registers (need_zeroed_hardregs);
+  /* Or should we zero all MMX registers.  */
+  else 
+    {
+      unsigned int exit_mmx_regno = REGNO (crtl->return_rtx);
+      all_mm_zeroed = zero_all_mm_registers (need_zeroed_hardregs, 
+					     exit_mmx_regno);
+    }
 
   /* Now, generate instructions to zero all the registers.  */
 
   CLEAR_HARD_REG_SET (zeroed_hardregs);
-  if (st_zeroed)
+  if (all_st_zeroed)
     SET_HARD_REG_BIT (zeroed_hardregs, FIRST_STACK_REG);
 
   rtx zero_gpr = NULL_RTX;
   rtx zero_vector = NULL_RTX;
   rtx zero_mask = NULL_RTX;
+  rtx zero_mmx = NULL_RTX;
 
   for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
     {
       if (!TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
 	continue;
-      if (!zero_call_used_regno_p (regno, all_sse_zeroed))
+      if (!zero_call_used_regno_p (regno, all_sse_zeroed, 
+				   exit_with_mmx_mode && !all_mm_zeroed))
 	continue;
 
       SET_HARD_REG_BIT (zeroed_hardregs, regno);
@@ -3728,6 +3815,15 @@ ix86_zero_call_used_regs (HARD_REG_SET need_zeroed_hardregs)
 	  }
 	else
 	  emit_move_insn (reg, zero_mask);
+      else if (mode == DImode)
+	if (zero_mmx == NULL_RTX)
+	  {
+	    zero_mmx = reg;
+	    tmp = gen_rtx_SET (reg, const0_rtx);
+	    emit_insn (tmp);
+	  }
+	else
+	  emit_move_insn (reg, zero_mmx);
       else
 	gcc_unreachable ();
     }
diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c
new file mode 100644
index 00000000000..61c0bb7a35c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c
@@ -0,0 +1,17 @@
+/* { dg-do compile { target *-*-linux* } } */
+/* { dg-options "-O2 -m32 -mmmx -fzero-call-used-regs=all" } */
+
+typedef int __v2si __attribute__ ((vector_size (8)));
+
+__v2si ret_mmx (void)
+{
+  return (__v2si) { 123, 345 };
+}
+
+/* { dg-final { scan-assembler "pxor\[ \t\]*%mm1, %mm1" } } */
+/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm2" } } */
+/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm3" } } */
+/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm4" } } */
+/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm5" } } */
+/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm6" } } */
+/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm7" } } */
diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c
new file mode 100644
index 00000000000..db636654e70
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target *-*-linux* } } */
+/* { dg-options "-O2 -m32 -mmmx -fzero-call-used-regs=all" } */
+typedef int __v2si __attribute__ ((vector_size (8)));
+
+long double ret_x87 (void)
+{
+  return 1.1L;
+}
+
+/* { dg-final { scan-assembler-times "fldz" 7 } } */
+/* { dg-final { scan-assembler-times "fstp" 7 } } */
diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c
new file mode 100644
index 00000000000..7c20b569bfa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target *-*-linux* } } */
+/* { dg-options "-O2  -fzero-call-used-regs=all" } */
+typedef int __v2si __attribute__ ((vector_size (8)));
+
+_Complex long double ret_x87_cplx (void)
+{
+  return 1.1L + 1.2iL;
+}
+
+/* { dg-final { scan-assembler-times "fldz" 6 } } */
+/* { dg-final { scan-assembler-times "fstp" 6 } } */
-- 
2.11.0





> On Oct 26, 2020, at 4:23 PM, Qing Zhao via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
> 
> 
> 
>> On Oct 26, 2020, at 3:33 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
>> 
>> On Mon, Oct 26, 2020 at 9:05 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>>> 
>>> On Mon, Oct 26, 2020 at 8:10 PM Qing Zhao <QING.ZHAO@oracle.com> wrote:
>>>> 
>>>> 
>>>> 
>>>>> On Oct 26, 2020, at 1:42 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
>>>>> 
>>>>> On Mon, Oct 26, 2020 at 6:30 PM Qing Zhao <QING.ZHAO@oracle.com> wrote:
>>>>>> 
>>>>>> 
>>>>>> The following is the current change in i386.c, could you check whether the logic is good?
>>>>> 
>>>>> x87 handling looks good to me.
>>>>> 
>>>>> One remaining question: If the function uses MMX regs (either
>>>>> internally or as an argument register), but exits in x87 mode, does
>>>>> your logic clear the x87 stack?
>>>> 
>>>> Yes but not completely yes.
>>>> 
>>>> FIRST, As following:
>>>> 
>>>> /* Then, decide which mode (MMX mode or x87 mode) the function exit with.
>>>>    In order to decide whether we need to clear the MMX registers or the
>>>>    stack registers.  */
>>>> bool exit_with_mmx_mode = false;
>>>> 
>>>> exit_with_mmx_mode = ((GET_CODE (crtl->return_rtx) == REG)
>>>>                       && (MMX_REG_P (crtl->return_rtx)));
>>>> 
>>>> /* then, let's see whether we can zero all st registers togeter.  */
>>>> if (!exit_with_mmx_mode)
>>>>   st_zeroed = zero_all_st_registers (need_zeroed_hardregs);
>>>> 
>>>> 
>>>> We first check whether this routine exit with mmx mode, if Not then it’s X87 mode
>>>> (at exit, “EMMS” should already been called per ABI), then
>>>> The st/mm registers will be cleared as x87 stack registers.
>>>> 
>>>> However, within the routine “zero_all_st_registers”:
>>>> 
>>>> static bool
>>>> zero_all_st_registers (HARD_REG_SET need_zeroed_hardregs)
>>>> {
>>>> unsigned int num_of_st = 0;
>>>> for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
>>>>   if (STACK_REGNO_P (regno)
>>>>       && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
>>>>     {
>>>>       num_of_st++;
>>>>       break;
>>>>     }
>>>> 
>>>> if (num_of_st == 0)
>>>>   return false;
>>>> 
>>>> 
>>>> In the above, I currently only check whether any “Stack” registers need to be zeroed or not.
>>>> But looks like we should also check any “MMX” register need to be zeroed or not too. If there is any
>>>> “MMX” register need to be zeroed, we still need to clear the whole X87 stack?
>>> 
>>> I think so, but I have to check the details.
>> 
>> Please compile the following testcase with "-m32 -mmmx":
>> 
>> --cut here--
>> #include <stdio.h>
>> 
>> typedef int __v2si __attribute__ ((vector_size (8)));
>> 
>> __v2si zzz;
>> 
>> void
>> __attribute__ ((noinline))
>> mmx (__v2si a, __v2si b, __v2si c)
>> {
>> __v2si res;
>> 
>> res = __builtin_ia32_paddd (a, b);
>> zzz = __builtin_ia32_paddd (res, c);
>> 
>> __builtin_ia32_emms ();
>> }
>> 
>> 
>> int main ()
>> {
>> __v2si a = { 123, 345 };
>> __v2si b = { 234, 456 };
>> __v2si c = { 345, 567 };
>> 
>> mmx (a, b, c);
>> 
>> printf ("%i, %i\n", zzz[0], zzz[1]);
>> 
>> return 0;
>> }
>> --cut here--
>> 
>> at the end of mmx() function:
>> 
>> 0x080491ed in mmx ()
>> (gdb) disass
>> Dump of assembler code for function mmx:
>> 0x080491e0 <+0>:     paddd  %mm1,%mm0
>> 0x080491e3 <+3>:     paddd  %mm2,%mm0
>> 0x080491e6 <+6>:     movq   %mm0,0x804c020
>> => 0x080491ed <+13>:    emms
>> 0x080491ef <+15>:    ret
>> End of assembler dump.
>> (gdb) i r flo
>> st0            <invalid float value> (raw 0xffff00000558000002be)
>> st1            <invalid float value> (raw 0xffff000001c8000000ea)
>> st2            <invalid float value> (raw 0xffff0000023700000159)
>> st3            0                   (raw 0x00000000000000000000)
>> st4            0                   (raw 0x00000000000000000000)
>> st5            0                   (raw 0x00000000000000000000)
>> st6            0                   (raw 0x00000000000000000000)
>> st7            0                   (raw 0x00000000000000000000)
>> fctrl          0x37f               895
>> fstat          0x0                 0
>> ftag           0x556a              21866
>> fiseg          0x0                 0
>> fioff          0x0                 0
>> foseg          0x0                 0
>> fooff          0x0                 0
>> fop            0x0                 0
>> 
>> There are still values in the MMX registers. However, we are in x87
>> mode, so the whole stack has to be cleared.
> 
> Yes. And I just tried, my current implementation behaved correctly. 
>> 
>> Now, what to do if the function uses x87 registers and exits in MMX
>> mode? I guess we have to clear all MMX registers (modulo return value
>> reg).
> 
> Need to add this part.
> 
> thanks.
> Qing
>> 
>> Uros.



More information about the Gcc-patches mailing list