This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH, GCC, AARCH64, 3/6] Restrict indirect tail calls to x16 and x17


Hi

This patch is part of a series that enables ARMv8.5-A in GCC and
adds Branch Target Identification Mechanism.
(https://developer.arm.com/products/architecture/cpu-architecture/a-profile/exploration-tools)

This patch changes the registers that are allowed for indirect tail
calls. We are choosing to restrict these to only x16 or x17.

Indirect tail calls are special in a way that they convert a call
statement (BLR instruction) to a jump statement (BR instruction). For
the best possible use of Branch Target Identification Mechanism, we 
would like to place a "BTI C" (call) at the beginning of the function
which is only compatible with BLRs and BR X16/X17. In order to make
indirect tail calls compatible with this scenario, we are restricting 
the TAILCALL_ADDR_REGS.

In order to use x16/x17 for this purpose, we also had to change the use
of these registers in the epilogue/prologue handling. For this purpose
we are now using x12 and x13 named as EP0_REGNUM and EP1_REGNUM as
scratch registers for epilogue and prologue.

Bootstrapped and regression tested with aarch64-none-linux-gnu. Updated
test. Ran Spec2017 and no performance hit.

Is this ok for trunk?

Thanks
Sudi


*** gcc/ChangeLog***

2018-xx-xx  Sudakshina Das  <sudi.das@arm.com>

          * config/aarch64/aarch64.c (aarch64_expand_prologue): Use new
          epilogue/prologue scratch registers EP0_REGNUM and EP1_REGNUM.
          (aarch64_expand_epilogue): Likewise.
          (aarch64_output_mi_thunk): Likewise
          * config/aarch64/aarch64.h (REG_CLASS_CONTENTS): Change
	TAILCALL_ADDR_REGS
          to x16 and x17.
          * config/aarch64/aarch64.md: Define EP0_REGNUM and EP1_REGNUM.

*** gcc/testsuite/ChangeLog ***

2018-xx-xx  Sudakshina Das  <sudi.das@arm.com>

          * gcc.target/aarch64/test_frame_17.c: Update to check for
	EP0_REGNUM instead of IP0_REGNUM and add test case.

diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 94184049c9c77d858fd5b3e2a8970a48b70f7529..8e7a8d54351cf7eb1774a474bfbfbebf58070e31 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -579,7 +579,7 @@ enum reg_class
 #define REG_CLASS_CONTENTS						\
 {									\
   { 0x00000000, 0x00000000, 0x00000000 },	/* NO_REGS */		\
-  { 0x0004ffff, 0x00000000, 0x00000000 },	/* TAILCALL_ADDR_REGS */\
+  { 0x00030000, 0x00000000, 0x00000000 },	/* TAILCALL_ADDR_REGS */\
   { 0x7fffffff, 0x00000000, 0x00000003 },	/* GENERAL_REGS */	\
   { 0x80000000, 0x00000000, 0x00000000 },	/* STACK_REG */		\
   { 0xffffffff, 0x00000000, 0x00000003 },	/* POINTER_REGS */	\
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 27f81b654a2bae3ddd87b99e4b7926cc588a95f5..f9a81f1734e6885662f6a9e6c97bdbcdac24211b 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -5317,8 +5317,8 @@ aarch64_expand_prologue (void)
 	aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
     }
 
-  rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
-  rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
+  rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
+  rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
 
   /* In theory we should never have both an initial adjustment
      and a callee save adjustment.  Verify that is the case since the
@@ -5328,7 +5328,7 @@ aarch64_expand_prologue (void)
   /* Will only probe if the initial adjustment is larger than the guard
      less the amount of the guard reserved for use by the caller's
      outgoing args.  */
-  aarch64_allocate_and_probe_stack_space (ip0_rtx, ip1_rtx, initial_adjust,
+  aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
 					  true, false);
 
   if (callee_adjust != 0)
@@ -5346,7 +5346,7 @@ aarch64_expand_prologue (void)
 	}
       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
 			  stack_pointer_rtx, callee_offset,
-			  ip1_rtx, ip0_rtx, frame_pointer_needed);
+			  tmp1_rtx, tmp0_rtx, frame_pointer_needed);
       if (frame_pointer_needed && !frame_size.is_constant ())
 	{
 	  /* Variable-sized frames need to describe the save slot
@@ -5388,7 +5388,7 @@ aarch64_expand_prologue (void)
 
   /* We may need to probe the final adjustment if it is larger than the guard
      that is assumed by the called.  */
-  aarch64_allocate_and_probe_stack_space (ip1_rtx, ip0_rtx, final_adjust,
+  aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
 					  !frame_pointer_needed, true);
 }
 
@@ -5426,8 +5426,8 @@ aarch64_expand_epilogue (bool for_sibcall)
   unsigned reg2 = cfun->machine->frame.wb_candidate2;
   rtx cfi_ops = NULL;
   rtx_insn *insn;
-  /* A stack clash protection prologue may not have left IP0_REGNUM or
-     IP1_REGNUM in a usable state.  The same is true for allocations
+  /* A stack clash protection prologue may not have left EP0_REGNUM or
+     EP1_REGNUM in a usable state.  The same is true for allocations
      with an SVE component, since we then need both temporary registers
      for each allocation.  For stack clash we are in a usable state if
      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
@@ -5439,11 +5439,9 @@ aarch64_expand_epilogue (bool for_sibcall)
      guard_size - guard_used_by_caller because we won't be doing any probes
      then.  In such situations the register should remain live with the correct
      value.  */
-  bool can_inherit_p = (initial_adjust.is_constant ()
-			&& final_adjust.is_constant ())
-			&& (!flag_stack_clash_protection
-			     || known_lt (initial_adjust,
-					  guard_size - guard_used_by_caller));
+  bool can_inherit_p = (!flag_stack_clash_protection
+			|| known_lt (initial_adjust,
+				     guard_size - guard_used_by_caller));
 
   /* We need to add memory barrier to prevent read from deallocated stack.  */
   bool need_barrier_p
@@ -5461,20 +5459,20 @@ aarch64_expand_epilogue (bool for_sibcall)
 
   /* Restore the stack pointer from the frame pointer if it may not
      be the same as the stack pointer.  */
-  rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
-  rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
+  rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
+  rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
   if (frame_pointer_needed
       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
     /* If writeback is used when restoring callee-saves, the CFA
        is restored on the instruction doing the writeback.  */
     aarch64_add_offset (Pmode, stack_pointer_rtx,
 			hard_frame_pointer_rtx, -callee_offset,
-			ip1_rtx, ip0_rtx, callee_adjust == 0);
+			tmp1_rtx, tmp0_rtx, callee_adjust == 0);
   else
      /* The case where we need to re-use the register here is very rare, so
 	avoid the complicated condition and just always emit a move if the
 	immediate doesn't fit.  */
-     aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust, true);
+     aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
 
   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
 				callee_adjust != 0, &cfi_ops);
@@ -5497,8 +5495,11 @@ aarch64_expand_epilogue (bool for_sibcall)
       cfi_ops = NULL;
     }
 
-  aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
-		  !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
+  /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
+     add restriction on emit_move optimization to leaf functions.  */
+  aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
+		  (!can_inherit_p || !crtl->is_leaf
+		   || df_regs_ever_live_p (EP0_REGNUM)));
 
   if (cfi_ops)
     {
@@ -5604,8 +5605,8 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
   emit_note (NOTE_INSN_PROLOGUE_END);
 
   this_rtx = gen_rtx_REG (Pmode, this_regno);
-  temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
-  temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
+  temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
+  temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
 
   if (vcall_offset == 0)
     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index d7473418a8eb62b2757017cd1675493f86e41ef4..3367f65945a213616935981a5e81af933bae9685 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -35,6 +35,9 @@
     (R11_REGNUM		11)
     (R12_REGNUM		12)
     (R13_REGNUM		13)
+    ;; Scratch registers for prologue/epilogue use.
+    (EP0_REGNUM		12)
+    (EP1_REGNUM		13)
     (R14_REGNUM		14)
     (R15_REGNUM		15)
     (R16_REGNUM		16)
diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_17.c b/gcc/testsuite/gcc.target/aarch64/test_frame_17.c
index c214431999b60cce8a75204876a8c73ec6304128..44f132911286779f63db60157987d568d3c13d0d 100644
--- a/gcc/testsuite/gcc.target/aarch64/test_frame_17.c
+++ b/gcc/testsuite/gcc.target/aarch64/test_frame_17.c
@@ -1,16 +1,27 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 --save-temps" } */
+/* { dg-options "-O2" } */
 
 /* Test reuse of stack adjustment temporaries.  */
 
 void foo ();
 
+/* Should only use 1 mov and re-use it.  */
 int reuse_mov (int i)
 {
   int arr[1025];
   return arr[i];
 }
 
+/* Should use 2 movs because x12 is live.  */
+int no_reuse_mov_live (int i)
+{
+  int arr[1025];
+  register long long a __asm("x12");
+  a = a+1;
+  return arr[i] + a;
+}
+
+/* Should use 2 movs because its not a leaf function.  */
 int no_reuse_mov (int i)
 {
   int arr[1025];
@@ -18,4 +29,4 @@ int no_reuse_mov (int i)
   return arr[i];
 }
 
-/* { dg-final { scan-assembler-times "mov\tx16, \[0-9\]+" 3 } } */
+/* { dg-final { scan-assembler-times "mov\tx12, \[0-9\]+" 5 } } */


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]