This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: RFC: Patch to implement Aarch64 SIMD ABI


Here is version 3 of my patch to implement the SIMD ABI on Aarch64.
I am having a problem with how to handle a SIMD function calling a
non-SIMD function.  When this happens the SIMD function needs to save
V8 to V23 because it cannot count on the non-SIMD function to save
all 128 bits of these registers.

I thought I had this working in the last patch but as I write test
cases, it appears that it is not working and I am not sure how to
implement it.  I tried adding clobbers in aarch64_expand_call but
that is not working (see code in this patch in aarch64_expand_call).
If I add them to 'call' which is a parallel insn, they are ignored.
If I find the underlying call instruction that is part of the parallel
then the clobbers get added to the instruction but then the call itself
is not recognized with the extra clobbers in place.  I don't think we
want to add new call instructions in aarch64.md to handle the vector
register saves and restores.  Am I trying to add the clobbers in the
wrong place?  Where and when should extra clobbers be added to a call
that is going to clobber more registers than what is indicated by
CALL_USED_REGISTERS?

I suppose I could use TARGET_HARD_REGNO_CALL_PART_CLOBBERED but I would
have to extend it to include the call instruction as an argument so the
the code could determine if the call being made was to a simd or non-simd
function.

Steve Ellcey
sellcey@cavium.com


2018-07-25  Steve Ellcey  <sellcey@cavium.com>

	* config/aarch64/aarch64.c (aarch64_attribute_table): New array.
	(aarch64_simd_decl_p): New function.
	(aarch64_reg_save_mode): New function.
	(aarch64_is_simd_call_p): New function.
	(aarch64_function_ok_for_sibcall): Check for simd calls.
	(aarch64_layout_frame): Check for simd function.
	(aarch64_gen_storewb_pair): Handle E_TFmode.
	(aarch64_push_regs): Use aarch64_reg_save_mode to get mode.
	(aarch64_gen_loadwb_pair): Handle E_TFmode.
	(aarch64_pop_regs): Use aarch64_reg_save_mode to get mode.
	(aarch64_components_for_bb): Check for simd function.
	(aarch64_process_components): Ditto.
	(aarch64_expand_prologue): Ditto.
	(aarch64_expand_epilogue): Ditto.
	(aarch64_expand_call): Ditto.
	(TARGET_ATTRIBUTE_TABLE): New define.
	* config/aarch64/aarch64.h (REG_ALLOC_ORDER): New define.
	(HONOR_REG_ALLOC_ORDER): Ditto.
	(FP_SIMD_SAVED_REGNUM_P): Ditto.
	* config/aarch64/aarch64.md (V23_REGNUM) New constant.
	(loadwb_pair<TX:mode>_<P:mode>): New instruction.
	("storewb_pair<TX:mode>_<P:mode>): Ditto.
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index fa01475..cc642f5 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1027,6 +1027,15 @@ static const struct processor *selected_tune;
 /* The current tuning set.  */
 struct tune_params aarch64_tune_params = generic_tunings;
 
+/* Table of machine attributes.  */
+static const struct attribute_spec aarch64_attribute_table[] =
+{
+  /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
+       affects_type_identity, handler, exclude } */
+  { "aarch64_vector_pcs", 0, 0, true,  false, false, false, NULL, NULL },
+  { NULL,                 0, 0, false, false, false, false, NULL, NULL }
+};
+
 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 
 /* An ISA extension in the co-processor and main instruction set space.  */
@@ -1405,6 +1414,26 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
   return false;
 }
 
+/* Return true if this is a definition of a vectorized simd function.  */
+
+static bool
+aarch64_simd_decl_p (tree fndecl)
+{
+  if (lookup_attribute ("aarch64_vector_pcs", DECL_ATTRIBUTES (fndecl)) != NULL)
+    return true;
+  if (lookup_attribute ("simd", DECL_ATTRIBUTES (fndecl)) == NULL)
+    return false;
+  return (VECTOR_TYPE_P (TREE_TYPE (TREE_TYPE (fndecl))));
+}
+
+static
+machine_mode aarch64_reg_save_mode (tree fndecl, unsigned regno)
+{
+  return GP_REGNUM_P (regno)
+	   ? E_DImode
+	   : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
+}
+
 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
    clobbers the top 64 bits when restoring the bottom 64 bits.  */
@@ -1499,6 +1528,13 @@ aarch64_is_noplt_call_p (rtx sym)
   return false;
 }
 
+static bool
+aarch64_is_simd_call_p (rtx sym)
+{
+  tree decl = SYMBOL_REF_DECL (sym);
+  return  decl && aarch64_simd_decl_p (decl);
+}
+
 /* Return true if the offsets to a zero/sign-extract operation
    represent an expression that matches an extend operation.  The
    operands represent the paramters from
@@ -3269,10 +3305,11 @@ aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
 }
 
 static bool
-aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
-				 tree exp ATTRIBUTE_UNUSED)
+aarch64_function_ok_for_sibcall (tree decl, tree exp ATTRIBUTE_UNUSED)
 {
-  /* Currently, always true.  */
+  if (aarch64_simd_decl_p (cfun->decl) && (!decl || !aarch64_simd_decl_p (decl)))
+    return false;
+
   return true;
 }
 
@@ -4035,6 +4072,7 @@ aarch64_layout_frame (void)
 {
   HOST_WIDE_INT offset = 0;
   int regno, last_fp_reg = INVALID_REGNUM;
+  bool simd_function = aarch64_simd_decl_p (cfun->decl);
 
   if (reload_completed && cfun->machine->frame.laid_out)
     return;
@@ -4069,7 +4107,8 @@ aarch64_layout_frame (void)
 
   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
     if (df_regs_ever_live_p (regno)
-	&& !call_used_regs[regno])
+	&& (!call_used_regs[regno]
+	    || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
       {
 	cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
 	last_fp_reg = regno;
@@ -4106,7 +4145,8 @@ aarch64_layout_frame (void)
       {
 	/* If there is an alignment gap between integer and fp callee-saves,
 	   allocate the last fp register to it if possible.  */
-	if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
+	if (regno == last_fp_reg && has_align_gap
+	    && !simd_function && (offset & 8) == 0)
 	  {
 	    cfun->machine->frame.reg_offset[regno] = max_int_offset;
 	    break;
@@ -4118,7 +4158,7 @@ aarch64_layout_frame (void)
 	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
 		 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
 	  cfun->machine->frame.wb_candidate2 = regno;
-	offset += UNITS_PER_WORD;
+	offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
       }
 
   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
@@ -4261,6 +4301,10 @@ aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
       return gen_storewb_pairdf_di (base, base, reg, reg2,
 				    GEN_INT (-adjustment),
 				    GEN_INT (UNITS_PER_WORD - adjustment));
+    case E_TFmode:
+      return gen_storewb_pairtf_di (base, base, reg, reg2,
+				    GEN_INT (-adjustment),
+				    GEN_INT (UNITS_PER_VREG - adjustment));
     default:
       gcc_unreachable ();
     }
@@ -4273,7 +4317,7 @@ static void
 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
 {
   rtx_insn *insn;
-  machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
+  machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
 
   if (regno2 == INVALID_REGNUM)
     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
@@ -4303,6 +4347,9 @@ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
     case E_DFmode:
       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
 				   GEN_INT (UNITS_PER_WORD));
+    case E_TFmode:
+      return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
+				   GEN_INT (UNITS_PER_VREG));
     default:
       gcc_unreachable ();
     }
@@ -4316,7 +4363,7 @@ static void
 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
 		  rtx *cfi_ops)
 {
-  machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
+  machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
   rtx reg1 = gen_rtx_REG (mode, regno1);
 
   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
@@ -4629,13 +4676,15 @@ aarch64_components_for_bb (basic_block bb)
   bitmap in = DF_LIVE_IN (bb);
   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
+  bool simd_function = aarch64_simd_decl_p (cfun->decl);
 
   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
   bitmap_clear (components);
 
   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
-    if ((!call_used_regs[regno])
+    if ((!call_used_regs[regno]
+	|| (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
        && (bitmap_bit_p (in, regno)
 	   || bitmap_bit_p (gen, regno)
 	   || bitmap_bit_p (kill, regno)))
@@ -4707,8 +4756,10 @@ aarch64_process_components (sbitmap components, bool prologue_p)
   while (regno != last_regno)
     {
       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
-	 so DFmode for the vector registers is enough.  */
-      machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
+	 so DFmode for the vector registers is enough.  For simd functions
+         we want to save the entire register.  */
+      machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
+      
       rtx reg = gen_rtx_REG (mode, regno);
       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
       if (!frame_pointer_needed)
@@ -4737,6 +4788,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
 	 mergeable with the current one into a pair.  */
       if (!satisfies_constraint_Ump (mem)
 	  || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
+	  || (aarch64_simd_decl_p (cfun->decl) && (FP_REGNUM_P (regno)))
 	  || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
 		       GET_MODE_SIZE (mode)))
 	{
@@ -4959,8 +5011,12 @@ aarch64_expand_prologue (void)
 
   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
 			     callee_adjust != 0 || emit_frame_chain);
-  aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-			     callee_adjust != 0 || emit_frame_chain);
+  if (aarch64_simd_decl_p (cfun->decl))
+    aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+			       callee_adjust != 0 || emit_frame_chain);
+  else
+    aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+			       callee_adjust != 0 || emit_frame_chain);
   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
 }
 
@@ -5041,8 +5097,12 @@ aarch64_expand_epilogue (bool for_sibcall)
 
   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
 				callee_adjust != 0, &cfi_ops);
-  aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-				callee_adjust != 0, &cfi_ops);
+  if (aarch64_simd_decl_p (cfun->decl))
+    aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+				  callee_adjust != 0, &cfi_ops);
+  else
+    aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+				  callee_adjust != 0, &cfi_ops);
 
   if (need_barrier_p)
     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
@@ -6318,6 +6378,18 @@ aarch64_expand_call (rtx result, rtx mem, bool sibcall)
   vec = gen_rtvec (2, call, tmp);
   call = gen_rtx_PARALLEL (VOIDmode, vec);
 
+#if 1
+  if (aarch64_simd_decl_p (cfun->decl) && !aarch64_is_simd_call_p (callee))
+    {
+      rtx *fusage = &CALL_INSN_FUNCTION_USAGE (call);
+      int i;
+
+      for (i = V0_REGNUM; i <= V31_REGNUM; i++)
+	if (FP_SIMD_SAVED_REGNUM_P (i))
+	  clobber_reg (fusage, gen_rtx_REG (TFmode, i));
+    }
+#endif
+
   aarch64_emit_call_insn (call);
 }
 
@@ -18210,6 +18282,9 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_SELECT_EARLY_REMAT_MODES
 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
 
+#undef TARGET_ATTRIBUTE_TABLE
+#define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
+
 #if CHECKING_P
 #undef TARGET_RUN_TARGET_SELFTESTS
 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index c121850..279dbed 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -392,6 +392,37 @@ extern unsigned aarch64_architecture_version;
     V_ALIASES(28), V_ALIASES(29), V_ALIASES(30), V_ALIASES(31)  \
   }
 
+/* This is here just to change the order of the vector registers so
+   that V24 to V31 are used before V16 to V23.  In SIMD functions
+   V16 to V23 are callee saved so we want to use V24 to V31 first.
+
+   ADJUST_REG_ALLOC_ORDER does not work if REG_ALLOC_ORDER is not used.  */
+
+#define REG_ALLOC_ORDER				\
+{						\
+  /* Argument registers.  */			\
+  0, 1, 2, 3, 4, 5, 6, 7,			\
+  /* Caller-saved registers.  */		\
+  8, 9, 10, 11, 12, 13, 14, 15,			\
+  16, 17, 18, 					\
+  /* Callee-saved registers.  */		\
+  19, 20, 21, 22, 23, 24, 25, 26,		\
+  27, 28,					\
+  /* All other registers.  */			\
+  29, 30, 31,					\
+  /* Argument vregisters.  */			\
+  32, 33, 34, 35, 36, 37, 38, 39,		\
+  /* Caller-saved vregisters.  */		\
+  56, 57, 58, 59, 60, 61, 62, 63,		\
+  48, 49, 50, 51, 52, 53, 54, 55,		\
+  /* Callee-saved vregisters.  */		\
+  40, 41, 42, 43, 44, 45, 46, 47,		\
+  /* Other pseudo registers.  */		\
+  64, 65, 66					\
+}
+
+#define HONOR_REG_ALLOC_ORDER 1
+
 /* Say that the return address register is used by the epilogue, but only after
    epilogue generation is complete.  Note that in the case of sibcalls, the
    values "used by the epilogue" are considered live at the start of the called
@@ -503,6 +534,8 @@ extern unsigned aarch64_architecture_version;
 #define PR_LO_REGNUM_P(REGNO)\
   (((unsigned) (REGNO - P0_REGNUM)) <= (P7_REGNUM - P0_REGNUM))
 
+#define FP_SIMD_SAVED_REGNUM_P(REGNO)			\
+  (((unsigned) (REGNO - V8_REGNUM)) <= (V23_REGNUM - V8_REGNUM))
 
 /* Register and constant classes.  */
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index e9c16f9..74a4821 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -63,6 +63,7 @@
     (V15_REGNUM		47)
     (V16_REGNUM		48)
     (V20_REGNUM		52)
+    (V23_REGNUM		55)
     (V24_REGNUM		56)
     (V28_REGNUM		60)
     (V31_REGNUM		63)
@@ -1413,6 +1414,21 @@
   [(set_attr "type" "neon_load1_2reg")]
 )
 
+(define_insn "loadwb_pair<TX:mode>_<P:mode>"
+  [(parallel
+    [(set (match_operand:P 0 "register_operand" "=k")
+          (plus:P (match_operand:P 1 "register_operand" "0")
+                  (match_operand:P 4 "aarch64_mem_pair_offset" "n")))
+     (set (match_operand:TX 2 "register_operand" "=w")
+          (mem:TX (match_dup 1)))
+     (set (match_operand:TX 3 "register_operand" "=w")
+          (mem:TX (plus:P (match_dup 1)
+                  (match_operand:P 5 "const_int_operand" "n"))))])]
+  "INTVAL (operands[5]) == GET_MODE_SIZE (<TX:MODE>mode)"
+  "ldp\\t%q2, %q3, [%1], %4"
+  [(set_attr "type" "neon_load1_2reg")]
+)
+
 ;; Store pair with pre-index writeback.  This is primarily used in function
 ;; prologues.
 (define_insn "storewb_pair<GPI:mode>_<P:mode>"
@@ -1447,6 +1463,22 @@
   [(set_attr "type" "neon_store1_2reg<q>")]
 )
 
+(define_insn "storewb_pair<TX:mode>_<P:mode>"
+  [(parallel
+    [(set (match_operand:P 0 "register_operand" "=&k")
+          (plus:P (match_operand:P 1 "register_operand" "0")
+                  (match_operand:P 4 "aarch64_mem_pair_offset" "n")))
+     (set (mem:TX (plus:P (match_dup 0)
+                  (match_dup 4)))
+          (match_operand:TX 2 "register_operand" "w"))
+     (set (mem:TX (plus:P (match_dup 0)
+                  (match_operand:P 5 "const_int_operand" "n")))
+          (match_operand:TX 3 "register_operand" "w"))])]
+  "INTVAL (operands[5]) == INTVAL (operands[4]) + GET_MODE_SIZE (<TX:MODE>mode)"
+  "stp\\t%q2, %q3, [%0, %4]!"
+  [(set_attr "type" "neon_store1_2reg<q>")]
+)
+
 ;; -------------------------------------------------------------------
 ;; Sign/Zero extension
 ;; -------------------------------------------------------------------

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]