This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

RFC: Patch to implement Aarch64 SIMD ABI


This is a patch to support the Aarch64 SIMD ABI [1] in GCC.  I intend
to eventually follow this up with two more patches; one to define the
TARGET_SIMD_CLONE* macros and one to improve the GCC register
allocation/usage when calling SIMD functions.

The significant difference between the standard ARM ABI and the SIMD ABI
is that in the normal ABI a callee saves only the lower 64 bits of registers
V8-V15, in the SIMD ABI the callee must save all 128 bits of registers
V8-V23.

This patch checks for SIMD functions and saves the extra registers when
needed.  It does not change the caller behavour, so with just this patch
there may be values saved by both the caller and callee.  This is not
efficient, but it is correct code.

This patch bootstraps and passes the GCC testsuite but that only verifies
I haven't broken anything, it doesn't validate the handling of SIMD functions.
I tried to write some tests, but I could never get GCC to generate code
that would save the FP callee-save registers in the prologue.  Complex code
might generate spills and fills but it never triggered the prologue/epilogue
code to save V8-V23.  If anyone has ideas on how to write a test that would
cause GCC to generate this code I would appreciate some ideas.  Just doing
lots of calculations with lots of intermediate values doesn't seem to be enough.

Steve Ellcey
sellcey@cavium.com

[1] https://developer.arm.com/products/software-development-tools/hpc/arm-compiler-for-hpc/vector-function-abi


2018-07-18  Steve Ellcey  <sellcey@cavium.com>

	* config/aarch64/aarch64.c (aarch64_attribute_table): New array.
	(aarch64_simd_function_p): New function.
	(aarch64_layout_frame): Check for simd function.
	(aarch64_process_components): Ditto.
	(aarch64_expand_prologue): Ditto.
	(aarch64_expand_epilogue): Ditto.
	(TARGET_ATTRIBUTE_TABLE): New define.
	* config/aarch64/aarch64.h (FP_SIMD_SAVED_REGNUM_P): New define.
	* config/aarch64/aarch64.md (V23_REGNUM) New constant.
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 1369704..b25da11 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1026,6 +1026,15 @@ static const struct processor *selected_tune;
 /* The current tuning set.  */
 struct tune_params aarch64_tune_params = generic_tunings;
 
+/* Table of machine attributes.  */
+static const struct attribute_spec aarch64_attribute_table[] =
+{
+  /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
+       affects_type_identity, handler, exclude } */
+  { "aarch64_vector_pcs", 0, 0, true,  false, false, false, NULL, NULL },
+  { NULL,                 0, 0, false, false, false, false, NULL, NULL }
+};
+
 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 
 /* An ISA extension in the co-processor and main instruction set space.  */
@@ -1404,6 +1413,18 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
   return false;
 }
 
+/* Return true if this is a definition of a vectorized simd function.  */
+
+static bool
+aarch64_simd_function_p (tree fndecl)
+{
+  if (lookup_attribute ("aarch64_vector_pcs", DECL_ATTRIBUTES (fndecl)) != NULL)
+    return true;
+  if (lookup_attribute ("simd", DECL_ATTRIBUTES (fndecl)) == NULL)
+    return false;
+  return (VECTOR_TYPE_P (TREE_TYPE (TREE_TYPE (fndecl))));
+}
+
 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
    clobbers the top 64 bits when restoring the bottom 64 bits.  */
@@ -4034,6 +4055,7 @@ aarch64_layout_frame (void)
 {
   HOST_WIDE_INT offset = 0;
   int regno, last_fp_reg = INVALID_REGNUM;
+  bool simd_function = aarch64_simd_function_p (cfun->decl);
 
   if (reload_completed && cfun->machine->frame.laid_out)
     return;
@@ -4068,7 +4090,8 @@ aarch64_layout_frame (void)
 
   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
     if (df_regs_ever_live_p (regno)
-	&& !call_used_regs[regno])
+	&& (!call_used_regs[regno]
+	    || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
       {
 	cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
 	last_fp_reg = regno;
@@ -4105,7 +4128,8 @@ aarch64_layout_frame (void)
       {
 	/* If there is an alignment gap between integer and fp callee-saves,
 	   allocate the last fp register to it if possible.  */
-	if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
+	if (regno == last_fp_reg && has_align_gap
+	    && !simd_function && (offset & 8) == 0)
 	  {
 	    cfun->machine->frame.reg_offset[regno] = max_int_offset;
 	    break;
@@ -4117,7 +4141,7 @@ aarch64_layout_frame (void)
 	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
 		 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
 	  cfun->machine->frame.wb_candidate2 = regno;
-	offset += UNITS_PER_WORD;
+	offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
       }
 
   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
@@ -4706,8 +4730,11 @@ aarch64_process_components (sbitmap components, bool prologue_p)
   while (regno != last_regno)
     {
       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
-	 so DFmode for the vector registers is enough.  */
-      machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
+	 so DFmode for the vector registers is enough.  For simd functions
+         we want to save the entire register.  */
+      machine_mode mode = GP_REGNUM_P (regno) ? E_DImode
+	: (aarch64_simd_function_p (cfun->decl) ? E_TFmode : E_DFmode);
+      
       rtx reg = gen_rtx_REG (mode, regno);
       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
       if (!frame_pointer_needed)
@@ -4736,6 +4763,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
 	 mergeable with the current one into a pair.  */
       if (!satisfies_constraint_Ump (mem)
 	  || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
+	  || (aarch64_simd_function_p (cfun->decl) && (FP_REGNUM_P (regno)))
 	  || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
 		       GET_MODE_SIZE (mode)))
 	{
@@ -4958,8 +4986,12 @@ aarch64_expand_prologue (void)
 
   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
 			     callee_adjust != 0 || emit_frame_chain);
-  aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-			     callee_adjust != 0 || emit_frame_chain);
+  if (aarch64_simd_function_p (cfun->decl))
+    aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+			       callee_adjust != 0 || emit_frame_chain);
+  else
+    aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+			       callee_adjust != 0 || emit_frame_chain);
   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
 }
 
@@ -5040,8 +5072,12 @@ aarch64_expand_epilogue (bool for_sibcall)
 
   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
 				callee_adjust != 0, &cfi_ops);
-  aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-				callee_adjust != 0, &cfi_ops);
+  if (aarch64_simd_function_p (cfun->decl))
+    aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+				  callee_adjust != 0, &cfi_ops);
+  else
+    aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+				  callee_adjust != 0, &cfi_ops);
 
   if (need_barrier_p)
     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
@@ -18070,6 +18106,9 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_SELECT_EARLY_REMAT_MODES
 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
 
+#undef TARGET_ATTRIBUTE_TABLE
+#define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
+
 #if CHECKING_P
 #undef TARGET_RUN_TARGET_SELFTESTS
 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index f284e74..d11474e 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -500,6 +500,8 @@ extern unsigned aarch64_architecture_version;
 #define PR_LO_REGNUM_P(REGNO)\
   (((unsigned) (REGNO - P0_REGNUM)) <= (P7_REGNUM - P0_REGNUM))
 
+#define FP_SIMD_SAVED_REGNUM_P(REGNO)			\
+  (((unsigned) (REGNO - V8_REGNUM)) <= (V23_REGNUM - V8_REGNUM))
 
 /* Register and constant classes.  */
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index a014a01..d319430 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -63,6 +63,7 @@
     (V15_REGNUM		47)
     (V16_REGNUM		48)
     (V20_REGNUM		52)
+    (V23_REGNUM		55)
     (V24_REGNUM		56)
     (V28_REGNUM		60)
     (V31_REGNUM		63)

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]