[PATCH v3] AArch64: Improve GOT addressing

Wilco Dijkstra Wilco.Dijkstra@arm.com
Fri Jun 4 13:44:48 GMT 2021


Hi Richard,

This merges the v1 and v2 patches and removes the spurious MEM from
ldr_got_small_si/di. This has been rebased after [1], and the performance
gain has now doubled.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2021-June/571708.html

Improve GOT addressing by treating the instructions as a pair.  This reduces
register pressure and improves code quality significantly.  SPECINT2017 improves
by 0.6% with -fPIC and codesize is 0.73% smaller.  Perlbench has 0.9% smaller
codesize, 1.5% fewer executed instructions and is 1.8% faster on Neoverse N1.

Passes bootstrap and regress. OK for commit?

ChangeLog:
2021-06-04  Wilco Dijkstra  <wdijkstr@arm.com>

        * config/aarch64/aarch64.md (movsi): Split GOT accesses after reload.
        (movdi): Likewise.
        (ldr_got_small_<mode>): Remove MEM and LO_SUM, emit ADRP+LDR GOT sequence.
        (ldr_got_small_sidi): Likewise.
        * config/aarch64/aarch64.c (aarch64_load_symref_appropriately): Delay
        splitting of GOT accesses until after reload. Remove tmp_reg and MEM.
        (aarch64_print_operand): Correctly print got_lo12 in L specifier.
        (aarch64_rtx_costs): Set rematerialization cost for GOT accesses.
        (aarch64_mov_operand_p): Make GOT accesses valid move operands.

---

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 08245827daa3f8199b29031e754244c078f0f500..11ea33c70fb06194fadfe94322fdfa098e5320fc 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -3615,6 +3615,14 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
 
     case SYMBOL_SMALL_GOT_4G:
       {
+	/* Use movdi for GOT accesses until after reload - this improves
+	   CSE and rematerialization.  */
+	if (!reload_completed)
+	  {
+	    emit_insn (gen_rtx_SET (dest, imm));
+	    return;
+	  }
+
 	/* In ILP32, the mode of dest can be either SImode or DImode,
 	   while the got entry is always of SImode size.  The mode of
 	   dest depends on how dest is used: if dest is assigned to a
@@ -3624,34 +3632,21 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
 	   patterns here (two patterns for ILP32).  */
 
 	rtx insn;
-	rtx mem;
-	rtx tmp_reg = dest;
 	machine_mode mode = GET_MODE (dest);
 
-	if (can_create_pseudo_p ())
-	  tmp_reg = gen_reg_rtx (mode);
-
-	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 	if (mode == ptr_mode)
 	  {
 	    if (mode == DImode)
-	      insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
+	      insn = gen_ldr_got_small_di (dest, imm);
 	    else
-	      insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
-
-	    mem = XVECEXP (SET_SRC (insn), 0, 0);
+	      insn = gen_ldr_got_small_si (dest, imm);
 	  }
 	else
 	  {
 	    gcc_assert (mode == Pmode);
-
-	    insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
-	    mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
+	    insn = gen_ldr_got_small_sidi (dest, imm);
 	  }
 
-	gcc_assert (MEM_P (mem));
-	MEM_READONLY_P (mem) = 1;
-	MEM_NOTRAP_P (mem) = 1;
 	emit_insn (insn);
 	return;
       }
@@ -11019,7 +11014,7 @@ aarch64_print_operand (FILE *f, rtx x, int code)
       switch (aarch64_classify_symbolic_expression (x))
 	{
 	case SYMBOL_SMALL_GOT_4G:
-	  asm_fprintf (asm_out_file, ":lo12:");
+	  asm_fprintf (asm_out_file, ":got_lo12:");
 	  break;
 
 	case SYMBOL_SMALL_TLSGD:
@@ -13452,6 +13447,12 @@ cost_plus:
 
     case SYMBOL_REF:
       *cost = 0;
+
+      /* Use a separate remateralization cost for GOT accesses.  */
+      if (aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC
+	  && aarch64_classify_symbol (x, 0) == SYMBOL_SMALL_GOT_4G)
+	*cost = COSTS_N_INSNS (1) / 2;
+
       return true;
 
     case HIGH:
@@ -19907,6 +19908,11 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
       return aarch64_simd_valid_immediate (x, NULL);
     }
 
+  /* GOT accesses are valid moves until after regalloc.  */
+  if (SYMBOL_REF_P (x)
+      && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
+    return true;
+
   x = strip_salt (x);
   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
     return true;
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index abfd84526745d029ad4953eabad6dd17b159a218..30effca6f3562f6870a6cc8097750e63bb0d424d 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1283,8 +1283,11 @@ (define_insn_and_split "*movsi_aarch64"
    fmov\\t%w0, %s1
    fmov\\t%s0, %s1
    * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);"
-  "CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), SImode)
-    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+  "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), SImode)
+    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0])))
+    || (reload_completed
+	&& (aarch64_classify_symbolic_expression (operands[1])
+	    == SYMBOL_SMALL_GOT_4G))"
    [(const_int 0)]
    "{
        aarch64_expand_mov_immediate (operands[0], operands[1]);
@@ -1319,8 +1322,11 @@ (define_insn_and_split "*movdi_aarch64"
    fmov\\t%x0, %d1
    fmov\\t%d0, %d1
    * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
-   "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode))
-    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+   "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode)
+    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0])))
+    || (reload_completed
+	&& (aarch64_classify_symbolic_expression (operands[1])
+	    == SYMBOL_SMALL_GOT_4G))"
    [(const_int 0)]
    "{
        aarch64_expand_mov_immediate (operands[0], operands[1]);
@@ -6703,27 +6709,26 @@ (define_insn "add_losym_<mode>"
   [(set_attr "type" "alu_imm")]
 )
 
+;;; GOT accesses use a single insn so linkers can relax GOT relocations.
 (define_insn "ldr_got_small_<mode>"
   [(set (match_operand:PTR 0 "register_operand" "=r")
-	(unspec:PTR [(mem:PTR (lo_sum:PTR
-			      (match_operand:PTR 1 "register_operand" "r")
-			      (match_operand:PTR 2 "aarch64_valid_symref" "S")))]
+	(unspec:PTR [(match_operand:PTR 1 "aarch64_valid_symref" "S")]
 		    UNSPEC_GOTSMALLPIC))]
   ""
-  "ldr\\t%<w>0, [%1, #:got_lo12:%c2]"
-  [(set_attr "type" "load_<ldst_sz>")]
+  "adrp\\t%0, %A1\;ldr\\t%<w>0, [%0, %L1]"
+  [(set_attr "type" "load_<ldst_sz>")
+   (set_attr "length" "8")]
 )
 
 (define_insn "ldr_got_small_sidi"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extend:DI
-	 (unspec:SI [(mem:SI (lo_sum:DI
-			     (match_operand:DI 1 "register_operand" "r")
-			     (match_operand:DI 2 "aarch64_valid_symref" "S")))]
+	 (unspec:SI [(match_operand:DI 1 "aarch64_valid_symref" "S")]
 		    UNSPEC_GOTSMALLPIC)))]
   "TARGET_ILP32"
-  "ldr\\t%w0, [%1, #:got_lo12:%c2]"
-  [(set_attr "type" "load_4")]
+  "adrp\\t%0, %A1\;ldr\\t%w0, [%0, %L1]"
+  [(set_attr "type" "load_4")
+   (set_attr "length" "8")]
 )
 
 (define_insn "ldr_got_small_28k_<mode>"



More information about the Gcc-patches mailing list