This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[AArch64] Implement -fpic for -mcmodel=small


Currently, AArch64 don't differentiate -fpic and -fPIC.

For -mcmodel=small, both allow 4G GOT table size, then we always need
two instructions to address GOT entry.

This patch implements -fpic for -mcmodel=small which allow 32K GOT table
size, smaller than -fPIC, but then we can use one instruction to address
GOT entry given pic_offset_table_rtx initialized properly.
(As we are using page base, the first page may be wasted in the worsest
scenario, then only 28K space for GOT.)

the generate instruction sequence for accessing global variable is

  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]

  or "ldr reg, [pic_offset_table_rtx, #:gotpage_lo14:sym]" for ILP32
  
Only one instruction needed. But we must initialize global pointer
(pic_offset_table_rtx) properly. Currently, We initialize it for every
global access, and let CSE to remove all redundant ones.

The final instruction sequences will looks like the following
for multiply global variables access.

  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_

  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
  ...

instead of the the following less efficient -fPIC version:

  adrp  rA, :got:sym1
  ldr   rA, [rA, #:got_lo12:sym1]
  adrp  rB, :got:sym2
  ldr   rB, [rB, #:got_lo12:sym2]
  adrp  rC, :got:sym3
  ldr   rC, [rC, #:got_lo12:sym3]
  ...
  
AArch64 don't reserve any register as gp, we use pseudo pic reg, and let
register allocator to use any one possible.

Binutils correspondent

test done
=========
gcc bootstrap OK on aarch64 board with BOOT_CFLAGS="-O2 -fpic".
built glibc under -fpic, code size slightly smaller.

Ok for trunk?

2015-05-20  Jiong. Wang  <jiong.wang@arm.com>

gcc/
  * config/aarch64/aarch64.md: (ldr_got_small_<mode>): Support new GOT relocation
  modifiers.
  (ldr_got_small_sidi): Ditto.
  * config/aarch64/iterators.md (got_modifier): New mode iterator.
  * config/aarch64/aarch64-otps.h (aarch64_code_model): New model.
  * config/aarch64/aarch64.c (aarch64_load_symref_appropriately): Support -fpic.
  (aarch64_rtx_costs): Add costs for new instruction sequences.
  (initialize_aarch64_code_model): Initialize new model.
  (aarch64_classify_symbol): Recognize new model.
  (aarch64_asm_preferred_eh_data_format): Support new model.
  (aarch64_load_symref_appropriately): Generate new instruction sequences for -fpic.
  (TARGET_USE_PSEUDO_PIC_REG): New definition.
  (aarch64_use_pseudo_pic_reg): New function.

gcc/testsuite/
  * gcc.target/aarch64/pic-small.c: New testcase.

-- 
Regards,
Jiong

diff --git a/gcc/config/aarch64/aarch64-opts.h b/gcc/config/aarch64/aarch64-opts.h
index ea64cf4..49a990a 100644
--- a/gcc/config/aarch64/aarch64-opts.h
+++ b/gcc/config/aarch64/aarch64-opts.h
@@ -53,6 +53,9 @@ enum aarch64_code_model {
   /* Static code and data fit within a 4GB region.
      The default non-PIC code model.  */
   AARCH64_CMODEL_SMALL,
+  /* -fpic for small memory model.
+     GOT size to 28KiB (4K*8-4K) or 3580 entries.  */
+  AARCH64_CMODEL_SMALL_SPIC,
   /* Static code, data and GOT/PLT fit within a 4GB region.
      The default PIC code model.  */
   AARCH64_CMODEL_SMALL_PIC,
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 7a34e49..4b6e648 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -840,10 +840,55 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
 	rtx tmp_reg = dest;
 	machine_mode mode = GET_MODE (dest);
 
-	if (can_create_pseudo_p ())
-	  tmp_reg = gen_reg_rtx (mode);
+	if (aarch64_cmodel != AARCH64_CMODEL_SMALL_SPIC)
+	  {
+	    if (can_create_pseudo_p ())
+	      tmp_reg = gen_reg_rtx (mode);
+
+	    emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
+	  }
+	/* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
+	   here before rtl expand.  Tree IVOPT will generate rtl pattern to
+	   decide rtx costs, in which case pic_offset_table_rtx is not
+	   initialized.  For that case no need to generate the first adrp
+	   instruction as the the final cost for global variable access is
+	   one instruction.  */
+	else if (pic_offset_table_rtx != NULL_RTX)
+	  {
+	    /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
+	       using the page base as GOT base, the first page may be wasted,
+	       in the worst scenario, there is only 28K space for GOT).
+
+	       The generate instruction sequence for accessing global variable
+	       is:
+
+	         ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
+
+	       Only one instruction needed. But we must initialize
+	       pic_offset_table_rtx properly.  We generate initialize insn for
+	       every global access, and allow CSE to remove all redundant.
+
+	       The final instruction sequences will look like the following
+	       for multiply global variables access.
+
+	         adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
+
+	         ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
+	         ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
+	         ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
+	         ...  */
+
+
+	    rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
+	    tmp_reg = pic_offset_table_rtx;
+	    crtl->uses_pic_offset_table = 1;
+	    emit_move_insn (tmp_reg, gen_rtx_HIGH (Pmode, s));
+
+	    if (mode != GET_MODE (tmp_reg))
+	      tmp_reg = simplify_gen_subreg (mode, tmp_reg,
+					     GET_MODE (tmp_reg), 0);
+	  }
 
-	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 	if (mode == ptr_mode)
 	  {
 	    if (mode == DImode)
@@ -4352,7 +4397,15 @@ aarch64_print_operand (FILE *f, rtx x, char code)
       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
 	{
 	case SYMBOL_SMALL_GOT:
-	  asm_fprintf (asm_out_file, ":got:");
+         /* For SYMBOL_GOT symbol, don't generate GOT modifier for high part when
+            it's -fpic, because the high part will be:
+
+               adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
+
+            We need normal pc relative relocation against symbol value instead of
+	     against symbol's GOT entry.  */
+	  if (aarch64_cmodel != AARCH64_CMODEL_SMALL_SPIC)
+	    asm_fprintf (asm_out_file, ":got:");
 	  break;
 
 	case SYMBOL_SMALL_TLSGD:
@@ -6300,7 +6346,8 @@ cost_plus:
 
     case SYMBOL_REF:
 
-      if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
+      if (aarch64_cmodel == AARCH64_CMODEL_LARGE
+	  || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
 	{
 	  /* LDR.  */
 	  if (speed)
@@ -7108,7 +7155,9 @@ initialize_aarch64_code_model (void)
 	   aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
 	   break;
 	 case AARCH64_CMODEL_SMALL:
-	   aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
+	   aarch64_cmodel = (flag_pic == 2
+			     ? AARCH64_CMODEL_SMALL_PIC
+			     : AARCH64_CMODEL_SMALL_SPIC);
 	   break;
 	 case AARCH64_CMODEL_LARGE:
 	   sorry ("code model %qs with -f%s", "large",
@@ -7189,6 +7238,7 @@ aarch64_classify_symbol (rtx x, rtx offset,
 	case AARCH64_CMODEL_TINY:
 	  return SYMBOL_TINY_ABSOLUTE;
 
+	case AARCH64_CMODEL_SMALL_SPIC:
 	case AARCH64_CMODEL_SMALL_PIC:
 	case AARCH64_CMODEL_SMALL:
 	  return SYMBOL_SMALL_ABSOLUTE;
@@ -7236,6 +7286,7 @@ aarch64_classify_symbol (rtx x, rtx offset,
 	    return SYMBOL_TINY_GOT;
 	  return SYMBOL_TINY_ABSOLUTE;
 
+	case AARCH64_CMODEL_SMALL_SPIC:
 	case AARCH64_CMODEL_SMALL_PIC:
 	  if (!aarch64_symbol_binds_local_p (x))
 	    return SYMBOL_SMALL_GOT;
@@ -9118,6 +9169,7 @@ aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
      case AARCH64_CMODEL_TINY_PIC:
      case AARCH64_CMODEL_SMALL:
      case AARCH64_CMODEL_SMALL_PIC:
+     case AARCH64_CMODEL_SMALL_SPIC:
        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
 	  for everything.  */
        type = DW_EH_PE_sdata4;
@@ -11371,6 +11423,18 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
   return true;
 }
 
+/* Return 1 if pseudo register should be created and used to hold
+   GOT address for PIC code.  */
+
+bool
+aarch64_use_pseudo_pic_reg (void)
+{
+  if (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
+    return true;
+
+  return false;
+}
+
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST aarch64_address_cost
 
@@ -11649,6 +11712,9 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
 #undef TARGET_SCHED_FUSION_PRIORITY
 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
 
+#undef TARGET_USE_PSEUDO_PIC_REG
+#define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-aarch64.h"
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 1c2c5fb..43ab44e 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4226,7 +4226,12 @@
 			      (match_operand:PTR 2 "aarch64_valid_symref" "S")))]
 		    UNSPEC_GOTSMALLPIC))]
   ""
-  "ldr\\t%<w>0, [%1, #:got_lo12:%a2]"
+  {
+    if (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
+      return "ldr\t%<w>0, [%1, #:<got_modifier>:%a2]";
+    else
+      return "ldr\t%<w>0, [%1, #:got_lo12:%a2]";
+  }
   [(set_attr "type" "load1")]
 )
 
@@ -4238,7 +4243,12 @@
 			     (match_operand:DI 2 "aarch64_valid_symref" "S")))]
 		    UNSPEC_GOTSMALLPIC)))]
   "TARGET_ILP32"
-  "ldr\\t%w0, [%1, #:got_lo12:%a2]"
+  {
+    if (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
+      return "ldr\t%w0, [%1, #:gotpage_lo14:%a2]";
+    else
+      return "ldr\t%w0, [%1, #:got_lo12:%a2]";
+  }
   [(set_attr "type" "load1")]
 )
 
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 498358a..5f6d0cc 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -657,6 +657,10 @@
 
 (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")])
 
+;; -fpic small model GOT reloc modifers: gotpage_lo15/lo14 for ILP64/32.
+;; No need of iterator for -fPIC as it use got_lo12 for both modes.
+(define_mode_attr got_modifier [(SI "gotpage_lo14") (DI "gotpage_lo15")])
+
 ;; -------------------------------------------------------------------
 ;; Code Iterators
 ;; -------------------------------------------------------------------
diff --git a/gcc/testsuite/gcc.target/aarch64/pic-small.c b/gcc/testsuite/gcc.target/aarch64/pic-small.c
new file mode 100644
index 0000000..b5156fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pic-small.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fpic -fno-inline --save-temps" } */
+
+void abort ();
+int global_a;
+
+int
+initialize (void)
+{
+  global_a = 0x10;
+  return global_a - 1;
+}
+
+int
+main (int argc, char **argv)
+{
+  int a = initialize ();
+
+  if (a != global_a - 1)
+    abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "adrp\tx\[0-9\]+, :got:_GLOBAL_OFFSET_TABLE" 2 } } */
+/* { dg-final { cleanup-saved-temps } } */

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]