This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[committed] Trampoline fix for hppa64 - take 2


My previous attempt at fixing trampolines on hppa64 assumed that
all hppa64 (ie., PA 2.0) machines had cache lines of 64 bytes
<http://gcc.gnu.org/ml/gcc-patches/2003-11/msg00737.html>.  This
turns out not to be true.  Some early PA 2.0 machines (e.g., the
C200) use a cache line of 32 bytes.

This patch reverts to using a cache line length of 32 bytes.  It's
now easy to change the length to 16 bytes but this is believed to be
necessary only for PA 1.0 machines that we don't support any longer.

The patch has a number of other improvements.  The 32-bit PA 2.0
call sequence is now shorter.  Only the code in the trampoline is
flushed and if it happens to lie in one line, then only one line
is flushed.  The I and D patterns are now capable of flushing
an arbitrary number of cache lines.

In working on the patch, I noticed that the anddi3 expander didn't
allow generating an `and' with a constant immediate when generating
64-bit code.  So, I fixed the anddi3 and iordi3 expanders to allow
suitable constants when generating 64-bit code.

Tested on hppa-unknown-linux-gnu, hppa64-hp-hpux11.11 and
hppa-hp-hpux11.00 with no regressions.  Committed to trunk.

Dave
-- 
J. David Anglin                                  dave.anglin@nrc-cnrc.gc.ca
National Research Council of Canada              (613) 990-0752 (FAX: 952-6602)

2003-12-20  John David Anglin  <dave.anglin@nrc-cnrc.gc.ca>

	* pa.h (TRAMPOLINE_TEMPLATE): Shorten sequence when generating PA
	2.0 code.
	(TRAMPOLINE_CODE_SIZE, MIN_CACHELINE_SIZE): New defines.
	(INITIALIZE_TRAMPOLINE): Rework to pass line length, and aligned start
	and end addresses to I and D cache instruction patterns.
	* pa.md (anddi3, iordi3): Change predicates of operands 1 and 2 to
	and_operand and ior_operand, respectively.  When generating 64-bit
	code, only one operand needs to be a register operand.
	(xordi3): Change predicates of operands 1 and 2 to register_operand.
	(one_cmpldi2): Change predicate of operand 1 to register_operand.
	(dcacheflush, icacheflush): Revise to flush an arbitrary number of
	cache lines.

Index: config/pa/pa.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/pa/pa.h,v
retrieving revision 1.207
diff -u -3 -p -r1.207 pa.h
--- config/pa/pa.h	14 Dec 2003 07:26:19 -0000	1.207
+++ config/pa/pa.h	19 Dec 2003 20:15:57 -0000
@@ -1003,10 +1003,20 @@ extern int may_call_alloca;
 	  fputs ("\tdepwi	0,31,2,%r21\n", FILE);			\
 	fputs ("\tldw	4(%r21),%r19\n", FILE);				\
 	fputs ("\tldw	0(%r21),%r21\n", FILE);				\
-	fputs ("\tldsid	(%r21),%r1\n", FILE);				\
-	fputs ("\tmtsp	%r1,%sr0\n", FILE);				\
-	fputs ("\tbe	0(%sr0,%r21)\n", FILE);				\
-	fputs ("\tldw	40(%r22),%r29\n", FILE);			\
+	if (TARGET_PA_20)						\
+	  {								\
+	    fputs ("\tbve	(%r21)\n", FILE);			\
+	    fputs ("\tldw	40(%r22),%r29\n", FILE);		\
+	    fputs ("\t.word	0\n", FILE);				\
+	    fputs ("\t.word	0\n", FILE);				\
+	  }								\
+	else								\
+	  {								\
+	    fputs ("\tldsid	(%r21),%r1\n", FILE);			\
+	    fputs ("\tmtsp	%r1,%sr0\n", FILE);			\
+	    fputs ("\tbe	0(%sr0,%r21)\n", FILE);			\
+	    fputs ("\tldw	40(%r22),%r29\n", FILE);		\
+	  }								\
 	fputs ("\t.word	0\n", FILE);					\
 	fputs ("\t.word	0\n", FILE);					\
 	fputs ("\t.word	0\n", FILE);					\
@@ -1029,16 +1039,21 @@ extern int may_call_alloca;
       }									\
   }
 
-/* Length in units of the trampoline for entering a nested function.
+/* Length in units of the trampoline for entering a nested function.  */
 
-   Flush the cache entries corresponding to the first and last addresses
-   of the trampoline.  This is necessary as the trampoline may cross two
-   cache lines.
+#define TRAMPOLINE_SIZE (TARGET_64BIT ? 72 : 52)
 
-   If the code part of the trampoline ever grows to > 32 bytes, then it
-   will become necessary to hack on the cacheflush pattern in pa.md.  */
+/* Length in units of the trampoline instruction code.  */
 
-#define TRAMPOLINE_SIZE (TARGET_64BIT ? 72 : 52)
+#define TRAMPOLINE_CODE_SIZE (TARGET_64BIT ? 24 : (TARGET_PA_20 ? 32 : 40))
+
+/* Minimum length of a cache line.  A length of 16 will work on all
+   PA-RISC processors.  All PA 1.1 processors have a cache line of
+   32 bytes.  Most but not all PA 2.0 processors have a cache line
+   of 64 bytes.  As cache flushes are expensive and we don't support
+   PA 1.0, we use a minimum length of 32.  */
+
+#define MIN_CACHELINE_SIZE 32
 
 /* Emit RTL insns to initialize the variable parts of a trampoline.
    FNADDR is an RTX for the address of the function's pure code.
@@ -1048,54 +1063,85 @@ extern int may_call_alloca;
    Move the static chain value to trampoline template at offset 40.
    Move the trampoline address to trampoline template at offset 44.
    Move r19 to trampoline template at offset 48.  The latter two
-   words create a plabel for the indirect call to the trampoline.  */
+   words create a plabel for the indirect call to the trampoline.
+
+   A similar sequence is used for the 64-bit port but the plabel is
+   at the beginning of the trampoline.
+
+   Finally, the cache entries for the trampoline code are flushed.
+   This is necessary to ensure that the trampoline instruction sequence
+   is written to memory prior to any attempts at prefetching the code
+   sequence.  */
 
 #define INITIALIZE_TRAMPOLINE(TRAMP, FNADDR, CXT) 			\
 {									\
+  rtx start_addr = gen_reg_rtx (Pmode);					\
+  rtx end_addr = gen_reg_rtx (Pmode);					\
+  rtx line_length = gen_reg_rtx (Pmode);				\
+  rtx tmp;								\
+									\
   if (!TARGET_64BIT)							\
     {									\
-      rtx start_addr, end_addr;						\
+      tmp = memory_address (Pmode, plus_constant ((TRAMP), 36));	\
+      emit_move_insn (gen_rtx_MEM (Pmode, tmp), (FNADDR));		\
+      tmp = memory_address (Pmode, plus_constant ((TRAMP), 40));	\
+      emit_move_insn (gen_rtx_MEM (Pmode, tmp), (CXT));			\
 									\
-      start_addr = memory_address (Pmode, plus_constant ((TRAMP), 36));	\
-      emit_move_insn (gen_rtx_MEM (Pmode, start_addr), (FNADDR));	\
-      start_addr = memory_address (Pmode, plus_constant ((TRAMP), 40));	\
-      emit_move_insn (gen_rtx_MEM (Pmode, start_addr), (CXT));		\
-      start_addr = memory_address (Pmode, plus_constant ((TRAMP), 44));	\
-      emit_move_insn (gen_rtx_MEM (Pmode, start_addr), (TRAMP));	\
-      start_addr = memory_address (Pmode, plus_constant ((TRAMP), 48));	\
-      emit_move_insn (gen_rtx_MEM (Pmode, start_addr),			\
+      /* Create a fat pointer for the trampoline.  */			\
+      tmp = memory_address (Pmode, plus_constant ((TRAMP), 44));	\
+      emit_move_insn (gen_rtx_MEM (Pmode, tmp), (TRAMP));		\
+      tmp = memory_address (Pmode, plus_constant ((TRAMP), 48));	\
+      emit_move_insn (gen_rtx_MEM (Pmode, tmp),				\
 		      gen_rtx_REG (Pmode, 19));				\
+									\
       /* fdc and fic only use registers for the address to flush,	\
-	 they do not accept integer displacements.  */ 			\
-      start_addr = force_reg (Pmode, (TRAMP));				\
-      end_addr = force_reg (Pmode, plus_constant ((TRAMP), 32));	\
-      emit_insn (gen_dcacheflush (start_addr, end_addr));		\
-      emit_insn (gen_icacheflush (start_addr, end_addr, start_addr,	\
+	 they do not accept integer displacements.  We align the	\
+	 start and end addresses to the beginning of their respective	\
+	 cache lines to minimize the number of lines flushed.  */	\
+      tmp = force_reg (Pmode, (TRAMP));					\
+      emit_insn (gen_andsi3 (start_addr, tmp,				\
+			     GEN_INT (-MIN_CACHELINE_SIZE)));		\
+      tmp = force_reg (Pmode,						\
+		       plus_constant (tmp, TRAMPOLINE_CODE_SIZE - 1));	\
+      emit_insn (gen_andsi3 (end_addr, tmp,				\
+			     GEN_INT (-MIN_CACHELINE_SIZE)));		\
+      emit_move_insn (line_length, GEN_INT (MIN_CACHELINE_SIZE));	\
+      emit_insn (gen_dcacheflush (start_addr, end_addr, line_length));	\
+      emit_insn (gen_icacheflush (start_addr, end_addr, line_length,	\
+				  gen_reg_rtx (Pmode),			\
 				  gen_reg_rtx (Pmode),			\
 				  gen_reg_rtx (Pmode)));		\
     }									\
   else									\
     {									\
-      rtx start_addr, end_addr;						\
+      tmp = memory_address (Pmode, plus_constant ((TRAMP), 56));	\
+      emit_move_insn (gen_rtx_MEM (Pmode, tmp), (FNADDR));		\
+      tmp = memory_address (Pmode, plus_constant ((TRAMP), 64));	\
+      emit_move_insn (gen_rtx_MEM (Pmode, tmp), (CXT));			\
 									\
-      start_addr = memory_address (Pmode, plus_constant ((TRAMP), 56));	\
-      emit_move_insn (gen_rtx_MEM (Pmode, start_addr), (FNADDR));	\
-      start_addr = memory_address (Pmode, plus_constant ((TRAMP), 64));	\
-      emit_move_insn (gen_rtx_MEM (Pmode, start_addr), (CXT));		\
       /* Create a fat pointer for the trampoline.  */			\
-      end_addr = force_reg (Pmode, plus_constant ((TRAMP), 32));	\
-      start_addr = memory_address (Pmode, plus_constant ((TRAMP), 16));	\
-      emit_move_insn (gen_rtx_MEM (Pmode, start_addr), end_addr);	\
-      end_addr = gen_rtx_REG (Pmode, 27);				\
-      start_addr = memory_address (Pmode, plus_constant ((TRAMP), 24));	\
-      emit_move_insn (gen_rtx_MEM (Pmode, start_addr), end_addr);	\
+      tmp = memory_address (Pmode, plus_constant ((TRAMP), 16));	\
+      emit_move_insn (gen_rtx_MEM (Pmode, tmp),				\
+		      force_reg (Pmode, plus_constant ((TRAMP), 32)));	\
+      tmp = memory_address (Pmode, plus_constant ((TRAMP), 24));	\
+      emit_move_insn (gen_rtx_MEM (Pmode, tmp),				\
+		      gen_rtx_REG (Pmode, 27));				\
+									\
       /* fdc and fic only use registers for the address to flush,	\
-	 they do not accept integer displacements.   PA 2.0 cache	\
-	 lines are 64 bytes.  */		 			\
-      start_addr = force_reg (Pmode, (TRAMP));				\
-      end_addr = force_reg (Pmode, plus_constant ((TRAMP), 64));	\
-      emit_insn (gen_dcacheflush (start_addr, end_addr));		\
-      emit_insn (gen_icacheflush (start_addr, end_addr, start_addr,	\
+	 they do not accept integer displacements.  We align the	\
+	 start and end addresses to the beginning of their respective	\
+	 cache lines to minimize the number of lines flushed.  */	\
+      tmp = force_reg (Pmode, plus_constant ((TRAMP), 32));		\
+      emit_insn (gen_anddi3 (start_addr, tmp,				\
+			     GEN_INT (-MIN_CACHELINE_SIZE)));		\
+      tmp = force_reg (Pmode,						\
+		       plus_constant (tmp, TRAMPOLINE_CODE_SIZE - 1));	\
+      emit_insn (gen_anddi3 (end_addr, tmp,				\
+			     GEN_INT (-MIN_CACHELINE_SIZE)));		\
+      emit_move_insn (line_length, GEN_INT (MIN_CACHELINE_SIZE));	\
+      emit_insn (gen_dcacheflush (start_addr, end_addr, line_length));	\
+      emit_insn (gen_icacheflush (start_addr, end_addr, line_length,	\
+				  gen_reg_rtx (Pmode),			\
 				  gen_reg_rtx (Pmode),			\
 				  gen_reg_rtx (Pmode)));		\
     }									\
Index: config/pa/pa.md
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/pa/pa.md,v
retrieving revision 1.136
diff -u -3 -p -r1.136 pa.md
--- config/pa/pa.md	14 Dec 2003 07:26:19 -0000	1.136
+++ config/pa/pa.md	19 Dec 2003 20:16:03 -0000
@@ -5322,15 +5322,25 @@
 
 (define_expand "anddi3"
   [(set (match_operand:DI 0 "register_operand" "")
-	(and:DI (match_operand:DI 1 "arith_double_operand" "")
-		(match_operand:DI 2 "arith_double_operand" "")))]
+	(and:DI (match_operand:DI 1 "and_operand" "")
+		(match_operand:DI 2 "and_operand" "")))]
   ""
   "
 {
-  if (! register_operand (operands[1], DImode)
-      || ! register_operand (operands[2], DImode))
-    /* Let GCC break this into word-at-a-time operations.  */
-    FAIL;
+  if (TARGET_64BIT)
+    {
+      /* One operand must be a register operand.  */
+      if (!register_operand (operands[1], DImode)
+	  && !register_operand (operands[2], DImode))
+	FAIL;
+    }
+  else
+    {
+      /* Both operands must be register operands.  */
+      if (!register_operand (operands[1], DImode)
+	  || !register_operand (operands[2], DImode))
+	FAIL;
+    }
 }")
 
 (define_insn ""
@@ -5391,15 +5401,25 @@
 
 (define_expand "iordi3"
   [(set (match_operand:DI 0 "register_operand" "")
-	(ior:DI (match_operand:DI 1 "arith_double_operand" "")
-		(match_operand:DI 2 "arith_double_operand" "")))]
+	(ior:DI (match_operand:DI 1 "ior_operand" "")
+		(match_operand:DI 2 "ior_operand" "")))]
   ""
   "
 {
-  if (! register_operand (operands[1], DImode)
-      || ! register_operand (operands[2], DImode))
-    /* Let GCC break this into word-at-a-time operations.  */
-    FAIL;
+  if (TARGET_64BIT)
+    {
+      /* One operand must be a register operand.  */
+      if (!register_operand (operands[1], DImode)
+	  && !register_operand (operands[2], DImode))
+	FAIL;
+    }
+  else
+    {
+      /* Both operands must be register operands.  */
+      if (!register_operand (operands[1], DImode)
+	  || !register_operand (operands[2], DImode))
+	FAIL;
+    }
 }")
 
 (define_insn ""
@@ -5462,15 +5482,11 @@
 
 (define_expand "xordi3"
   [(set (match_operand:DI 0 "register_operand" "")
-	(xor:DI (match_operand:DI 1 "arith_double_operand" "")
-		(match_operand:DI 2 "arith_double_operand" "")))]
+	(xor:DI (match_operand:DI 1 "register_operand" "")
+		(match_operand:DI 2 "register_operand" "")))]
   ""
   "
 {
-  if (! register_operand (operands[1], DImode)
-      || ! register_operand (operands[2], DImode))
-    /* Let GCC break this into word-at-a-time operations.  */
-    FAIL;
 }")
 
 (define_insn ""
@@ -5532,12 +5548,10 @@
 
 (define_expand "one_cmpldi2"
   [(set (match_operand:DI 0 "register_operand" "")
-	(not:DI (match_operand:DI 1 "arith_double_operand" "")))]
+	(not:DI (match_operand:DI 1 "register_operand" "")))]
   ""
   "
 {
-  if (! register_operand (operands[1], DImode))
-    FAIL;
 }")
 
 (define_insn ""
@@ -8828,29 +8842,59 @@ add,l %2,%3,%3\;bv,n %%r0(%3)"
   return \"\";
 }")
 
-;; Flush the I and D cache line found at the address in operand 0.
+;; Flush the I and D cache lines from the start address (operand0)
+;; to the end address (operand1).  No lines are flushed if the end
+;; address is less than the start address (unsigned).
+;;
+;; Because the range of memory flushed is variable and the size of
+;; a MEM can only be a CONST_INT, the patterns specify that they
+;; perform an unspecified volatile operation on all memory.
+;;
+;; The address range for an icache flush must lie within a single
+;; space on targets with non-equivalent space registers.
+;;
 ;; This is used by the trampoline code for nested functions.
-;; So long as the trampoline itself is less than 32 bytes this
-;; is sufficient.
-
+;;
+;; Operand 0 contains the start address.
+;; Operand 1 contains the end address.
+;; Operand 2 contains the line length to use.
+;; Operand 3 contains the start address (clobbered).
+;; Operands 4 and 5 (icacheflush) are clobbered scratch registers.
 (define_insn "dcacheflush"
-  [(unspec_volatile [(const_int 1)] 0)
-   (use (mem:SI (match_operand 0 "pmode_register_operand" "r")))
-   (use (mem:SI (match_operand 1 "pmode_register_operand" "r")))]
+  [(const_int 1)
+   (unspec_volatile [(mem:BLK (scratch))] 0)
+   (use (match_operand 0 "pmode_register_operand" "r"))
+   (use (match_operand 1 "pmode_register_operand" "r"))
+   (use (match_operand 2 "pmode_register_operand" "r"))
+   (clobber (match_scratch 3 "=&0"))]
   ""
-  "fdc 0(%0)\;fdc 0(%1)\;sync"
+  "*
+{
+  if (TARGET_64BIT)
+    return \"cmpb,*<<=,n %3,%1,.\;fdc,m %2(%3)\;sync\";
+  else
+    return \"cmpb,<<=,n %3,%1,.\;fdc,m %2(%3)\;sync\";
+}"
   [(set_attr "type" "multi")
    (set_attr "length" "12")])
 
 (define_insn "icacheflush"
-  [(unspec_volatile [(const_int 2)] 0)
-   (use (mem:SI (match_operand 0 "pmode_register_operand" "r")))
-   (use (mem:SI (match_operand 1 "pmode_register_operand" "r")))
+  [(const_int 2)
+   (unspec_volatile [(mem:BLK (scratch))] 0)
+   (use (match_operand 0 "pmode_register_operand" "r"))
+   (use (match_operand 1 "pmode_register_operand" "r"))
    (use (match_operand 2 "pmode_register_operand" "r"))
-   (clobber (match_operand 3 "pmode_register_operand" "=&r"))
-   (clobber (match_operand 4 "pmode_register_operand" "=&r"))]
+   (clobber (match_scratch 3 "=&0"))
+   (clobber (match_operand 4 "pmode_register_operand" "=&r"))
+   (clobber (match_operand 5 "pmode_register_operand" "=&r"))]
   ""
-  "mfsp %%sr0,%4\;ldsid (%2),%3\;mtsp %3,%%sr0\;fic 0(%%sr0,%0)\;fic 0(%%sr0,%1)\;sync\;mtsp %4,%%sr0\;nop\;nop\;nop\;nop\;nop\;nop"
+  "*
+{
+  if (TARGET_64BIT)
+    return \"mfsp %%sr0,%5\;ldsid (%3),%4\;mtsp %4,%%sr0\;cmpb,*<<=,n %3,%1,.\;fic,m %2(%%sr0,%3)\;sync\;mtsp %5,%%sr0\;nop\;nop\;nop\;nop\;nop\;nop\";
+  else
+    return \"mfsp %%sr0,%5\;ldsid (%3),%4\;mtsp %4,%%sr0\;cmpb,<<=,n %3,%1,.\;fic,m %2(%%sr0,%3)\;sync\;mtsp %5,%%sr0\;nop\;nop\;nop\;nop\;nop\;nop\";
+}"
   [(set_attr "type" "multi")
    (set_attr "length" "52")])
 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]