This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[PATCH][AArch64] Implement workaround for ARM Cortex-A53 erratum 835769

From: Kyrill Tkachov <kyrylo dot tkachov at arm dot com>
To: GCC Patches <gcc-patches at gcc dot gnu dot org>
Cc: Richard Earnshaw <Richard dot Earnshaw at arm dot com>, Marcus Shawcroft <marcus dot shawcroft at arm dot com>
Date: Fri, 10 Oct 2014 11:53:25 +0100
Subject: [PATCH][AArch64] Implement workaround for ARM Cortex-A53 erratum 835769
Authentication-results: sourceware.org; auth=none

Hi all,


Some early revisions of the Cortex-A53 have an erratum (835769) whereby
it is possible for a 64-bit multiply-accumulate instruction in
AArch64 state to generate an incorrect result.  The details are quite
complex and hard to determine statically, since branches in the code
may exist in some circumstances, but all cases end with a memory
(load, store, or prefetch) instruction followed immediately by the
multiply-accumulate operation.

The safest work-around for this issue is to make the compiler avoid
emitting multiply-accumulate instructions immediately after memory
instructions and the simplest way to do this is to insert a NOP. A
linker patching technique can also be used, by moving potentially
affected multiply-accumulate instruction into a patch region and
replacing the original instruction with a branch to the patch.

This patch achieves the compiler part by using the final prescan pass.
The insn length attribute is updated accordingly using ADJUST_INSN_LENGTH
so that conditional branches work properly.

The fix is disabled by default and can be turned on by the
-mfix-cortex-a53-835769 compiler option.

I'm attaching a trunk and a 4.9 version of the patch.
The 4.9 version is different only in that rtx_insn* is replaced by rtx.

Tested on aarch64-none-linux-gnu (and bootstrap with the option) and
built various large benchmarks with it.

Ok?

Thanks,
Kyrill

2014-10-10  Kyrylo Tkachov<kyrylo.tkachov@arm.com>
            Ramana Radhakrishnan<ramana.radhakrishnan@arm.com>

     * config/aarch64/aarch64.h (FINAL_PRESCAN_INSN): Define.
     (ADJUST_INSN_LENGTH): Define.
     * config/aarch64/aarch64.opt (mfix-cortex-a53-835769): New option.
     * config/aarch64/aarch64.c (is_mem_p): New function.
     (is_memory_op): Likewise.
     (aarch64_prev_real_insn): Likewise.
     (is_madd_op): Likewise.
     (dep_between_memop_and_next): Likewise.
     (aarch64_madd_needs_nop): Likewise.
     (aarch64_final_prescan_insn): Likewise.
     * doc/invoke.texi (AArch64 Options): Document -mfix-cortex-a53-835769
     and -mno-fix-cortex-a53-835769 options.

commit 6ee2bec04fbce15b13b59b54ef4fe11aeda74ff4
Author: Kyrill Tkachov <kyrylo.tkachov@arm.com>
Date:   Wed Oct 8 12:48:34 2014 +0000

    [AArch64] Add final prescan workaround for Cortex-A53 erratum.

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index b5f53d2..c57a467 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -308,6 +308,8 @@ aarch64_builtin_vectorized_function (tree fndecl,
 
 extern void aarch64_split_combinev16qi (rtx operands[3]);
 extern void aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel);
+extern bool aarch64_madd_needs_nop (rtx_insn *);
+extern void aarch64_final_prescan_insn (rtx_insn *);
 extern bool
 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel);
 void aarch64_atomic_assign_expand_fenv (tree *, tree *, tree *);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5144c35..76a2480 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7586,6 +7586,128 @@ aarch64_mangle_type (const_tree type)
   return NULL;
 }
 
+static int
+is_mem_p (rtx *x, void *data ATTRIBUTE_UNUSED)
+{
+  return MEM_P (*x);
+}
+
+static bool
+is_memory_op (rtx_insn *mem_insn)
+{
+   rtx pattern = PATTERN (mem_insn);
+   return for_each_rtx (&pattern, is_mem_p, NULL);
+}
+
+/* Find the first rtx_insn before insn that will generate an assembly
+   instruction.  */
+
+static rtx_insn *
+aarch64_prev_real_insn (rtx_insn *insn)
+{
+  if (!insn)
+    return NULL;
+
+  do
+    {
+      insn = prev_real_insn (insn);
+    }
+  while (insn && recog_memoized (insn) < 0);
+
+  return insn;
+}
+
+static bool
+is_madd_op (enum attr_type t1)
+{
+  unsigned int i;
+  /* A number of these may be AArch32 only.  */
+  enum attr_type mlatypes[] = {
+    TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
+    TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
+    TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
+  };
+
+  for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
+    {
+      if (t1 == mlatypes[i])
+	return true;
+    }
+
+  return false;
+}
+
+/* Check if there is a register dependency between a load and the insn
+   for which we hold recog_data.  */
+
+static bool
+dep_between_memop_and_curr (rtx memop)
+{
+  rtx load_reg;
+  int opno;
+
+  if (!memop)
+    return false;
+
+  if (!REG_P (SET_DEST (memop)))
+    return false;
+
+  load_reg = SET_DEST (memop);
+  for (opno = 0; opno < recog_data.n_operands; opno++)
+    {
+      rtx operand = recog_data.operand[opno];
+      if (REG_P (operand)
+          && reg_overlap_mentioned_p (load_reg, operand))
+        return true;
+
+    }
+  return false;
+}
+
+bool
+aarch64_madd_needs_nop (rtx_insn* insn)
+{
+  enum attr_type attr_type;
+  rtx_insn *prev;
+  rtx body;
+
+  if (!aarch64_fix_a53_err835769)
+    return false;
+
+  if (recog_memoized (insn) < 0)
+    return false;
+
+  attr_type = get_attr_type (insn);
+  if (!is_madd_op (attr_type))
+    return false;
+
+  prev = aarch64_prev_real_insn (insn);
+  if (!prev)
+    return false;
+
+  body = single_set (prev);
+
+  /* If the previous insn is a memory op and there is no dependency between
+     it and the madd, emit a nop between them.  If we know the previous insn is
+     a memory op but body is NULL, emit the nop to be safe, it's probably a
+     load/store pair insn.  */
+  if (is_memory_op (prev)
+      && GET_MODE (recog_data.operand[0]) == DImode
+      && (!dep_between_memop_and_curr (body)))
+    return true;
+
+  return false;
+
+}
+
+void
+aarch64_final_prescan_insn (rtx_insn *insn)
+{
+  if (aarch64_madd_needs_nop (insn))
+    fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
+}
+
+
 /* Return the equivalent letter for size.  */
 static char
 sizetochar (int size)
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index db950da..e9e5fd8 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -486,6 +486,15 @@ enum target_cpus
   (TARGET_CPU_generic | (AARCH64_CPU_DEFAULT_FLAGS << 6))
 #endif
 
+/* If inserting NOP before a mult-accumulate insn remember to adjust the
+   length so that conditional branching code is updated appropriately.  */
+#define ADJUST_INSN_LENGTH(insn, length)	\
+  if (aarch64_madd_needs_nop (insn))		\
+    length += 4;
+
+#define FINAL_PRESCAN_INSN(INSN, OPVEC, NOPERANDS)	\
+    aarch64_final_prescan_insn (INSN);			\
+
 /* The processor for which instructions should be scheduled.  */
 extern enum aarch64_processor aarch64_tune;
 
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index f5a15b7..77deb2e 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -67,6 +67,10 @@ mgeneral-regs-only
 Target Report RejectNegative Mask(GENERAL_REGS_ONLY)
 Generate code which uses only the general registers
 
+mfix-cortex-a53-835769
+Target Report Var(aarch64_fix_a53_err835769) Init(0)
+Workaround for ARM Cortex-A53 Erratum number 835769
+
 mlittle-endian
 Target Report RejectNegative InverseMask(BIG_END)
 Assume target CPU is configured as little endian
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 5fe7e15..acc9dd9 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -489,6 +489,7 @@ Objective-C and Objective-C++ Dialects}.
 -mstrict-align @gol
 -momit-leaf-frame-pointer  -mno-omit-leaf-frame-pointer @gol
 -mtls-dialect=desc  -mtls-dialect=traditional @gol
+-mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
 
 @emph{Adapteva Epiphany Options}
@@ -11744,6 +11745,14 @@ of TLS variables.  This is the default.
 Use traditional TLS as the thread-local storage mechanism for dynamic accesses
 of TLS variables.
 
+@item -mfix-cortex-a53-835769
+@itemx -mno-fix-cortex-a53-835769
+@opindex -mfix-cortex-a53-835769
+@opindex -mno-fix-cortex-a53-835769
+Enable or disable the workaround for the ARM Cortex-A53 erratum number 835769.
+This will involve inserting a NOP instruction between memory instructions and
+64-bit integer multiply-accumulate instructions.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture, optionally suffixed by one or

commit 48cc738e2c38dd372ae054d30964dd780478c4d1
Author: Kyrill Tkachov <kyrylo.tkachov@arm.com>
Date:   Tue Oct 7 12:42:01 2014 +0000

    [AArch64] Add workaround for Cortex-A53 erratum 835769
    
    2014-10-07  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
                Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>
    
    	* config/aarch64/aarch64.h (FINAL_PRESCAN_INSN): Define.
    	(ADJUST_INSN_LENGTH): Define.
    	* config/aarch64/aarch64.opt (mfix-cortex-a53-835769): New
    	* option.
    	* config/aarch64/aarch64.c (is_mem_p): New function.
    	(is_memory_op): Likewise.
    	(aarch64_prev_real_insn): Likewise.
    	(is_madd_op): Likewise.
    	(dep_between_memop_and_curr): Likewise.
    	(aarch64_madd_needs_nop): Likewise.
    	(aarch64_final_prescan_insn): Likewise.
    	* doc/invoke.texi (AArch64 Options): Document
    	* -mfix-cortex-a53-835769
    	and -mno-fix-cortex-a53-835769 options.

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 5542f02..bef58bf 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -287,6 +287,8 @@ aarch64_builtin_vectorized_function (tree fndecl,
 
 extern void aarch64_split_combinev16qi (rtx operands[3]);
 extern void aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel);
+extern bool aarch64_madd_needs_nop (rtx);
+extern void aarch64_final_prescan_insn (rtx);
 extern bool
 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel);
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index bf35031..24a57e7 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -6452,6 +6452,128 @@ aarch64_mangle_type (const_tree type)
   return NULL;
 }
 
+static int
+is_mem_p (rtx *x, void *data ATTRIBUTE_UNUSED)
+{
+  return MEM_P (*x);
+}
+
+static bool
+is_memory_op (rtx mem_insn)
+{
+   rtx pattern = PATTERN (mem_insn);
+   return for_each_rtx (&pattern, is_mem_p, NULL);
+}
+
+/* Find the first rtx before insn that will generate an assembly
+   instruction.  */
+
+static rtx
+aarch64_prev_real_insn (rtx insn)
+{
+  if (!insn)
+    return NULL;
+
+  do
+    {
+      insn = prev_real_insn (insn);
+    }
+  while (insn && recog_memoized (insn) < 0);
+
+  return insn;
+}
+
+static bool
+is_madd_op (enum attr_type t1)
+{
+  unsigned int i;
+  /* A number of these may be AArch32 only.  */
+  enum attr_type mlatypes[] = {
+    TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
+    TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
+    TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
+  };
+
+  for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
+    {
+      if (t1 == mlatypes[i])
+	return true;
+    }
+
+  return false;
+}
+
+/* Check if there is a register dependency between a load and the insn
+   for which we hold recog_data.  */
+
+static bool
+dep_between_memop_and_curr (rtx memop)
+{
+  rtx load_reg;
+  int opno;
+
+  if (!memop)
+    return false;
+
+  if (!REG_P (SET_DEST (memop)))
+    return false;
+
+  load_reg = SET_DEST (memop);
+  for (opno = 0; opno < recog_data.n_operands; opno++)
+    {
+      rtx operand = recog_data.operand[opno];
+      if (REG_P (operand)
+          && reg_overlap_mentioned_p (load_reg, operand))
+        return true;
+
+    }
+  return false;
+}
+
+bool
+aarch64_madd_needs_nop (rtx insn)
+{
+  enum attr_type attr_type;
+  rtx prev;
+  rtx body;
+
+  if (!aarch64_fix_a53_err835769)
+    return false;
+
+  if (recog_memoized (insn) < 0)
+    return false;
+
+  attr_type = get_attr_type (insn);
+  if (!is_madd_op (attr_type))
+    return false;
+
+  prev = aarch64_prev_real_insn (insn);
+  if (!prev)
+    return false;
+
+  body = single_set (prev);
+
+  /* If the previous insn is a memory op and there is no dependency between
+     it and the madd, emit a nop between them.  If we know the previous insn is
+     a memory op but body is NULL, emit the nop to be safe, it's probably a
+     load/store pair insn.  */
+  if (is_memory_op (prev)
+      && GET_MODE (recog_data.operand[0]) == DImode
+      && (!dep_between_memop_and_curr (body)))
+    return true;
+
+  return false;
+
+}
+
+void
+aarch64_final_prescan_insn (rtx insn)
+{
+  if (aarch64_madd_needs_nop (insn))
+    fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
+}
+
+
 /* Return the equivalent letter for size.  */
 static char
 sizetochar (int size)
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 2fd6df4..77b2bb9 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -481,6 +481,15 @@ enum target_cpus
   (TARGET_CPU_generic | (AARCH64_CPU_DEFAULT_FLAGS << 6))
 #endif
 
+/* If inserting NOP before a mult-accumulate insn remember to adjust the
+   length so that conditional branching code is updated appropriately.  */
+#define ADJUST_INSN_LENGTH(insn, length)	\
+  if (aarch64_madd_needs_nop (insn))		\
+    length += 4;
+
+#define FINAL_PRESCAN_INSN(INSN, OPVEC, NOPERANDS)	\
+    aarch64_final_prescan_insn (INSN);			\
+
 /* The processor for which instructions should be scheduled.  */
 extern enum aarch64_processor aarch64_tune;
 
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index f5a15b7..77deb2e 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -67,6 +67,10 @@ mgeneral-regs-only
 Target Report RejectNegative Mask(GENERAL_REGS_ONLY)
 Generate code which uses only the general registers
 
+mfix-cortex-a53-835769
+Target Report Var(aarch64_fix_a53_err835769) Init(0)
+Workaround for ARM Cortex-A53 Erratum number 835769
+
 mlittle-endian
 Target Report RejectNegative InverseMask(BIG_END)
 Assume target CPU is configured as little endian
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 573925d..a152472 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -479,6 +479,7 @@ Objective-C and Objective-C++ Dialects}.
 -mstrict-align @gol
 -momit-leaf-frame-pointer  -mno-omit-leaf-frame-pointer @gol
 -mtls-dialect=desc  -mtls-dialect=traditional @gol
+-mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
 
 @emph{Adapteva Epiphany Options}
@@ -11407,6 +11408,14 @@ of TLS variables.  This is the default.
 Use traditional TLS as the thread-local storage mechanism for dynamic accesses
 of TLS variables.
 
+@item -mfix-cortex-a53-835769
+@itemx -mno-fix-cortex-a53-835769
+@opindex -mfix-cortex-a53-835769
+@opindex -mno-fix-cortex-a53-835769
+Enable or disable the workaround for the ARM Cortex-A53 erratum number 835769.
+This will involve inserting a NOP instruction between memory instructions and
+64-bit integer multiply-accumulate instructions.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture, optionally suffixed by one or

Follow-Ups:
- Re: [PATCH][AArch64] Implement workaround for ARM Cortex-A53 erratum 835769
  - From: Marcus Shawcroft
- Re: [PATCH][AArch64] Implement workaround for ARM Cortex-A53 erratum 835769
  - From: Richard Earnshaw
- Re: [PATCH][AArch64] Implement workaround for ARM Cortex-A53 erratum 835769
  - From: Andrew Pinski

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]