This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: Add support to trace comparison instructions and switch statements


On Fri, Jul 21, 2017 at 01:38:17PM +0800, 吴潍浠(此彼) wrote:
> Hi Jeff
> 
> I have signed the copyright assignment, and used the name 'Wish Wu' .
> Should I send you a copy of my assignment ?
> 
> The attachment is my new patch with small changes. 
> Codes are checked by ./contrib/check_GNU_style.sh, except some special files.

Please provide a ChangeLog entry, you can use ./contrib/mklog as a start.

@@ -975,6 +974,10 @@ fsanitize=
 Common Driver Report Joined
 Select what to sanitize.
 
+fsanitize-coverage=
+Common Driver Report Joined
+Select what to coverage sanitize.
+

Why Driver?  The reason fsanitize= needs it is that say for
-fsanitize=address we add libraries in the driver, etc., but that
isn't the case for the coverage, right?

--- gcc/flag-types.h	(revision 250199)
+++ gcc/flag-types.h	(working copy)
@@ -250,6 +250,14 @@ enum sanitize_code {
 				  | SANITIZE_BOUNDS_STRICT
 };
 
+/* Different trace modes.  */
+enum sanitize_coverage_code {
+  /* Trace PC.  */
+  SANITIZE_COV_TRACE_PC = 1UL << 0,
+  /* Trace Compare.  */
+  SANITIZE_COV_TRACE_CMP = 1UL << 1
+};

No need for UL suffixes, the reason sanitize_code uses them is
that it includes 1 << 16 and above and might be included even in target code
(for host we require 32-bit integers, for target code it might be just
16-bit).

--- gcc/opts.c	(revision 250199)
+++ gcc/opts.c	(working copy)
@@ -1519,6 +1519,17 @@ const struct sanitizer_opts_s sanitizer_opts[] =
   { NULL, 0U, 0UL, false }
 };
 
+/* -f{,no-}sanitize-coverage= suboptions.  */
+const struct sanitizer_opts_s coverage_sanitizer_opts[] =
+{
+#define SANITIZER_OPT(name, flags, recover) \
+    { #name, flags, sizeof #name - 1, recover }
+  SANITIZER_OPT (trace-pc, SANITIZE_COV_TRACE_PC, false),
+  SANITIZER_OPT (trace-cmp, SANITIZE_COV_TRACE_CMP, false),
+#undef SANITIZER_OPT
+  { NULL, 0U, 0UL, false }

No need to have the recover argument for the macro, just add false to it
(unless you want to use a different struct type that wouldn't even include
that member).

+/* Given ARG, an unrecognized coverage sanitizer option, return the best
+   matching coverage sanitizer option, or NULL if there isn't one.  */
+
+static const char *
+get_closest_coverage_sanitizer_option (const string_fragment &arg)
+{
+  best_match <const string_fragment &, const char*> bm (arg);
+  for (int i = 0; coverage_sanitizer_opts[i].name != NULL; ++i)
+    {
+      bm.consider (coverage_sanitizer_opts[i].name);
+    }

Body which contains just one line shouldn't be wrapped in {}s, just use
  for (int i = 0; coverage_sanitizer_opts[i].name != NULL; ++i)
    bm.consider (coverage_sanitizer_opts[i].name);

+unsigned int
+parse_coverage_sanitizer_options (const char *p, location_t loc,
+			 unsigned int flags, int value, bool complain)

Wrong formatting, unsigned int should go below const char *, like:

parse_coverage_sanitizer_options (const char *p, location_t loc,
				  unsigned int flags, int value, bool complain)

+{
+  while (*p != 0)
+    {
+      size_t len, i;
+      bool found = false;
+      const char *comma = strchr (p, ',');
+
+      if (comma == NULL)
+	len = strlen (p);
+      else
+	len = comma - p;
+      if (len == 0)
+	{
+	  p = comma + 1;
+	  continue;
+	}
+
+      /* Check to see if the string matches an option class name.  */
+      for (i = 0; coverage_sanitizer_opts[i].name != NULL; ++i)
+	if (len == coverage_sanitizer_opts[i].len
+	    && memcmp (p, coverage_sanitizer_opts[i].name, len) == 0)
+	  {
+	    if (value)
+	      flags |= coverage_sanitizer_opts[i].flag;
+	    else
+	      flags &= ~coverage_sanitizer_opts[i].flag;
+	    found = true;
+	    break;
+	  }
+
+      if (! found && complain)
+	{
+	  const char *hint
+	    = get_closest_coverage_sanitizer_option (string_fragment (p, len));
+
+	  if (hint)
+	    error_at (loc,
+		      "unrecognized argument to "
+		      "-f%ssanitize-coverage= option: %q.*s;"
+		      " did you mean %qs?",
+		      value ? "" : "no-",
+		      (int) len, p, hint);
+	  else
+	    error_at (loc,
+		      "unrecognized argument to "
+		      "-f%ssanitize-coverage= option: %q.*s",
+		      value ? "" : "no-",
+		      (int) len, p);
+	}
+
+      if (comma == NULL)
+	break;
+      p = comma + 1;
+    }
+  return flags;
+}

Though, looking at the above, it sounds like there is just way too much
duplication.  So, maybe better just use the parse_sanitizer_options
and get_closest_coverage_option functions for all of
-f{,no-}sanitize{,-recover,-coverage}= , add
  const struct sanitizer_opts_s *opts = sanitizer_opts;
  if (code == OPT_fsanitize_coverage_)
    opts = coverage_sanitizer_opts;
early in both functions, deal with the diagnostics (to print "-coverage"
when needed) and look if there is anything else that needs to be
conditionalized.

@@ -1943,6 +2032,12 @@ common_handle_option (struct gcc_options *opts,
 	  &= ~(SANITIZE_UNDEFINED | SANITIZE_UNDEFINED_NONDEFAULT);
       break;
 
+    case OPT_fsanitize_coverage_:
+      opts->x_flag_sanitize_coverage
+	= parse_coverage_sanitizer_options (arg, loc,
+				   opts->x_flag_sanitize_coverage, value, true);

Wrong formatting, opts should go below arg.  But if you use
process_sanitizer_options as mentioned above, it will be a non-issue.
+      break;
+
     case OPT_O:
     case OPT_Os:
     case OPT_Ofast:
Index: gcc/sancov.c
===================================================================
--- gcc/sancov.c	(revision 250199)
+++ gcc/sancov.c	(working copy)
@@ -1,6 +1,7 @@
 /* Code coverage instrumentation for fuzzing.
    Copyright (C) 2015-2017 Free Software Foundation, Inc.
-   Contributed by Dmitry Vyukov <dvyukov@google.com>
+   Contributed by Dmitry Vyukov <dvyukov@google.com> and
+   Wish Wu <wishwu007@gmail.com>
 
 This file is part of GCC.
 
@@ -29,31 +30,202 @@ along with GCC; see the file COPYING3.  If not see
 #include "flags.h"
 #include "stmt.h"
 #include "gimple-iterator.h"
+#include "tree-core.h"

tree-core.h is already included by tree.h, why do you need to include it
explicitly?

 #include "tree-cfg.h"
 #include "tree-pass.h"
 #include "tree-iterator.h"
+#include "fold-const.h"
+#include "stringpool.h"
+#include "output.h"
+#include "cgraph.h"
 #include "asan.h"
 
 namespace {
 
+static void
+instrument_cond (gimple_stmt_iterator *gsi, gimple *stmt)
+{
+  unsigned int bitno;
+  tree lhs = gimple_cond_lhs (stmt);
+  tree rhs = gimple_cond_rhs (stmt);
+
+  bitno = MAX (TYPE_PRECISION (TREE_TYPE (lhs)),
+	       TYPE_PRECISION (TREE_TYPE (rhs)));

1) this is C++, so you can mix declarations and code,
   in this case there are even just declarations before it,
   so you can use unsigned int bitno = ...;
2) better use prec than bitno for the variable name
3) MAX doesn't make sense, a GIMPLE_COND should have
   both operands of compatible types
4) TYPE_PRECISION is only meaningful for some types, and could
   mean something unrelated for some other types, I think you
   want to move it after the check for type kind
+
+  if (TREE_CODE (TREE_TYPE (lhs)) == INTEGER_TYPE)

What about ENUMERAL_TYPEs (and maybe BOOLEAN_TYPEs)?
Shouldn't they be handled like INTEGER_TYPE, i.e. use
  if (INTEGRAL_TYPE_P (TREE_TYPE (lhs)))
?
+    {
+      enum built_in_function fncode;
+      switch (bitno)
+	{
+	case 8:
+      	  fncode = BUILT_IN_SANITIZER_COV_TRACE_CMP1;
+	  break;
+
+	case 16:
+      	  fncode = BUILT_IN_SANITIZER_COV_TRACE_CMP2;
+	  break;
+
+	case 32:
+      	  fncode = BUILT_IN_SANITIZER_COV_TRACE_CMP4;
+	  break;
+
+	case 64:
+      	  fncode = BUILT_IN_SANITIZER_COV_TRACE_CMP8;
+	  break;
+
+	default:
+	  return;
+	}
+      tree fndecl = builtin_decl_implicit (fncode);

For signed integers, when all these builtins have unsigned arguments
it looks like pedantically invalid GIMPLE IL, you'd need to
cast the arguments to corresponding unsigned type.

+      gimple *gcall = gimple_build_call (fndecl, 2, lhs, rhs);
+      gimple_set_location (gcall, gimple_location (stmt));
+      gsi_insert_before (gsi, gcall, GSI_SAME_STMT);
+    }
+  else if (TREE_CODE (TREE_TYPE (lhs)) == REAL_TYPE)

  else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (lhs))) ?

+    {
+      enum built_in_function fncode;
+      switch (bitno)
+	{
+	case 32:
+      	  fncode = BUILT_IN_SANITIZER_COV_TRACE_CMPF;
+	  break;
+
+	case 64:
+      	  fncode = BUILT_IN_SANITIZER_COV_TRACE_CMPD;
+	  break;

Not really sure it is a good idea to use a precision of
REAL_TYPE to decide which builtin to use.
I'd think it should be a check whether TYPE_MODE (TREE_TYPE (lhs))
is equal to TYPE_MODE (float_type_node) or TYPE_MODE (double_type_node).

+static void
+instrument_switch (gimple_stmt_iterator *gsi, gimple *stmt, function *fun)
+{
+  gswitch *switch_stmt = as_a<gswitch *> (stmt);
+  tree index = gimple_switch_index (switch_stmt);
+  unsigned bitno = TYPE_PRECISION (TREE_TYPE (index));

Again, please use prec instead of bitno.
Shouldn't you punt if prec > 64?

+  unsigned i, n = gimple_switch_num_labels (switch_stmt), num = 0;
+  for (i = 0; i < n; ++i)
+    {
+      tree label = gimple_switch_label (switch_stmt, i);
+      tree low_case = CASE_LOW (label);
+      if (low_case != NULL_TREE)
+	  num++;

Wrong formatting, num++ should be just one tab indented, not tab + 2 spaces.

+      tree high_case = CASE_HIGH (label);
+      if (high_case != NULL_TREE)
+	  num++;

Is that really how do you want to handle CASE_HIGH non-NULL?  That is for
cases like:
  case 'a' ... 'z':
In that case it is actually like 26 cases, not 2.

+    }
+
+  tree case_array_elem_type = build_type_variant (uint64_type_node, 1, 0);
+  tree case_array_type = build_array_type (case_array_elem_type,
+					   build_index_type (size_int (num + 2
+								       - 1)));

Better wrap like:
  tree case_array_type
    = build_array_type (...);

+  for (i = 0; i < n; ++i)
+    {
+      tree label = gimple_switch_label (switch_stmt, i);
+
+      tree low_case = CASE_LOW (label);
+      if (low_case != NULL_TREE)
+	CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
+				build_int_cst (uint64_type_node,
+					       TREE_INT_CST_LOW (low_case)));

You don't want build_int_cst, instead use fold_convert (uint64_type_node, low_case);

+  /* Insert callback to every compare statments.  */
+  if (flag_sanitize_coverage & SANITIZE_COV_TRACE_CMP)
+    {
+      FOR_EACH_BB_FN (bb, fun)
+	{
+	  gimple_stmt_iterator gsi;
+	  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+	    {
+	      gimple *stmt = gsi_stmt (gsi);
+	      switch (gimple_code (stmt))
+		{
+		case GIMPLE_COND:
+		  instrument_cond (&gsi, stmt);
+		  break;
+
+		case GIMPLE_SWITCH:
+		  instrument_switch (&gsi, stmt, fun);
+	    	  break;

If you are just looking for GIMPLE_COND and GIMPLE_SWITCH, then both of them
always satisfy stmt_ends_bb_p and thus you don't need to walk the whole bb
to find them; just look at the last stmt in the bb.

That said, the question is what you really want to instrument and why.
Consider simple:
void bar (void);
void
foo (int x)
{
  if (x == 21 || x == 64 || x == 98 || x == 135)
    bar ();
}
testcase, in GIMPLE IL that will be depending on branch cost e.g. on x86_64:
  <bb 2> [100.00%] [count: INV]:
  _1 = x_8(D) == 21;
  _2 = x_8(D) == 64;
  _3 = _1 | _2;
  if (_3 != 0)
    goto <bb 4>; [33.00%] [count: INV]
  else
    goto <bb 3>; [67.00%] [count: INV]

  <bb 3> [67.00%] [count: INV]:
  _4 = x_8(D) == 98;
  _5 = x_8(D) == 135;
  _6 = _4 | _5;
  if (_6 != 0)
    goto <bb 4>; [50.00%] [count: INV]
  else
    goto <bb 5>; [50.00%] [count: INV]

  <bb 4> [66.50%] [count: INV]:
  bar (); [tail call]

  <bb 5> [100.00%] [count: INV]:
  return;
but e.g. on powerpc64:
  <bb 2> [100.00%] [count: INV]:
  if (x_2(D) == 21)
    goto <bb 6>; [20.24%] [count: INV]
  else
    goto <bb 3>; [79.76%] [count: INV]

  <bb 3> [79.76%] [count: INV]:
  if (x_2(D) == 64)
    goto <bb 6>; [34.00%] [count: INV]
  else
    goto <bb 4>; [66.00%] [count: INV]

  <bb 4> [52.64%] [count: INV]:
  if (x_2(D) == 98)
    goto <bb 6>; [34.00%] [count: INV]
  else
    goto <bb 5>; [66.00%] [count: INV]

  <bb 5> [34.74%] [count: INV]:
  if (x_2(D) == 135)
    goto <bb 6>; [34.00%] [count: INV]
  else
    goto <bb 7>; [66.00%] [count: INV]

  <bb 6> [77.07%] [count: INV]:
  bar (); [tail call]

  <bb 7> [100.00%] [count: INV]:
  return;
So in the powerpc64 case where it is closer to the source you'd instrument
comparisons with the 4 constants, while for x86_64 you wouldn't instrument
anything at all (as you don't instrument GIMPLE_COND with BOOLEAN_TYPE
operands - those have precision 1 and aren't INTEGRAL_TYPE, and those
GIMPLE_CONDs in there are actually artificial anyway and the original
comparisons are GIMPLE_ASSIGNs with EQ_EXPR comparison code).
Do you trace or should you trace the comparison kind, or do you really want
to treat x == 64, x > 64, x < 64, x != 64 etc. the same?
And, should what you trace be tied to basic blocks at whose start you add
the __sanitizer_cov_trace_pc instrumentation for
-fsanitize-coverage=trace-pc or is that completely unrelated?

Perhaps for -fsanitize-coverage= it might be a good idea to force
LOGICAL_OP_NON_SHORT_CIRCUIT/BRANCH_COST or whatever affects GIMPLE
decisions mentioned above so that the IL is closer to what the user wrote.

	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]