This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
cpplib preparation for identifier spelling preservation

From: "Joseph S. Myers" <joseph at codesourcery dot com>
To: gcc-patches at gcc dot gnu dot org
Date: Sun, 10 May 2009 14:28:30 +0000 (UTC)
Subject: cpplib preparation for identifier spelling preservation
One missing piece of extended identifiers support is preserving the
spellings of those identifiers for the preprocessor # operator and for
checks on duplicate macro definitions.  (There have been attempts to
explain the present implementation as a phase 1 conversion of UCNs in
identifiers to some standard form, but this has never been documented
as such and does not fit well with the standard phases of translation;
I think it's better to avoid complicated phase 1 conversions and
instead take the (multibyte) source characters as they appear in the
source file interpreted according to -finput-charset.  In fact I think
the special phase 1 handling of backslash-whitespace-newline is a bad
idea (cf. discussions of how this does not allow writing some C++0x
raw strings) and we should remove that as well and then close bug 8270
as fixed, but that may be more controversial.)

This patch makes mechanical changes to cpplib in preparation for
implementing this spelling preservation.  Identifiers will store two
cpp_hashnode pointers instead of one (one to the canonical spelling,
one to the original spelling), and CPP_MACRO_ARG tokens will store the
original spelling as well as information about what number argument
they are; in both cases, there is room to spare without increasing
memory usage.  This patch simply moves from the relevant pointers /
integers going directly in union cpp_token_u, to them being contained
in separate structures; a subsequent patch will then add the second
members to the structures and arrange for them to be used when
appropriate.

Bootstrapped with no regressions on x86_64-unknown-linux-gnu.  Applied
to mainline.

gcc:
2009-05-10  Joseph Myers  <joseph@codesourcery.com>

	* c-lex.c (c_lex_with_flags): Expect cpp_hashnode in
	tok->val.node.node.

libcpp:
2009-05-10  Joseph Myers  <joseph@codesourcery.com>

	* include/cpplib.h (enum cpp_token_fld_kind): Add
	CPP_TOKEN_FLD_TOKEN_NO.
	(struct cpp_macro_arg, struct cpp_identifier): Define.
	(union cpp_token_u): Use struct cpp_identifier for identifiers.
	Use struct cpp_macro_arg for macro arguments.  Add token_no for
	CPP_PASTE token numbers.
	* directives.c (_cpp_handle_directive, lex_macro_node, do_pragma,
	do_pragma_poison, parse_assertion): Use val.node.node in place of
	val.node.
	* expr.c (parse_defined, eval_token): Use val.node.node in place
	of val.node.
	* lex.c (cpp_ideq, _cpp_lex_direct, cpp_token_len,
	cpp_spell_token, cpp_output_token, _cpp_equiv_tokens,
	cpp_token_val_index): Use val.macro_arg.arg_no or val.token_no in
	place of val.arg_no.  Use val.node.node in place of val.node.
	* macro.c (replace_args, cpp_get_token, parse_params,
	lex_expansion_token, create_iso_definition, cpp_macro_definition):
	Use val.macro_arg.arg_no or val.token_no in place of val.arg_no.
	Use val.node.node in place of val.node.

Index: gcc/c-lex.c
===================================================================
--- gcc/c-lex.c	(revision 147334)
+++ gcc/c-lex.c	(working copy)
@@ -313,7 +313,7 @@ c_lex_with_flags (tree *value, location_
       goto retry;
 
     case CPP_NAME:
-      *value = HT_IDENT_TO_GCC_IDENT (HT_NODE (tok->val.node));
+      *value = HT_IDENT_TO_GCC_IDENT (HT_NODE (tok->val.node.node));
       break;
 
     case CPP_NUMBER:
@@ -369,7 +369,7 @@ c_lex_with_flags (tree *value, location_
 	      break;
 
 	    case CPP_NAME:
-	      *value = HT_IDENT_TO_GCC_IDENT (HT_NODE (tok->val.node));
+	      *value = HT_IDENT_TO_GCC_IDENT (HT_NODE (tok->val.node.node));
 	      if (objc_is_reserved_word (*value))
 		{
 		  type = CPP_AT_NAME;
Index: libcpp/directives.c
===================================================================
--- libcpp/directives.c	(revision 147334)
+++ libcpp/directives.c	(working copy)
@@ -408,8 +408,8 @@ _cpp_handle_directive (cpp_reader *pfile
 
   if (dname->type == CPP_NAME)
     {
-      if (dname->val.node->is_directive)
-	dir = &dtable[dname->val.node->directive_index];
+      if (dname->val.node.node->is_directive)
+	dir = &dtable[dname->val.node.node->directive_index];
     }
   /* We do not recognize the # followed by a number extension in
      assembler code.  */
@@ -538,7 +538,7 @@ lex_macro_node (cpp_reader *pfile, bool 
 
   if (token->type == CPP_NAME)
     {
-      cpp_hashnode *node = token->val.node;
+      cpp_hashnode *node = token->val.node.node;
 
       if (is_def_or_undef && node == pfile->spec_nodes.n_defined)
 	cpp_error (pfile, CPP_DL_ERROR,
@@ -549,7 +549,7 @@ lex_macro_node (cpp_reader *pfile, bool 
   else if (token->flags & NAMED_OP)
     cpp_error (pfile, CPP_DL_ERROR,
        "\"%s\" cannot be used as a macro name as it is an operator in C++",
-	       NODE_NAME (token->val.node));
+	       NODE_NAME (token->val.node.node));
   else if (token->type == CPP_EOF)
     cpp_error (pfile, CPP_DL_ERROR, "no macro name given in #%s directive",
 	       pfile->directive->name);
@@ -1329,7 +1329,7 @@ do_pragma (cpp_reader *pfile)
   ns_token = *token;
   if (token->type == CPP_NAME)
     {
-      p = lookup_pragma_entry (pfile->pragmas, token->val.node);
+      p = lookup_pragma_entry (pfile->pragmas, token->val.node.node);
       if (p && p->is_nspace)
 	{
 	  bool allow_name_expansion = p->allow_expansion;
@@ -1337,7 +1337,7 @@ do_pragma (cpp_reader *pfile)
 	    pfile->state.prevent_expansion--;
 	  token = cpp_get_token (pfile);
 	  if (token->type == CPP_NAME)
-	    p = lookup_pragma_entry (p->u.space, token->val.node);
+	    p = lookup_pragma_entry (p->u.space, token->val.node.node);
 	  else
 	    p = NULL;
 	  if (allow_name_expansion)
@@ -1429,7 +1429,7 @@ do_pragma_poison (cpp_reader *pfile)
 	  break;
 	}
 
-      hp = tok->val.node;
+      hp = tok->val.node.node;
       if (hp->flags & NODE_POISONED)
 	continue;
 
@@ -1986,12 +1986,12 @@ parse_assertion (cpp_reader *pfile, stru
     cpp_error (pfile, CPP_DL_ERROR, "predicate must be an identifier");
   else if (parse_answer (pfile, answerp, type) == 0)
     {
-      unsigned int len = NODE_LEN (predicate->val.node);
+      unsigned int len = NODE_LEN (predicate->val.node.node);
       unsigned char *sym = (unsigned char *) alloca (len + 1);
 
       /* Prefix '#' to get it out of macro namespace.  */
       sym[0] = '#';
-      memcpy (sym + 1, NODE_NAME (predicate->val.node), len);
+      memcpy (sym + 1, NODE_NAME (predicate->val.node.node), len);
       result = cpp_lookup (pfile, sym, len + 1);
     }
 
Index: libcpp/macro.c
===================================================================
--- libcpp/macro.c	(revision 147334)
+++ libcpp/macro.c	(working copy)
@@ -946,7 +946,7 @@ replace_args (cpp_reader *pfile, cpp_has
 
 	/* We have an argument.  If it is not being stringified or
 	   pasted it is macro-replaced before insertion.  */
-	arg = &args[src->val.arg_no - 1];
+	arg = &args[src->val.macro_arg.arg_no - 1];
 
 	if (src->flags & STRINGIFY_ARG)
 	  {
@@ -982,7 +982,7 @@ replace_args (cpp_reader *pfile, cpp_has
 	}
 
       paste_flag = 0;
-      arg = &args[src->val.arg_no - 1];
+      arg = &args[src->val.macro_arg.arg_no - 1];
       if (src->flags & STRINGIFY_ARG)
 	count = 1, from = &arg->stringified;
       else if (src->flags & PASTE_LEFT)
@@ -994,7 +994,7 @@ replace_args (cpp_reader *pfile, cpp_has
 	    {
 	      if (dest[-1]->type == CPP_COMMA
 		  && macro->variadic
-		  && src->val.arg_no == macro->paramc)
+		  && src->val.macro_arg.arg_no == macro->paramc)
 		{
 		  /* Swallow a pasted comma if from == NULL, otherwise
 		     drop the paste flag.  */
@@ -1035,7 +1035,7 @@ replace_args (cpp_reader *pfile, cpp_has
 		     "empty macro arguments are undefined"
 		     " in ISO C90 and ISO C++98",
 		     NODE_NAME (node),
-		     src->val.arg_no);
+		     src->val.macro_arg.arg_no);
 	}
 
       /* Avoid paste on RHS (even case count == 0).  */
@@ -1261,7 +1261,7 @@ cpp_get_token (cpp_reader *pfile)
       if (result->type != CPP_NAME)
 	break;
 
-      node = result->val.node;
+      node = result->val.node.node;
 
       if (node->type != NT_MACRO || (result->flags & NO_EXPAND))
 	break;
@@ -1553,7 +1553,7 @@ parse_params (cpp_reader *pfile, cpp_mac
 	    }
 	  prev_ident = 1;
 
-	  if (_cpp_save_parameter (pfile, macro, token->val.node))
+	  if (_cpp_save_parameter (pfile, macro, token->val.node.node))
 	    return false;
 	  continue;
 
@@ -1626,10 +1626,10 @@ lex_expansion_token (cpp_reader *pfile, 
 
   /* Is this a parameter?  */
   if (token->type == CPP_NAME
-      && (token->val.node->flags & NODE_MACRO_ARG) != 0)
+      && (token->val.node.node->flags & NODE_MACRO_ARG) != 0)
     {
       token->type = CPP_MACRO_ARG;
-      token->val.arg_no = token->val.node->value.arg_index;
+      token->val.macro_arg.arg_no = token->val.node.node->value.arg_index;
     }
   else if (CPP_WTRADITIONAL (pfile) && macro->paramc > 0
 	   && (token->type == CPP_STRING || token->type == CPP_CHAR))
@@ -1771,7 +1771,7 @@ create_iso_definition (cpp_reader *pfile
 	    {
 	      macro->extra_tokens = 1;
 	      num_extra_tokens++;
-	      token->val.arg_no = macro->count - 1;
+	      token->val.token_no = macro->count - 1;
 	    }
 	  else
 	    {
@@ -2007,7 +2007,7 @@ cpp_macro_definition (cpp_reader *pfile,
 	  cpp_token *token = &macro->exp.tokens[i];
 
 	  if (token->type == CPP_MACRO_ARG)
-	    len += NODE_LEN (macro->params[token->val.arg_no - 1]);
+	    len += NODE_LEN (macro->params[token->val.macro_arg.arg_no - 1]);
 	  else
 	    len += cpp_token_len (token);
 
@@ -2079,9 +2079,9 @@ cpp_macro_definition (cpp_reader *pfile,
 	  if (token->type == CPP_MACRO_ARG)
 	    {
 	      memcpy (buffer,
-		      NODE_NAME (macro->params[token->val.arg_no - 1]),
-		      NODE_LEN (macro->params[token->val.arg_no - 1]));
-	      buffer += NODE_LEN (macro->params[token->val.arg_no - 1]);
+		      NODE_NAME (macro->params[token->val.macro_arg.arg_no - 1]),
+		      NODE_LEN (macro->params[token->val.macro_arg.arg_no - 1]));
+	      buffer += NODE_LEN (macro->params[token->val.macro_arg.arg_no - 1]);
 	    }
 	  else
 	    buffer = cpp_spell_token (pfile, token, buffer, false);
Index: libcpp/include/cpplib.h
===================================================================
--- libcpp/include/cpplib.h	(revision 147334)
+++ libcpp/include/cpplib.h	(working copy)
@@ -189,10 +189,27 @@ enum cpp_token_fld_kind {
   CPP_TOKEN_FLD_SOURCE,
   CPP_TOKEN_FLD_STR,
   CPP_TOKEN_FLD_ARG_NO,
+  CPP_TOKEN_FLD_TOKEN_NO,
   CPP_TOKEN_FLD_PRAGMA,
   CPP_TOKEN_FLD_NONE
 };
 
+/* A macro argument in the cpp_token union.  */
+struct GTY(()) cpp_macro_arg {
+  /* Argument number.  */
+  unsigned int arg_no;
+};
+
+/* An identifier in the cpp_token union.  */
+struct GTY(()) cpp_identifier {
+  /* The canonical (UTF-8) spelling of the identifier.  */
+  cpp_hashnode *
+    GTY ((nested_ptr (union tree_node,
+		"%h ? CPP_HASHNODE (GCC_IDENT_TO_HT_IDENT (%h)) : NULL",
+			"%h ? HT_IDENT_TO_GCC_IDENT (HT_NODE (%h)) : NULL")))
+       node;
+};
+
 /* A preprocessing token.  This has been carefully packed and should
    occupy 16 bytes on 32-bit hosts and 24 bytes on 64-bit hosts.  */
 struct GTY(()) cpp_token {
@@ -203,12 +220,7 @@ struct GTY(()) cpp_token {
   union cpp_token_u
   {
     /* An identifier.  */
-    cpp_hashnode *
-      GTY ((nested_ptr (union tree_node,
-		"%h ? CPP_HASHNODE (GCC_IDENT_TO_HT_IDENT (%h)) : NULL",
-			"%h ? HT_IDENT_TO_GCC_IDENT (HT_NODE (%h)) : NULL"),
-	    tag ("CPP_TOKEN_FLD_NODE")))
-	 node;
+    struct cpp_identifier GTY ((tag ("CPP_TOKEN_FLD_NODE"))) node;
 	 
     /* Inherit padding from this token.  */
     cpp_token * GTY ((tag ("CPP_TOKEN_FLD_SOURCE"))) source;
@@ -217,7 +229,11 @@ struct GTY(()) cpp_token {
     struct cpp_string GTY ((tag ("CPP_TOKEN_FLD_STR"))) str;
 
     /* Argument no. for a CPP_MACRO_ARG.  */
-    unsigned int GTY ((tag ("CPP_TOKEN_FLD_ARG_NO"))) arg_no;
+    struct cpp_macro_arg GTY ((tag ("CPP_TOKEN_FLD_ARG_NO"))) macro_arg;
+
+    /* Original token no. for a CPP_PASTE (from a sequence of
+       consecutive paste tokens in a macro expansion).  */
+    unsigned int GTY ((tag ("CPP_TOKEN_FLD_TOKEN_NO"))) token_no;
 
     /* Caller-supplied identifier for a CPP_PRAGMA.  */
     unsigned int GTY ((tag ("CPP_TOKEN_FLD_PRAGMA"))) pragma;
Index: libcpp/expr.c
===================================================================
--- libcpp/expr.c	(revision 147334)
+++ libcpp/expr.c	(working copy)
@@ -651,7 +651,7 @@ parse_defined (cpp_reader *pfile)
 
   if (token->type == CPP_NAME)
     {
-      node = token->val.node;
+      node = token->val.node.node;
       if (paren && cpp_get_token (pfile)->type != CPP_CLOSE_PAREN)
 	{
 	  cpp_error (pfile, CPP_DL_ERROR, "missing ')' after \"defined\"");
@@ -771,14 +771,14 @@ eval_token (cpp_reader *pfile, const cpp
       break;
 
     case CPP_NAME:
-      if (token->val.node == pfile->spec_nodes.n_defined)
+      if (token->val.node.node == pfile->spec_nodes.n_defined)
 	return parse_defined (pfile);
       else if (CPP_OPTION (pfile, cplusplus)
-	       && (token->val.node == pfile->spec_nodes.n_true
-		   || token->val.node == pfile->spec_nodes.n_false))
+	       && (token->val.node.node == pfile->spec_nodes.n_true
+		   || token->val.node.node == pfile->spec_nodes.n_false))
 	{
 	  result.high = 0;
-	  result.low = (token->val.node == pfile->spec_nodes.n_true);
+	  result.low = (token->val.node.node == pfile->spec_nodes.n_true);
 	}
       else
 	{
@@ -786,7 +786,7 @@ eval_token (cpp_reader *pfile, const cpp
 	  result.low = 0;
 	  if (CPP_OPTION (pfile, warn_undef) && !pfile->state.skip_eval)
 	    cpp_error (pfile, CPP_DL_WARNING, "\"%s\" is not defined",
-		       NODE_NAME (token->val.node));
+		       NODE_NAME (token->val.node.node));
 	}
       break;
 
Index: libcpp/lex.c
===================================================================
--- libcpp/lex.c	(revision 147334)
+++ libcpp/lex.c	(working copy)
@@ -76,7 +76,7 @@ cpp_ideq (const cpp_token *token, const 
   if (token->type != CPP_NAME)
     return 0;
 
-  return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
+  return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
 }
 
 /* Record a note TYPE at byte POS into the current cleaned logical
@@ -1120,16 +1120,16 @@ _cpp_lex_direct (cpp_reader *pfile)
       result->type = CPP_NAME;
       {
 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
-	result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
-					   &nst);
+	result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
+						&nst);
 	warn_about_normalization (pfile, result, &nst);
       }
 
       /* Convert named operators to their proper types.  */
-      if (result->val.node->flags & NODE_OPERATOR)
+      if (result->val.node.node->flags & NODE_OPERATOR)
 	{
 	  result->flags |= NAMED_OP;
-	  result->type = (enum cpp_ttype) result->val.node->directive_index;
+	  result->type = (enum cpp_ttype) result->val.node.node->directive_index;
 	}
       break;
 
@@ -1244,7 +1244,7 @@ _cpp_lex_direct (cpp_reader *pfile)
 	      result->flags |= DIGRAPH;
 	      result->type = CPP_HASH;
 	      if (*buffer->cur == '%' && buffer->cur[1] == ':')
-		buffer->cur += 2, result->type = CPP_PASTE, result->val.arg_no = 0;
+		buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
 	    }
 	  else if (*buffer->cur == '>')
 	    {
@@ -1325,7 +1325,7 @@ _cpp_lex_direct (cpp_reader *pfile)
     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
-    case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.arg_no = 0; break;
+    case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
 
     case '?': result->type = CPP_QUERY; break;
     case '~': result->type = CPP_COMPL; break;
@@ -1350,7 +1350,7 @@ _cpp_lex_direct (cpp_reader *pfile)
 	if (forms_identifier_p (pfile, true, &nst))
 	  {
 	    result->type = CPP_NAME;
-	    result->val.node = lex_identifier (pfile, base, true, &nst);
+	    result->val.node.node = lex_identifier (pfile, base, true, &nst);
 	    warn_about_normalization (pfile, result, &nst);
 	    break;
 	  }
@@ -1376,7 +1376,7 @@ cpp_token_len (const cpp_token *token)
     {
     default:		len = 6;				break;
     case SPELL_LITERAL:	len = token->val.str.len;		break;
-    case SPELL_IDENT:	len = NODE_LEN (token->val.node) * 10;	break;
+    case SPELL_IDENT:	len = NODE_LEN (token->val.node.node) * 10;	break;
     }
 
   return len;
@@ -1457,23 +1457,23 @@ cpp_spell_token (cpp_reader *pfile, cons
     case SPELL_IDENT:
       if (forstring)
 	{
-	  memcpy (buffer, NODE_NAME (token->val.node),
-		  NODE_LEN (token->val.node));
-	  buffer += NODE_LEN (token->val.node);
+	  memcpy (buffer, NODE_NAME (token->val.node.node),
+		  NODE_LEN (token->val.node.node));
+	  buffer += NODE_LEN (token->val.node.node);
 	}
       else
 	{
 	  size_t i;
-	  const unsigned char * name = NODE_NAME (token->val.node);
+	  const unsigned char * name = NODE_NAME (token->val.node.node);
 	  
-	  for (i = 0; i < NODE_LEN (token->val.node); i++)
+	  for (i = 0; i < NODE_LEN (token->val.node.node); i++)
 	    if (name[i] & ~0x7F)
 	      {
 		i += utf8_to_ucn (buffer, name + i) - 1;
 		buffer += 10;
 	      }
 	    else
-	      *buffer++ = NODE_NAME (token->val.node)[i];
+	      *buffer++ = NODE_NAME (token->val.node.node)[i];
 	}
       break;
 
@@ -1550,9 +1550,9 @@ cpp_output_token (const cpp_token *token
     case SPELL_IDENT:
       {
 	size_t i;
-	const unsigned char * name = NODE_NAME (token->val.node);
+	const unsigned char * name = NODE_NAME (token->val.node.node);
 	
-	for (i = 0; i < NODE_LEN (token->val.node); i++)
+	for (i = 0; i < NODE_LEN (token->val.node.node); i++)
 	  if (name[i] & ~0x7F)
 	    {
 	      unsigned char buffer[10];
@@ -1560,7 +1560,7 @@ cpp_output_token (const cpp_token *token
 	      fwrite (buffer, 1, 10, fp);
 	    }
 	  else
-	    fputc (NODE_NAME (token->val.node)[i], fp);
+	    fputc (NODE_NAME (token->val.node.node)[i], fp);
       }
       break;
 
@@ -1583,13 +1583,14 @@ _cpp_equiv_tokens (const cpp_token *a, c
       {
       default:			/* Keep compiler happy.  */
       case SPELL_OPERATOR:
-	/* arg_no is used to track where multiple consecutive ##
+	/* token_no is used to track where multiple consecutive ##
 	   tokens were originally located.  */
-	return (a->type != CPP_PASTE || a->val.arg_no == b->val.arg_no);
+	return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
       case SPELL_NONE:
-	return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
+	return (a->type != CPP_MACRO_ARG
+		|| a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
       case SPELL_IDENT:
-	return a->val.node == b->val.node;
+	return a->val.node.node == b->val.node.node;
       case SPELL_LITERAL:
 	return (a->val.str.len == b->val.str.len
 		&& !memcmp (a->val.str.text, b->val.str.text,
@@ -1901,7 +1902,7 @@ cpp_token_val_index (cpp_token *tok)
       return CPP_TOKEN_FLD_STR;
     case SPELL_OPERATOR:
       if (tok->type == CPP_PASTE)
-	return CPP_TOKEN_FLD_ARG_NO;
+	return CPP_TOKEN_FLD_TOKEN_NO;
       else
 	return CPP_TOKEN_FLD_NONE;
     case SPELL_NONE:

-- 
Joseph S. Myers
joseph@codesourcery.com
Follow-Ups:
- Re: cpplib preparation for identifier spelling preservation
  - From: Ian Lance Taylor
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]