cpplib: Some lexer cleanups

Sat Apr 19 10:25:00 GMT 2003

This is a first batch of cleanups for string, number and identifier
lexing, all of which are simplified.

There's a new funciton continues_identifier_p which returns whether
to accept a '$'; this is used by both number and identifier lexing
and will be the place where UCN recognition goes.  Which reminds
me, as for all of GCC 3.x, dollars_in_identifiers is still not done
right and we don't honour the command line switch.

With -pedantic, we'd warn about a dollar in an identifier at each
one we met.  IMO if you're a user of '$', this would make -pedantic
pretty unusable, so I've made it warn once per translation unit instead.

Neil.

	* cpphash.h (struct cpp_reader): New member warned_dollar.
	* cpplex.c (continues_identifier_p): New function.
	(parse_identifier, parse_number, parse_string): Rename lex_identifer,
	lex_number and lex_string, and simplify.
	(parse_slow, unescaped_terminator_p): Die.
	(_cpp_lex_direct): Update.

Index: cpphash.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/cpphash.h,v
retrieving revision 1.181
diff -u -p -r1.181 cpphash.h

--- cpphash.h	19 Apr 2003 00:22:47 -0000	1.181
+++ cpphash.h	19 Apr 2003 10:17:39 -0000
@@ -381,6 +381,10 @@ struct cpp_reader
   cpp_token avoid_paste;
   cpp_token eof;
 
+  /* True if we have already warned about dollars in identifiers or
+     numbers for this buffer.  */
+  bool warned_dollar;
+
   /* Opaque handle to the dependencies of mkdeps.c.  */
   struct deps *deps;
 
Index: cpplex.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/cpplex.c,v
retrieving revision 1.222
diff -u -p -r1.222 cpplex.c
--- cpplex.c	19 Apr 2003 07:41:15 -0000	1.222
+++ cpplex.c	19 Apr 2003 10:17:39 -0000
@@ -62,12 +62,10 @@ static cppchar_t get_effective_char PARA
 
 static int skip_line_comment PARAMS ((cpp_reader *));
 static void skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
-static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *));
-static uchar *parse_slow PARAMS ((cpp_reader *, const uchar *, int,
-				  unsigned int *));
-static void parse_number PARAMS ((cpp_reader *, cpp_string *, int));
-static int unescaped_terminator_p PARAMS ((cpp_reader *, const uchar *));
-static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
+static cpp_hashnode *lex_identifier PARAMS ((cpp_reader *));
+static void lex_number PARAMS ((cpp_reader *, cpp_string *));
+static bool continues_identifier_p PARAMS ((cpp_reader *));
+static void lex_string PARAMS ((cpp_reader *, cpp_token *));
 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const uchar *,
 				  cppchar_t));
 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
@@ -377,46 +375,50 @@ name_p (pfile, string)
   return 1;
 }
 
-/* Parse an identifier, skipping embedded backslash-newlines.  This is
-   a critical inner loop.  The common case is an identifier which has
-   not been split by backslash-newline, does not contain a dollar
-   sign, and has already been scanned (roughly 10:1 ratio of
-   seen:unseen identifiers in normal code; the distribution is
-   Poisson-like).  Second most common case is a new identifier, not
-   split and no dollar sign.  The other possibilities are rare and
-   have been relegated to parse_slow.  */
+/* Returns TRUE if the sequence starting at buffer->cur is invalid in
+   an identifier.  */
+static bool
+continues_identifier_p (pfile)
+     cpp_reader *pfile;
+{
+  if (*pfile->buffer->cur != '$')
+    return false;
+
+  if (CPP_PEDANTIC (pfile) && !pfile->state.skipping && !pfile->warned_dollar)
+    {
+      pfile->warned_dollar = true;
+      cpp_error (pfile, DL_PEDWARN, "'$' in identifier or number");
+    }
+  pfile->buffer->cur++;
+
+  return true;
+}
+
+/* Lex an identifier starting at BUFFER->CUR - 1.  */
 static cpp_hashnode *
-parse_identifier (pfile)
+lex_identifier (pfile)
      cpp_reader *pfile;
 {
   cpp_hashnode *result;
   const uchar *cur, *base;
 
-  /* Fast-path loop.  Skim over a normal identifier.
-     N.B. ISIDNUM does not include $.  */
-  cur = pfile->buffer->cur;
-  while (ISIDNUM (*cur))
-    cur++;
-
-  /* Check for slow-path cases.  */
-  if (*cur == '$')
+  base = pfile->buffer->cur - 1;
+  do
     {
-      unsigned int len;
+      cur = pfile->buffer->cur;
+
+      /* N.B. ISIDNUM does not include $.  */
+      while (ISIDNUM (*cur))
+	cur++;
 
-      base = parse_slow (pfile, cur, 0, &len);
-      result = (cpp_hashnode *)
-	ht_lookup (pfile->hash_table, base, len, HT_ALLOCED);
-    }
-  else
-    {
-      base = pfile->buffer->cur - 1;
       pfile->buffer->cur = cur;
-      result = (cpp_hashnode *)
-	ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
     }
+  while (continues_identifier_p (pfile));
+
+  result = (cpp_hashnode *)
+    ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
 
-  /* Rarely, identifiers require diagnostics when lexed.
-     XXX Has to be forced out of the fast path.  */
+  /* Rarely, identifiers require diagnostics when lexed.  */
   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
 			&& !pfile->state.skipping, 0))
     {
@@ -436,188 +438,66 @@ parse_identifier (pfile)
   return result;
 }
 
-/* Slow path.  This handles numbers and identifiers which have been
-   split, or contain dollar signs.  The part of the token from
-   PFILE->buffer->cur-1 to CUR has already been scanned.  NUMBER_P is
-   1 if it's a number, and 2 if it has a leading period.  Returns a
-   pointer to the token's NUL-terminated spelling in permanent
-   storage, and sets PLEN to its length.  */
-static uchar *
-parse_slow (pfile, cur, number_p, plen)
-     cpp_reader *pfile;
-     const uchar *cur;
-     int number_p;
-     unsigned int *plen;
-{
-  cpp_buffer *buffer = pfile->buffer;
-  const uchar *base = buffer->cur - 1;
-  struct obstack *stack = &pfile->hash_table->stack;
-  unsigned int c, prevc, saw_dollar = 0;
-
-  /* Place any leading period.  */
-  if (number_p == 2)
-    obstack_1grow (stack, '.');
-
-  /* Copy the part of the token which is known to be okay.  */
-  obstack_grow (stack, base, cur - base);
-
-  /* Now process the part which isn't.  We are looking at one of
-     '$', '\\', or '?' on entry to this loop.  */
-  prevc = cur[-1];
-  c = *cur++;
-  buffer->cur = cur;
-  for (;;)
-    {
-      /* Potential escaped newline?  */
-      buffer->backup_to = buffer->cur - 1;
-
-      if (!is_idchar (c))
-	{
-	  if (!number_p)
-	    break;
-	  if (c != '.' && !VALID_SIGN (c, prevc))
-	    break;
-	}
-
-      /* Handle normal identifier characters in this loop.  */
-      do
-	{
-	  prevc = c;
-	  obstack_1grow (stack, c);
-
-	  if (c == '$')
-	    saw_dollar++;
-
-	  c = *buffer->cur++;
-	}
-      while (is_idchar (c));
-    }
-
-  /* Step back over the unwanted char.  */
-  BACKUP ();
-
-  /* $ is not an identifier character in the standard, but is commonly
-     accepted as an extension.  Don't warn about it in skipped
-     conditional blocks.  */
-  if (saw_dollar && CPP_PEDANTIC (pfile) && ! pfile->state.skipping)
-    cpp_error (pfile, DL_PEDWARN, "'$' character(s) in identifier or number");
-
-  /* Identifiers and numbers are null-terminated.  */
-  *plen = obstack_object_size (stack);
-  obstack_1grow (stack, '\0');
-  return obstack_finish (stack);
-}
-
-/* Parse a number, beginning with character C, skipping embedded
-   backslash-newlines.  LEADING_PERIOD is nonzero if there was a "."
-   before C.  Place the result in NUMBER.  */
+/* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
 static void
-parse_number (pfile, number, leading_period)
+lex_number (pfile, number)
      cpp_reader *pfile;
      cpp_string *number;
-     int leading_period;
 {
   const uchar *cur;
+  const uchar *base;
+  uchar *dest;
 
-  /* Fast-path loop.  Skim over a normal number.
-     N.B. ISIDNUM does not include $.  */
-  cur = pfile->buffer->cur;
-  while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
-    cur++;
-
-  /* Check for slow-path cases.  */
-  if (*cur == '$')
-    number->text = parse_slow (pfile, cur, 1 + leading_period, &number->len);
-  else
+  base = pfile->buffer->cur - 1;
+  do
     {
-      const uchar *base = pfile->buffer->cur - 1;
-      uchar *dest;
+      cur = pfile->buffer->cur;
+
+      /* N.B. ISIDNUM does not include $.  */
+      while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
+	cur++;
 
-      number->len = cur - base + leading_period;
-      dest = _cpp_unaligned_alloc (pfile, number->len + 1);
-      dest[number->len] = '\0';
-      number->text = dest;
-
-      if (leading_period)
-	*dest++ = '.';
-      memcpy (dest, base, cur - base);
       pfile->buffer->cur = cur;
     }
-}
-
-/* Subroutine of parse_string.  */
-static int
-unescaped_terminator_p (pfile, dest)
-     cpp_reader *pfile;
-     const unsigned char *dest;
-{
-  const unsigned char *start, *temp;
+  while (continues_identifier_p (pfile));
 
-  /* In #include-style directives, terminators are not escapable.  */
-  if (pfile->state.angled_headers)
-    return 1;
-
-  start = BUFF_FRONT (pfile->u_buff);
-
-  /* An odd number of consecutive backslashes represents an escaped
-     terminator.  */
-  for (temp = dest; temp > start && temp[-1] == '\\'; temp--)
-    ;
-
-  return ((dest - temp) & 1) == 0;
+  number->len = cur - base;
+  dest = _cpp_unaligned_alloc (pfile, number->len + 1);
+  memcpy (dest, base, number->len);
+  dest[number->len] = '\0';
+  number->text = dest;
 }
 
-/* Parses a string, character constant, or angle-bracketed header file
-   name.  Handles embedded trigraphs and escaped newlines.  The stored
-   string is guaranteed NUL-terminated, but it is not guaranteed that
-   this is the first NUL since embedded NULs are preserved.
-
-   When this function returns, buffer->cur points to the next
-   character to be processed.  */
+/* Lexes a string, character constant, or angle-bracketed header file
+   name.  The stored string is guaranteed NUL-terminated, but it is
+   not guaranteed that this is the first NUL since embedded NULs are
+   preserved.  */
 static void
-parse_string (pfile, token, terminator)
+lex_string (pfile, token)
      cpp_reader *pfile;
      cpp_token *token;
-     cppchar_t terminator;
 {
   cpp_buffer *buffer = pfile->buffer;
-  unsigned char *dest, *limit;
-  cppchar_t c;
   bool warned_nulls = false;
-
-  dest = BUFF_FRONT (pfile->u_buff);
-  limit = BUFF_LIMIT (pfile->u_buff);
+  const uchar *base;
+  uchar *dest;
+  cppchar_t terminator;
+
+  base = buffer->cur;
+  terminator = base[-1];
+  if (terminator == '<')
+    terminator = '>';
 
   for (;;)
     {
-      /* We need room for another char, possibly the terminating NUL.  */
-      if ((size_t) (limit - dest) < 1)
-	{
-	  size_t len_so_far = dest - BUFF_FRONT (pfile->u_buff);
-	  _cpp_extend_buff (pfile, &pfile->u_buff, 2);
-	  dest = BUFF_FRONT (pfile->u_buff) + len_so_far;
-	  limit = BUFF_LIMIT (pfile->u_buff);
-	}
-
-      c = *buffer->cur++;
+      cppchar_t c = *buffer->cur++;
 
-      if (c == terminator)
-	{
-	  if (unescaped_terminator_p (pfile, dest))
-	    break;
-	}
-      else if (c == '\n')
-	{
-	  /* No string literal may extend over multiple lines.  In
-	     assembly language, suppress the error except for <>
-	     includes.  This is a kludge around not knowing where
-	     comments are.  */
-	  if (CPP_OPTION (pfile, lang) != CLK_ASM || terminator == '>')
-	    cpp_error (pfile, DL_ERROR, "missing terminating %c character",
-		       (int) terminator);
-	  buffer->cur--;
-	  break;
-	}
+      /* In #include-style directives, terminators are not escapable.
+	 \n can follow the '\\' if the file's last byte is '\\'.  */
+      if (c == '\\' && !pfile->state.angled_headers && *buffer->cur != '\n')
+	buffer->cur++;
+      else if (c == terminator || c == '\n')
+	break;
       else if (c == '\0')
 	{
 	  if (!warned_nulls)
@@ -627,14 +507,25 @@ parse_string (pfile, token, terminator)
 			 "null character(s) preserved in literal");
 	    }
 	}
-	*dest++ = c;
     }
 
-  *dest = '\0';
-
-  token->val.str.text = BUFF_FRONT (pfile->u_buff);
-  token->val.str.len = dest - BUFF_FRONT (pfile->u_buff);
-  BUFF_FRONT (pfile->u_buff) = dest + 1;
+  token->val.str.len = buffer->cur - base - 1;
+  dest = _cpp_unaligned_alloc (pfile, token->val.str.len + 1);
+  memcpy (dest, base, token->val.str.len);
+  dest[token->val.str.len] = '\0';
+  token->val.str.text = dest;
+
+  if (buffer->cur[-1] == '\n')
+    {
+      /* No string literal may extend over multiple lines.  In
+	 assembly language, suppress the error except for <>
+	 includes.  This is a kludge around not knowing where
+	 comments are.  */
+      if (CPP_OPTION (pfile, lang) != CLK_ASM || terminator == '>')
+	cpp_error (pfile, DL_ERROR, "missing terminating %c character",
+		   (int) terminator);
+      buffer->cur--;
+    }
 }
 
 /* The stored comment includes the comment start and any terminator.  */
@@ -916,23 +807,18 @@ _cpp_lex_direct (pfile)
     case '0': case '1': case '2': case '3': case '4':
     case '5': case '6': case '7': case '8': case '9':
       result->type = CPP_NUMBER;
-      parse_number (pfile, &result->val.str, 0);
+      lex_number (pfile, &result->val.str);
       break;
 
     case 'L':
       /* 'L' may introduce wide characters or strings.  */
-      {
-	const unsigned char *pos = buffer->cur;
-
-	c = get_effective_char (pfile);
-	if (c == '\'' || c == '"')
-	  {
-	    result->type = (c == '"' ? CPP_WSTRING: CPP_WCHAR);
-	    parse_string (pfile, result, c);
-	    break;
-	  }
-	buffer->cur = pos;
-      }
+      if (*buffer->cur == '\'' || *buffer->cur == '"')
+	{
+	  result->type = (*buffer->cur == '"' ? CPP_WSTRING: CPP_WCHAR);
+	  buffer->cur++;
+	  lex_string (pfile, result);
+	  break;
+	}
       /* Fall through.  */
 
     start_ident:
@@ -948,7 +834,7 @@ _cpp_lex_direct (pfile)
     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
     case 'Y': case 'Z':
       result->type = CPP_NAME;
-      result->val.node = parse_identifier (pfile);
+      result->val.node = lex_identifier (pfile);
 
       /* Convert named operators to their proper types.  */
       if (result->val.node->flags & NODE_OPERATOR)
@@ -961,7 +847,7 @@ _cpp_lex_direct (pfile)
     case '\'':
     case '"':
       result->type = c == '"' ? CPP_STRING: CPP_CHAR;
-      parse_string (pfile, result, c);
+      lex_string (pfile, result);
       break;
 
     case '/':
@@ -1018,7 +904,7 @@ _cpp_lex_direct (pfile)
       if (pfile->state.angled_headers)
 	{
 	  result->type = CPP_HEADER_NAME;
-	  parse_string (pfile, result, '>');
+	  lex_string (pfile, result);
 	  break;
 	}
 
@@ -1108,8 +994,9 @@ _cpp_lex_direct (pfile)
       /* All known character sets have 0...9 contiguous.  */
       else if (ISDIGIT (c))
 	{
+	  buffer->cur--;
 	  result->type = CPP_NUMBER;
-	  parse_number (pfile, &result->val.str, 1);
+	  lex_number (pfile, &result->val.str);
 	}
       else if (c == '*' && CPP_OPTION (pfile, cplusplus))
 	result->type = CPP_DOT_STAR;