cpplib: new lexer

Neil Booth NeilB@earthling.net
Mon Apr 24 03:35:00 GMT 2000


Zack Weinberg wrote:-

> Once you've made these changes you may commit the patch.  Remember to
> send the revised patch to the list.

OK, here it is.  I'll post the big comment as a separate patch later,
when I've thought about it.  Besides this is big enough as it is.

Neil.

	* cpphash.c: replace HSPACE_BEFORE with PREV_WHITESPACE.
	* cpphash.h (_cpp_check_directive): new.
	* cpplex.c (handle_newline, cpp_free_token_list,
	init_trigraph_map, trigraph_ok, trigraph_replace,
	backslash_start, skip_block_comment, skip_line_comment,
	skip_whitespace, parse_name, parse_number, parse_string,
	copy_comment, _cpp_lex_line, spell_char, spell_string,
	spell_comment, spell_name, spell_other, _cpp_lex_file,
	_cpp_output_list): new.
	(expand_name_space): take length argument.
	(init_token_list): add comment list initialisation.
	(cpp_scan_line): use auto_expand_name_space.  PREV_WHITESPACE
	instead of HSPACE_BEFORE.

	* cpplib.c (_cpp_check_directive): new
	* cpplib.h (cpp_name, PREV_WHITESPACE, PREV_COMMENT,
	DIGRAPH, UNSIGNED_INT, TOK_NAME): new.
	(HSPACE_BEFORE): delete.
	(TTYPE_TABLE): rearrange.
	(struct cpp_toklist): update.	

Index: cpphash.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/cpphash.c,v
retrieving revision 1.74
diff -u -p -r1.74 cpphash.c
--- cpphash.c	2000/04/23 21:50:58	1.74
+++ cpphash.c	2000/04/24 10:21:17
@@ -459,7 +459,8 @@ collect_objlike_expansion (pfile, list)
 	default:;
 	}
 
-      if (i > 1 && !last_was_paste && (list->tokens[i].flags & HSPACE_BEFORE))
+      if (i > 1 && !last_was_paste
+	  && (list->tokens[i].flags & PREV_WHITESPACE))
 	CPP_PUTC (pfile, ' ');
 
       CPP_PUTS (pfile,
@@ -571,10 +572,10 @@ collect_funlike_expansion (pfile, list, 
 	}
 
       if (last_token != PASTE && last_token != START
-	  && (list->tokens[i].flags & HSPACE_BEFORE))
+	  && (list->tokens[i].flags & PREV_WHITESPACE))
 	CPP_PUTC (pfile, ' ');
       if (last_token == ARG && CPP_TRADITIONAL (pfile)
-	  && !(list->tokens[i].flags & HSPACE_BEFORE))
+	  && !(list->tokens[i].flags & PREV_WHITESPACE))
 	endpat->raw_after = 1;
 
       switch (token)
@@ -616,7 +617,7 @@ collect_funlike_expansion (pfile, list, 
       {
 	int raw_before = (last_token == PASTE
 			  || (CPP_TRADITIONAL (pfile)
-			      && !(list->tokens[i].flags & HSPACE_BEFORE)));
+			      && !(list->tokens[i].flags & PREV_WHITESPACE)));
       
 	add_pat (&pat, &endpat,
 		 CPP_WRITTEN (pfile) - last /* nchars */, j /* argno */,
@@ -865,7 +866,7 @@ _cpp_create_definition (pfile, list, hp)
   /* The macro is function-like only if the next character,
      with no intervening whitespace, is '('.  */
   else if (list->tokens[1].type == CPP_OPEN_PAREN
-	   && ! (list->tokens[1].flags & HSPACE_BEFORE))
+	   && ! (list->tokens[1].flags & PREV_WHITESPACE))
     {
       struct arglist args;
       int replacement;
Index: cpphash.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/cpphash.h,v
retrieving revision 1.37
diff -u -p -r1.37 cpphash.h
--- cpphash.h	2000/04/23 17:03:31	1.37
+++ cpphash.h	2000/04/24 10:21:18
@@ -317,5 +317,6 @@ extern void _cpp_scan_line		PARAMS ((cpp
 /* In cpplib.c */
 extern int _cpp_handle_directive	PARAMS ((cpp_reader *));
 extern void _cpp_handle_eof		PARAMS ((cpp_reader *));
+extern void _cpp_check_directive        PARAMS((cpp_toklist *, cpp_token *));
 
 #endif
Index: cpplex.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/cpplex.c,v
retrieving revision 1.22
diff -u -p -r1.22 cpplex.c
--- cpplex.c	2000/04/23 17:03:31	1.22
+++ cpplex.c	2000/04/24 10:21:21
@@ -4,6 +4,7 @@
    Based on CCCP program by Paul Rubin, June 1986
    Adapted to ANSI C, Richard Stallman, Jan 1987
    Broken out to separate file, Zack Weinberg, Mar 2000
+   Single-pass line tokenization by Neil Booth, April 2000
 
 This program is free software; you can redistribute it and/or modify it
 under the terms of the GNU General Public License as published by the
@@ -54,12 +55,15 @@ static void output_line_command	PARAMS (
 					 unsigned int));
 static void bump_column		PARAMS ((cpp_printer *, unsigned int,
 					 unsigned int));
-static void expand_name_space	PARAMS ((cpp_toklist *));
+static void expand_name_space   PARAMS ((cpp_toklist *, unsigned int));
 static void expand_token_space	PARAMS ((cpp_toklist *));
 static void init_token_list	PARAMS ((cpp_reader *, cpp_toklist *, int));
 static void pedantic_whitespace	PARAMS ((cpp_reader *, U_CHAR *,
 					 unsigned int));
 
+#define auto_expand_name_space(list) \
+    expand_name_space ((list), (list)->name_cap / 2)
+
 /* Re-allocates PFILE->token_buffer so it will hold at least N more chars.  */
 
 void
@@ -431,12 +441,12 @@ cpp_file_buffer (pfile)
 
 /* Expand a token list's string space.  */
 static void
-expand_name_space (list)
+expand_name_space (list, len)
      cpp_toklist *list;
-{  
-  list->name_cap *= 2;
-  list->namebuf = (unsigned char *) xrealloc (list->namebuf,
-					      list->name_cap);
+     unsigned int len;
+{
+  list->name_cap += len;
+  list->namebuf = (unsigned char *) xrealloc (list->namebuf, list->name_cap);
 }
 
 /* Expand the number of tokens in a list.  */
@@ -449,33 +459,38 @@ expand_token_space (list)
     xrealloc (list->tokens, list->tokens_cap * sizeof (cpp_token));
 }
 
-/* Initialise a token list.  */
+/* Initialize a token list.  We leave the first token unused, as this
+   allows us to always peek at the previous token without worrying
+   about underflowing the list.  */
 static void
 init_token_list (pfile, list, recycle)
      cpp_reader *pfile;
      cpp_toklist *list;
      int recycle;
 {
-  /* Recycling a used list saves 2 free-malloc pairs.  */
-  if (recycle)
-    {
-      list->tokens_used = 0;
-      list->name_used = 0;
-    }
-  else
+  /* Recycling a used list saves 3 free-malloc pairs.  */
+  if (!recycle)
     {
-      /* Initialise token space.  */
-      list->tokens_cap = 256;	/* 4K on Intel.	 */
-      list->tokens_used = 0;
+      /* Initialize token space.  */
+      list->tokens_cap = 256;	/* 4K's worth.  */
       list->tokens = (cpp_token *)
 	xmalloc (list->tokens_cap * sizeof (cpp_token));
 
-      /* Initialise name space.	 */
+      /* Initialize name space.  */
       list->name_cap = 1024;
-      list->name_used = 0;
       list->namebuf = (unsigned char *) xmalloc (list->name_cap);
+
+      /* Only create a comment space on demand.  */
+      list->comments_cap = 0;
+      list->comments = 0;
     }
 
+  /* Put a dummy token at the start that will fail matches.  */
+  list->tokens[0].type = CPP_EOF;
+
+  list->tokens_used = 1;
+  list->name_used = 0;
+  list->comments_used = 0;
   if (pfile->buffer)
     list->line = pfile->buffer->lineno;
   list->dir_handler = 0;
@@ -522,7 +537,7 @@ _cpp_scan_line (pfile, list)
       if (list->tokens_used >= list->tokens_cap)
 	expand_token_space (list);
       if (list->name_used + len >= list->name_cap)
-	expand_name_space (list);
+	auto_expand_name_space (list);
 
       if (type == CPP_MACRO)
 	type = CPP_NAME;
@@ -530,7 +545,7 @@ _cpp_scan_line (pfile, list)
       list->tokens_used++;
       list->tokens[i].type = type;
       list->tokens[i].col = col;
-      list->tokens[i].flags = space_before ? HSPACE_BEFORE : 0;
+      list->tokens[i].flags = space_before ? PREV_WHITESPACE : 0;
       
       if (type == CPP_VSPACE)
 	break;
@@ -2037,3 +2052,1334 @@ _cpp_init_input_buffer (pfile)
   pfile->input_buffer = tmp;
   pfile->input_buffer_len = 8192;
 }
+
+#if 0
+
+static void expand_comment_space PARAMS ((cpp_toklist *));
+void init_trigraph_map PARAMS ((void));
+static unsigned char* trigraph_replace PARAMS ((cpp_reader *, unsigned char *,
+						unsigned char *));
+static const unsigned char *backslash_start PARAMS ((cpp_reader *,
+						     const unsigned char *));
+static int skip_block_comment PARAMS ((cpp_reader *));
+static int skip_line_comment PARAMS ((cpp_reader *));
+static void skip_whitespace PARAMS ((cpp_reader *, int));
+static void parse_name PARAMS ((cpp_reader *, cpp_toklist *, cpp_name *));
+static void parse_number PARAMS ((cpp_reader *, cpp_toklist *, cpp_name *));
+static void parse_string PARAMS ((cpp_reader *, cpp_toklist *, cpp_name *,
+				  unsigned int));
+static int trigraph_ok PARAMS ((cpp_reader *, const unsigned char *));
+static void copy_comment PARAMS ((cpp_toklist *, const unsigned char *,
+				  unsigned int, unsigned int, unsigned int));
+void _cpp_lex_line PARAMS ((cpp_reader *, cpp_toklist *));
+
+static void _cpp_output_list PARAMS ((cpp_reader *, cpp_toklist *));
+
+unsigned int spell_char PARAMS ((unsigned char *, cpp_toklist *,
+				 cpp_token *token));
+unsigned int spell_string PARAMS ((unsigned char *, cpp_toklist *,
+				   cpp_token *token));
+unsigned int spell_comment PARAMS ((unsigned char *, cpp_toklist *,
+				    cpp_token *token));
+unsigned int spell_name PARAMS ((unsigned char *, cpp_toklist *,
+				 cpp_token *token));
+unsigned int spell_other PARAMS ((unsigned char *, cpp_toklist *,
+				  cpp_token *token));
+
+typedef unsigned int (* speller) PARAMS ((unsigned char *, cpp_toklist *,
+					  cpp_token *));
+
+/* Macros on a cpp_name.  */
+#define INIT_NAME(list, name) \
+  do {(name).len = 0; (name).offset = (list)->name_used;} while (0)
+
+/* Careful: 1-based.  First token is NTH_TOKEN (1).  */
+#define NTH_TOKEN(list, n) ((list)->tokens[n])
+#define IS_DIRECTIVE(list) (NTH_TOKEN(list, 1).type == CPP_HASH)
+#define COLUMN(cur) ((cur) - buffer->line_base)
+
+/* Maybe put these in the ISTABLE eventually.  */
+#define IS_HSPACE(c) ((c) == ' ' || (c) == '\t')
+#define IS_NEWLINE(c) ((c) == '\n' || (c) == '\r')
+
+/* Handle LF, CR, CR-LF and LF-CR style newlines.  Assumes next
+   character, if any, is in buffer.  */
+#define handle_newline(cur, limit, c) \
+  do {\
+  if ((cur) < (limit) && *(cur) == '\r' + '\n' - c) \
+    (cur)++; \
+  CPP_BUMP_LINE_CUR (pfile, (cur)); \
+  } while (0)
+
+#define IMMED_TOKEN() (!(cur_token->flags & PREV_WHITESPACE))
+#define PREV_TOKEN_TYPE (cur_token[-1].type)
+
+#define SPELL_TEXT     0
+#define SPELL_HANDLER  1
+#define SPELL_NONE     2
+#define SPELL_EOL      3
+
+#define T(e, s) {SPELL_TEXT, s},
+#define H(e, s) {SPELL_HANDLER, s},
+#define N(e, s) {SPELL_NONE, s},
+#define E(e, s) {SPELL_EOL, s},
+
+static const struct token_spelling
+{
+  char type;
+  PTR  speller;
+} token_spellings [N_TTYPES + 1] = {TTYPE_TABLE {0, 0} };
+
+#undef T
+#undef H
+#undef N
+#undef E
+
+static const unsigned char *digraph_spellings [] = {"%:", "%:%:", "<:",
+						    ":>", "<%", "%>"};
+
+static void
+expand_comment_space (list)
+     cpp_toklist *list;
+{
+  if (list->comments_cap == 0)
+    {
+      list->comments_cap = 10;
+      list->comments = (cpp_token *)
+	xmalloc (list->comments_cap * sizeof (cpp_token));
+    }
+  else
+    {
+      list->comments_cap *= 2;
+      list->comments = (cpp_token *)
+	xrealloc (list->comments, list->comments_cap);
+    }
+}
+
+void
+cpp_free_token_list (list)
+     cpp_toklist *list;
+{
+  if (list->comments)
+    free (list->comments);
+  free (list->tokens);
+  free (list->namebuf);
+  free (list);
+}
+
+static char trigraph_map[256];
+
+void
+init_trigraph_map ()
+{
+  trigraph_map['='] = '#';
+  trigraph_map['('] = '[';
+  trigraph_map[')'] = ']';
+  trigraph_map['/'] = '\\';
+  trigraph_map['\''] = '^';
+  trigraph_map['<'] = '{';
+  trigraph_map['>'] = '}';
+  trigraph_map['!'] = '|';
+  trigraph_map['-'] = '~';
+}
+
+/* Call when a trigraph is encountered.  It warns if necessary, and
+   returns true if the trigraph should be honoured.  END is the third
+   character of a trigraph in the input stream.  */
+static int
+trigraph_ok (pfile, end)
+     cpp_reader *pfile;
+     const unsigned char *end;
+{
+  int accept = CPP_OPTION (pfile, trigraphs);
+  
+  if (CPP_OPTION (pfile, warn_trigraphs))
+    {
+      unsigned int col = end - 1 - pfile->buffer->line_base;
+      if (accept)
+	cpp_warning_with_line (pfile, pfile->buffer->lineno, col, 
+			       "trigraph ??%c converted to %c",
+			       (int) *end, (int) trigraph_map[*end]);
+      else
+	cpp_warning_with_line (pfile, pfile->buffer->lineno, col,
+			       "trigraph ??%c ignored", (int) *end);
+    }
+  return accept;
+}
+
+/* Scan a string for trigraphs, warning or replacing them inline as
+   appropriate.  When parsing a string, we must call this routine
+   before processing a newline character (if trigraphs are enabled),
+   since the newline might be escaped by a preceding backslash
+   trigraph sequence.  Returns a pointer to the end of the name after
+   replacement.  */
+
+static unsigned char*
+trigraph_replace (pfile, src, limit)
+     cpp_reader *pfile;
+     unsigned char *src;
+     unsigned char* limit;
+{
+  unsigned char *dest;
+
+  /* Starting with src[1], find two consecutive '?'.  The case of no
+     trigraphs is streamlined.  */
+  
+  for (; src + 1 < limit; src += 2)
+    {
+      if (src[0] != '?')
+	continue;
+
+      /* Make src point to the 1st (NOT 2nd) of two consecutive '?'s.  */
+      if (src[-1] == '?')
+	src--;
+      else if (src + 2 == limit || src[1] != '?')
+	continue;
+
+      /* Check if it really is a trigraph.  */
+      if (trigraph_map[src[2]] == 0)
+	continue;
+
+      dest = src;
+      goto trigraph_found;
+    }
+  return limit;
+
+  /* Now we have a trigraph, we need to scan the remaining buffer, and
+     copy-shifting its contents left if replacement is enabled.  */
+  for (; src + 2 < limit; dest++, src++)
+    if ((*dest = *src) == '?' && src[1] == '?' && trigraph_map[src[2]])
+      {
+      trigraph_found:
+	src += 2;
+	if (trigraph_ok (pfile, pfile->buffer->cur - (limit - src)))
+	  *dest = trigraph_map[*src];
+      }
+  
+  /* Copy remaining (at most 2) characters.  */
+  while (src < limit)
+    *dest++ = *src++;
+  return dest;
+}
+
+/* If CUR is a backslash or the end of a trigraphed backslash, return
+   a pointer to its beginning, otherwise NULL.  We don't read beyond
+   the buffer start, because there is the start of the comment in the
+   buffer.  */
+static const unsigned char *
+backslash_start (pfile, cur)
+     cpp_reader *pfile;
+     const unsigned char *cur;
+{
+  if (cur[0] == '\\')
+    return cur;
+  if (cur[0] == '/' && cur[-1] == '?' && cur[-2] == '?'
+      && trigraph_ok (pfile, cur))
+    return cur - 2;
+  return 0;
+}
+
+/* Skip a C-style block comment.  This is probably the trickiest
+   handler.  We find the end of the comment by seeing if an asterisk
+   is before every '/' we encounter.  The nasty complication is that a
+   previous asterisk may be separated by one or more escaped newlines.
+   Returns non-zero if comment terminated by EOF, zero otherwise.  */
+static int
+skip_block_comment (pfile)
+     cpp_reader *pfile;
+{
+  cpp_buffer *buffer = pfile->buffer;
+  const unsigned char *char_after_star = 0;
+  register const unsigned char *cur = buffer->cur;
+  int seen_eof = 0;
+  
+  /* Inner loop would think the comment has ended if the first comment
+     character is a '/'.  Avoid this and keep the inner loop clean by
+     skipping such a character.  */
+  if (cur < buffer->rlimit && cur[0] == '/')
+    cur++;
+
+  for (; cur < buffer->rlimit; )
+    {
+      unsigned char c = *cur++;
+
+      /* People like decorating comments with '*', so check for
+	 '/' instead for efficiency.  */
+      if (c == '/')
+	{
+	  if (cur[-2] == '*' || cur - 1 == char_after_star)
+	    goto out;
+
+	  /* Warn about potential nested comments, but not when
+	     the final character inside the comment is a '/'.
+	     Don't bother to get it right across escaped newlines.  */
+	  if (CPP_OPTION (pfile, warn_comments) && cur + 1 < buffer->rlimit
+	      && cur[0] == '*' && cur[1] != '/') 
+	    {
+	      buffer->cur = cur;
+	      cpp_warning (pfile, "'/*' within comment");
+	    }
+	}
+      else if (IS_NEWLINE(c))
+	{
+	  const unsigned char* bslash = backslash_start (pfile, cur - 2);
+
+	  handle_newline (cur, buffer->rlimit, c);
+	  /* Work correctly if there is an asterisk before an
+	     arbirtrarily long sequence of escaped newlines.  */
+	  if (bslash && (bslash[-1] == '*' || bslash == char_after_star))
+	    char_after_star = cur;
+	  else
+	    char_after_star = 0;
+	}
+    }
+  seen_eof = 1;
+
+ out:
+  buffer->cur = cur;
+  return seen_eof;
+}
+
+/* Skip a C++ or Chill line comment.  Handles escaped newlines.
+   Returns non-zero if a multiline comment.  */
+static int
+skip_line_comment (pfile)
+     cpp_reader *pfile;
+{
+  cpp_buffer *buffer = pfile->buffer;
+  register const unsigned char *cur = buffer->cur;
+  int multiline = 0;
+
+  for (; cur < buffer->rlimit; )
+    {
+      unsigned char c = *cur++;
+
+      if (IS_NEWLINE (c))
+	{
+	  /* Check for a (trigaph?) backslash escaping the newline.  */
+	  if (!backslash_start (pfile, cur - 2))
+	    goto out;
+	  multiline = 1;
+	  handle_newline (cur, buffer->rlimit, c);
+	}
+    }
+  cur++;
+
+ out:
+  buffer->cur = cur - 1;	/* Leave newline for caller.  */
+  return multiline;
+}
+
+/* Skips whitespace, stopping at next non-whitespace character.  */
+static void
+skip_whitespace (pfile, in_directive)
+     cpp_reader *pfile;
+     int in_directive;
+{
+  cpp_buffer *buffer = pfile->buffer;
+  register const unsigned char *cur = buffer->cur;
+  unsigned short null_count = 0;
+
+  for (; cur < buffer->rlimit; )
+    {
+      unsigned char c = *cur++;
+
+      if (IS_HSPACE(c))		/* FIXME: Fix ISTABLE.  */
+	continue;
+      if (!is_space(c) || IS_NEWLINE (c)) /* Main loop handles newlines.  */
+	goto out;
+      if (c == '\0')
+	null_count++;
+      /* Mut be '\f' or '\v' */
+      else if (in_directive && CPP_PEDANTIC (pfile))
+	cpp_pedwarn (pfile, "%s in preprocessing directive",
+		     c == '\f' ? "formfeed" : "vertical tab");
+    }
+  cur++;
+
+ out:
+  buffer->cur = cur - 1;
+  if (null_count)
+    cpp_warning (pfile, null_count > 1 ? "embedded null characters ignored"
+		 : "embedded null character ignored");
+}
+
+/* Parse (append) an identifier.  */
+static void
+parse_name (pfile, list, name)
+     cpp_reader *pfile;
+     cpp_toklist *list;
+     cpp_name *name;
+{
+  const unsigned char *name_limit;
+  unsigned char *namebuf;
+  cpp_buffer *buffer = pfile->buffer;
+  register const unsigned char *cur = buffer->cur;
+
+ expanded:
+  name_limit = list->namebuf + list->name_cap;
+  namebuf = list->namebuf + list->name_used;
+
+  for (; cur < buffer->rlimit && namebuf < name_limit; )
+    {
+      unsigned char c = *namebuf = *cur; /* Copy a single char.  */
+
+      if (! is_idchar(c))
+	goto out;
+      namebuf++;
+      cur++;
+      if (c == '$' && CPP_PEDANTIC (pfile))
+	{
+	  buffer->cur = cur;
+	  cpp_pedwarn (pfile, "'$' character in identifier");
+	}
+    }
+
+  /* Run out of name space?  */
+  if (cur < buffer->rlimit)
+    {
+      list->name_used = namebuf - list->namebuf;
+      auto_expand_name_space (list);
+      goto expanded;
+    }
+
+ out:
+  buffer->cur = cur;
+  name->len = namebuf - (list->namebuf + name->offset);
+  list->name_used = namebuf - list->namebuf;
+}
+
+/* Parse (append) a number.  */
+
+#define VALID_SIGN(c, prevc) \
+  (((c) == '+' || (c) == '-') && \
+   ((prevc) == 'e' || (prevc) == 'E' \
+    || (((prevc) == 'p' || (prevc) == 'P') && !CPP_OPTION (pfile, c89))))
+
+static void
+parse_number (pfile, list, name)
+     cpp_reader *pfile;
+     cpp_toklist *list;
+     cpp_name *name;
+{
+  const unsigned char *name_limit;
+  unsigned char *namebuf;
+  cpp_buffer *buffer = pfile->buffer;
+  register const unsigned char *cur = buffer->cur;
+
+ expanded:
+  name_limit = list->namebuf + list->name_cap;
+  namebuf = list->namebuf + list->name_used;
+
+  for (; cur < buffer->rlimit && namebuf < name_limit; )
+    {
+      unsigned char c = *namebuf = *cur; /* Copy a single char.  */
+
+      /* Perhaps we should accept '$' here if we accept it for
+         identifiers.  We know namebuf[-1] is safe, because for c to
+         be a sign we must have pushed at least one character.  */
+      if (!is_numchar (c) && c != '.' && ! VALID_SIGN (c, namebuf[-1]))
+	goto out;
+
+      namebuf++;
+      cur++;
+    }
+
+  /* Run out of name space?  */
+  if (cur < buffer->rlimit)
+    {
+      list->name_used = namebuf - list->namebuf;
+      auto_expand_name_space (list);
+      goto expanded;
+    }
+  
+ out:
+  buffer->cur = cur;
+  name->len = namebuf - (list->namebuf + name->offset);
+  list->name_used = namebuf - list->namebuf;
+}
+
+/* Places a string terminated by an unescaped TERMINATOR into a
+   cpp_name, which should be expandable and thus at the top of the
+   list's stack.  Handles embedded trigraphs, if necessary, and
+   escaped newlines.
+
+   Can be used for character constants (terminator = '\''), string
+   constants ('"'), angled headers ('>') and assertions (')').  */
+
+static void
+parse_string (pfile, list, name, terminator)
+     cpp_reader *pfile;
+     cpp_toklist *list;
+     cpp_name *name;
+     unsigned int terminator;
+{
+  cpp_buffer *buffer = pfile->buffer;
+  register const unsigned char *cur = buffer->cur;
+  const unsigned char *name_limit;
+  unsigned char *namebuf;
+  unsigned int null_count = 0;
+  int trigraphed_len = 0;
+
+ expanded:
+  name_limit = list->namebuf + list->name_cap;
+  namebuf = list->namebuf + list->name_used;
+
+  for (; cur < buffer->rlimit && namebuf < name_limit; )
+    {
+      unsigned int c = *namebuf++ = *cur++; /* Copy a single char.  */
+
+      if (c == '\0')
+	null_count++;
+      else if (c == terminator || IS_NEWLINE (c))
+	{
+	  unsigned char* name_start = list->namebuf + name->offset;
+
+	  /* Needed for trigraph_replace and multiline string warning.  */
+	  buffer->cur = cur;
+
+	  /* Scan for trigraphs before checking if backslash-escaped.  */
+	  if (CPP_OPTION (pfile, trigraphs)
+	      || CPP_OPTION (pfile, warn_trigraphs))
+	    {
+	      namebuf = trigraph_replace (pfile, name_start + trigraphed_len,
+					    namebuf);
+	      trigraphed_len = namebuf - 2 - (name_start + trigraphed_len);
+	      if (trigraphed_len < 0)
+		trigraphed_len = 0;
+	    }
+
+	  namebuf--;     /* Drop the newline / terminator from the name.  */
+	  if (IS_NEWLINE (c))
+	    {
+	      /* Drop a backslash newline, and continue. */
+	      if (namebuf[-1] == '\\')
+		{
+		  handle_newline (cur, buffer->rlimit, c);
+		  namebuf--;
+		  continue;
+		}
+
+	      cur--;
+
+	      /* In Fortran and assembly language, silently terminate
+		 strings of either variety at end of line.  This is a
+		 kludge around not knowing where comments are in these
+		 languages.  */
+	      if (CPP_OPTION (pfile, lang_fortran)
+		  || CPP_OPTION (pfile, lang_asm))
+		goto out;
+
+	      /* Character constants, headers and asserts may not
+		 extend over multiple lines.  In Standard C, neither
+		 may strings.  We accept multiline strings as an
+		 extension, but not in directives.  */
+	      if (terminator != '"' || IS_DIRECTIVE (list))
+		goto unterminated;
+		
+	      cur++;  /* Move forwards again.  */
+
+	      if (pfile->multiline_string_line == 0)
+		{
+		  pfile->multiline_string_line = list->line;
+		  if (CPP_PEDANTIC (pfile))
+		    cpp_pedwarn (pfile, "multi-line string constant");
+		}
+
+	      *namebuf++ = '\n';
+	      handle_newline (cur, buffer->rlimit, c);
+	    }
+	  else
+	    {
+	      unsigned char *temp;
+
+	      /* An odd number of consecutive backslashes represents
+		 an escaped terminator.  */
+	      temp = namebuf - 1;
+	      while (temp >= name_start && *temp == '\\')
+		temp--;
+
+	      if ((namebuf - temp) & 1)
+		goto out;
+	      namebuf++;
+	    }
+	}
+    }
+
+  /* Run out of name space?  */
+  if (cur < buffer->rlimit)
+    {
+      list->name_used = namebuf - list->namebuf;
+      auto_expand_name_space (list);
+      goto expanded;
+    }
+
+  /* We may not have trigraph-replaced the input for this code path,
+     but as the input is in error by being unterminated we don't
+     bother.  Prevent warnings about no newlines at EOF.  */
+  if (IS_NEWLINE(cur[-1]))
+    cur--;
+
+ unterminated:
+  cpp_error (pfile, "missing terminating %c character", (int) terminator);
+
+  if (terminator == '\"' && pfile->multiline_string_line != list->line
+      && pfile->multiline_string_line != 0)
+    {
+      cpp_error_with_line (pfile, pfile->multiline_string_line, -1,
+			   "possible start of unterminated string literal");
+      pfile->multiline_string_line = 0;
+    }
+  
+ out:
+  buffer->cur = cur;
+  name->len = namebuf - (list->namebuf + name->offset);
+  list->name_used = namebuf - list->namebuf;
+
+  if (null_count > 0)
+    cpp_warning (pfile, (null_count > 1 ? "null characters preserved"
+			 : "null character preserved"));
+}
+
+/* The character C helps us distinguish comment types: '*' = C style,
+   '-' = Chill-style and '/' = C++ style.  For code simplicity, the
+   stored comment includes any C-style comment terminator.  */
+static void
+copy_comment (list, from, len, tok_no, type)
+     cpp_toklist *list;
+     const unsigned char *from;
+     unsigned int len;
+     unsigned int tok_no;
+     unsigned int type;
+{
+  cpp_token *comment;
+
+  if (list->comments_used == list->comments_cap)
+    expand_comment_space (list);
+
+  if (list->name_used + len > list->name_cap)
+    expand_name_space (list, len);
+
+  comment = &list->comments[list->comments_used++];
+  comment->type = type;
+  comment->aux = tok_no;
+  comment->val.name.len = len;
+  comment->val.name.offset = list->name_used;
+
+  memcpy (list->namebuf + list->name_used, from, len);
+  list->name_used += len;
+}
+
+/*
+ *  The tokenizer's main loop.  Returns a token list, representing a
+ *  logical line in the input file, terminated with a CPP_VSPACE
+ *  token.  On EOF, a token list containing the single CPP_EOF token
+ *  is returned.
+ *
+ *  Implementation relies almost entirely on lookback, rather than
+ *  looking forwards.  This means that tokenization requires just
+ *  a single pass of the file, even in the presence of trigraphs and
+ *  escaped newlines, providing significant performance benefits.
+ *  Trigraph overhead is negligible if they are disabled, and low
+ *  even when enabled.
+ */
+
+#define PUSH_TOKEN(ttype) cur_token++->type = ttype
+#define REVISE_TOKEN(ttype) cur_token[-1].type = ttype
+#define BACKUP_TOKEN(ttype) (--cur_token)->type = ttype
+#define BACKUP_DIGRAPH(ttype) do { \
+  BACKUP_TOKEN(ttype); cur_token->flags |= DIGRAPH;} while (0)
+
+void
+_cpp_lex_line (pfile, list)
+     cpp_reader *pfile;
+     cpp_toklist *list;
+{
+  cpp_token *cur_token, *token_limit;
+  cpp_buffer *buffer = pfile->buffer;
+  register const unsigned char *cur = buffer->cur;
+  unsigned char flags = 0;
+
+ expanded:
+  token_limit = list->tokens + list->tokens_cap;
+  cur_token = list->tokens + list->tokens_used;
+
+  for (; cur < buffer->rlimit && cur_token < token_limit;)
+    {
+      unsigned char c = *cur++;
+
+      /* Optimize whitespace skipping, in particular the case of a
+	 single whitespace character, as every other token is probably
+	 whitespace. (' ' '\t' '\v' '\f' '\0').  */
+      if (is_hspace ((unsigned int) c))
+	{
+	  if (c == '\0' || (cur < buffer->rlimit && is_hspace (*cur)))
+	    {
+	      buffer->cur = cur - (c == '\0');	/* Get the null warning.  */
+	      skip_whitespace (pfile, IS_DIRECTIVE (list));
+	      cur = buffer->cur;
+	    }
+	  flags = PREV_WHITESPACE;
+	  if (cur == buffer->rlimit)
+	    break;
+	  c = *cur++;
+	}
+
+      /* Initialize current token.  Its type is set in the switch.  */
+      cur_token->col = COLUMN (cur);
+      cur_token->flags = flags;
+      flags = 0;
+
+      switch (c)
+	{
+	case '0': case '1': case '2': case '3': case '4':
+	case '5': case '6': case '7': case '8': case '9':
+	  /* Prepend an immediately previous CPP_DOT token.  */
+	  if (PREV_TOKEN_TYPE == CPP_DOT && IMMED_TOKEN ())
+	    {
+	      cur_token--;
+	      if (list->name_cap == list->name_used)
+		auto_expand_name_space (list);
+
+	      cur_token->val.name.len = 1;
+	      cur_token->val.name.offset = list->name_used;
+	      list->namebuf[list->name_used++] = '.';
+	    }
+	  else
+	    INIT_NAME (list, cur_token->val.name);
+	  cur--;		/* Backup character.  */
+
+	continue_number:
+	  buffer->cur = cur;
+	  parse_number (pfile, list, &cur_token->val.name);
+	  cur = buffer->cur;
+
+	  PUSH_TOKEN (CPP_NUMBER); /* Number not yet interpreted.  */
+	  break;
+
+	letter:
+	case '_':
+	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
+	case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
+	case 's': case 't': case 'u': case 'v': case 'w': case 'x':
+	case 'y': case 'z':
+	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+	case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
+	case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
+	case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
+	case 'Y': case 'Z':
+	  INIT_NAME (list, cur_token->val.name);
+	  cur--;		     /* Backup character.  */
+	  cur_token->type = CPP_NAME; /* Identifier, macro etc.  */
+
+	continue_name:
+	  buffer->cur = cur;
+	  parse_name (pfile, list, &cur_token->val.name);
+	  cur = buffer->cur;
+
+	  /* Find handler for newly created / extended directive.  */
+	  if (IS_DIRECTIVE (list) && cur_token == &NTH_TOKEN (list, 2))
+	    _cpp_check_directive (list, cur_token);
+	  cur_token++;
+	  break;
+
+	case '\'':
+	  /* Fall through.  */
+	case '\"':
+	  cur_token->type = c == '\'' ? CPP_CHAR : CPP_STRING;
+	  /* Do we have a wide string?  */
+	  if (cur_token[-1].type == CPP_NAME && IMMED_TOKEN ()
+	      && cur_token[-1].val.name.len == 1
+	      && TOK_NAME (list, cur_token - 1)[0] == 'L'
+	      && !CPP_TRADITIONAL (pfile))
+	    {
+	      /* No need for 'L' any more.  */
+	      list->name_used--;
+	      (--cur_token)->type = (c == '\'' ? CPP_WCHAR : CPP_WSTRING);
+	    }
+
+	do_parse_string:
+	  /* Here c is one of ' " > or ).  */
+	  INIT_NAME (list, cur_token->val.name);
+	  buffer->cur = cur;
+	  parse_string (pfile, list, &cur_token->val.name, c);
+	  cur = buffer->cur;
+	  cur_token++;
+	  break;
+
+	case '/':
+	  cur_token->type = CPP_DIV;
+	  if (IMMED_TOKEN ())
+	    {
+	      if (PREV_TOKEN_TYPE == CPP_DIV)
+		{
+		  /* We silently allow C++ comments in system headers,
+		     irrespective of conformance mode, because lots of
+		     broken systems do that and trying to clean it up
+		     in fixincludes is a nightmare.  */
+		  if (buffer->system_header_p)
+		    goto do_line_comment;
+		  else if (CPP_OPTION (pfile, cplusplus_comments))
+		    {
+		      if (CPP_OPTION (pfile, c89) && CPP_PEDANTIC (pfile)
+			  && ! buffer->warned_cplusplus_comments)
+			{
+			  buffer->cur = cur;
+			  cpp_pedwarn (pfile,
+			     "C++ style comments are not allowed in ISO C89");
+			  cpp_pedwarn (pfile,
+			  "(this will be reported only once per input file)");
+			  buffer->warned_cplusplus_comments = 1;
+			}
+		    do_line_comment:
+		      buffer->cur = cur;
+		      if (cur[-2] != c)
+			cpp_warning (pfile,
+				     "comment start split across lines");
+		      if (skip_line_comment (pfile))
+			cpp_error_with_line (pfile, list->line,
+					     cur_token[-1].col,
+					     "multi-line comment");
+		      if (!CPP_OPTION (pfile, discard_comments))
+			copy_comment (list, cur, buffer->cur - cur,
+				      cur_token - 1 - list->tokens, c == '/'
+				      ? CPP_CPP_COMMENT: CPP_CHILL_COMMENT);
+		      cur = buffer->cur;
+
+		      /* Back-up to first '-' or '/'.  */
+		      cur_token -= 2;
+		      if (!CPP_OPTION (pfile, traditional))
+			flags = PREV_WHITESPACE;
+		    }
+		}
+	    }
+	  cur_token++;
+	  break;
+		      
+	case '*':
+	  cur_token->type = CPP_MULT;
+	  if (IMMED_TOKEN ())
+	    {
+	      if (PREV_TOKEN_TYPE == CPP_DIV)
+		{
+		  buffer->cur = cur;
+		  if (cur[-2] != '/')
+		    cpp_warning (pfile,
+				 "comment start '/*' split across lines");
+		  if (skip_block_comment (pfile))
+		    cpp_error_with_line (pfile, list->line, cur_token[-1].col,
+					 "unterminated comment");
+		  else if (buffer->cur[-2] != '*')
+		    cpp_warning (pfile,
+				 "comment end '*/' split across lines");
+		  if (!CPP_OPTION (pfile, discard_comments))
+		    copy_comment (list, cur, buffer->cur - cur,
+				 cur_token - 1 - list->tokens, CPP_C_COMMENT);
+		  cur = buffer->cur;
+
+		  cur_token -= 2;
+		  if (!CPP_OPTION (pfile, traditional))
+		    flags = PREV_WHITESPACE;
+		}
+	      else if (CPP_OPTION (pfile, cplusplus))
+		{
+		  /* In C++, there are .* and ->* operators.  */
+		  if (PREV_TOKEN_TYPE == CPP_DEREF)
+		    BACKUP_TOKEN (CPP_DEREF_STAR);
+		  else if (PREV_TOKEN_TYPE == CPP_DOT)
+		    BACKUP_TOKEN (CPP_DOT_STAR);
+		}
+	    }
+	  cur_token++;
+	  break;
+
+	case '\n':
+	case '\r':
+	  handle_newline (cur, buffer->rlimit, c);
+	  if (PREV_TOKEN_TYPE != CPP_BACKSLASH || !IMMED_TOKEN ())
+	    {
+	      if (PREV_TOKEN_TYPE == CPP_BACKSLASH)
+		{
+		  buffer->cur = cur;
+		  cpp_warning (pfile,
+			       "backslash and newline separated by space");
+		}
+	      PUSH_TOKEN (CPP_VSPACE);
+	      goto out;
+	    }
+	  /* Remove the escaped newline.  Then continue to process
+	     any interrupted name or number.  */
+	  cur_token--;
+	  if (IMMED_TOKEN ())
+	    {
+	      cur_token--;
+	      if (cur_token->type == CPP_NAME)
+		goto continue_name;
+	      else if (cur_token->type == CPP_NUMBER)
+		goto continue_number;
+	      cur_token++;
+	    }
+	  break;
+
+	case '-':
+	  if (IMMED_TOKEN () && PREV_TOKEN_TYPE == CPP_MINUS)
+	    {
+	      if (CPP_OPTION (pfile, chill))
+		goto do_line_comment;
+	      REVISE_TOKEN (CPP_MINUS_MINUS);
+	    }
+	  else
+	    PUSH_TOKEN (CPP_MINUS);
+	  break;
+
+	  /* The digraph flag checking ensures that ## and %:%:
+	     are interpreted as CPP_PASTE, but #%: and %:# are not.  */
+	make_hash:
+	case '#':
+	  if (PREV_TOKEN_TYPE == CPP_HASH && IMMED_TOKEN ()
+	      && ((cur_token->flags ^ cur_token[-1].flags) & DIGRAPH) == 0)
+	    REVISE_TOKEN (CPP_PASTE);
+	  else
+	    PUSH_TOKEN (CPP_HASH);
+	  break;
+
+	case ':':
+	  cur_token->type = CPP_COLON;
+	  if (IMMED_TOKEN ())
+	    {
+	      if (PREV_TOKEN_TYPE == CPP_COLON
+		  && CPP_OPTION (pfile, cplusplus))
+		BACKUP_TOKEN (CPP_SCOPE);
+	      /* Digraph: "<:" is a '['  */
+	      else if (PREV_TOKEN_TYPE == CPP_LESS)
+		BACKUP_DIGRAPH (CPP_OPEN_SQUARE);
+	      /* Digraph: "%:" is a '#'  */
+	      else if (PREV_TOKEN_TYPE == CPP_MOD)
+		{
+		  (--cur_token)->flags |= DIGRAPH;
+		  goto make_hash;
+		}
+	    }
+	  cur_token++;
+	  break;
+
+	case '&':
+	  if (IMMED_TOKEN () && PREV_TOKEN_TYPE == CPP_AND)
+	    REVISE_TOKEN (CPP_AND_AND);
+	  else
+	    PUSH_TOKEN (CPP_AND);
+	  break;
+
+	make_or:
+	case '|':
+	  if (IMMED_TOKEN () && PREV_TOKEN_TYPE == CPP_OR)
+	    REVISE_TOKEN (CPP_OR_OR);
+	  else
+	    PUSH_TOKEN (CPP_OR);
+	  break;
+
+	case '+':
+	  if (IMMED_TOKEN () && PREV_TOKEN_TYPE == CPP_PLUS)
+	    REVISE_TOKEN (CPP_PLUS_PLUS);
+	  else
+	    PUSH_TOKEN (CPP_PLUS);
+	  break;
+
+	case '=':
+	    /* This relies on equidistance of "?=" and "?" tokens.  */
+	  if (IMMED_TOKEN () && PREV_TOKEN_TYPE <= CPP_LAST_EQ)
+	    REVISE_TOKEN (PREV_TOKEN_TYPE + (CPP_EQ_EQ - CPP_EQ));
+	  else
+	    PUSH_TOKEN (CPP_EQ);
+	  break;
+
+	case '>':
+	  cur_token->type = CPP_GREATER;
+	  if (IMMED_TOKEN ())
+	    {
+	      if (PREV_TOKEN_TYPE == CPP_GREATER)
+		BACKUP_TOKEN (CPP_RSHIFT);
+	      else if (PREV_TOKEN_TYPE == CPP_MINUS)
+		BACKUP_TOKEN (CPP_DEREF);
+	      /* Digraph: ":>" is a ']'  */
+	      else if (PREV_TOKEN_TYPE == CPP_COLON)
+		BACKUP_DIGRAPH (CPP_CLOSE_SQUARE);
+	      /* Digraph: "%>" is a '}'  */
+	      else if (PREV_TOKEN_TYPE == CPP_MOD)
+		BACKUP_DIGRAPH (CPP_CLOSE_BRACE);
+	    }
+	  cur_token++;
+	  break;
+	  
+	case '<':
+	  if (IMMED_TOKEN () && PREV_TOKEN_TYPE == CPP_LESS)
+	    {
+	      REVISE_TOKEN (CPP_LSHIFT);
+	      break;
+	    }
+	  /* Is this the beginning of a header name?  */
+	  if (list->dir_flags & SYNTAX_INCLUDE)
+	    {
+	      c = '>';	/* Terminator.  */
+	      cur_token->type = CPP_HEADER_NAME;
+	      goto do_parse_string;
+	    }
+	  PUSH_TOKEN (CPP_LESS);
+	  break;
+
+	case '%':
+	  /* Digraph: "<%" is a '{'  */
+	  cur_token->type = CPP_MOD;
+	  if (IMMED_TOKEN () && PREV_TOKEN_TYPE == CPP_LESS)
+	    BACKUP_DIGRAPH (CPP_OPEN_BRACE);
+	  cur_token++;
+	  break;
+
+	case ')':
+	  PUSH_TOKEN (CPP_CLOSE_PAREN);
+	  break;
+
+	case '(':
+	  /* Is this the beginning of an assertion string?  */
+	  if (list->dir_flags & SYNTAX_ASSERT)
+	    {
+	      c = ')';	/* Terminator.  */
+	      cur_token->type = CPP_ASSERTION;
+	      goto do_parse_string;
+	    }
+	  PUSH_TOKEN (CPP_OPEN_PAREN);
+	  break;
+
+	make_complement:
+	case '~':
+	  PUSH_TOKEN (CPP_COMPL);
+	  break;
+
+	case '?':
+	  if (cur + 1 < buffer->rlimit && *cur == '?'
+	      && trigraph_map[cur[1]] && trigraph_ok (pfile, cur + 1))
+	    {
+	      /* Handle trigraph.  */
+	      cur++;
+	      switch (*cur++)
+		{
+		case '(': goto make_open_square;
+		case ')': goto make_close_square;
+		case '<': goto make_open_brace;
+		case '>': goto make_close_brace;
+		case '=': goto make_hash;
+		case '!': goto make_or;
+		case '-': goto make_complement;
+		case '/': goto make_backslash;
+		case '\'': goto make_xor;
+		}
+	    }
+	  if (IMMED_TOKEN () && CPP_OPTION (pfile, cplusplus))
+	    {
+	      /* GNU C++ defines <? and >? operators.  */
+	      if (PREV_TOKEN_TYPE == CPP_LESS)
+		{
+		  REVISE_TOKEN (CPP_MIN);
+		  break;
+		}
+	      else if (PREV_TOKEN_TYPE == CPP_GREATER)
+		{
+		  REVISE_TOKEN (CPP_MAX);
+		  break;
+		}
+	    }
+	  PUSH_TOKEN (CPP_QUERY);
+	  break;
+
+	case '.':
+	  if (PREV_TOKEN_TYPE == CPP_DOT && cur_token[-2].type == CPP_DOT
+	      && IMMED_TOKEN ()
+	      && !(cur_token[-1].flags & PREV_WHITESPACE))
+	    {
+	      cur_token -= 2;
+	      PUSH_TOKEN (CPP_ELLIPSIS);
+	    }
+	  else
+	    PUSH_TOKEN (CPP_DOT);
+	  break;
+
+	make_xor:
+	case '^': PUSH_TOKEN (CPP_XOR); break;
+	make_open_brace:
+	case '{': PUSH_TOKEN (CPP_OPEN_BRACE); break;
+	make_close_brace:
+	case '}': PUSH_TOKEN (CPP_CLOSE_BRACE); break;
+	make_open_square:
+	case '[': PUSH_TOKEN (CPP_OPEN_SQUARE); break;
+	make_close_square:
+	case ']': PUSH_TOKEN (CPP_CLOSE_SQUARE); break;
+	make_backslash:
+	case '\\': PUSH_TOKEN (CPP_BACKSLASH); break;
+	case '!': PUSH_TOKEN (CPP_NOT); break;
+	case ',': PUSH_TOKEN (CPP_COMMA); break;
+	case ';': PUSH_TOKEN (CPP_SEMICOLON); break;
+
+	case '$':
+	  if (CPP_OPTION (pfile, dollars_in_ident))
+	    goto letter;
+	  /* Fall through */
+	default:
+	  cur_token->aux = c;
+	  PUSH_TOKEN (CPP_OTHER);
+	  break;
+	}
+    }
+
+  /* Run out of token space?  */
+  if (cur_token == token_limit)
+    {
+      list->tokens_used = cur_token - list->tokens;
+      expand_token_space (list);
+      goto expanded;
+    }
+
+  cur_token->type = CPP_EOF;
+  cur_token->flags = flags;
+
+  if (cur_token != &NTH_TOKEN (list, 1))
+    {
+      /* Next call back will get just a CPP_EOF.  */
+      buffer->cur = cur;
+      cpp_warning (pfile, "no newline at end of file");
+      PUSH_TOKEN (CPP_VSPACE);
+    }
+
+ out:
+  buffer->cur = cur;
+
+  list->tokens_used = cur_token - list->tokens;
+
+  /* FIXME:  take this check out and put it in the caller.
+     list->directive == 0 indicates an unknown directive (but null
+     directive is OK).  This is the first time we can be sure the
+     directive is invalid, and thus warn about it, because it might
+     have been split by escaped newlines.  Also, don't complain about
+     invalid directives in assembly source, we don't know where the
+     comments are, and # may introduce assembler pseudo-ops.  */
+
+  if (IS_DIRECTIVE (list) && list->dir_handler == 0
+      && NTH_TOKEN (list, 2).type != CPP_VSPACE
+      && !CPP_OPTION (pfile, lang_asm))
+    cpp_error_with_line (pfile, list->line, NTH_TOKEN(list, 2).col,
+			 "invalid preprocessing directive");
+}
+
+/* Token spelling functions.  Used for output of a preprocessed file,
+   stringizing and token pasting.  They all assume sufficient buffer
+   is allocated, and return exactly how much they used.  */
+
+/* Needs buffer of 3 + len.  */
+unsigned int
+spell_char (buffer, list, token)
+     unsigned char *buffer;
+     cpp_toklist *list;
+     cpp_token *token;
+{
+  unsigned char* orig_buff = buffer;
+  size_t len;
+
+  if (token->type == CPP_WCHAR)
+    *buffer++ = 'L';
+  *buffer++ = '\'';
+
+  len = token->val.name.len;
+  memcpy (buffer, TOK_NAME (list, token), len);
+  buffer += len;
+  *buffer++ = '\'';
+  return buffer - orig_buff;
+}
+
+/* Needs buffer of 3 + len.  */
+unsigned int
+spell_string (buffer, list, token)
+     unsigned char *buffer;
+     cpp_toklist *list;
+     cpp_token *token;
+{
+  unsigned char* orig_buff = buffer;
+  size_t len;
+
+  if (token->type == CPP_WSTRING)
+    *buffer++ = 'L';
+  *buffer++ = '"';
+
+  len = token->val.name.len;
+  memcpy (buffer, TOK_NAME (list, token), len);
+  buffer += len;
+  *buffer++ = '"';
+  return buffer - orig_buff;
+}
+
+/* Needs buffer of len + 2.  */
+unsigned int
+spell_comment (buffer, list, token)
+     unsigned char *buffer;
+     cpp_toklist *list;
+     cpp_token *token;
+{
+  size_t len;
+
+  if (token->type == CPP_C_COMMENT)
+    {
+      *buffer++ = '/';
+      *buffer++ = '*';
+    }
+  else if (token->type == CPP_CPP_COMMENT)
+    {
+      *buffer++ = '/';
+      *buffer++ = '/';
+    }
+  else 
+    {
+      *buffer++ = '-';
+      *buffer++ = '-';
+    }
+
+  len = token->val.name.len;
+  memcpy (buffer, TOK_NAME (list, token), len);
+
+  return len + 2;
+}
+
+/* Needs buffer of len.  */
+unsigned int
+spell_name (buffer, list, token)
+     unsigned char *buffer;
+     cpp_toklist *list;
+     cpp_token *token;
+{
+  size_t len;
+
+  len = token->val.name.len;
+  memcpy (buffer, TOK_NAME (list, token), len);
+  buffer += len;
+
+  return len;
+}
+
+/* Needs buffer of 1.  */
+unsigned int
+spell_other (buffer, list, token)
+     unsigned char *buffer;
+     cpp_toklist *list ATTRIBUTE_UNUSED;
+     cpp_token *token;
+{
+  *buffer++ = token->aux;
+  return 1;
+}
+
+void
+_cpp_lex_file (pfile)
+     cpp_reader* pfile;
+{
+  int recycle;
+  cpp_toklist* list;
+
+  init_trigraph_map ();
+  list = (cpp_toklist *) xmalloc (sizeof (cpp_toklist));
+
+  for (recycle = 0; ;)
+    {
+      init_token_list (pfile, list, recycle);
+      recycle = 1;
+
+      _cpp_lex_line (pfile, list);
+      if (NTH_TOKEN (list, 1).type == CPP_EOF)
+	break;
+
+      if (list->dir_handler)
+	{
+	  if (list->dir_handler (pfile))
+	    {
+	      list = (cpp_toklist *) xmalloc (sizeof (cpp_toklist));
+	      recycle = 0;
+	    }
+	}
+      else
+	_cpp_output_list (pfile, list);
+    }
+}
+
+static void
+_cpp_output_list (pfile, list)
+     cpp_reader *pfile;
+     cpp_toklist *list;
+{
+  unsigned int comment_no = 0;
+  cpp_token *token, *comment_token = 0;
+
+  if (list->comments_used > 0)
+    comment_token = list->tokens + list->comments[0].aux;
+
+  CPP_RESERVE (pfile, 2);	/* Always have room for " \n".  */
+  for (token = &NTH_TOKEN(list, 1);; token++)
+    {
+      if (token->flags & PREV_WHITESPACE)
+	{
+	  /* Output comments if -C.  Otherwise a space will do.  */
+	  if (token == comment_token)
+	    {
+	      cpp_token *comment = &list->comments[comment_no];
+	      do
+		{
+		  /* Longest wrapper is 4.  */
+		  CPP_RESERVE (pfile, 4 + 2 + comment->val.name.len);
+		  pfile->limit += spell_comment (pfile->limit, list, comment);
+		  comment_no++, comment++;
+		  if (comment_no == list->comments_used)
+		    break;
+		  comment_token = comment->aux + list->tokens;
+		}
+	      while (comment_token == token);
+	    }
+	  else
+	    CPP_PUTC_Q (pfile, ' ');
+	}
+
+      switch (token_spellings[token->type].type)
+	{
+	case SPELL_TEXT:
+	  {
+	    const unsigned char *spelling;
+	    unsigned char c;
+
+	    CPP_RESERVE (pfile, 4 + 2); /* Longest is 4.  */
+	    if (token->flags & DIGRAPH)
+	      spelling = digraph_spellings [token->type - CPP_FIRST_DIGRAPH];
+	    else
+	      spelling = token_spellings[token->type].speller;
+
+	    while ((c = *spelling++) != '\0')
+	      CPP_PUTC_Q (pfile, c);
+	  }
+	  break;
+
+	case SPELL_HANDLER:
+	  {
+	    speller s;
+
+	    s = (speller) token_spellings[token->type].speller;
+	    /* Longest wrapper is 4.  */
+	    CPP_RESERVE (pfile, 4 + 2 + token->val.name.len);
+	    pfile->limit += s (pfile->limit, list, token);
+	  }
+	  break;
+
+	case SPELL_EOL:
+	  CPP_PUTC_Q (pfile, '\n');
+	  return;
+
+	case SPELL_NONE:
+	  cpp_error (pfile, "Unwriteable token");
+	  break;
+	}
+    }
+}
+
+#endif
Index: cpplib.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/cpplib.c,v
retrieving revision 1.153
diff -u -p -r1.153 cpplib.c
--- cpplib.c	2000/04/23 17:03:31	1.153
+++ cpplib.c	2000/04/24 10:21:26
@@ -150,6 +150,29 @@ DIRECTIVE_TABLE
 #undef D
 #undef DIRECTIVE_TABLE
 
+/* Check if a token's name matches that of a known directive.  Put in
+   this file to save exporting dtable and other unneeded information.  */
+void
+_cpp_check_directive (list, token)
+     cpp_toklist *list;
+     cpp_token *token;
+{
+  const char *name = list->namebuf + token->val.name.offset;
+  size_t len = token->val.name.len;
+  unsigned int i;
+
+  list->dir_handler = 0;
+  list->dir_flags = 0;
+
+  for (i = 0; i < N_DIRECTIVES; i++)
+    if (dtable[i].length == len && !strncmp (dtable[i].name, name, len)) 
+      {
+	list->dir_handler = dtable[i].func;
+	list->dir_flags = dtable[i].flags;
+	break;
+      }
+}
+
 /* Handle a possible # directive.
    '#' has already been read.  */
 
Index: cpplib.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/cpplib.h,v
retrieving revision 1.81
diff -u -p -r1.81 cpplib.h
--- cpplib.h	2000/04/21 17:18:50	1.81
+++ cpplib.h	2000/04/24 10:21:27
@@ -34,13 +34,26 @@ typedef struct cpp_options cpp_options;
 typedef struct cpp_printer cpp_printer;
 typedef struct cpp_token cpp_token;
 typedef struct cpp_toklist cpp_toklist;
+typedef struct cpp_name cpp_name;
 
-  /* Put operators that can appear in a preprocessor expression first.
-     This allows a lookup table to be implemented in _cpp_parse_expr.
-     Ordering within this group is currently not significant, apart
-     from those ending in '=' being at the end.  */
+/* The first two groups, apart from '=', can appear in preprocessor
+   expressions.  This allows a lookup table to be implemented in
+   _cpp_parse_expr.
+
+   The first group, to CPP_LAST_EQ, can be immediately followed by an
+   '='.  The lexer needs operators ending in '=', like ">>=", to be in
+   the same order as their counterparts without the '=', like ">>".  */
+
+/* Positions in the table.  */
+#define CPP_LAST_EQ CPP_LSHIFT
+#define CPP_FIRST_DIGRAPH CPP_HASH
+
 #define TTYPE_TABLE				\
-  T(CPP_PLUS = 0,	"+")	/* math */	\
+  T(CPP_EQ = 0,		"=")			\
+  T(CPP_NOT,		"!")			\
+  T(CPP_GREATER,	">")	/* compare */	\
+  T(CPP_LESS,		"<")			\
+  T(CPP_PLUS,		"+")	/* math */	\
   T(CPP_MINUS,		"-")			\
   T(CPP_MULT,		"*")			\
   T(CPP_DIV,		"/")			\
@@ -51,22 +64,19 @@ typedef struct cpp_toklist cpp_toklist;
   T(CPP_COMPL,		"~")			\
   T(CPP_RSHIFT,		">>")			\
   T(CPP_LSHIFT,		"<<")			\
-  T(CPP_NOT,		"!")	/* logicals */	\
-  T(CPP_AND_AND,	"&&")			\
+\
+  T(CPP_AND_AND,	"&&")	/* logical */	\
   T(CPP_OR_OR,		"||")			\
   T(CPP_QUERY,		"?")			\
   T(CPP_COLON,		":")			\
   T(CPP_COMMA,		",")	/* grouping */	\
   T(CPP_OPEN_PAREN,	"(")			\
   T(CPP_CLOSE_PAREN,	")")			\
-  T(CPP_GREATER,	">")	/* compare */	\
-  T(CPP_LESS,		"<")			\
-  T(CPP_EQ_EQ,		"==")			\
+  T(CPP_EQ_EQ,		"==")	/* compare */	\
   T(CPP_NOT_EQ,		"!=")			\
   T(CPP_GREATER_EQ,	">=")			\
   T(CPP_LESS_EQ,	"<=")			\
 \
-  /* The remainder of the punctuation.  Order is not significant. */	\
   T(CPP_PLUS_EQ,	"+=")	/* math */	\
   T(CPP_MINUS_EQ,	"-=")			\
   T(CPP_MULT_EQ,	"*=")			\
@@ -78,55 +88,67 @@ typedef struct cpp_toklist cpp_toklist;
   T(CPP_COMPL_EQ,	"~=")			\
   T(CPP_RSHIFT_EQ,	">>=")			\
   T(CPP_LSHIFT_EQ,	"<<=")			\
-  T(CPP_EQ,		"=")	/* assign */	\
+  /* Digraphs together, beginning with CPP_FIRST_DIGRAPH.  */	\
+  T(CPP_HASH,		"#")	/* digraphs */	\
+  T(CPP_PASTE,		"##")			\
+  T(CPP_OPEN_SQUARE,	"[")			\
+  T(CPP_CLOSE_SQUARE,	"]")			\
+  T(CPP_OPEN_BRACE,	"{")			\
+  T(CPP_CLOSE_BRACE,	"}")			\
+  /* The remainder of the punctuation.  Order is not significant. */	\
+  T(CPP_SEMICOLON,	";")	/* structure */	\
+  T(CPP_ELLIPSIS,	"...")			\
+  T(CPP_BACKSLASH,	"\\")			\
   T(CPP_PLUS_PLUS,	"++")	/* increment */	\
   T(CPP_MINUS_MINUS,	"--")			\
   T(CPP_DEREF,		"->")	/* accessors */	\
   T(CPP_DOT,		".")			\
-  T(CPP_OPEN_SQUARE,	"[")			\
-  T(CPP_CLOSE_SQUARE,	"]")			\
   T(CPP_SCOPE,		"::")			\
   T(CPP_DEREF_STAR,	"->*")			\
   T(CPP_DOT_STAR,	".*")			\
-  T(CPP_OPEN_BRACE,	"{")	/* structure */	\
-  T(CPP_CLOSE_BRACE,	"}")			\
-  T(CPP_SEMICOLON,	";")			\
-  T(CPP_ELLIPSIS,	"...")			\
-  T(CPP_HASH,		"#")			\
-  T(CPP_PASTE,		"##")			\
-  T(CPP_BACKSLASH,	"\\")			\
   T(CPP_MIN,		"<?")	/* extension */	\
   T(CPP_MAX,		">?")			\
-  T(CPP_OTHER,		spell_other) /* stray punctuation */ \
+  H(CPP_OTHER,		spell_other) /* stray punctuation */ \
 \
-  T(CPP_NAME,		spell_name)	/* word */	\
-  T(CPP_INT,		0)		/* 23 */	\
-  T(CPP_FLOAT,		0)		/* 3.14159 */	\
-  T(CPP_NUMBER,		spell_name)	/* 34_be+ta  */	\
-  T(CPP_CHAR,		spell_char)	/* 'char' */	\
-  T(CPP_WCHAR,		spell_char)	/* L'char' */	\
-  T(CPP_STRING,		spell_string)	/* "string" */	\
-  T(CPP_WSTRING,	spell_string)	/* L"string" */	\
+  H(CPP_NAME,		spell_name)	/* word */	\
+  N(CPP_INT,		0)		/* 23 */	\
+  N(CPP_FLOAT,		0)		/* 3.14159 */	\
+  H(CPP_NUMBER,		spell_name)	/* 34_be+ta  */	\
+  H(CPP_CHAR,		spell_char)	/* 'char' */	\
+  H(CPP_WCHAR,		spell_char)	/* L'char' */	\
+  H(CPP_STRING,		spell_string)	/* "string" */	\
+  H(CPP_WSTRING,	spell_string)	/* L"string" */	\
 \
-  T(CPP_COMMENT,	spell_comment)	/* Only if output comments.  */ \
-  T(CPP_VSPACE,		"\n")		/* End of line.  */		\
-  T(CPP_EOF,		0)		/* End of file.  */		\
-  T(CPP_HEADER_NAME,	0)		/* <stdio.h> in #include */	\
-  T(CPP_ASSERTION,	0)		/* (...) in #assert */		\
+  H(CPP_C_COMMENT,	spell_comment)	/* Only if output comments.  */ \
+  H(CPP_CPP_COMMENT,	spell_comment)	/* Only if output comments.  */ \
+  H(CPP_CHILL_COMMENT,	spell_comment)	/* Only if output comments.  */ \
+  N(CPP_MACRO_ARG,      0)              /* Macro argument.  */          \
+  N(CPP_SUBLIST,        0)	        /* Sublist.  */                 \
+  E(CPP_VSPACE,		"\n")		/* End of line.  */		\
+  N(CPP_EOF,		0)		/* End of file.  */		\
+  N(CPP_HEADER_NAME,	0)		/* <stdio.h> in #include */	\
+  N(CPP_ASSERTION,	0)		/* (...) in #assert */		\
 \
   /* Obsolete - will be removed when no code uses them still.  */	\
-  T(CPP_HSPACE,		0)		/* Horizontal white space.  */	\
-  T(CPP_POP,		0)		/* End of buffer.  */		\
-  T(CPP_DIRECTIVE,	0)		/* #define and the like */	\
-  T(CPP_MACRO,		0)		/* Like a NAME, but expanded.  */
+  H(CPP_COMMENT,	0)		/* Only if output comments.  */ \
+  N(CPP_HSPACE,		0)		/* Horizontal white space.  */	\
+  N(CPP_POP,		0)		/* End of buffer.  */		\
+  N(CPP_DIRECTIVE,	0)		/* #define and the like */	\
+  N(CPP_MACRO,		0)		/* Like a NAME, but expanded.  */
 
 #define T(e, s) e,
+#define H(e, s) e,
+#define N(e, s) e,
+#define E(e, s) e,
 enum cpp_ttype
 {
   TTYPE_TABLE
   N_TTYPES
 };
 #undef T
+#undef H
+#undef N
+#undef E
 
 /* Payload of a NAME, NUMBER, FLOAT, STRING, or COMMENT token.  */
 struct cpp_name
@@ -134,9 +156,13 @@ struct cpp_name
   unsigned int len;
   unsigned int offset;		/* from list->namebuf */
 };
+
+#define TOK_NAME(list, token) ((list)->namebuf + (token)->val.name.offset)
 
-/* Per token flags.  */
-#define HSPACE_BEFORE	(1 << 0)	/* token preceded by hspace */
+/* Flags for the cpp_token structure.  */
+#define PREV_WHITESPACE     1	/* If whitespace before this token.  */
+#define DIGRAPH             2	/* If it was a digraph.  */
+#define UNSIGNED_INT        4   /* If int preprocessing token unsigned.  */
 
 /* A preprocessing token.
    This has been carefully packed and should occupy 16 bytes on
@@ -150,8 +176,9 @@ struct cpp_token
   unsigned char type;
 #endif
   unsigned char flags;			/* flags - see above */
-  unsigned int aux;			/* hash of a NAME, or something -
-					   see uses in the code */
+  unsigned int aux;			/* CPP_OTHER character.  Hash of a
+					   NAME, or something - see uses
+					   in the code */
   union
   {
     struct cpp_name name;		/* a string */
@@ -168,7 +195,7 @@ typedef int (*parse_cleanup_t) PARAMS ((
 
 struct cpp_toklist
 {
-  struct cpp_token *tokens;	/* actual tokens as an array */
+  cpp_token *tokens;		/* actual tokens as an array */
   unsigned int tokens_used;	/* tokens used */
   unsigned int tokens_cap;	/* tokens allocated */
 
@@ -178,6 +205,11 @@ struct cpp_toklist
 
   unsigned int line;		/* starting line number */
 
+  /* Comment copying.  */
+  cpp_token *comments;		/* comment tokens.  */
+  unsigned int comments_used;	/* comment tokens used.  */
+  unsigned int comments_cap;	/* comment token capacity.  */
+
   /* Only used if tokens[0].type == CPP_DIRECTIVE.  This is the
      handler to call after lexing the rest of this line.  The flags
      indicate whether the rest of the line gets special treatment
@@ -561,6 +593,7 @@ struct cpp_printer
 /* Name under which this program was invoked.  */
 extern const char *progname;
 
+extern void _cpp_lex_file PARAMS((cpp_reader *));
 extern int cpp_handle_options PARAMS ((cpp_reader *, int, char **));
 extern enum cpp_ttype cpp_get_token PARAMS ((cpp_reader *));
 extern enum cpp_ttype cpp_get_non_space_token PARAMS ((cpp_reader *));
@@ -579,6 +612,8 @@ extern void cpp_define PARAMS ((cpp_read
 extern void cpp_assert PARAMS ((cpp_reader *, const char *));
 extern void cpp_undef  PARAMS ((cpp_reader *, const char *));
 extern void cpp_unassert PARAMS ((cpp_reader *, const char *));
+
+extern void cpp_free_token_list PARAMS ((cpp_toklist *));
 
 /* N.B. The error-message-printer prototypes have not been nicely
    formatted because exgettext needs to see 'msgid' on the same line


More information about the Gcc-patches mailing list