This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

cpplib: Initial patch for token persistency


This is an initial patch towards cpplib making reasonable guarantees
about the persistency of tokens.  Such guarantees will help clients of
cpplib (like Mark's C++ parser) and even cpplib itself, in such places
as macro expansion, since we can just keep pointers to tokens rather
than tokens themselves (16 -> 4 or 24 -> 8 byte win on 32 / 64 bit
archs).

My original idea was basically to lex in a line of tokens at a time
(where lines separated by an escaped newline are considered to be a
single line), and when starting the next real line, to overwrite it by
starting afresh from the start of the token buffer unless a counter
"keep_tokens" is set.

This can be set by the client (though not yet) or by cpplib.  For
example, when looking for the '(' after a funlike macro name, cpplib
has to be sure that the original macro token is not overwritten if the
'(' appears on a subsequent line, so it increments the keep_tokens
counter.  It has to be kept incremented whilst reading in the
arguments to a macro for similar reasons.

If the counter is non-zero, then the lexer just appends tokens to the
current lexed token buffer.  Since we're guaranteeing that tokens
don't disappear from underneath the caller, when extending the buffer
like this we cannot simply realloc it if and when we run out of space.
So the token buffer essentially becomes a linked list of smaller
buffers, which I've called "token runs".

Lexing a line at a time would cause issues with __VA_ARGS__ and
poisoned identifier warnings, since they are dependent on context;
that context is not known to the lexer itself.  I was originally going
to kludge something, but then realised that by lexing a
token-at-a-time like we do now, but storing those tokens into lines in
the token buffer, gets the best of both worlds.

So, this patch does that.  It is a bit ugly, but will simplify a bit
in future; there is various duct tape in there to temporarily keep the
rest of cpplib happy.  It doesn't implement any of the wins yet, so is
a performance loss at present because of the extra token copy we do
for each lex.

Still, it's a first step to something definitely worthwhile; a bunch
of simplifications are just round the corner I hope.  Any comments,
Zack?

Bootstrapped x86 Linux; I'm about to apply it.

Neil.

	* cpphash.h (struct tokenrun): New.
	(struct cpp_context): New member bol.
	(struct cpp_reader): New members.
	(_cpp_init_tokenrun): New.
	* cppinit.c (cpp_create_reader): Set up the token runs.
	* cpplex.c (lex_directive, lex_token, next_tokenrun): New.
	(lex_token): New internalised version of _cpp_lex_token.  Don't
	handle directives or the multiple include opimisation here any
	more.  Simply lex a token.
	* cpplib.c (run_directive): Clear bol.
	(_cpp_pop_buffer): Set bol.
	* cppmacro.c (funlike_invocation_p): Keep tokens whilst parsing
	arguments.

Index: cpphash.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/cpphash.h,v
retrieving revision 1.122
diff -u -p -r1.122 cpphash.h
--- cpphash.h	2001/08/22 20:37:17	1.122
+++ cpphash.h	2001/09/10 23:05:19
@@ -102,6 +102,13 @@ struct toklist
   cpp_token *limit;
 };
 
+typedef struct tokenrun tokenrun;
+struct tokenrun
+{
+  tokenrun *next;
+  cpp_token *base, *limit;
+};
+
 typedef struct cpp_context cpp_context;
 struct cpp_context
 {
@@ -124,6 +131,9 @@ struct lexer_state
   /* True if we are skipping a failed conditional group.  */
   unsigned char skipping;
 
+  /* Nonzero if next token is the start of a line.  */
+  unsigned char bol;
+
   /* Nonzero if in a directive that takes angle-bracketed headers.  */
   unsigned char angled_headers;
 
@@ -258,6 +268,13 @@ struct cpp_reader
   const cpp_hashnode *mi_ind_cmacro;
   bool mi_valid;
 
+  /* Lexing.  */
+  cpp_token *cur_token;
+  tokenrun base_run, *cur_run;
+
+  /* Non-zero prevents the lexer from re-using the token runs.  */
+  unsigned int keep_tokens;
+
   /* Token lookahead.  */
   struct cpp_lookahead *la_read;	/* Read from this lookahead.  */
   struct cpp_lookahead *la_write;	/* Write to this lookahead.  */
@@ -397,6 +414,7 @@ extern int _cpp_parse_expr		PARAMS ((cpp
 extern void _cpp_lex_token		PARAMS ((cpp_reader *, cpp_token *));
 extern int _cpp_equiv_tokens		PARAMS ((const cpp_token *,
 						 const cpp_token *));
+extern void _cpp_init_tokenrun		PARAMS ((tokenrun *, unsigned int));
 extern void _cpp_init_pool		PARAMS ((cpp_pool *, unsigned int,
 						  unsigned int, unsigned int));
 extern void _cpp_free_pool		PARAMS ((cpp_pool *));
Index: cppinit.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/cppinit.c,v
retrieving revision 1.177
diff -u -p -r1.177 cppinit.c
--- cppinit.c	2001/09/01 10:22:15	1.177
+++ cppinit.c	2001/09/10 23:05:28
@@ -511,6 +511,12 @@ cpp_create_reader (table, lang)
   /* Indicate date and time not yet calculated.  */
   pfile->date.type = CPP_EOF;
 
+  /* Create a token buffer for the lexer.  */
+  _cpp_init_tokenrun (&pfile->base_run, 250);
+  pfile->cur_run = &pfile->base_run;
+  pfile->cur_token = pfile->base_run.base;
+  pfile->state.bol = 1;
+
   /* Initialise the base context.  */
   pfile->context = &pfile->base_context;
   pfile->base_context.macro = 0;
Index: cpplex.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/cpplex.c,v
retrieving revision 1.158
diff -u -p -r1.158 cpplex.c
--- cpplex.c	2001/09/10 22:34:03	1.158
+++ cpplex.c	2001/09/10 23:05:34
@@ -102,6 +102,9 @@ static void lex_dot PARAMS ((cpp_reader 
 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
 static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
 				   const unsigned char *, unsigned int *));
+static int lex_directive PARAMS ((cpp_reader *));
+static void lex_token PARAMS ((cpp_reader *, cpp_token *, int));
+static tokenrun *next_tokenrun PARAMS ((tokenrun *));
 
 static cpp_chunk *new_chunk PARAMS ((unsigned int));
 static int chunk_suitable PARAMS ((cpp_pool *, cpp_chunk *, unsigned int));
@@ -903,103 +906,200 @@ lex_dot (pfile, result)
     }
 }
 
+/* Allocate COUNT tokens for RUN.  */
 void
+_cpp_init_tokenrun (run, count)
+     tokenrun *run;
+     unsigned int count;
+{
+  run->base = xnewvec (cpp_token, count);
+  run->limit = run->base + count;
+  run->next = NULL;
+}
+
+/* Returns the next tokenrun, or creates one if there is none.  */
+static tokenrun *
+next_tokenrun (run)
+     tokenrun *run;
+{
+  if (run->next == NULL)
+    {
+      run->next = xnew (tokenrun);
+      _cpp_init_tokenrun (run->next, 250);
+    }
+
+  return run->next;
+}
+
+static int
+lex_directive (pfile)
+     cpp_reader *pfile;
+{
+  /* 6.10.3 paragraph 11: If there are sequences of preprocessing
+     tokens within the list of arguments that would otherwise act as
+     preprocessing directives, the behavior is undefined.
+
+     This implementation will report a hard error, terminate the macro
+     invocation, and proceed to process the directive.  */
+  if (pfile->state.parsing_args)
+    {
+      pfile->lexer_pos.output_line = pfile->line;
+      if (pfile->state.parsing_args == 2)
+	{
+	  cpp_error (pfile,
+		     "directives may not be used inside a macro argument");
+	  pfile->state.bol = 1;
+	  pfile->buffer->cur = pfile->buffer->line_base;
+	  pfile->buffer->read_ahead = EOF;
+	  pfile->cur_token->type = CPP_EOF;
+	}
+
+      return 0;
+    }
+
+  /* This is a directive.  If the return value is false, it is an
+     assembler #.  */
+  {
+    /* FIXME: short-term kludge only - it doesn't handle the case that
+       the # is at the end of a run and we moved to the start of the
+       next one.  Easily fixed once we kill lookaheads.  */
+    cpp_token *token = pfile->cur_token++;
+    if (_cpp_handle_directive (pfile, token->flags & PREV_WHITE))
+      return 1;
+    pfile->cur_token = token;
+    return 0;
+  }
+}
+
+/* Lex a token into RESULT (external interface).  */
+void
 _cpp_lex_token (pfile, result)
      cpp_reader *pfile;
      cpp_token *result;
 {
+  if (pfile->cur_token == pfile->cur_run->limit)
+    {
+      pfile->cur_run = next_tokenrun (pfile->cur_run);
+      pfile->cur_token = pfile->cur_run->base;
+    }
+
+ next_token:
+  if (pfile->state.bol)
+    {
+    start_new_line:
+      pfile->state.bol = 0;
+
+      /* Return lexer back to base.  */
+      if (!pfile->keep_tokens)
+	{
+	  pfile->cur_run = &pfile->base_run;
+	  pfile->cur_token = pfile->base_run.base;
+	}
+
+      lex_token (pfile, pfile->cur_token, 1);
+      pfile->lexer_pos.output_line = pfile->cur_token->line;
+      if (pfile->cur_token->type == CPP_HASH && lex_directive (pfile))
+	goto start_new_line;
+    }
+  else
+    {
+      lex_token (pfile, pfile->cur_token, 0);
+      if (pfile->cur_token->type == CPP_EOF)
+	{
+	  if (!pfile->state.in_directive)
+	    goto start_new_line;
+	  /* Decrementing pfile->line allows directives to recognise
+	     that the newline has been seen, and also means that
+	     diagnostics don't point to the next line.  */
+	  pfile->lexer_pos.output_line = pfile->line--;
+	}
+    }
+
+  if (!pfile->state.in_directive)
+    {
+      if (pfile->state.skipping && pfile->cur_token->type != CPP_EOF)
+	goto next_token;
+
+      /* Outside a directive, invalidate controlling macros.  */
+      pfile->mi_valid = false;
+    }
+
+  *result = *pfile->cur_token++;
+}
+
+/* Lex a token into RESULT (internal interface).  */
+static void
+lex_token (pfile, result, skip_newlines)
+     cpp_reader *pfile;
+     cpp_token *result;
+     int skip_newlines;
+{
   cppchar_t c;
   cpp_buffer *buffer;
   const unsigned char *comment_start;
-  int bol;
 
- next_token:
+ fresh_line:
   buffer = pfile->buffer;
   result->flags = buffer->saved_flags;
   buffer->saved_flags = 0;
-  bol = (buffer->cur <= buffer->line_base + 1
-	 && pfile->lexer_pos.output_line == pfile->line);
- next_char:
+ update_tokens_line:
   pfile->lexer_pos.line = pfile->line;
   result->line = pfile->line;
- next_char2:
-  pfile->lexer_pos.col = CPP_BUF_COLUMN (buffer, buffer->cur);
 
+ skipped_white:
   c = buffer->read_ahead;
   if (c == EOF && buffer->cur < buffer->rlimit)
-    {
-      c = *buffer->cur++;
-      pfile->lexer_pos.col++;
-    }
-  result->col = pfile->lexer_pos.col;
-
- do_switch:
+    c = *buffer->cur++;
+  result->col = CPP_BUF_COLUMN (buffer, buffer->cur);
+  pfile->lexer_pos.col = result->col;
   buffer->read_ahead = EOF;
+
+ trigraph:
   switch (c)
     {
     case EOF:
-      /* Non-empty files should end in a newline.  Don't warn for
-	 command line and _Pragma buffers.  */
-      if (pfile->lexer_pos.col != 0)
-	{
-	  /* Account for the missing \n, prevent multiple warnings.  */
-	  pfile->line++;
-	  pfile->lexer_pos.col = 0;
-	  if (!buffer->from_stage3)
-	    cpp_pedwarn (pfile, "no newline at end of file");
-	}
-
-      /* To prevent bogus diagnostics, only pop the buffer when
-	 in-progress directives and arguments have been taken care of.
-	 Decrement the line to terminate an in-progress directive.  */
-      if (pfile->state.in_directive)
-	pfile->lexer_pos.output_line = pfile->line--;
-      else if (! pfile->state.parsing_args)
+      if (!pfile->state.parsing_args && !pfile->state.in_directive)
 	{
-	  /* Don't pop the last buffer.  */
-	  if (buffer->prev)
+	  if (buffer->cur == buffer->line_base)
 	    {
-	      unsigned char stop = buffer->return_at_eof;
+	      /* Don't pop the last buffer.  */
+	      if (buffer->prev)
+		{
+		  unsigned char stop = buffer->return_at_eof;
 
-	      _cpp_pop_buffer (pfile);
-	      if (!stop)
-		goto next_token;
+		  _cpp_pop_buffer (pfile);
+		  if (!stop)
+		    goto fresh_line;
+		}
 	    }
+	  else
+	    {
+	      /* Non-empty files should end in a newline.  Don't warn
+		 for command line and _Pragma buffers.  */
+	      if (!buffer->from_stage3)
+		cpp_pedwarn (pfile, "no newline at end of file");
+	      handle_newline (pfile, '\n');
+	    }
 	}
       result->type = CPP_EOF;
-      return;
+      break;
 
     case ' ': case '\t': case '\f': case '\v': case '\0':
       skip_whitespace (pfile, c);
       result->flags |= PREV_WHITE;
-      goto next_char2;
+      goto skipped_white;
 
     case '\n': case '\r':
-      if (pfile->state.in_directive)
+      if (pfile->state.in_directive && pfile->state.parsing_args)
+	buffer->read_ahead = c;
+      else
 	{
-	  result->type = CPP_EOF;
-	  if (pfile->state.parsing_args)
-	    buffer->read_ahead = c;
-	  else
-	    {
-	      handle_newline (pfile, c);
-	      /* Decrementing pfile->line allows directives to
-		 recognise that the newline has been seen, and also
-		 means that diagnostics don't point to the next line.  */
-	      pfile->lexer_pos.output_line = pfile->line--;
-	    }
-	  return;
+	  handle_newline (pfile, c);
+	  if (skip_newlines)
+	    goto fresh_line;
 	}
-
-      handle_newline (pfile, c);
-      /* This is a new line, so clear any white space flag.  Newlines
-	 in arguments are white space (6.10.3.10); parse_arg takes
-	 care of that.  */
-      result->flags &= ~(PREV_WHITE | AVOID_LPASTE);
-      bol = 1;
-      if (pfile->state.parsing_args != 2)
-	pfile->lexer_pos.output_line = pfile->line;
-      goto next_char;
+      result->type = CPP_EOF;
+      break;
 
     case '?':
     case '\\':
@@ -1013,7 +1113,7 @@ _cpp_lex_token (pfile, result)
 	  /* We had at least one escaped newline of some sort, and the
 	     next character is in buffer->read_ahead.  Update the
 	     token's line and column.  */
-	    goto next_char;
+	    goto update_tokens_line;
 
 	/* We are either the original '?' or '\\', or a trigraph.  */
 	result->type = CPP_QUERY;
@@ -1021,7 +1121,7 @@ _cpp_lex_token (pfile, result)
 	if (c == '\\')
 	  goto random_char;
 	else if (c != '?')
-	  goto do_switch;
+	  goto trigraph;
       }
       break;
 
@@ -1122,7 +1222,7 @@ _cpp_lex_token (pfile, result)
       if (!pfile->state.save_comments)
 	{
 	  result->flags |= PREV_WHITE;
-	  goto next_char;
+	  goto update_tokens_line;
 	}
 
       /* Save the comment as a token in its own right.  */
@@ -1187,8 +1287,6 @@ _cpp_lex_token (pfile, result)
 
     case '%':
       lex_percent (pfile, result);
-      if (result->type == CPP_HASH)
-	goto do_hash;
       break;
 
     case '.':
@@ -1248,49 +1346,9 @@ _cpp_lex_token (pfile, result)
       break;
 	  
     case '#':
-      c = buffer->extra_char;	/* Can be set by error condition below.  */
-      if (c != EOF)
-	{
-	  buffer->read_ahead = c;
-	  buffer->extra_char = EOF;
-	}
-      else
-	c = get_effective_char (pfile);
-
-      if (c == '#')
-	{
-	  ACCEPT_CHAR (CPP_PASTE);
-	  break;
-	}
-
       result->type = CPP_HASH;
-    do_hash:
-      if (!bol)
-	break;
-      /* 6.10.3 paragraph 11: If there are sequences of preprocessing
-	 tokens within the list of arguments that would otherwise act
-	 as preprocessing directives, the behavior is undefined.
-
-	 This implementation will report a hard error, terminate the
-	 macro invocation, and proceed to process the directive.  */
-      if (pfile->state.parsing_args)
-	{
-	  pfile->lexer_pos.output_line = pfile->line;
-	  if (pfile->state.parsing_args == 2)
-	    {
-	      cpp_error (pfile,
-			 "directives may not be used inside a macro argument");
-	      result->type = CPP_EOF;
-	    }
-	}
-      /* in_directive can be true inside a _Pragma.  */
-      else if (!pfile->state.in_directive)
-	{
-	  /* This is the hash introducing a directive.  If the return
-	     value is false, it is an assembler #.  */
-	  if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
-	    goto next_token;
-	}
+      if (get_effective_char (pfile) == '#')
+	  ACCEPT_CHAR (CPP_PASTE);
       break;
 
     case '|':
@@ -1339,13 +1397,6 @@ _cpp_lex_token (pfile, result)
       result->val.c = c;
       break;
     }
-
-  if (!pfile->state.in_directive && pfile->state.skipping)
-    goto next_char;
-
-  /* If not in a directive, this token invalidates controlling macros.  */
-  if (!pfile->state.in_directive)
-    pfile->mi_valid = false;
 }
 
 /* An upper bound on the number of bytes needed to spell a token,
Index: cpplib.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/cpplib.c,v
retrieving revision 1.271
diff -u -p -r1.271 cpplib.c
--- cpplib.c	2001/08/22 20:37:19	1.271
+++ cpplib.c	2001/09/10 23:05:38
@@ -402,6 +402,7 @@ run_directive (pfile, dir_no, buf, count
   cpp_push_buffer (pfile, (const U_CHAR *) buf, count,
 		   /* from_stage3 */ true, 1);
   start_directive (pfile);
+  pfile->state.bol = 0;
   pfile->state.prevent_expansion++;
   pfile->directive = &dtable[dir_no];
   (void) (*pfile->directive->handler) (pfile);
@@ -1782,6 +1783,7 @@ _cpp_pop_buffer (pfile)
      case of a missing #endif.  */
   pfile->lexer_pos.output_line = pfile->line;
   pfile->state.skipping = 0;
+  pfile->state.bol = 1;
 
   /* Update the reader's buffer before _cpp_do_file_change.  */
   pfile->buffer = buffer->prev;
Index: cppmacro.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/cppmacro.c,v
retrieving revision 1.64
diff -u -p -r1.64 cppmacro.c
--- cppmacro.c	2001/09/05 06:46:53	1.64
+++ cppmacro.c	2001/09/10 23:05:41
@@ -599,6 +599,7 @@ funlike_invocation_p (pfile, node, list)
   pfile->state.parsing_args = 1;
   pfile->state.prevent_expansion++;
 
+  pfile->keep_tokens++;
   cpp_start_lookahead (pfile);
   cpp_get_token (pfile, &maybe_paren);
   cpp_stop_lookahead (pfile, maybe_paren.type == CPP_OPEN_PAREN);
@@ -613,6 +614,7 @@ funlike_invocation_p (pfile, node, list)
 
   pfile->state.prevent_expansion--;
   pfile->state.parsing_args = 0;
+  pfile->keep_tokens--;
 
   /* Reset the position in case of failure.  If success, the macro's
      expansion appears where the name would have.  */


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]