This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

Re: bumming cycles out of parse_identifier()...


On Mon, Sep 10, 2001 at 08:15:09PM +0100, Neil Booth wrote:
> Zack Weinberg wrote:-
> 
> > Here's a counterproposal: When we read in the file, check its last
> > character to see if it is \r or \n.  If it isn't, copy the file and
> > add one.  Then we only ever have to check cur < limit at line
> > boundaries, where we do lots of expensive other stuff anyway.  We
> > would then have to set a flag on the buffer and remember to issue the
> > "no newline at end of file" warning later (when we know the line number).
> 
> That's too clever.  It would never have occurred to me.  Fancy coding
> it up?  You could have handle_newline return true / false depending
> upon whether you're at EOF or not, which is probably more useful than
> what it returns now.

After some experimentation, this doesn't seem to be much of a speed
win - less than half a second off the time to preprocess misc-inst.cc
100 times, and I'm not sure that isn't noise.  It is definitely a
readability win in cpplex.c, but I'm seeing quite a bit of additional
mess elsewhere, in all the places that call cpp_push_buffer.
cpp_push_buffer doesn't have enough information to know how to
enlarge the buffer it's given, and putting it into all the callers is
ugly - never mind that that's an exported interface (used at least by
fix-header...)

You want to have a tilt at it?  The half-done patch is appended.  It
works as long as it never sees any buffer that isn't \n terminated,
including from command line switches, and I only fixed -D.  

I'm definitely seeing how "never step back" leads to problems.  There
might be performance gains just from dropping that (esp skip_block_comment).

> I'm not an expert on this stuff by any means.  Yes, I think it's
> possible to avoid the copy (we can query with an nl_langinfo call or
> something to see if we're UTF-8).  Initially, I think we should go for
> locale setting only (and maybe a command line switch to turn the whole
> conversion lark off).  Marcus Kuhn is of the strongly held belief that
> locale is the right way, and that it's all we should need.  He knows
> more than a fair bit about these issues, so I thought we'd try that
> and see.

Try it and see, sure, but I'm confidently predicting that someone in
Russia will write a library with headers with comments in KOI8-R, and
someone in Japan will try to use it from their program with comments
in SJIS, and we'll get the bug report when it doesn't Just Work.  (For
arbitrary values of country and character set, of course.)

> Well, we won't be accepting those in identifiers, at least for now.
> The standard only mandates that we accept the UCS ugliness in addition
> to A-Za-z0-9_.  Since I don't think either of us is entirely sure
> what's going to be needed here, let's only add stuff as and when the
> need arises.

Well, (a) I think it's silly to allow \u09B6 and not the literal
character U+09B6, and (b) even if we just do \u escapes we still have
to canonicalize, because the scenario I described with
combined/uncombined accents can still happen with \u escapes.

On the other hand, I think it's reasonable if the first pass doesn't
bother supporting extended identifiers at all, not even \u escapes.

zw

===================================================================
Index: cpphash.h
--- cpphash.h	2001/08/22 20:37:17	1.122
+++ cpphash.h	2001/09/11 05:40:57
@@ -213,6 +213,10 @@ struct cpp_buffer
      token from the enclosing buffer is returned.  */
   bool return_at_eof;
 
+  /* Nonzero means this buffer lacked a trailing newline when we
+     pushed it, so one was added.  */
+  bool missing_last_newline;
+
   /* The directory of the this buffer's file.  Its NAME member is not
      allocated, so we don't need to worry about freeing it.  */
   struct search_path dir;
===================================================================
Index: cppinit.c
--- cppinit.c	2001/09/01 10:22:15	1.177
+++ cppinit.c	2001/09/11 05:40:58
@@ -750,26 +750,26 @@ init_builtins (pfile)
 
   if (CPP_OPTION (pfile, cplusplus))
     {
-      _cpp_define_builtin (pfile, "__cplusplus 1");
+      _cpp_define_builtin (pfile, "__cplusplus 1\n");
       if (SUPPORTS_ONE_ONLY)
-	_cpp_define_builtin (pfile, "__GXX_WEAK__ 1");
+	_cpp_define_builtin (pfile, "__GXX_WEAK__ 1\n");
       else
-	_cpp_define_builtin (pfile, "__GXX_WEAK__ 0");
+	_cpp_define_builtin (pfile, "__GXX_WEAK__ 0\n");
     }
   if (CPP_OPTION (pfile, objc))
     _cpp_define_builtin (pfile, "__OBJC__ 1");
 
   if (CPP_OPTION (pfile, lang) == CLK_STDC94)
-    _cpp_define_builtin (pfile, "__STDC_VERSION__ 199409L");
+    _cpp_define_builtin (pfile, "__STDC_VERSION__ 199409L\n");
   else if (CPP_OPTION (pfile, c99))
-    _cpp_define_builtin (pfile, "__STDC_VERSION__ 199901L");
+    _cpp_define_builtin (pfile, "__STDC_VERSION__ 199901L\n");
 
   if (CPP_OPTION (pfile, lang) == CLK_STDC89
       || CPP_OPTION (pfile, lang) == CLK_STDC94
       || CPP_OPTION (pfile, lang) == CLK_STDC99)
-    _cpp_define_builtin (pfile, "__STRICT_ANSI__ 1");
+    _cpp_define_builtin (pfile, "__STRICT_ANSI__ 1\n");
   else if (CPP_OPTION (pfile, lang) == CLK_ASM)
-    _cpp_define_builtin (pfile, "__ASSEMBLER__ 1");
+    _cpp_define_builtin (pfile, "__ASSEMBLER__ 1\n");
 }
 #undef BUILTIN
 #undef OPERATOR
===================================================================
Index: cpplex.c
--- cpplex.c	2001/09/10 22:34:03	1.158
+++ cpplex.c	2001/09/11 05:40:58
@@ -80,7 +80,7 @@ const struct token_spelling token_spelli
 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
 
-static cppchar_t handle_newline PARAMS ((cpp_reader *, cppchar_t));
+static bool handle_newline PARAMS ((cpp_reader *, cppchar_t));
 static cppchar_t skip_escaped_newlines PARAMS ((cpp_reader *, cppchar_t));
 static cppchar_t get_effective_char PARAMS ((cpp_reader *));
 
@@ -123,37 +123,34 @@ cpp_ideq (token, string)
   return !ustrcmp (NODE_NAME (token->val.node), (const U_CHAR *) string);
 }
 
-/* Call when meeting a newline.  Returns the character after the newline
-   (or carriage-return newline combination), or EOF.  */
-static cppchar_t
+/* Call when meeting a newline.  Returns true at EOF, false otherwise.  */
+static bool
 handle_newline (pfile, newline_char)
      cpp_reader *pfile;
      cppchar_t newline_char;
 {
-  cpp_buffer *buffer;
-  cppchar_t next = EOF;
+  cpp_buffer *buffer = pfile->buffer;
 
   pfile->line++;
-  buffer = pfile->buffer;
-  buffer->col_adjust = 0;
-  buffer->line_base = buffer->cur;
+  buffer->read_ahead = EOF;
 
-  /* Handle CR-LF and LF-CR combinations, get the next character.  */
-  if (buffer->cur < buffer->rlimit)
-    {
-      next = *buffer->cur++;
-      if (next + newline_char == '\r' + '\n')
-	{
-	  buffer->line_base = buffer->cur;
-	  if (buffer->cur < buffer->rlimit)
-	    next = *buffer->cur++;
-	  else
-	    next = EOF;
-	}
+  /* Check for a CR-LF or LF-CR combination.  */
+  if (buffer->cur < buffer->rlimit
+      && *buffer->cur + newline_char == '\r' + '\n')
+    buffer->cur++;
+
+  if (buffer->cur == buffer->rlimit)
+    {
+      /* EOF.  Step back the cur pointer once so we'll come back here
+	 in case the buffer isn't popped immediately, and don't
+	 advance line_base etc.  */
+      buffer->cur--;
+      return true;
     }
 
-  buffer->read_ahead = next;
-  return next;
+  buffer->col_adjust = 0;
+  buffer->line_base = buffer->cur;
+  return false;
 }
 
 /* Subroutine of skip_escaped_newlines; called when a trigraph is
@@ -222,14 +219,11 @@ skip_escaped_newlines (pfile, next)
 
       do
 	{
-	  if (buffer->cur == buffer->rlimit)
-	    break;
-      
 	  SAVE_STATE ();
 	  if (next == '?')
 	    {
 	      next1 = *buffer->cur++;
-	      if (next1 != '?' || buffer->cur == buffer->rlimit)
+	      if (next1 != '?')
 		{
 		  RESTORE_STATE ();
 		  break;
@@ -245,21 +239,20 @@ skip_escaped_newlines (pfile, next)
 
 	      /* We have a full trigraph here.  */
 	      next = _cpp_trigraph_map[next1];
-	      if (next != '\\' || buffer->cur == buffer->rlimit)
+	      if (next != '\\')
 		break;
 	      SAVE_STATE ();
 	    }
 
 	  /* We have a backslash, and room for at least one more character.  */
 	  space = 0;
-	  do
+	  for (;;)
 	    {
 	      next1 = *buffer->cur++;
 	      if (!is_nvspace (next1))
 		break;
 	      space = 1;
 	    }
-	  while (buffer->cur < buffer->rlimit);
 
 	  if (!is_vspace (next1))
 	    {
@@ -270,9 +263,13 @@ skip_escaped_newlines (pfile, next)
 	  if (space && !pfile->state.lexing_comment)
 	    cpp_warning (pfile, "backslash and newline separated by space");
 
-	  next = handle_newline (pfile, next1);
-	  if (next == EOF)
-	    cpp_pedwarn (pfile, "backslash-newline at end of file");
+	  if (handle_newline (pfile, next1))
+	    {
+	      cpp_pedwarn (pfile, "backslash-newline at end of file");
+	      next = EOF;
+	    }
+	  else
+	    next = *buffer->cur++;
 	}
       while (next == '\\' || next == '?');
     }
@@ -289,20 +286,15 @@ get_effective_char (pfile)
      cpp_reader *pfile;
 {
   cpp_buffer *buffer = pfile->buffer;
-  cppchar_t next = EOF;
+  cppchar_t next = *buffer->cur++;
 
-  if (buffer->cur < buffer->rlimit)
-    {
-      next = *buffer->cur++;
+  /* '?' can introduce trigraphs (and therefore backslash); '\\'
+     can introduce escaped newlines, which we want to skip, or
+     UCNs, which, depending upon lexer state, we will handle in
+     the future.  */
+  if (next == '?' || next == '\\')
+    next = skip_escaped_newlines (pfile, next);
 
-      /* '?' can introduce trigraphs (and therefore backslash); '\\'
-	 can introduce escaped newlines, which we want to skip, or
-	 UCNs, which, depending upon lexer state, we will handle in
-	 the future.  */
-      if (next == '?' || next == '\\')
-	next = skip_escaped_newlines (pfile, next);
-    }
-
   buffer->read_ahead = next;
   return next;
 }
@@ -318,7 +310,7 @@ skip_block_comment (pfile)
   cppchar_t c = EOF, prevc = EOF;
 
   pfile->state.lexing_comment = 1;
-  while (buffer->cur != buffer->rlimit)
+  for (;;)
     {
       prevc = c, c = *buffer->cur++;
 
@@ -338,11 +330,10 @@ skip_block_comment (pfile)
 	  /* Warn about potential nested comments, but not if the '/'
 	     comes immediately before the true comment delimeter.
 	     Don't bother to get it right across escaped newlines.  */
-	  if (CPP_OPTION (pfile, warn_comments)
-	      && buffer->cur != buffer->rlimit)
+	  if (CPP_OPTION (pfile, warn_comments))
 	    {
 	      prevc = c, c = *buffer->cur++;
-	      if (c == '*' && buffer->cur != buffer->rlimit)
+	      if (c == '*')
 		{
 		  prevc = c, c = *buffer->cur++;
 		  if (c != '/') 
@@ -355,8 +346,11 @@ skip_block_comment (pfile)
 	}
       else if (is_vspace (c))
 	{
-	  prevc = c, c = handle_newline (pfile, c);
-	  goto next_char;
+	  if (handle_newline (pfile, c))
+	    {
+	      c = EOF;
+	      break;
+	    }
 	}
       else if (c == '\t')
 	adjust_column (pfile);
@@ -381,10 +375,6 @@ skip_line_comment (pfile)
   pfile->state.lexing_comment = 1;
   do
     {
-      c = EOF;
-      if (buffer->cur == buffer->rlimit)
-	break;
-
       c = *buffer->cur++;
       if (c == '?' || c == '\\')
 	c = skip_escaped_newlines (pfile, c);
@@ -443,10 +433,6 @@ skip_whitespace (pfile, c)
 			       CPP_BUF_COL (buffer),
 			       "%s in preprocessing directive",
 			       c == '\f' ? "form feed" : "vertical tab");
-
-      c = EOF;
-      if (buffer->cur == buffer->rlimit)
-	break;
       c = *buffer->cur++;
     }
   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
@@ -494,10 +480,10 @@ parse_identifier (pfile)
   rlimit = pfile->buffer->rlimit;
   do
     cur++;
-  while (cur < rlimit && ISIDNUM (*cur));
+  while (ISIDNUM (*cur));
 
   /* Check for slow-path cases.  */
-  if (cur < rlimit && (*cur == '?' || *cur == '\\' || *cur == '$'))
+  if (*cur == '?' || *cur == '\\' || *cur == '$')
     result = parse_identifier_slow (pfile, cur);
   else
     {
@@ -557,10 +543,6 @@ parse_identifier_slow (pfile, cur)
           if (c == '$')
             saw_dollar++;
 
-          c = EOF;
-          if (buffer->cur == buffer->rlimit)
-            break;
-
           c = *buffer->cur++;
         }
 
@@ -620,10 +602,6 @@ parse_number (pfile, number, c, leading_
 	    limit = _cpp_next_chunk (pool, 0, &dest);
 	  *dest++ = c;
 
-	  c = EOF;
-	  if (buffer->cur == buffer->rlimit)
-	    break;
-
 	  c = *buffer->cur++;
 	}
       while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
@@ -709,12 +687,8 @@ parse_string (pfile, token, terminator)
 
   for (;;)
     {
-      if (buffer->cur == buffer->rlimit)
-	c = EOF;
-      else
-	c = *buffer->cur++;
+      c = *buffer->cur++;
 
-    have_char:
       /* We need space for the terminating NUL.  */
       if (dest >= limit)
 	limit = _cpp_next_chunk (pool, 0, &dest);
@@ -746,12 +720,16 @@ parse_string (pfile, token, terminator)
 	     multiple lines.  In Standard C, neither may strings.
 	     Unfortunately, we accept multiline strings as an
 	     extension, except in #include family directives.  */
-	  if (terminator != '"' || pfile->state.angled_headers)
+	  if (terminator != '"' || pfile->state.angled_headers
+	      || handle_newline (pfile, c))
 	    {
 	      unterminated (pfile, terminator);
 	      break;
 	    }
 
+	  /* Always render this as '\n'.  */
+	  c = '\n';
+
 	  if (!warned_multi)
 	    {
 	      warned_multi = true;
@@ -760,10 +738,6 @@ parse_string (pfile, token, terminator)
 
 	  if (pfile->mlstring_pos.line == 0)
 	    pfile->mlstring_pos = pfile->lexer_pos;
-	      
-	  c = handle_newline (pfile, c);
-	  *dest++ = '\n';
-	  goto have_char;
 	}
       else if (c == '\0' && !warned_nulls)
 	{
@@ -926,7 +900,7 @@ _cpp_lex_token (pfile, result)
   pfile->lexer_pos.col = CPP_BUF_COLUMN (buffer, buffer->cur);
 
   c = buffer->read_ahead;
-  if (c == EOF && buffer->cur < buffer->rlimit)
+  if (c == EOF)
     {
       c = *buffer->cur++;
       pfile->lexer_pos.col++;
@@ -937,38 +911,6 @@ _cpp_lex_token (pfile, result)
   buffer->read_ahead = EOF;
   switch (c)
     {
-    case EOF:
-      /* Non-empty files should end in a newline.  Don't warn for
-	 command line and _Pragma buffers.  */
-      if (pfile->lexer_pos.col != 0)
-	{
-	  /* Account for the missing \n, prevent multiple warnings.  */
-	  pfile->line++;
-	  pfile->lexer_pos.col = 0;
-	  if (!buffer->from_stage3)
-	    cpp_pedwarn (pfile, "no newline at end of file");
-	}
-
-      /* To prevent bogus diagnostics, only pop the buffer when
-	 in-progress directives and arguments have been taken care of.
-	 Decrement the line to terminate an in-progress directive.  */
-      if (pfile->state.in_directive)
-	pfile->lexer_pos.output_line = pfile->line--;
-      else if (! pfile->state.parsing_args)
-	{
-	  /* Don't pop the last buffer.  */
-	  if (buffer->prev)
-	    {
-	      unsigned char stop = buffer->return_at_eof;
-
-	      _cpp_pop_buffer (pfile);
-	      if (!stop)
-		goto next_token;
-	    }
-	}
-      result->type = CPP_EOF;
-      return;
-
     case ' ': case '\t': case '\f': case '\v': case '\0':
       skip_whitespace (pfile, c);
       result->flags |= PREV_WHITE;
@@ -982,7 +924,9 @@ _cpp_lex_token (pfile, result)
 	    buffer->read_ahead = c;
 	  else
 	    {
+	      /* Ignore EOF until we get out of directive processing.  */
 	      handle_newline (pfile, c);
+
 	      /* Decrementing pfile->line allows directives to
 		 recognise that the newline has been seen, and also
 		 means that diagnostics don't point to the next line.  */
@@ -991,7 +935,9 @@ _cpp_lex_token (pfile, result)
 	  return;
 	}
 
-      handle_newline (pfile, c);
+      if (handle_newline (pfile, c))
+	goto end_of_file;
+    
       /* This is a new line, so clear any white space flag.  Newlines
 	 in arguments are white space (6.10.3.10); parse_arg takes
 	 care of that.  */
@@ -1054,7 +1000,7 @@ _cpp_lex_token (pfile, result)
       if (result->val.node == pfile->spec_nodes.n_L)
 	{
 	  c = buffer->read_ahead;
-	  if (c == EOF && buffer->cur < buffer->rlimit)
+	  if (c == EOF)
 	    c = *buffer->cur;
 	  if (c == '\'' || c == '"')
 	    {
@@ -1346,6 +1292,32 @@ _cpp_lex_token (pfile, result)
   /* If not in a directive, this token invalidates controlling macros.  */
   if (!pfile->state.in_directive)
     pfile->mi_valid = false;
+  return;
+
+ end_of_file:
+  if (buffer->missing_last_newline && !buffer->from_stage3)
+    {
+      cpp_pedwarn (pfile, "no newline at end of file");
+      /* Account for the missing \n, prevent multiple warnings.  */
+      pfile->line++;
+      buffer->missing_last_newline = 0;
+    }
+
+  /* To prevent bogus diagnostics, only pop the buffer
+     after we exit all active argument lists (these will
+     cause parse errors at a higher level).  Also, don't
+     pop the last buffer.  */
+  if (!pfile->state.parsing_args && buffer->prev)
+    {
+      unsigned char stop = buffer->return_at_eof;
+
+      _cpp_pop_buffer (pfile);
+      if (!stop)
+	goto next_token;
+    }
+
+  result->type = CPP_EOF;
+  return;
 }
 
 /* An upper bound on the number of bytes needed to spell a token,
===================================================================
Index: cpplib.c
--- cpplib.c	2001/08/22 20:37:19	1.271
+++ cpplib.c	2001/09/11 05:40:58
@@ -1608,11 +1608,11 @@ cpp_define (pfile, str)
 
   /* Copy the entire option so we can modify it. 
      Change the first "=" in the string to a space.  If there is none,
-     tack " 1" on the end.  */
+     tack " 1" on the end.  Always tack on a trailing newline.  */
 
   /* Length including the null.  */  
   count = strlen (str);
-  buf = (char *) alloca (count + 2);
+  buf = (char *) alloca (count + 3);
   memcpy (buf, str, count);
 
   p = strchr (str, '=');
@@ -1623,6 +1623,7 @@ cpp_define (pfile, str)
       buf[count++] = ' ';
       buf[count++] = '1';
     }
+  buf[count++] = '\n';
 
   run_directive (pfile, T_DEFINE, buf, count);
 }


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]