[Fwd: Patch for Recovery from Bad Multibyte Characters]

Jeffrey A Law law@cygnus.com
Tue Jun 1 00:52:00 GMT 1999


  In message <3752D581.C2ED469B@cygnus.com>you write:
  > This patch correct a problem in the C and C++ compilers in which the
  > compiler sometimes misses the end of a string or character literal
  > while attempting to complete a multibyte character. It replaces all
  > but the first character back into the input stream. In order to do
  > this, I had to add the capability to put back more than one character
  > to both the C and C++ front ends. I was careful to make sure that the
  > overhead to sub_getch is still the same as before for the case where
  > no character is in the buffer.
Well, you know more about this code than anyone else.  If you're happy with
it, I see no reason to object. 

I don't think SJIS bugs are critical for the gcc-2.95 release, so installing it
into just the mainline sources ought to be OK.
jeff 

  > This fixes the testcase g++.old-deja/g++.mike/net48.C when compiled
  > with SJIS as the multibyte character set (set the LANG environment
  > variable to C-SJIS when configured with --enable-c-mbchar). The
  > warnings are still valid, however the literals now have the correct
  > value and the end of the literals is not missed.
  > 
  > Dave
  > 
  > --------------22196983260D8D98C779157E
  > Content-Type: text/plain; charset=us-ascii; name="101093-CR-1.ChangeLog"
  > Content-Transfer-Encoding: 7bit
  > Content-Disposition: inline; filename="101093-CR-1.ChangeLog"
  > 
  > Wed May 26 13:47:29 1999  Dave Brolley  <brolley@cygnus.com>
  > 
  > 	* c-lex.c (GETC): Redefine to call getch.
  > 	(UNGETC): Redefine to call put_back.
  > 	(putback_buffer): New structure type.
  > 	(putback): New static structure.
  > 	(getch): New function.
  > 	(put_back): New function.
  > 	(yylex): Replace unused bytes from bad multibyte character.
  > 
  > 
  > --------------22196983260D8D98C779157E
  > Content-Type: text/plain; charset=us-ascii; name="101093-CR-1.ChangeLog"
  > Content-Transfer-Encoding: 7bit
  > Content-Disposition: inline; filename="101093-CR-1.ChangeLog"
  > 
  > Wed May 26 13:50:39 1999  Dave Brolley  <brolley@cygnus.com>
  > 
  > 	* lex.c (real_yylex): Replace unused bytes from bad multibyte char.
  > 
  > 	* input.c (putback_buffer): New structure type.
  > 	(putback): Replaces putback_char member.
  > 	(putback): Replaces putback_char static variable.
  > 	(feed_input): Use putback.
  > 	(end_input): Use putback.
  > 	(sub_getch): Use putback.
  > 	(put_back): Use putback.
  > 
  > 
  > --------------22196983260D8D98C779157E
  > Content-Type: text/plain; charset=us-ascii; name="101093-CR-1.txt"
  > Content-Transfer-Encoding: 7bit
  > Content-Disposition: inline; filename="101093-CR-1.txt"
  > 
  > Index: c-lex.c
  > ===================================================================
  > RCS file: /egcs/carton/cvsfiles/egcs/gcc/c-lex.c,v
  > retrieving revision 1.49
  > diff -c -p -c -p -r1.49 c-lex.c
  > *** c-lex.c	1999/03/20 19:21:23	1.49
  > --- c-lex.c	1999/05/26 21:02:50
  > *************** extern int yy_get_token ();
  > *** 71,80 ****
  >   
  >   #define GETC() (yy_cur < yy_lim ? *yy_cur++ : yy_get_token ())
  >   #define UNGETC(c) ((c) == EOF ? 0 : yy_cur--)
  > ! #else
  > ! #define GETC() getc (finput)
  > ! #define UNGETC(c) ungetc (c, finput)
  > ! #endif
  >   
  >   /* the declaration found for the last IDENTIFIER token read in.
  >      yylex must look this up to detect typedefs, which get token type TYPEN
  > AME,
  > --- 71,117 ----
  >   
  >   #define GETC() (yy_cur < yy_lim ? *yy_cur++ : yy_get_token ())
  >   #define UNGETC(c) ((c) == EOF ? 0 : yy_cur--)
  > ! 
  > ! #else /* ! USE_CPPLIB */
  > ! 
  > ! #define GETC() getch ()
  > ! #define UNGETC(c) put_back (c)
  > ! 
  > ! struct putback_buffer {
  > !   char *buffer;
  > !   int   buffer_size;
  > !   int   index;
  > ! };
  > ! 
  > ! static struct putback_buffer putback = {NULL, 0, -1};
  > ! 
  > ! static inline int
  > ! getch ()
  > ! {
  > !   if (putback.index != -1)
  > !     {
  > !       int ch = putback.buffer[putback.index];
  > !       --putback.index;
  > !       return ch;
  > !     }
  > !   return getc (finput);
  > ! }
  > ! 
  > ! static inline void
  > ! put_back (ch)
  > !      int ch;
  > ! {
  > !   if (ch != EOF)
  > !     {
  > !       if (putback.index == putback.buffer_size - 1)
  > ! 	{
  > ! 	  putback.buffer_size += 16;
  > ! 	  putback.buffer = xrealloc (putback.buffer, putback.buffer_size);
  > ! 	}
  > !       putback.buffer[++putback.index] = ch;
  > !     }
  > ! }
  > ! #endif /* ! USE_CPPLIB */
  >   
  >   /* the declaration found for the last IDENTIFIER token read in.
  >      yylex must look this up to detect typedefs, which get token type TYPEN
  > AME,
  > *************** yylex ()
  > *** 1972,1983 ****
  >   		else
  >   		  {
  >   		    if (char_len == -1)
  > ! 		      warning ("Ignoring invalid multibyte character");
  > ! 		    if (wide_flag)
  > ! 		      c = wc;
  >   #ifdef MAP_CHARACTER
  > ! 		    else
  > ! 		      c = MAP_CHARACTER (c);
  >   #endif
  >   		  }
  >   #else /* ! MULTIBYTE_CHARS */
  > --- 2009,2025 ----
  >   		else
  >   		  {
  >   		    if (char_len == -1)
  > ! 		      {
  > ! 			warning ("Ignoring invalid multibyte character");
  > ! 			/* Replace all but the first byte.  */
  > ! 			for (--i; i > 1; --i)
  > ! 			  UNGETC (token_buffer[i]);
  > ! 			wc = token_buffer[1];
  > ! 		      }
  >   #ifdef MAP_CHARACTER
  > ! 		      c = MAP_CHARACTER (wc);
  > ! #else
  > ! 		      c = wc;
  >   #endif
  >   		  }
  >   #else /* ! MULTIBYTE_CHARS */
  > *************** yylex ()
  > *** 2095,2114 ****
  >   		    c = GETC ();
  >   		  }
  >   		if (char_len == -1)
  > - 		  warning ("Ignoring invalid multibyte character");
  > - 		else
  >   		  {
  > ! 		    /* mbtowc sometimes needs an extra char before accepting */
  > ! 		    if (char_len <= i)
  > ! 		      UNGETC (c);
  > ! 		    if (! wide_flag)
  > ! 		      {
  > ! 			p += (i + 1);
  > ! 			c = GETC ();
  > ! 			continue;
  > ! 		      }
  > ! 		    c = wc;
  >   		  }
  >   #endif /* MULTIBYTE_CHARS */
  >   	      }
  >   
  > --- 2137,2160 ----
  >   		    c = GETC ();
  >   		  }
  >   		if (char_len == -1)
  >   		  {
  > ! 		    warning ("Ignoring invalid multibyte character");
  > ! 		    /* Replace all except the first byte.  */
  > ! 		    UNGETC (c);
  > ! 		    for (--i; i > 0; --i)
  > ! 		      UNGETC (p[i]);
  > ! 		    char_len = 1;
  > ! 		  }
  > ! 		/* mbtowc sometimes needs an extra char before accepting */
  > ! 		if (char_len <= i)
  > ! 		  UNGETC (c);
  > ! 		if (! wide_flag)
  > ! 		  {
  > ! 		    p += (i + 1);
  > ! 		    c = GETC ();
  > ! 		    continue;
  >   		  }
  > + 		c = wc;
  >   #endif /* MULTIBYTE_CHARS */
  >   	      }
  >   
  > 
  > --------------22196983260D8D98C779157E
  > Content-Type: text/plain; charset=us-ascii; name="101093-CR-1.txt"
  > Content-Transfer-Encoding: 7bit
  > Content-Disposition: inline; filename="101093-CR-1.txt"
  > 
  > Index: input.c
  > ===================================================================
  > RCS file: /egcs/carton/cvsfiles/egcs/gcc/cp/input.c,v
  > retrieving revision 1.9
  > diff -c -p -c -p -r1.9 input.c
  > *** input.c	1999/02/01 07:01:50	1.9
  > --- input.c	1999/05/26 21:02:51
  > *************** Boston, MA 02111-1307, USA.  */
  > *** 33,38 ****
  > --- 33,44 ----
  >   
  >   extern FILE *finput;
  >   
  > + struct putback_buffer {
  > +   char *buffer;
  > +   int   buffer_size;
  > +   int   index;
  > + };
  > + 
  >   struct input_source {
  >     /* saved string */
  >     char *str;
  > *************** struct input_source {
  > *** 45,51 ****
  >     char *filename;
  >     int lineno;
  >     struct pending_input *input;
  > !   int putback_char;
  >   };
  >   
  >   static struct input_source *input, *free_inputs;
  > --- 51,57 ----
  >     char *filename;
  >     int lineno;
  >     struct pending_input *input;
  > !   struct putback_buffer putback;
  >   };
  >   
  >   static struct input_source *input, *free_inputs;
  > *************** free_input (inp)
  > *** 98,104 ****
  >     free_inputs = inp;
  >   }
  >   
  > ! static int putback_char = -1;
  >   
  >   /* Some of these external functions are declared inline in case this file
  >      is included in lex.c.  */
  > --- 104,110 ----
  >     free_inputs = inp;
  >   }
  >   
  > ! static struct putback_buffer putback = {NULL, 0, -1};
  >   
  >   /* Some of these external functions are declared inline in case this file
  >      is included in lex.c.  */
  > *************** feed_input (str, len)
  > *** 122,129 ****
  >     inp->filename = input_filename;
  >     inp->lineno = lineno;
  >     inp->input = save_pending_input ();
  > !   inp->putback_char = putback_char;
  > !   putback_char = -1;
  >     input = inp;
  >   }
  >   
  > --- 128,137 ----
  >     inp->filename = input_filename;
  >     inp->lineno = lineno;
  >     inp->input = save_pending_input ();
  > !   inp->putback = putback;
  > !   putback.buffer = NULL;
  > !   putback.buffer_size = 0;
  > !   putback.index = -1;
  >     input = inp;
  >   }
  >   
  > *************** end_input ()
  > *** 141,147 ****
  >     lineno = inp->lineno;
  >     /* Get interface/implementation back in sync.  */
  >     extract_interface_info ();
  > !   putback_char = inp->putback_char;
  >     restore_pending_input (inp->input);
  >     free_input (inp);
  >   }
  > --- 149,155 ----
  >     lineno = inp->lineno;
  >     /* Get interface/implementation back in sync.  */
  >     extract_interface_info ();
  > !   putback = inp->putback;
  >     restore_pending_input (inp->input);
  >     free_input (inp);
  >   }
  > *************** end_input ()
  > *** 149,165 ****
  >   static inline int
  >   sub_getch ()
  >   {
  > !   if (putback_char != -1)
  >       {
  > !       int ch = putback_char;
  > !       putback_char = -1;
  >         return ch;
  >       }
  >     if (input)
  >       {
  >         if (input->offset >= input->length)
  >   	{
  > ! 	  my_friendly_assert (putback_char == -1, 223);
  >   	  ++(input->offset);
  >   	  if (input->offset - input->length < 64)
  >   	    return EOF;
  > --- 157,173 ----
  >   static inline int
  >   sub_getch ()
  >   {
  > !   if (putback.index != -1)
  >       {
  > !       int ch = putback.buffer[putback.index];
  > !       --putback.index;
  >         return ch;
  >       }
  >     if (input)
  >       {
  >         if (input->offset >= input->length)
  >   	{
  > ! 	  my_friendly_assert (putback.index == -1, 223);
  >   	  ++(input->offset);
  >   	  if (input->offset - input->length < 64)
  >   	    return EOF;
  > *************** put_back (ch)
  > *** 180,187 ****
  >   {
  >     if (ch != EOF)
  >       {
  > !       my_friendly_assert (putback_char == -1, 224);
  > !       putback_char = ch;
  >       }
  >   }
  >   
  > --- 188,200 ----
  >   {
  >     if (ch != EOF)
  >       {
  > !       if (putback.index == putback.buffer_size - 1)
  > ! 	{
  > ! 	  putback.buffer_size += 16;
  > ! 	  putback.buffer = xrealloc (putback.buffer, putback.buffer_size);
  > ! 	}
  > !       my_friendly_assert (putback.buffer != NULL, 224);
  > !       putback.buffer[++putback.index] = ch;
  >       }
  >   }
  >   
  > Index: lex.c
  > ===================================================================
  > RCS file: /egcs/carton/cvsfiles/egcs/gcc/cp/lex.c,v
  > retrieving revision 1.112
  > diff -c -p -c -p -r1.112 lex.c
  > *** lex.c	1999/05/24 00:46:52	1.112
  > --- lex.c	1999/05/26 21:02:52
  > *************** real_yylex ()
  > *** 4074,4085 ****
  >   		else
  >   		  {
  >   		    if (char_len == -1)
  > ! 		      warning ("Ignoring invalid multibyte character");
  > ! 		    if (wide_flag)
  > ! 		      c = wc;
  >   #ifdef MAP_CHARACTER
  > ! 		    else
  > ! 		      c = MAP_CHARACTER (c);
  >   #endif
  >   		  }
  >   #else /* ! MULTIBYTE_CHARS */
  > --- 4074,4090 ----
  >   		else
  >   		  {
  >   		    if (char_len == -1)
  > ! 		      {
  > ! 			warning ("Ignoring invalid multibyte character");
  > ! 			/* Replace all but the first byte.  */
  > ! 			for (--i; i > 1; --i)
  > ! 			  put_back (token_buffer[i]);
  > ! 			wc = token_buffer[1];
  > ! 		      }
  >   #ifdef MAP_CHARACTER
  > ! 		      c = MAP_CHARACTER (wc);
  > ! #else
  > ! 		      c = wc;
  >   #endif
  >   		  }
  >   #else /* ! MULTIBYTE_CHARS */
  > *************** real_yylex ()
  > *** 4203,4222 ****
  >   		    c = getch ();
  >   		  }
  >   		if (char_len == -1)
  > - 		  warning ("Ignoring invalid multibyte character");
  > - 		else
  >   		  {
  > ! 		    /* mbtowc sometimes needs an extra char before accepting */
  > ! 		    if (char_len <= i)
  > ! 		      put_back (c);
  > ! 		    if (! wide_flag)
  > ! 		      {
  > ! 			p += (i + 1);
  > ! 			c = getch ();
  > ! 			continue;
  > ! 		      }
  > ! 		    c = wc;
  >   		  }
  >   #endif /* MULTIBYTE_CHARS */
  >   	      }
  >   
  > --- 4208,4231 ----
  >   		    c = getch ();
  >   		  }
  >   		if (char_len == -1)
  >   		  {
  > ! 		    warning ("Ignoring invalid multibyte character");
  > ! 		    /* Replace all except the first byte.  */
  > ! 		    put_back (c);
  > ! 		    for (--i; i > 0; --i)
  > ! 		      put_back (p[i]);
  > ! 		    char_len = 1;
  > ! 		  }
  > ! 		/* mbtowc sometimes needs an extra char before accepting */
  > ! 		if (char_len <= i)
  > ! 		  put_back (c);
  > ! 		if (! wide_flag)
  > ! 		  {
  > ! 		    p += (i + 1);
  > ! 		    c = getch ();
  > ! 		    continue;
  >   		  }
  > + 		c = wc;
  >   #endif /* MULTIBYTE_CHARS */
  >   	      }
  >   
  > 
  > --------------22196983260D8D98C779157E--
  > 
  > 
  > 
  > --------------F133F5721FECA46830369BC4--
  > 




More information about the Gcc-patches mailing list