This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

[Fwd: Patch for Recovery from Bad Multibyte Characters]





Sorry, I am unable to deliver your mail note to egcs-patches@cygnus.com,
that list name is no longer used.  Please try resending your mail
note to the new list name:  egcs-patches@egcs.cygnus.com.

If you have any questions about this, send mail to sourcemaster@cygnus.com.
This is a machine-generated message.

Here is a copy of the note you sent.
-----------------------------------------------------------------------------

Hi,

After some further discussion with Jason Merrill, I used gdb to trace through the
code on his system and I'm satisfied that the code is behaving correctly.
MB_CUR_MAX is 2 on his platform and mbtowc is returning -1 (invalid character) when
called against <\352> and <\352>". The compiler is correctly generating a warning,
treating ,<\352> as a single character and terminating the string at the double
quote.

I've commited this patch to the mainline sources.

Dave

Dave Brolley wrote:

> Hi,
>
> This patch correct a problem in the C and C++ compilers in which the
> compiler sometimes misses the end of a string or character literal
> while attempting to complete a multibyte character. It replaces all
> but the first character back into the input stream. In order to do
> this, I had to add the capability to put back more than one character
> to both the C and C++ front ends. I was careful to make sure that the
> overhead to sub_getch is still the same as before for the case where
> no character is in the buffer.
>
> This fixes the testcase g++.old-deja/g++.mike/net48.C when compiled
> with SJIS as the multibyte character set (set the LANG environment
> variable to C-SJIS when configured with --enable-c-mbchar). The
> warnings are still valid, however the literals now have the correct
> value and the end of the literals is not missed.
>
> Dave
>
>   ----------------------------------------------------------------------
> Wed May 26 13:47:29 1999  Dave Brolley  <brolley@cygnus.com>
>
>         * c-lex.c (GETC): Redefine to call getch.
>         (UNGETC): Redefine to call put_back.
>         (putback_buffer): New structure type.
>         (putback): New static structure.
>         (getch): New function.
>         (put_back): New function.
>         (yylex): Replace unused bytes from bad multibyte character.
>
>   ----------------------------------------------------------------------
> Wed May 26 13:50:39 1999  Dave Brolley  <brolley@cygnus.com>
>
>         * lex.c (real_yylex): Replace unused bytes from bad multibyte char.
>
>         * input.c (putback_buffer): New structure type.
>         (putback): Replaces putback_char member.
>         (putback): Replaces putback_char static variable.
>         (feed_input): Use putback.
>         (end_input): Use putback.
>         (sub_getch): Use putback.
>         (put_back): Use putback.
>
>   ----------------------------------------------------------------------
> Index: c-lex.c
> ===================================================================
> RCS file: /egcs/carton/cvsfiles/egcs/gcc/c-lex.c,v
> retrieving revision 1.49
> diff -c -p -c -p -r1.49 c-lex.c
> *** c-lex.c     1999/03/20 19:21:23     1.49
> --- c-lex.c     1999/05/26 21:02:50
> *************** extern int yy_get_token ();
> *** 71,80 ****
>
>   #define GETC() (yy_cur < yy_lim ? *yy_cur++ : yy_get_token ())
>   #define UNGETC(c) ((c) == EOF ? 0 : yy_cur--)
> ! #else
> ! #define GETC() getc (finput)
> ! #define UNGETC(c) ungetc (c, finput)
> ! #endif
>
>   /* the declaration found for the last IDENTIFIER token read in.
>      yylex must look this up to detect typedefs, which get token type TYPENAME,
> --- 71,117 ----
>
>   #define GETC() (yy_cur < yy_lim ? *yy_cur++ : yy_get_token ())
>   #define UNGETC(c) ((c) == EOF ? 0 : yy_cur--)
> !
> ! #else /* ! USE_CPPLIB */
> !
> ! #define GETC() getch ()
> ! #define UNGETC(c) put_back (c)
> !
> ! struct putback_buffer {
> !   char *buffer;
> !   int   buffer_size;
> !   int   index;
> ! };
> !
> ! static struct putback_buffer putback = {NULL, 0, -1};
> !
> ! static inline int
> ! getch ()
> ! {
> !   if (putback.index != -1)
> !     {
> !       int ch = putback.buffer[putback.index];
> !       --putback.index;
> !       return ch;
> !     }
> !   return getc (finput);
> ! }
> !
> ! static inline void
> ! put_back (ch)
> !      int ch;
> ! {
> !   if (ch != EOF)
> !     {
> !       if (putback.index == putback.buffer_size - 1)
> !       {
> !         putback.buffer_size += 16;
> !         putback.buffer = xrealloc (putback.buffer, putback.buffer_size);
> !       }
> !       putback.buffer[++putback.index] = ch;
> !     }
> ! }
> ! #endif /* ! USE_CPPLIB */
>
>   /* the declaration found for the last IDENTIFIER token read in.
>      yylex must look this up to detect typedefs, which get token type TYPENAME,
> *************** yylex ()
> *** 1972,1983 ****
>                 else
>                   {
>                     if (char_len == -1)
> !                     warning ("Ignoring invalid multibyte character");
> !                   if (wide_flag)
> !                     c = wc;
>   #ifdef MAP_CHARACTER
> !                   else
> !                     c = MAP_CHARACTER (c);
>   #endif
>                   }
>   #else /* ! MULTIBYTE_CHARS */
> --- 2009,2025 ----
>                 else
>                   {
>                     if (char_len == -1)
> !                     {
> !                       warning ("Ignoring invalid multibyte character");
> !                       /* Replace all but the first byte.  */
> !                       for (--i; i > 1; --i)
> !                         UNGETC (token_buffer[i]);
> !                       wc = token_buffer[1];
> !                     }
>   #ifdef MAP_CHARACTER
> !                     c = MAP_CHARACTER (wc);
> ! #else
> !                     c = wc;
>   #endif
>                   }
>   #else /* ! MULTIBYTE_CHARS */
> *************** yylex ()
> *** 2095,2114 ****
>                     c = GETC ();
>                   }
>                 if (char_len == -1)
> -                 warning ("Ignoring invalid multibyte character");
> -               else
>                   {
> !                   /* mbtowc sometimes needs an extra char before accepting */
> !                   if (char_len <= i)
> !                     UNGETC (c);
> !                   if (! wide_flag)
> !                     {
> !                       p += (i + 1);
> !                       c = GETC ();
> !                       continue;
> !                     }
> !                   c = wc;
>                   }
>   #endif /* MULTIBYTE_CHARS */
>               }
>
> --- 2137,2160 ----
>                     c = GETC ();
>                   }
>                 if (char_len == -1)
>                   {
> !                   warning ("Ignoring invalid multibyte character");
> !                   /* Replace all except the first byte.  */
> !                   UNGETC (c);
> !                   for (--i; i > 0; --i)
> !                     UNGETC (p[i]);
> !                   char_len = 1;
> !                 }
> !               /* mbtowc sometimes needs an extra char before accepting */
> !               if (char_len <= i)
> !                 UNGETC (c);
> !               if (! wide_flag)
> !                 {
> !                   p += (i + 1);
> !                   c = GETC ();
> !                   continue;
>                   }
> +               c = wc;
>   #endif /* MULTIBYTE_CHARS */
>               }
>
>
>   ----------------------------------------------------------------------
> Index: input.c
> ===================================================================
> RCS file: /egcs/carton/cvsfiles/egcs/gcc/cp/input.c,v
> retrieving revision 1.9
> diff -c -p -c -p -r1.9 input.c
> *** input.c     1999/02/01 07:01:50     1.9
> --- input.c     1999/05/26 21:02:51
> *************** Boston, MA 02111-1307, USA.  */
> *** 33,38 ****
> --- 33,44 ----
>
>   extern FILE *finput;
>
> + struct putback_buffer {
> +   char *buffer;
> +   int   buffer_size;
> +   int   index;
> + };
> +
>   struct input_source {
>     /* saved string */
>     char *str;
> *************** struct input_source {
> *** 45,51 ****
>     char *filename;
>     int lineno;
>     struct pending_input *input;
> !   int putback_char;
>   };
>
>   static struct input_source *input, *free_inputs;
> --- 51,57 ----
>     char *filename;
>     int lineno;
>     struct pending_input *input;
> !   struct putback_buffer putback;
>   };
>
>   static struct input_source *input, *free_inputs;
> *************** free_input (inp)
> *** 98,104 ****
>     free_inputs = inp;
>   }
>
> ! static int putback_char = -1;
>
>   /* Some of these external functions are declared inline in case this file
>      is included in lex.c.  */
> --- 104,110 ----
>     free_inputs = inp;
>   }
>
> ! static struct putback_buffer putback = {NULL, 0, -1};
>
>   /* Some of these external functions are declared inline in case this file
>      is included in lex.c.  */
> *************** feed_input (str, len)
> *** 122,129 ****
>     inp->filename = input_filename;
>     inp->lineno = lineno;
>     inp->input = save_pending_input ();
> !   inp->putback_char = putback_char;
> !   putback_char = -1;
>     input = inp;
>   }
>
> --- 128,137 ----
>     inp->filename = input_filename;
>     inp->lineno = lineno;
>     inp->input = save_pending_input ();
> !   inp->putback = putback;
> !   putback.buffer = NULL;
> !   putback.buffer_size = 0;
> !   putback.index = -1;
>     input = inp;
>   }
>
> *************** end_input ()
> *** 141,147 ****
>     lineno = inp->lineno;
>     /* Get interface/implementation back in sync.  */
>     extract_interface_info ();
> !   putback_char = inp->putback_char;
>     restore_pending_input (inp->input);
>     free_input (inp);
>   }
> --- 149,155 ----
>     lineno = inp->lineno;
>     /* Get interface/implementation back in sync.  */
>     extract_interface_info ();
> !   putback = inp->putback;
>     restore_pending_input (inp->input);
>     free_input (inp);
>   }
> *************** end_input ()
> *** 149,165 ****
>   static inline int
>   sub_getch ()
>   {
> !   if (putback_char != -1)
>       {
> !       int ch = putback_char;
> !       putback_char = -1;
>         return ch;
>       }
>     if (input)
>       {
>         if (input->offset >= input->length)
>         {
> !         my_friendly_assert (putback_char == -1, 223);
>           ++(input->offset);
>           if (input->offset - input->length < 64)
>             return EOF;
> --- 157,173 ----
>   static inline int
>   sub_getch ()
>   {
> !   if (putback.index != -1)
>       {
> !       int ch = putback.buffer[putback.index];
> !       --putback.index;
>         return ch;
>       }
>     if (input)
>       {
>         if (input->offset >= input->length)
>         {
> !         my_friendly_assert (putback.index == -1, 223);
>           ++(input->offset);
>           if (input->offset - input->length < 64)
>             return EOF;
> *************** put_back (ch)
> *** 180,187 ****
>   {
>     if (ch != EOF)
>       {
> !       my_friendly_assert (putback_char == -1, 224);
> !       putback_char = ch;
>       }
>   }
>
> --- 188,200 ----
>   {
>     if (ch != EOF)
>       {
> !       if (putback.index == putback.buffer_size - 1)
> !       {
> !         putback.buffer_size += 16;
> !         putback.buffer = xrealloc (putback.buffer, putback.buffer_size);
> !       }
> !       my_friendly_assert (putback.buffer != NULL, 224);
> !       putback.buffer[++putback.index] = ch;
>       }
>   }
>
> Index: lex.c
> ===================================================================
> RCS file: /egcs/carton/cvsfiles/egcs/gcc/cp/lex.c,v
> retrieving revision 1.112
> diff -c -p -c -p -r1.112 lex.c
> *** lex.c       1999/05/24 00:46:52     1.112
> --- lex.c       1999/05/26 21:02:52
> *************** real_yylex ()
> *** 4074,4085 ****
>                 else
>                   {
>                     if (char_len == -1)
> !                     warning ("Ignoring invalid multibyte character");
> !                   if (wide_flag)
> !                     c = wc;
>   #ifdef MAP_CHARACTER
> !                   else
> !                     c = MAP_CHARACTER (c);
>   #endif
>                   }
>   #else /* ! MULTIBYTE_CHARS */
> --- 4074,4090 ----
>                 else
>                   {
>                     if (char_len == -1)
> !                     {
> !                       warning ("Ignoring invalid multibyte character");
> !                       /* Replace all but the first byte.  */
> !                       for (--i; i > 1; --i)
> !                         put_back (token_buffer[i]);
> !                       wc = token_buffer[1];
> !                     }
>   #ifdef MAP_CHARACTER
> !                     c = MAP_CHARACTER (wc);
> ! #else
> !                     c = wc;
>   #endif
>                   }
>   #else /* ! MULTIBYTE_CHARS */
> *************** real_yylex ()
> *** 4203,4222 ****
>                     c = getch ();
>                   }
>                 if (char_len == -1)
> -                 warning ("Ignoring invalid multibyte character");
> -               else
>                   {
> !                   /* mbtowc sometimes needs an extra char before accepting */
> !                   if (char_len <= i)
> !                     put_back (c);
> !                   if (! wide_flag)
> !                     {
> !                       p += (i + 1);
> !                       c = getch ();
> !                       continue;
> !                     }
> !                   c = wc;
>                   }
>   #endif /* MULTIBYTE_CHARS */
>               }
>
> --- 4208,4231 ----
>                     c = getch ();
>                   }
>                 if (char_len == -1)
>                   {
> !                   warning ("Ignoring invalid multibyte character");
> !                   /* Replace all except the first byte.  */
> !                   put_back (c);
> !                   for (--i; i > 0; --i)
> !                     put_back (p[i]);
> !                   char_len = 1;
> !                 }
> !               /* mbtowc sometimes needs an extra char before accepting */
> !               if (char_len <= i)
> !                 put_back (c);
> !               if (! wide_flag)
> !                 {
> !                   p += (i + 1);
> !                   c = getch ();
> !                   continue;
>                   }
> +               c = wc;
>   #endif /* MULTIBYTE_CHARS */
>               }







Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]