This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [Fwd: Patch for Recovery from Bad Multibyte Characters]
- To: Dave Brolley <brolley at cygnus dot com>
- Subject: Re: [Fwd: Patch for Recovery from Bad Multibyte Characters]
- From: Jeffrey A Law <law at cygnus dot com>
- Date: Tue, 01 Jun 1999 01:47:33 -0600
- Cc: egcs-patches at egcs dot cygnus dot com
- Reply-To: law at cygnus dot com
In message <3752D581.C2ED469B@cygnus.com>you write:
> This patch correct a problem in the C and C++ compilers in which the
> compiler sometimes misses the end of a string or character literal
> while attempting to complete a multibyte character. It replaces all
> but the first character back into the input stream. In order to do
> this, I had to add the capability to put back more than one character
> to both the C and C++ front ends. I was careful to make sure that the
> overhead to sub_getch is still the same as before for the case where
> no character is in the buffer.
Well, you know more about this code than anyone else. If you're happy with
it, I see no reason to object.
I don't think SJIS bugs are critical for the gcc-2.95 release, so installing it
into just the mainline sources ought to be OK.
jeff
> This fixes the testcase g++.old-deja/g++.mike/net48.C when compiled
> with SJIS as the multibyte character set (set the LANG environment
> variable to C-SJIS when configured with --enable-c-mbchar). The
> warnings are still valid, however the literals now have the correct
> value and the end of the literals is not missed.
>
> Dave
>
> --------------22196983260D8D98C779157E
> Content-Type: text/plain; charset=us-ascii; name="101093-CR-1.ChangeLog"
> Content-Transfer-Encoding: 7bit
> Content-Disposition: inline; filename="101093-CR-1.ChangeLog"
>
> Wed May 26 13:47:29 1999 Dave Brolley <brolley@cygnus.com>
>
> * c-lex.c (GETC): Redefine to call getch.
> (UNGETC): Redefine to call put_back.
> (putback_buffer): New structure type.
> (putback): New static structure.
> (getch): New function.
> (put_back): New function.
> (yylex): Replace unused bytes from bad multibyte character.
>
>
> --------------22196983260D8D98C779157E
> Content-Type: text/plain; charset=us-ascii; name="101093-CR-1.ChangeLog"
> Content-Transfer-Encoding: 7bit
> Content-Disposition: inline; filename="101093-CR-1.ChangeLog"
>
> Wed May 26 13:50:39 1999 Dave Brolley <brolley@cygnus.com>
>
> * lex.c (real_yylex): Replace unused bytes from bad multibyte char.
>
> * input.c (putback_buffer): New structure type.
> (putback): Replaces putback_char member.
> (putback): Replaces putback_char static variable.
> (feed_input): Use putback.
> (end_input): Use putback.
> (sub_getch): Use putback.
> (put_back): Use putback.
>
>
> --------------22196983260D8D98C779157E
> Content-Type: text/plain; charset=us-ascii; name="101093-CR-1.txt"
> Content-Transfer-Encoding: 7bit
> Content-Disposition: inline; filename="101093-CR-1.txt"
>
> Index: c-lex.c
> ===================================================================
> RCS file: /egcs/carton/cvsfiles/egcs/gcc/c-lex.c,v
> retrieving revision 1.49
> diff -c -p -c -p -r1.49 c-lex.c
> *** c-lex.c 1999/03/20 19:21:23 1.49
> --- c-lex.c 1999/05/26 21:02:50
> *************** extern int yy_get_token ();
> *** 71,80 ****
>
> #define GETC() (yy_cur < yy_lim ? *yy_cur++ : yy_get_token ())
> #define UNGETC(c) ((c) == EOF ? 0 : yy_cur--)
> ! #else
> ! #define GETC() getc (finput)
> ! #define UNGETC(c) ungetc (c, finput)
> ! #endif
>
> /* the declaration found for the last IDENTIFIER token read in.
> yylex must look this up to detect typedefs, which get token type TYPEN
> AME,
> --- 71,117 ----
>
> #define GETC() (yy_cur < yy_lim ? *yy_cur++ : yy_get_token ())
> #define UNGETC(c) ((c) == EOF ? 0 : yy_cur--)
> !
> ! #else /* ! USE_CPPLIB */
> !
> ! #define GETC() getch ()
> ! #define UNGETC(c) put_back (c)
> !
> ! struct putback_buffer {
> ! char *buffer;
> ! int buffer_size;
> ! int index;
> ! };
> !
> ! static struct putback_buffer putback = {NULL, 0, -1};
> !
> ! static inline int
> ! getch ()
> ! {
> ! if (putback.index != -1)
> ! {
> ! int ch = putback.buffer[putback.index];
> ! --putback.index;
> ! return ch;
> ! }
> ! return getc (finput);
> ! }
> !
> ! static inline void
> ! put_back (ch)
> ! int ch;
> ! {
> ! if (ch != EOF)
> ! {
> ! if (putback.index == putback.buffer_size - 1)
> ! {
> ! putback.buffer_size += 16;
> ! putback.buffer = xrealloc (putback.buffer, putback.buffer_size);
> ! }
> ! putback.buffer[++putback.index] = ch;
> ! }
> ! }
> ! #endif /* ! USE_CPPLIB */
>
> /* the declaration found for the last IDENTIFIER token read in.
> yylex must look this up to detect typedefs, which get token type TYPEN
> AME,
> *************** yylex ()
> *** 1972,1983 ****
> else
> {
> if (char_len == -1)
> ! warning ("Ignoring invalid multibyte character");
> ! if (wide_flag)
> ! c = wc;
> #ifdef MAP_CHARACTER
> ! else
> ! c = MAP_CHARACTER (c);
> #endif
> }
> #else /* ! MULTIBYTE_CHARS */
> --- 2009,2025 ----
> else
> {
> if (char_len == -1)
> ! {
> ! warning ("Ignoring invalid multibyte character");
> ! /* Replace all but the first byte. */
> ! for (--i; i > 1; --i)
> ! UNGETC (token_buffer[i]);
> ! wc = token_buffer[1];
> ! }
> #ifdef MAP_CHARACTER
> ! c = MAP_CHARACTER (wc);
> ! #else
> ! c = wc;
> #endif
> }
> #else /* ! MULTIBYTE_CHARS */
> *************** yylex ()
> *** 2095,2114 ****
> c = GETC ();
> }
> if (char_len == -1)
> - warning ("Ignoring invalid multibyte character");
> - else
> {
> ! /* mbtowc sometimes needs an extra char before accepting */
> ! if (char_len <= i)
> ! UNGETC (c);
> ! if (! wide_flag)
> ! {
> ! p += (i + 1);
> ! c = GETC ();
> ! continue;
> ! }
> ! c = wc;
> }
> #endif /* MULTIBYTE_CHARS */
> }
>
> --- 2137,2160 ----
> c = GETC ();
> }
> if (char_len == -1)
> {
> ! warning ("Ignoring invalid multibyte character");
> ! /* Replace all except the first byte. */
> ! UNGETC (c);
> ! for (--i; i > 0; --i)
> ! UNGETC (p[i]);
> ! char_len = 1;
> ! }
> ! /* mbtowc sometimes needs an extra char before accepting */
> ! if (char_len <= i)
> ! UNGETC (c);
> ! if (! wide_flag)
> ! {
> ! p += (i + 1);
> ! c = GETC ();
> ! continue;
> }
> + c = wc;
> #endif /* MULTIBYTE_CHARS */
> }
>
>
> --------------22196983260D8D98C779157E
> Content-Type: text/plain; charset=us-ascii; name="101093-CR-1.txt"
> Content-Transfer-Encoding: 7bit
> Content-Disposition: inline; filename="101093-CR-1.txt"
>
> Index: input.c
> ===================================================================
> RCS file: /egcs/carton/cvsfiles/egcs/gcc/cp/input.c,v
> retrieving revision 1.9
> diff -c -p -c -p -r1.9 input.c
> *** input.c 1999/02/01 07:01:50 1.9
> --- input.c 1999/05/26 21:02:51
> *************** Boston, MA 02111-1307, USA. */
> *** 33,38 ****
> --- 33,44 ----
>
> extern FILE *finput;
>
> + struct putback_buffer {
> + char *buffer;
> + int buffer_size;
> + int index;
> + };
> +
> struct input_source {
> /* saved string */
> char *str;
> *************** struct input_source {
> *** 45,51 ****
> char *filename;
> int lineno;
> struct pending_input *input;
> ! int putback_char;
> };
>
> static struct input_source *input, *free_inputs;
> --- 51,57 ----
> char *filename;
> int lineno;
> struct pending_input *input;
> ! struct putback_buffer putback;
> };
>
> static struct input_source *input, *free_inputs;
> *************** free_input (inp)
> *** 98,104 ****
> free_inputs = inp;
> }
>
> ! static int putback_char = -1;
>
> /* Some of these external functions are declared inline in case this file
> is included in lex.c. */
> --- 104,110 ----
> free_inputs = inp;
> }
>
> ! static struct putback_buffer putback = {NULL, 0, -1};
>
> /* Some of these external functions are declared inline in case this file
> is included in lex.c. */
> *************** feed_input (str, len)
> *** 122,129 ****
> inp->filename = input_filename;
> inp->lineno = lineno;
> inp->input = save_pending_input ();
> ! inp->putback_char = putback_char;
> ! putback_char = -1;
> input = inp;
> }
>
> --- 128,137 ----
> inp->filename = input_filename;
> inp->lineno = lineno;
> inp->input = save_pending_input ();
> ! inp->putback = putback;
> ! putback.buffer = NULL;
> ! putback.buffer_size = 0;
> ! putback.index = -1;
> input = inp;
> }
>
> *************** end_input ()
> *** 141,147 ****
> lineno = inp->lineno;
> /* Get interface/implementation back in sync. */
> extract_interface_info ();
> ! putback_char = inp->putback_char;
> restore_pending_input (inp->input);
> free_input (inp);
> }
> --- 149,155 ----
> lineno = inp->lineno;
> /* Get interface/implementation back in sync. */
> extract_interface_info ();
> ! putback = inp->putback;
> restore_pending_input (inp->input);
> free_input (inp);
> }
> *************** end_input ()
> *** 149,165 ****
> static inline int
> sub_getch ()
> {
> ! if (putback_char != -1)
> {
> ! int ch = putback_char;
> ! putback_char = -1;
> return ch;
> }
> if (input)
> {
> if (input->offset >= input->length)
> {
> ! my_friendly_assert (putback_char == -1, 223);
> ++(input->offset);
> if (input->offset - input->length < 64)
> return EOF;
> --- 157,173 ----
> static inline int
> sub_getch ()
> {
> ! if (putback.index != -1)
> {
> ! int ch = putback.buffer[putback.index];
> ! --putback.index;
> return ch;
> }
> if (input)
> {
> if (input->offset >= input->length)
> {
> ! my_friendly_assert (putback.index == -1, 223);
> ++(input->offset);
> if (input->offset - input->length < 64)
> return EOF;
> *************** put_back (ch)
> *** 180,187 ****
> {
> if (ch != EOF)
> {
> ! my_friendly_assert (putback_char == -1, 224);
> ! putback_char = ch;
> }
> }
>
> --- 188,200 ----
> {
> if (ch != EOF)
> {
> ! if (putback.index == putback.buffer_size - 1)
> ! {
> ! putback.buffer_size += 16;
> ! putback.buffer = xrealloc (putback.buffer, putback.buffer_size);
> ! }
> ! my_friendly_assert (putback.buffer != NULL, 224);
> ! putback.buffer[++putback.index] = ch;
> }
> }
>
> Index: lex.c
> ===================================================================
> RCS file: /egcs/carton/cvsfiles/egcs/gcc/cp/lex.c,v
> retrieving revision 1.112
> diff -c -p -c -p -r1.112 lex.c
> *** lex.c 1999/05/24 00:46:52 1.112
> --- lex.c 1999/05/26 21:02:52
> *************** real_yylex ()
> *** 4074,4085 ****
> else
> {
> if (char_len == -1)
> ! warning ("Ignoring invalid multibyte character");
> ! if (wide_flag)
> ! c = wc;
> #ifdef MAP_CHARACTER
> ! else
> ! c = MAP_CHARACTER (c);
> #endif
> }
> #else /* ! MULTIBYTE_CHARS */
> --- 4074,4090 ----
> else
> {
> if (char_len == -1)
> ! {
> ! warning ("Ignoring invalid multibyte character");
> ! /* Replace all but the first byte. */
> ! for (--i; i > 1; --i)
> ! put_back (token_buffer[i]);
> ! wc = token_buffer[1];
> ! }
> #ifdef MAP_CHARACTER
> ! c = MAP_CHARACTER (wc);
> ! #else
> ! c = wc;
> #endif
> }
> #else /* ! MULTIBYTE_CHARS */
> *************** real_yylex ()
> *** 4203,4222 ****
> c = getch ();
> }
> if (char_len == -1)
> - warning ("Ignoring invalid multibyte character");
> - else
> {
> ! /* mbtowc sometimes needs an extra char before accepting */
> ! if (char_len <= i)
> ! put_back (c);
> ! if (! wide_flag)
> ! {
> ! p += (i + 1);
> ! c = getch ();
> ! continue;
> ! }
> ! c = wc;
> }
> #endif /* MULTIBYTE_CHARS */
> }
>
> --- 4208,4231 ----
> c = getch ();
> }
> if (char_len == -1)
> {
> ! warning ("Ignoring invalid multibyte character");
> ! /* Replace all except the first byte. */
> ! put_back (c);
> ! for (--i; i > 0; --i)
> ! put_back (p[i]);
> ! char_len = 1;
> ! }
> ! /* mbtowc sometimes needs an extra char before accepting */
> ! if (char_len <= i)
> ! put_back (c);
> ! if (! wide_flag)
> ! {
> ! p += (i + 1);
> ! c = getch ();
> ! continue;
> }
> + c = wc;
> #endif /* MULTIBYTE_CHARS */
> }
>
>
> --------------22196983260D8D98C779157E--
>
>
>
> --------------F133F5721FECA46830369BC4--
>