This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

Patch: iconv conversion for gcj


Here's a revised version of my gcj iconv patch.  It fixes all the
known bugs with this patch, plus some existing lurking bugs in the
lexer.  (Notably, the patch changes gcj to react gracefully if UEOF is
reached in the middle of a string literal.)

There is a corresponding (trivial) gcc configury patch which checks
for the iconv function.  If iconv is not found, gcj assumes that all
source files are UTF-8 encoded and will give an error if an invalid
encoding is found.

I'll submit the configure patch once this one is accepted.  I can send
it to you if you need it for testing though.  I also have a simple
test case that you can use to see the patch in action.  (There might
also be a test case in PR gcj/33, but I haven't looked recently.)

Ok to commit?

2000-08-26  Tom Tromey  <tromey@cygnus.com>

	Fix for PR gcj/33:
	* jv-scan.c (help): Document --encoding.
	(options): Added `encoding' entry.
	(OPT_ENCODING): New define.
	(main): Handle --encoding.
	Include <langinfo.h> if nl_langinfo exists.
	* lang-options.h: Document --classpath, --CLASSPATH, --main, and
	--encoding.
	* jcf-parse.c Include <langinfo.h> if we have nl_langinfo.
	(parse_source_file): Correctly call java_init_lex.  Added `finput'
	argument.  Use nl_langinfo to determine default encoding.
	* java-tree.h (current_encoding): Declare.
	* parse.y (java_parser_context_restore_global): Don't restore
	`finput'.
	(java_parser_context_save_global): Don't set `finput' field.
	(java_pop_parser_context): Don't restore `finput'.  Free old lexer
	if required.
	* lang.c (current_encoding): New global.
	(lang_decode_option): Recognize `-fencoding='.
	(finish_parse): Don't close finput.
	* parse.h (struct parser_ctxt): Removed `finput' and
	`unget_utf8_value' fields.  Added `lexer' field.
	(java_init_lex): Fixed declaration.
	* lex.c (java_new_lexer): New function.
	(java_destroy_lexer): Likewise.
	(java_read_char): Added `lex' argument.  Handle iconv case.
	(java_read_unicode): Added `lex' argument.  Count backslashes in
	lexer structure.
	(java_init_lex): Added `finput' and `encoding' arguments.  Set
	`lexer' field in ctxp.
	(BAD_UTF8_VALUE): Removed.
	(java_lex): Handle seeing UEOF in the middle of a string literal.
	* lex.h: Include <iconv.h> if HAVE_ICONV defined.
	(java_lexer): New structure.
	(UNGETC): Removed.
	(GETC): Removed.
	(DEFAULT_ENCODING): New define.
	(java_destroy_lexer): Declare.

Tom

Index: java-tree.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/java/java-tree.h,v
retrieving revision 1.79
diff -u -r1.79 java-tree.h
--- java-tree.h	2000/08/24 01:44:00	1.79
+++ java-tree.h	2000/08/26 18:00:07
@@ -169,6 +169,9 @@
    object to its synchronization structure.  */
 extern int flag_hash_synchronization;
 
+/* Encoding used for source files.  */
+extern char *current_encoding;
+
 /* The Java .class file that provides main_class;  the main input file. */
 extern struct JCF *current_jcf;
 
Index: jcf-parse.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/java/jcf-parse.c,v
retrieving revision 1.51
diff -u -r1.51 jcf-parse.c
--- jcf-parse.c	2000/08/11 22:01:37	1.51
+++ jcf-parse.c	2000/08/26 18:00:08
@@ -35,6 +35,10 @@
 #include "toplev.h"
 #include "parse.h"
 
+#ifdef HAVE_NL_LANGINFO
+#include <langinfo.h>
+#endif
+
 /* A CONSTANT_Utf8 element is converted to an IDENTIFIER_NODE at parse time. */
 #define JPOOL_UTF(JCF, INDEX) CPOOL_UTF(&(JCF)->cpool, INDEX)
 #define JPOOL_UTF_LENGTH(JCF, INDEX) IDENTIFIER_LENGTH (JPOOL_UTF (JCF, INDEX))
@@ -83,7 +87,7 @@
 static tree give_name_to_class PARAMS ((JCF *jcf, int index));
 static void parse_zip_file_entries PARAMS ((void));
 static void process_zip_dir PARAMS ((void));
-static void parse_source_file PARAMS ((tree));
+static void parse_source_file PARAMS ((tree, FILE *));
 static void jcf_parse_source PARAMS ((void));
 static int jcf_figure_file_type PARAMS ((JCF *));
 static int find_in_current_zip PARAMS ((const char *, struct JCF **));
@@ -564,6 +568,7 @@
 jcf_parse_source ()
 {
   tree file;
+  FILE *finput;
 
   java_parser_context_save_global ();
   java_push_parser_context ();
@@ -576,7 +581,7 @@
       if (!(finput = fopen (input_filename, "r")))
 	fatal ("input file `%s' just disappeared - jcf_parse_source",
 	       input_filename);
-      parse_source_file (file);
+      parse_source_file (file, finput);
       if (fclose (finput))
 	fatal ("can't close input file `%s' stream - jcf_parse_source",
 	       input_filename);
@@ -754,8 +759,9 @@
 /* Parse a source file, as pointed by the current value of INPUT_FILENAME. */
 
 static void
-parse_source_file (file)
+parse_source_file (file, finput)
      tree file;
+     FILE *finput;
 {
   int save_error_count = java_error_count;
   /* Mark the file as parsed */
@@ -764,8 +770,22 @@
   jcf_dependency_add_file (input_filename, 0);
 
   lang_init_source (1);		    /* Error msgs have no method prototypes */
+
+  /* There's no point in trying to find the current encoding unless we
+     are going to do something intelligent with it -- hence the test
+     for iconv.  */
+#ifdef HAVE_ICONV
+#ifdef HAVE_NL_LANGINFO
+  setlocale (LC_CTYPE, "");
+  if (current_encoding == NULL)
+    current_encoding = nl_langinfo (CODESET);
+#endif /* HAVE_NL_LANGINFO */
+#endif /* HAVE_ICONV */
+  if (current_encoding == NULL || *current_encoding == '\0')
+    current_encoding = DEFAULT_ENCODING;
 
-  java_init_lex ();		    /* Initialize the parser */
+  /* Initialize the parser */
+  java_init_lex (finput, current_encoding);
   java_parse_abort_on_error ();
 
   java_parse ();		    /* Parse and build partial tree nodes. */
@@ -796,6 +816,7 @@
   int several_files = 0;
   char *list = xstrdup (input_filename), *next;
   tree node, current_file_list = NULL_TREE;
+  FILE *finput;
 
   do 
     {
@@ -901,7 +922,7 @@
 	case JCF_SOURCE:
 	  java_push_parser_context ();
 	  java_parser_context_save_global ();
-	  parse_source_file (name);
+	  parse_source_file (name, finput);
 	  java_parser_context_restore_global ();
 	  java_pop_parser_context (1);
 	  break;
Index: jv-scan.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/java/jv-scan.c,v
retrieving revision 1.17
diff -u -r1.17 jv-scan.c
--- jv-scan.c	2000/02/18 12:26:50	1.17
+++ jv-scan.c	2000/08/26 18:00:08
@@ -26,6 +26,10 @@
 
 #include "version.h"
 
+#ifdef HAVE_NL_LANGINFO
+#include <langinfo.h>
+#endif
+
 #include <getopt.h>
 
 void fatal PARAMS ((const char *s, ...)) ATTRIBUTE_PRINTF_1 ATTRIBUTE_NORETURN;
@@ -61,6 +65,7 @@
 
 #define OPT_HELP      LONG_OPT (0)
 #define OPT_VERSION   LONG_OPT (1)
+#define OPT_ENCODING  LONG_OPT (2)
 
 static struct option options[] =
 {
@@ -69,6 +74,7 @@
   { "print-main", no_argument,      &flag_find_main, 1 },
   { "list-filename", no_argument,   &flag_list_filename, 1 },
   { "list-class", no_argument,      &flag_dump_class, 1 },
+  { "encoding",  required_argument, NULL, OPT_ENCODING },
   { NULL,        no_argument,       NULL, 0 }
 };
 
@@ -84,6 +90,7 @@
 {
   printf ("Usage: jv-scan [OPTION]... FILE...\n\n");
   printf ("Print useful information read from Java source files.\n\n");
+  printf ("  --encoding NAME         Specify encoding of input file\n");
   printf ("  --print-main            Print name of class containing `main'\n");
   printf ("  --list-class            List all classes defined in file\n");
   printf ("  --list-filename         Print input filename when listing class names\n");
@@ -114,6 +121,7 @@
 {
   int i = 1;
   const char *output_file = NULL;
+  const char *encoding = NULL;
   long ft;
   int opt;
 
@@ -144,6 +152,10 @@
 	  version ();
 	  break;
 
+	case OPT_ENCODING:
+	  encoding = optarg;
+	  break;
+
 	default:
 	  usage ();
 	  break;
@@ -172,7 +184,20 @@
 	input_filename = argv [i];
 	if ( (finput = fopen (argv [i], "r")) )
 	  {
-	    java_init_lex ();
+	    /* There's no point in trying to find the current encoding
+	       unless we are going to do something intelligent with it
+	       -- hence the test for iconv.  */
+#ifdef HAVE_ICONV
+#ifdef HAVE_NL_LANGINFO
+	    setlocale (LC_CTYPE, "");
+	    if (encoding == NULL)
+	      encoding = nl_langinfo (CODESET);
+#endif /* HAVE_NL_LANGINFO */
+#endif /* HAVE_ICONV */
+	    if (encoding == NULL || *encoding == '\0')
+	      encoding = DEFAULT_ENCODING;
+
+	    java_init_lex (finput, encoding);
 	    yyparse ();
 	    if (ftell (out) != ft)
 	      fputc ('\n', out);
Index: lang-options.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/java/lang-options.h,v
retrieving revision 1.16
diff -u -r1.16 lang-options.h
--- lang-options.h	2000/08/24 01:44:00	1.16
+++ lang-options.h	2000/08/26 18:00:08
@@ -42,8 +42,10 @@
   { "-M", "Print dependencies to stdout" },
   { "-MM", "Print dependencies to stdout" },
 #endif /* ! USE_CPPLIB */
-  { "-fclasspath", "Set class path and suppress system path" },
-  { "-fCLASSPATH", "Set class path" },
+  { "--classpath", "Set class path and suppress system path" },
+  { "--CLASSPATH", "Set class path" },
+  { "--main", "Choose class whose main method should be used" },
+  { "--encoding", "Choose input encoding (default is UTF-8)" },
   { "-I", "Add directory to class path" },
   { "-foutput-class-dir", "Directory where class files should be written" },
   { "-fuse-divide-subroutine", "" },
Index: lang.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/java/lang.c,v
retrieving revision 1.47
diff -u -r1.47 lang.c
--- lang.c	2000/08/24 20:34:39	1.47
+++ lang.c	2000/08/26 18:00:08
@@ -121,6 +121,9 @@
    JNI, not CNI.  */
 int flag_jni = 0;
 
+/* The encoding of the source file.  */
+char *current_encoding = NULL;
+
 /* When non zero, report the now deprecated empty statements.  */
 int flag_extraneous_semicolon;
 
@@ -222,6 +225,13 @@
       return 1;
     }
 #undef ARG
+#define ARG "-fencoding="
+  if (strncmp (p, ARG, sizeof (ARG) - 1) == 0)
+    {
+      current_encoding = p + sizeof (ARG) - 1;
+      return 1;
+    }
+#undef ARG
 
   if (p[0] == '-' && p[1] == 'f')
     {
@@ -309,7 +319,9 @@
   return 0;
 }
 
+/* Global open file.  */
 FILE *finput;
+
 const char *
 init_parse (filename)
      const char *filename;
@@ -362,6 +374,7 @@
 	    }
 	}
     }
+
   init_lex ();
 
   return filename;
@@ -370,7 +383,6 @@
 void
 finish_parse ()
 {
-  fclose (finput);
   jcf_dependency_write ();
 }
 
Index: lex.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/java/lex.c,v
retrieving revision 1.40
diff -u -r1.40 lex.c
--- lex.c	2000/06/29 17:03:49	1.40
+++ lex.c	2000/08/26 18:00:09
@@ -24,15 +24,15 @@
 The Free Software Foundation is independent of Sun Microsystems, Inc.  */
 
 /* It defines java_lex (yylex) that reads a Java ASCII source file
-possibly containing Unicode escape sequence or utf8 encoded characters
-and returns a token for everything found but comments, white spaces
-and line terminators. When necessary, it also fills the java_lval
-(yylval) union. It's implemented to be called by a re-entrant parser
-generated by Bison.
-
-The lexical analysis conforms to the Java grammar described in "The
-Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
-Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html)  */
+   possibly containing Unicode escape sequence or utf8 encoded
+   characters and returns a token for everything found but comments,
+   white spaces and line terminators. When necessary, it also fills
+   the java_lval (yylval) union. It's implemented to be called by a
+   re-entrant parser generated by Bison.
+
+   The lexical analysis conforms to the Java grammar described in "The
+   Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
+   Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
 
 #include "keyword.h"
 
@@ -55,15 +55,18 @@
 static int java_parse_doc_section PARAMS ((unicode_t));
 static void java_parse_end_comment PARAMS ((unicode_t));
 static unicode_t java_get_unicode PARAMS ((void));
-static unicode_t java_read_unicode PARAMS ((int, int *));
+static unicode_t java_read_unicode PARAMS ((java_lexer *, int, int *));
 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
-static unicode_t java_read_char PARAMS ((void));
+static unicode_t java_read_char PARAMS ((java_lexer *));
 static void java_allocate_new_line PARAMS ((void));
 static void java_unget_unicode PARAMS ((void));
 static unicode_t java_sneak_unicode PARAMS ((void));
+java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
 
 void
-java_init_lex ()
+java_init_lex (finput, encoding)
+     FILE *finput;
+     const char *encoding;
 {
 #ifndef JC1_LITE
   int java_lang_imported = 0;
@@ -112,9 +115,9 @@
   ctxp->lineno = lineno = 0;
   ctxp->p_line = NULL;
   ctxp->c_line = NULL;
-  ctxp->unget_utf8_value = 0;
   ctxp->minus_seen = 0;
   ctxp->java_error_flag = 0;
+  ctxp->lexer = java_new_lexer (finput, encoding);
 }
 
 static char *
@@ -192,59 +195,180 @@
   ctxp->c_line->white_space_only = 1;
 }
 
-#define BAD_UTF8_VALUE 0xFFFE
+/* Create a new lexer object.  */
+java_lexer *
+java_new_lexer (finput, encoding)
+     FILE *finput;
+     const char *encoding;
+{
+  java_lexer *lex = (java_lexer *) xmalloc (sizeof (java_lexer));
+  int enc_error = 0;
 
-static unicode_t
-java_read_char ()
+  lex->finput = finput;
+  lex->bs_count = 0;
+  lex->unget_value = 0;
+
+#ifdef HAVE_ICONV
+  lex->handle = iconv_open ("UCS-2", encoding);
+  if (lex->handle == (iconv_t) -1)
+    {
+      /* FIXME: we should give a nice error based on errno here.  */
+      enc_error = 1;
+    }
+  lex->first = -1;
+  lex->last = -1;
+#else /* HAVE_ICONV */
+  if (strcmp (encoding, DEFAULT_ENCODING))
+    enc_error = 1;
+#endif /* HAVE_ICONV */
+
+  if (enc_error)
+    fatal ("unknown encoding: `%s'", encoding);
+
+  return lex;
+}
+
+void
+java_destroy_lexer (lex)
+     java_lexer *lex;
 {
-  int c;
-  int c1, c2;
+#ifdef HAVE_ICONV
+  iconv_close (lex->handle);
+#endif
+  free (lex);
+}
 
-  if (ctxp->unget_utf8_value)
+static unicode_t
+java_read_char (lex)
+     java_lexer *lex;
+{
+  if (lex->unget_value)
     {
-      int to_return = ctxp->unget_utf8_value;
-      ctxp->unget_utf8_value = 0;
-      return (to_return);
+      unicode_t r = lex->unget_value;
+      lex->unget_value = 0;
+      return r;
     }
 
-  c = GETC ();
+#ifdef HAVE_ICONV
+  {
+    char out[2];
+    size_t ir, inbytesleft, in_save, out_count;
+    char *inp, *outp;
 
-  if (c < 128)
-    return (unicode_t)c;
-  if (c == EOF)
-    return UEOF;
-  else
-    {
-      if ((c & 0xe0) == 0xc0)
-        {
-          c1 = GETC ();
-	  if ((c1 & 0xc0) == 0x80)
-	    return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f));
-	  c = c1;
-	}
-      else if ((c & 0xf0) == 0xe0)
-        {
-          c1 = GETC ();
-	  if ((c1 & 0xc0) == 0x80)
-	    {
-	      c2 = GETC ();
-	      if ((c2 & 0xc0) == 0x80)
-	        return (unicode_t)(((c & 0xf) << 12) + 
-				   (( c1 & 0x3f) << 6) + (c2 & 0x3f));
-	      else
-		c = c2;
-	    }
-	  else
+    while (1)
+      {
+	/* See if we need to read more data.  If FIRST == 0 then the
+	   previous conversion attempt ended in the middle of a
+	   character at the end of the buffer.  Otherwise we only have
+	   to read if the buffer is empty.  */
+	if (lex->first == 0 || lex->first >= lex->last)
+	  {
+	    int r;
+
+	    if (lex->first >= lex->last)
+	      {
+		lex->first = 0;
+		lex->last = 0;
+	      }
+	    if (feof (lex->finput))
+	      return UEOF;
+	    r = fread (&lex->buffer[lex->last], 1,
+		       sizeof (lex->buffer) - lex->last,
+		       lex->finput);
+	    lex->last += r;
+	  }
+
+	inbytesleft = lex->last - lex->first;
+
+	if (inbytesleft == 0)
+	  {
+	    /* We've tried to read and there is nothing left.  */
+	    return UEOF;
+	  }
+
+	in_save = inbytesleft;
+	out_count = 2;
+	inp = &lex->buffer[lex->first];
+	outp = out;
+	ir = iconv (lex->handle, (const char **) &inp, &inbytesleft,
+		    &outp, &out_count);
+	lex->first += in_save - inbytesleft;
+
+	if (out_count == 0)
+	  {
+	    /* Success.  We assume that UCS-2 is big-endian.  This
+	       appears to be an ok assumption.  */
+	    unicode_t result;
+	    result = (((unsigned char) out[0]) << 8) | (unsigned char) out[1];
+	    return result;
+	  }
+
+	if (ir == (size_t) -1)
+	  {
+	    if (errno == EINVAL)
+	      {
+		/* This is ok.  This means that the end of our buffer
+		   is in the middle of a character sequence.  We just
+		   move the valid part of the buffer to the beginning
+		   to force a read.  */
+		/* We use bcopy() because it should work for
+		   overlapping strings.  Use memmove() instead... */
+		bcopy (&lex->buffer[lex->first], &lex->buffer[0],
+		       lex->last - lex->first);
+		lex->last -= lex->first;
+		lex->first = 0;
+	      }
+	    else
+	      {
+		/* A more serious error.  */
+		java_lex_error ("unrecognized character in input stream", 0);
+		return UEOF;
+	      }
+	  }
+      }
+  }
+#else /* HAVE_ICONV */
+  {
+    int c, c1, c2;
+    c = getc (lex->finput);
+
+    if (c < 128)
+      return (unicode_t)c;
+    if (c == EOF)
+      return UEOF;
+    else
+      {
+	if ((c & 0xe0) == 0xc0)
+	  {
+	    c1 = getc (lex->finput);
+	    if ((c1 & 0xc0) == 0x80)
+	      return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f));
 	    c = c1;
-	}
-      /* We looked for a UTF8 multi-byte sequence (since we saw an initial
-	 byte with the high bit set), but found invalid bytes instead.
-	 If the most recent byte was Ascii (and not EOF), we should
-	 unget it, in case it was a comment terminator or other delimitor. */
-      if ((c & 0x80) == 0)
-	UNGETC (c);
-      return BAD_UTF8_VALUE;
-    }
+	  }
+	else if ((c & 0xf0) == 0xe0)
+	  {
+	    c1 = getc (lex->finput);
+	    if ((c1 & 0xc0) == 0x80)
+	      {
+		c2 = getc (lex->finput);
+		if ((c2 & 0xc0) == 0x80)
+		  return (unicode_t)(((c & 0xf) << 12) + 
+				     (( c1 & 0x3f) << 6) + (c2 & 0x3f));
+		else
+		  c = c2;
+	      }
+	    else
+	      c = c1;
+	  }
+
+	/* We simply don't support invalid characters.  */
+	java_lex_error ("malformed UTF-8 character", 0);
+      }
+  }
+#endif /* HAVE_ICONV */
+
+  /* We only get here on error.  */
+  return UEOF;
 }
 
 static void
@@ -265,56 +389,54 @@
 }
 
 static unicode_t
-java_read_unicode (term_context, unicode_escape_p)
-    int term_context;
-    int *unicode_escape_p;
+java_read_unicode (lex, term_context, unicode_escape_p)
+     java_lexer *lex;
+     int term_context;
+     int *unicode_escape_p;
 {
   unicode_t c;
-  long i, base;
 
-  c = java_read_char ();
+  c = java_read_char (lex);
   *unicode_escape_p = 0;
 
   if (c != '\\')
-    return ((term_context ? c : 
-	     java_lineterminator (c) ? '\n' : (unicode_t)c));
-
-  /* Count the number of preceeding '\' */
-  for (base = ftell (finput), i = base-2; c == '\\';)
-    { 
-      fseek (finput, i--, SEEK_SET);
-      c = java_read_char ();	/* Will fail if reading utf8 stream. FIXME */
+    {
+      lex->bs_count = 0;
+      return (term_context ? c : (java_lineterminator (c)
+				  ? '\n'
+				  : (unicode_t) c));
     }
-  fseek (finput, base, SEEK_SET);
-  if ((base-i-3)%2 == 0)	/* If odd number of \ seen */
+
+  ++lex->bs_count;
+  if ((lex->bs_count) % 2 == 1)
     {
-      c = java_read_char ();
+      /* Odd number of \ seen.  */
+      c = java_read_char (lex);
       if (c == 'u')
         {
-	  unsigned short unicode = 0;
+	  unicode_t unicode = 0;
 	  int shift = 12;
 	  /* Next should be 4 hex digits, otherwise it's an error.
 	     The hex value is converted into the unicode, pushed into
 	     the Unicode stream.  */
 	  for (shift = 12; shift >= 0; shift -= 4)
 	    {
-	      if ((c = java_read_char ()) == UEOF)
+	      if ((c = java_read_char (lex)) == UEOF)
 	        return UEOF;
 	      if (c >= '0' && c <= '9')
 		unicode |= (unicode_t)((c-'0') << shift);
 	      else if ((c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))
 	        unicode |= (unicode_t)((10+(c | 0x20)-'a') << shift);
 	      else
-		  java_lex_error 
-		    ("Non hex digit in Unicode escape sequence", 0);
+		java_lex_error ("Non hex digit in Unicode escape sequence", 0);
 	    }
 	  *unicode_escape_p = 1;
-	  return (term_context ? unicode :
-		  (java_lineterminator (c) ? '\n' : unicode));
+	  return (term_context
+		  ? unicode : (java_lineterminator (c) ? '\n' : unicode));
 	}
-      ctxp->unget_utf8_value = c;
+      lex->unget_value = c;
     }
-  return (unicode_t)'\\';
+  return (unicode_t) '\\';
 }
 
 static unicode_t
@@ -329,7 +451,7 @@
 	for (;;)
 	  {
 	    int unicode_escape_p;
-	    c = java_read_unicode (0, &unicode_escape_p);
+	    c = java_read_unicode (ctxp->lexer, 0, &unicode_escape_p);
 	    java_store_unicode (ctxp->c_line, c, unicode_escape_p);
 	    if (ctxp->c_line->white_space_only 
 		&& !JAVA_WHITE_SPACE_P (c) && c!='\n')
@@ -352,7 +474,7 @@
   else if (c == '\r')		/* CR */
     {
       int unicode_escape_p;
-      c = java_read_unicode (1, &unicode_escape_p);
+      c = java_read_unicode (ctxp->lexer, 1, &unicode_escape_p);
       if (c == '\r')
 	{
 	  /* In this case we will have another terminator.  For some
@@ -361,7 +483,7 @@
 	     up in the actual text of the line, causing an error.  So
 	     instead we choose a very low-level method.  FIXME: this
 	     is incredibly ugly.  */
-	  UNGETC (c);
+	  ctxp->lexer->unget_value = c;
 	}
       else if (c != '\n')
 	{
@@ -937,7 +1059,7 @@
       char *string;
 
       for (no_error = 1, c = java_get_unicode (); 
-	   c != '"' && c != '\n'; c = java_get_unicode ())
+	   c != UEOF && c != '"' && c != '\n'; c = java_get_unicode ())
 	{
 	  if (c == '\\')
 	    c = java_parse_escape_sequence ();
Index: lex.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/java/lex.h,v
retrieving revision 1.15
diff -u -r1.15 lex.h
--- lex.h	2000/02/17 04:19:34	1.15
+++ lex.h	2000/08/26 18:00:10
@@ -35,6 +35,13 @@
 /* A Unicode character, as read from the input file  */
 typedef unsigned short unicode_t;
 
+#ifdef HAVE_ICONV
+#include <iconv.h>
+#endif /* HAVE_ICONV */
+
+/* Default encoding to use if no encoding is specified.  */
+#define DEFAULT_ENCODING "UTF-8"
+
 /* Debug macro to print-out what we match  */
 #ifdef JAVA_LEX_DEBUG
 #ifdef JAVA_LEX_DEBUG_CHAR
@@ -96,12 +103,38 @@
   int col;
 } java_lc;
 
+typedef struct java_lexer
+{
+  /* The file from which we're reading.  */
+  FILE *finput;
+
+  /* Number of consecutive backslashes we've read.  */
+  int bs_count;
+
+  /* If nonzero, a value that was pushed back.  */
+  unicode_t unget_value;
+
+#ifdef HAVE_ICONV
+  /* The handle for the iconv converter we're using.  */
+  iconv_t handle;
+
+  /* Bytes we've read from the file but have not sent to iconv.  */
+  char buffer[1024];
+
+  /* Index of first valid character in buffer, -1 if no valid
+     characters.  */
+  int first;
+
+  /* Index of last valid character in buffer, plus one.  -1 if no
+     valid characters in buffer.  */
+  int last;
+#endif /* HAVE_ICONV */
+} java_lexer;
 
-#define JAVA_LINE_MAX 80
+/* Destroy a lexer object.  */
+extern void java_destroy_lexer PARAMS ((java_lexer *));
 
-/* Macro to read and unread bytes */
-#define UNGETC(c) ungetc(c, finput)
-#define GETC()    getc(finput)
+#define JAVA_LINE_MAX 80
 
 /* Build a location compound integer */
 #define BUILD_LOCATION() ((ctxp->elc.line << 12) | (ctxp->elc.col & 0xfff))
Index: parse.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/java/parse.h,v
retrieving revision 1.57
diff -u -r1.57 parse.h
--- parse.h	2000/08/11 22:01:37	1.57
+++ parse.h	2000/08/26 18:00:11
@@ -728,13 +728,12 @@
 struct parser_ctxt {
 
   const char *filename;		    /* Current filename */
-  FILE *finput;			    /* Current file input stream */
   struct parser_ctxt *next;
 
+  java_lexer *lexer;		     /* Current lexer state */
   char marker_begining;		     /* Marker. Should be a sub-struct */
   struct java_line *p_line, *c_line; /* Previous and current line */
   java_lc elc;			     /* Error's line column info */
-  unicode_t unget_utf8_value;        /* An unget utf8 value */
   int ccb_indent;		     /* Keep track of {} indent, lexer */
   int first_ccb_indent1;	     /* First { at ident level 1 */
   int last_ccb_indent1;		     /* Last } at ident level 1 */
@@ -928,7 +927,7 @@
 /* Always in use, no matter what you compile */
 void java_push_parser_context PARAMS ((void));
 void java_pop_parser_context PARAMS ((int));
-void java_init_lex PARAMS ((void));
+void java_init_lex PARAMS ((FILE *, const char *));
 extern void java_parser_context_save_global PARAMS ((void));
 extern void java_parser_context_restore_global PARAMS ((void));
 int yyparse PARAMS ((void));
Index: parse.y
===================================================================
RCS file: /cvs/gcc/egcs/gcc/java/parse.y,v
retrieving revision 1.204
diff -u -r1.204 parse.y
--- parse.y	2000/08/24 20:34:39	1.204
+++ parse.y	2000/08/26 18:00:22
@@ -2615,10 +2615,13 @@
       next->incomplete_class = ctxp->incomplete_class;
       next->gclass_list = ctxp->gclass_list;
       lineno = ctxp->lineno;
-      finput = ctxp->finput;
       current_class = ctxp->current_class;
     }
 
+  /* If the old and new lexers differ, then free the old one.  */
+  if (ctxp->lexer && next && ctxp->lexer != next->lexer)
+    java_destroy_lexer (ctxp->lexer);
+
   /* Set the single import class file flag to 0 for the current list
      of imported things */
   for (current = ctxp->import_list; current; current = TREE_CHAIN (current))
@@ -2658,7 +2661,6 @@
   else if (ctxp->saved_data)
     create_new_parser_context (1);
 
-  ctxp->finput = finput;
   ctxp->lineno = lineno;
   ctxp->current_class = current_class;
   ctxp->filename = input_filename;
@@ -2672,7 +2674,6 @@
 void
 java_parser_context_restore_global ()
 {
-  finput = ctxp->finput;
   lineno = ctxp->lineno;
   current_class = ctxp->current_class;
   input_filename = ctxp->filename;

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]