Quick 'n dirty charset support

Neil Booth neil@daikokuya.co.uk
Sun Apr 20 22:32:00 GMT 2003


This patch, which I'm not proposing applying, implements mapping
physical source file characters to the source character set (UTF-8).
With very limited testing, it appears to work on a recent Debian
machine of mine.

I happen to run that machine in en_GB locale, which nl_langinfo (CHARSET)
returns as ISO-8859-1, which means that every source file read is now
being converted (the patch elides C and UTF-8 locales).  This kinda sucks.

I'm looking for suggestions and ideas on where possibly to go next.
Having to deal with just UTF-8 in the lexer is attractive, but being
able to use ISO 8859 charsets without much iconv-ing would be nice too.

Neil.

Index: cppfiles.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/cppfiles.c,v
retrieving revision 1.167
diff -u -p -r1.167 cppfiles.c
--- cppfiles.c	19 Apr 2003 00:22:47 -0000	1.167
+++ cppfiles.c	20 Apr 2003 22:23:30 -0000
@@ -44,32 +44,6 @@ Foundation, 59 Temple Place - Suite 330,
 # define ENOTDIR 0
 #endif
 
-/* Suppress warning about function macros used w/o arguments in traditional
-   C.  It is unlikely that glibc's strcmp macro helps this file at all.  */
-#undef strcmp
-
-/* This structure is used for the table of all includes.  */
-struct include_file {
-  const char *name;		/* actual path name of file */
-  const char *header_name;	/* the original header found */
-  const cpp_hashnode *cmacro;	/* macro, if any, preventing reinclusion.  */
-  const struct cpp_path *foundhere;
-				/* location in search path where file was
-				   found, for #include_next and sysp.  */
-  const unsigned char *buffer;	/* pointer to cached file contents */
-  struct stat st;		/* copy of stat(2) data for file */
-  int fd;			/* fd open on file (short term storage only) */
-  int err_no;			/* errno obtained if opening a file failed */
-  unsigned short include_count;	/* number of times file has been read */
-  unsigned char pch;		/* 0: file not known to be a PCH.
-				   1: file is a PCH 
-				      (on return from find_include_file).
-				   2: file is not and never will be a valid
-				      precompiled header.
-				   3: file is always a valid precompiled
-				      header.  */
-};
-
 /* Variable length record files on VMS will have a stat size that includes
    record control characters that won't be included in the read size.  */
 #ifdef VMS
@@ -425,7 +399,7 @@ stack_include_file (pfile, inc)
     inc->include_count++;
 
   /* Push a buffer.  */
-  fp = cpp_push_buffer (pfile, inc->buffer, inc->st.st_size,
+  fp = cpp_push_buffer (pfile, (uchar *) inc->buffer, inc->st.st_size,
 			/* from_stage3 */ CPP_OPTION (pfile, preprocessed), 0);
   fp->inc = inc;
 
@@ -462,7 +436,7 @@ read_include_file (pfile, inc)
      struct include_file *inc;
 {
   ssize_t size, offset, count;
-  uchar *buf;
+  char *buf;
 
   if (S_ISREG (inc->st.st_mode))
     {
@@ -482,7 +456,7 @@ read_include_file (pfile, inc)
       size = inc->st.st_size;
 
 	{
-	  buf = (uchar *) xmalloc (size + 1);
+	  buf = xmalloc (size + 1);
 	  offset = 0;
 	  while (offset < size)
 	    {
@@ -501,8 +475,6 @@ read_include_file (pfile, inc)
 		}
 	      offset += count;
 	    }
-	  /* The lexer requires that the buffer be \n-terminated.  */
-	  buf[size] = '\n';
 	}
     }
   else if (S_ISBLK (inc->st.st_mode))
@@ -517,7 +489,7 @@ read_include_file (pfile, inc)
 	 bigger than the majority of C source files.  */
       size = 8 * 1024;
 
-      buf = (uchar *) xmalloc (size + 1);
+      buf = xmalloc (size + 1);
       offset = 0;
       while ((count = read (inc->fd, buf + offset, size - offset)) > 0)
 	{
@@ -535,12 +507,11 @@ read_include_file (pfile, inc)
 	buf = xrealloc (buf, offset + 1);
 
       /* The lexer requires that the buffer be \n-terminated.  */
-      buf[offset] = '\n';
       inc->st.st_size = offset;
     }
 
   inc->buffer = buf;
-  return 0;
+  return _cpp_convert_buffer (pfile, inc);
 
  perror_fail:
   cpp_errno (pfile, DL_ERROR, inc->name);
Index: cppcharset.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/cppcharset.c,v
retrieving revision 1.1
diff -u -p -r1.1 cppcharset.c
--- cppcharset.c	20 Apr 2003 07:29:20 -0000	1.1
+++ cppcharset.c	20 Apr 2003 22:23:30 -0000
@@ -27,6 +27,103 @@ Foundation, 59 Temple Place - Suite 330,
 
 static int ucn_valid_in_identifier PARAMS ((cpp_reader *, cppchar_t));
 
+#if HAVE_ICONV
+#include <langinfo.h>
+
+static void iconv_file PARAMS ((cpp_reader *, struct include_file *));
+
+static void
+iconv_file (pfile, inc)
+     cpp_reader *pfile;
+     struct include_file *inc;
+{
+  size_t src_len, dest_len, dest_written, result;
+  size_t inbytes, outbytes;
+  char *src, *dest, *dest_buf;
+
+  src = inc->buffer;
+  src_len = inc->st.st_size;
+  /* An initial guess at a sufficiently big output buffer.  */
+  dest_len = src_len + src_len / 4 + 1000;
+  /* Allocate an extra byte for a terminating '\n'.  */
+  dest_buf = xmalloc (dest_len + 1);
+  dest = dest_buf, dest_written = 0;
+  inbytes = src_len;
+
+  /* Reset conversion state.  */
+  iconv (pfile->src_cd, NULL, NULL, NULL, NULL);
+
+  for (;;)
+    {
+      /* Convert the buffer.  */
+      outbytes = dest_len - dest_written;
+      result = iconv (pfile->src_cd, &src, &inbytes, &dest, &outbytes);
+      dest_written = dest_len - outbytes;
+      if (result != (size_t) -1)
+	break;
+
+      /* An error occurred.  */
+      if (errno == E2BIG)
+	{
+	  /* Attempt to guess a new buffer size that will be just a
+	     bit too big for the rest of the input.  */
+	  dest_len = dest_len * (double) src_len / (src_len - inbytes);
+	  dest_len += dest_len / 20 + 1000;
+
+	  /* Allocate an extra byte for a terminating '\n'.  */
+	  dest_buf = xrealloc (dest_buf, dest_len + 1);
+	  dest = dest_buf + dest_written;
+	  continue;
+	}
+      else
+	cpp_errno (pfile, DL_ERROR, inc->name);
+
+      dest_written = 0;
+      break;
+    }
+
+  /* Free the original buffer.  */
+  free (inc->buffer);
+  inc->buffer = dest_buf;
+  inc->st.st_size = dest_written;
+}
+#endif
+
+/* Convert inc->buffer to UTF-8 if necessary.  On return the buffer is
+   guaranteed \n-terminated.  Returns non-zero on failure.  */
+int
+_cpp_convert_buffer (pfile, inc)
+     cpp_reader *pfile ATTRIBUTE_UNUSED;
+     struct include_file *inc;
+{
+  int result = 0;
+
+#if HAVE_ICONV
+  if (pfile->iconv_state == 0)
+    {
+      const char *locale = nl_langinfo (CODESET);
+
+      pfile->iconv_state = 2;
+
+      /* Elide iconv conversions for C and UTF-8 locales.  */
+      if (strcmp (locale, "C" ) && strcmp (locale, "UTF-8"))
+	{
+	  pfile->src_cd = iconv_open ("UTF-8", locale);
+	  if (pfile->src_cd == (iconv_t) -1)
+	    cpp_errno (pfile, DL_ERROR, progname);
+	  else
+	    pfile->iconv_state = 1;
+	}
+    }
+
+  if (pfile->iconv_state == 1)
+    iconv_file (pfile, inc);
+#endif
+
+  inc->buffer[inc->st.st_size] = '\n';
+  return result;
+}
+
 /* [lex.charset]: The character designated by the universal character
    name \UNNNNNNNN is that character whose character short name in
    ISO/IEC 10646 is NNNNNNNN; the character designated by the
Index: cpphash.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/cpphash.h,v
retrieving revision 1.185
diff -u -p -r1.185 cpphash.h
--- cpphash.h	20 Apr 2003 19:02:53 -0000	1.185
+++ cpphash.h	20 Apr 2003 22:23:30 -0000
@@ -23,6 +23,9 @@ Foundation, 59 Temple Place - Suite 330,
 #ifndef GCC_CPPHASH_H
 #define GCC_CPPHASH_H
 
+#if HAVE_ICONV
+#include <iconv.h>
+#endif
 #include "hashtable.h"
 
 struct directive;		/* Deliberately incomplete.  */
@@ -370,6 +373,16 @@ struct cpp_reader
      for include files.  (Altered as we get more of them.)  */
   unsigned int max_include_len;
 
+#if HAVE_ICONV
+  /* A tristate variable.  If 0, the need for iconv is untested.  If
+     1, use iconv to convert input files to UTF-8 with the open
+     descriptor src_cd, otherwise 2.  */
+  uchar iconv_state;
+
+  /* The opened desciptor to use on each input file.  */
+  iconv_t src_cd;
+#endif
+
   /* Date and time text.  Calculated together if either is requested.  */
   const uchar *date;
   const uchar *time;
@@ -438,6 +451,29 @@ struct cpp_reader
   struct cpp_savedstate *savedstate;
 };
 
+/* This structure is used for the table of all includes.  */
+struct include_file
+{
+  const char *name;		/* actual path name of file */
+  const char *header_name;	/* the original header found */
+  const cpp_hashnode *cmacro;	/* macro, if any, preventing reinclusion.  */
+  const struct cpp_path *foundhere;
+				/* location in search path where file was
+				   found, for #include_next and sysp.  */
+  char *buffer;			/* pointer to cached UTF-8 file contents */
+  struct stat st;		/* copy of stat(2) data for file */
+  int fd;			/* fd open on file (short term storage only) */
+  int err_no;			/* errno obtained if opening a file failed */
+  unsigned short include_count;	/* number of times file has been read */
+  unsigned char pch;		/* 0: file not known to be a PCH.
+				   1: file is a PCH 
+				      (on return from find_include_file).
+				   2: file is not and never will be a valid
+				      precompiled header.
+				   3: file is always a valid precompiled
+				      header.  */
+};
+
 /* Character classes.  Based on the more primitive macros in safe-ctype.h.
    If the definition of `numchar' looks odd to you, please look up the
    definition of a pp-number in the C standard [section 6.4.8 of C99].
@@ -555,8 +591,9 @@ extern uchar *_cpp_copy_replacement_text
 extern size_t _cpp_replacement_text_len PARAMS ((const cpp_macro *));
 
 /* In cppcharset.c.  */
-cppchar_t _cpp_valid_ucn PARAMS ((cpp_reader *, const uchar **,
-				  int identifer_p));
+extern cppchar_t _cpp_valid_ucn PARAMS ((cpp_reader *, const uchar **,
+					 int identifer_p));
+extern int _cpp_convert_buffer PARAMS ((cpp_reader *, struct include_file *));
 
 /* Utility routines and macros.  */
 #define DSC(str) (const uchar *)str, sizeof str - 1



More information about the Gcc-patches mailing list