Quick 'n dirty charset support
Neil Booth
neil@daikokuya.co.uk
Sun Apr 20 22:32:00 GMT 2003
This patch, which I'm not proposing applying, implements mapping
physical source file characters to the source character set (UTF-8).
With very limited testing, it appears to work on a recent Debian
machine of mine.
I happen to run that machine in en_GB locale, which nl_langinfo (CHARSET)
returns as ISO-8859-1, which means that every source file read is now
being converted (the patch elides C and UTF-8 locales). This kinda sucks.
I'm looking for suggestions and ideas on where possibly to go next.
Having to deal with just UTF-8 in the lexer is attractive, but being
able to use ISO 8859 charsets without much iconv-ing would be nice too.
Neil.
Index: cppfiles.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/cppfiles.c,v
retrieving revision 1.167
diff -u -p -r1.167 cppfiles.c
--- cppfiles.c 19 Apr 2003 00:22:47 -0000 1.167
+++ cppfiles.c 20 Apr 2003 22:23:30 -0000
@@ -44,32 +44,6 @@ Foundation, 59 Temple Place - Suite 330,
# define ENOTDIR 0
#endif
-/* Suppress warning about function macros used w/o arguments in traditional
- C. It is unlikely that glibc's strcmp macro helps this file at all. */
-#undef strcmp
-
-/* This structure is used for the table of all includes. */
-struct include_file {
- const char *name; /* actual path name of file */
- const char *header_name; /* the original header found */
- const cpp_hashnode *cmacro; /* macro, if any, preventing reinclusion. */
- const struct cpp_path *foundhere;
- /* location in search path where file was
- found, for #include_next and sysp. */
- const unsigned char *buffer; /* pointer to cached file contents */
- struct stat st; /* copy of stat(2) data for file */
- int fd; /* fd open on file (short term storage only) */
- int err_no; /* errno obtained if opening a file failed */
- unsigned short include_count; /* number of times file has been read */
- unsigned char pch; /* 0: file not known to be a PCH.
- 1: file is a PCH
- (on return from find_include_file).
- 2: file is not and never will be a valid
- precompiled header.
- 3: file is always a valid precompiled
- header. */
-};
-
/* Variable length record files on VMS will have a stat size that includes
record control characters that won't be included in the read size. */
#ifdef VMS
@@ -425,7 +399,7 @@ stack_include_file (pfile, inc)
inc->include_count++;
/* Push a buffer. */
- fp = cpp_push_buffer (pfile, inc->buffer, inc->st.st_size,
+ fp = cpp_push_buffer (pfile, (uchar *) inc->buffer, inc->st.st_size,
/* from_stage3 */ CPP_OPTION (pfile, preprocessed), 0);
fp->inc = inc;
@@ -462,7 +436,7 @@ read_include_file (pfile, inc)
struct include_file *inc;
{
ssize_t size, offset, count;
- uchar *buf;
+ char *buf;
if (S_ISREG (inc->st.st_mode))
{
@@ -482,7 +456,7 @@ read_include_file (pfile, inc)
size = inc->st.st_size;
{
- buf = (uchar *) xmalloc (size + 1);
+ buf = xmalloc (size + 1);
offset = 0;
while (offset < size)
{
@@ -501,8 +475,6 @@ read_include_file (pfile, inc)
}
offset += count;
}
- /* The lexer requires that the buffer be \n-terminated. */
- buf[size] = '\n';
}
}
else if (S_ISBLK (inc->st.st_mode))
@@ -517,7 +489,7 @@ read_include_file (pfile, inc)
bigger than the majority of C source files. */
size = 8 * 1024;
- buf = (uchar *) xmalloc (size + 1);
+ buf = xmalloc (size + 1);
offset = 0;
while ((count = read (inc->fd, buf + offset, size - offset)) > 0)
{
@@ -535,12 +507,11 @@ read_include_file (pfile, inc)
buf = xrealloc (buf, offset + 1);
/* The lexer requires that the buffer be \n-terminated. */
- buf[offset] = '\n';
inc->st.st_size = offset;
}
inc->buffer = buf;
- return 0;
+ return _cpp_convert_buffer (pfile, inc);
perror_fail:
cpp_errno (pfile, DL_ERROR, inc->name);
Index: cppcharset.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/cppcharset.c,v
retrieving revision 1.1
diff -u -p -r1.1 cppcharset.c
--- cppcharset.c 20 Apr 2003 07:29:20 -0000 1.1
+++ cppcharset.c 20 Apr 2003 22:23:30 -0000
@@ -27,6 +27,103 @@ Foundation, 59 Temple Place - Suite 330,
static int ucn_valid_in_identifier PARAMS ((cpp_reader *, cppchar_t));
+#if HAVE_ICONV
+#include <langinfo.h>
+
+static void iconv_file PARAMS ((cpp_reader *, struct include_file *));
+
+static void
+iconv_file (pfile, inc)
+ cpp_reader *pfile;
+ struct include_file *inc;
+{
+ size_t src_len, dest_len, dest_written, result;
+ size_t inbytes, outbytes;
+ char *src, *dest, *dest_buf;
+
+ src = inc->buffer;
+ src_len = inc->st.st_size;
+ /* An initial guess at a sufficiently big output buffer. */
+ dest_len = src_len + src_len / 4 + 1000;
+ /* Allocate an extra byte for a terminating '\n'. */
+ dest_buf = xmalloc (dest_len + 1);
+ dest = dest_buf, dest_written = 0;
+ inbytes = src_len;
+
+ /* Reset conversion state. */
+ iconv (pfile->src_cd, NULL, NULL, NULL, NULL);
+
+ for (;;)
+ {
+ /* Convert the buffer. */
+ outbytes = dest_len - dest_written;
+ result = iconv (pfile->src_cd, &src, &inbytes, &dest, &outbytes);
+ dest_written = dest_len - outbytes;
+ if (result != (size_t) -1)
+ break;
+
+ /* An error occurred. */
+ if (errno == E2BIG)
+ {
+ /* Attempt to guess a new buffer size that will be just a
+ bit too big for the rest of the input. */
+ dest_len = dest_len * (double) src_len / (src_len - inbytes);
+ dest_len += dest_len / 20 + 1000;
+
+ /* Allocate an extra byte for a terminating '\n'. */
+ dest_buf = xrealloc (dest_buf, dest_len + 1);
+ dest = dest_buf + dest_written;
+ continue;
+ }
+ else
+ cpp_errno (pfile, DL_ERROR, inc->name);
+
+ dest_written = 0;
+ break;
+ }
+
+ /* Free the original buffer. */
+ free (inc->buffer);
+ inc->buffer = dest_buf;
+ inc->st.st_size = dest_written;
+}
+#endif
+
+/* Convert inc->buffer to UTF-8 if necessary. On return the buffer is
+ guaranteed \n-terminated. Returns non-zero on failure. */
+int
+_cpp_convert_buffer (pfile, inc)
+ cpp_reader *pfile ATTRIBUTE_UNUSED;
+ struct include_file *inc;
+{
+ int result = 0;
+
+#if HAVE_ICONV
+ if (pfile->iconv_state == 0)
+ {
+ const char *locale = nl_langinfo (CODESET);
+
+ pfile->iconv_state = 2;
+
+ /* Elide iconv conversions for C and UTF-8 locales. */
+ if (strcmp (locale, "C" ) && strcmp (locale, "UTF-8"))
+ {
+ pfile->src_cd = iconv_open ("UTF-8", locale);
+ if (pfile->src_cd == (iconv_t) -1)
+ cpp_errno (pfile, DL_ERROR, progname);
+ else
+ pfile->iconv_state = 1;
+ }
+ }
+
+ if (pfile->iconv_state == 1)
+ iconv_file (pfile, inc);
+#endif
+
+ inc->buffer[inc->st.st_size] = '\n';
+ return result;
+}
+
/* [lex.charset]: The character designated by the universal character
name \UNNNNNNNN is that character whose character short name in
ISO/IEC 10646 is NNNNNNNN; the character designated by the
Index: cpphash.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/cpphash.h,v
retrieving revision 1.185
diff -u -p -r1.185 cpphash.h
--- cpphash.h 20 Apr 2003 19:02:53 -0000 1.185
+++ cpphash.h 20 Apr 2003 22:23:30 -0000
@@ -23,6 +23,9 @@ Foundation, 59 Temple Place - Suite 330,
#ifndef GCC_CPPHASH_H
#define GCC_CPPHASH_H
+#if HAVE_ICONV
+#include <iconv.h>
+#endif
#include "hashtable.h"
struct directive; /* Deliberately incomplete. */
@@ -370,6 +373,16 @@ struct cpp_reader
for include files. (Altered as we get more of them.) */
unsigned int max_include_len;
+#if HAVE_ICONV
+ /* A tristate variable. If 0, the need for iconv is untested. If
+ 1, use iconv to convert input files to UTF-8 with the open
+ descriptor src_cd, otherwise 2. */
+ uchar iconv_state;
+
+ /* The opened desciptor to use on each input file. */
+ iconv_t src_cd;
+#endif
+
/* Date and time text. Calculated together if either is requested. */
const uchar *date;
const uchar *time;
@@ -438,6 +451,29 @@ struct cpp_reader
struct cpp_savedstate *savedstate;
};
+/* This structure is used for the table of all includes. */
+struct include_file
+{
+ const char *name; /* actual path name of file */
+ const char *header_name; /* the original header found */
+ const cpp_hashnode *cmacro; /* macro, if any, preventing reinclusion. */
+ const struct cpp_path *foundhere;
+ /* location in search path where file was
+ found, for #include_next and sysp. */
+ char *buffer; /* pointer to cached UTF-8 file contents */
+ struct stat st; /* copy of stat(2) data for file */
+ int fd; /* fd open on file (short term storage only) */
+ int err_no; /* errno obtained if opening a file failed */
+ unsigned short include_count; /* number of times file has been read */
+ unsigned char pch; /* 0: file not known to be a PCH.
+ 1: file is a PCH
+ (on return from find_include_file).
+ 2: file is not and never will be a valid
+ precompiled header.
+ 3: file is always a valid precompiled
+ header. */
+};
+
/* Character classes. Based on the more primitive macros in safe-ctype.h.
If the definition of `numchar' looks odd to you, please look up the
definition of a pp-number in the C standard [section 6.4.8 of C99].
@@ -555,8 +591,9 @@ extern uchar *_cpp_copy_replacement_text
extern size_t _cpp_replacement_text_len PARAMS ((const cpp_macro *));
/* In cppcharset.c. */
-cppchar_t _cpp_valid_ucn PARAMS ((cpp_reader *, const uchar **,
- int identifer_p));
+extern cppchar_t _cpp_valid_ucn PARAMS ((cpp_reader *, const uchar **,
+ int identifer_p));
+extern int _cpp_convert_buffer PARAMS ((cpp_reader *, struct include_file *));
/* Utility routines and macros. */
#define DSC(str) (const uchar *)str, sizeof str - 1
More information about the Gcc-patches
mailing list