This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
patch to convert input character set to source character set
- From: Chandra Chavva <cchavva at redhat dot com>
- To: neil at daikokuya dot co dot uk, neil at daikokuya dot co dot uk, gcc-patches at gcc dot gnu dot org
- Date: Tue, 23 Sep 2003 08:50:21 -0700
- Subject: patch to convert input character set to source character set
I have attached the patch which converters the input character set to
source character set, UTF-8 or UTF-EDCDIC depending on basic execution
character set. Also introduced new option --input-character= to specify
the default character set of the source file. The input character set is
set to "ISO-8859-1" by default.
Zack has already approved this patch.
Please let me know is this patch okay to checkin.
Thanks
Chandra
2003-08-22 Chandrakala Chavva <cchavva@redhat.com>
* c.opt (-finput-charset=): New option.
* c-opts.c (c_common_handle_option): Handle OPT_finput_charset_.
* cpplib.h (cpp_options): Add input_charset.
(_cpp_input_to_utf8): Declare.
* cppinit.c (cpp_create_reader): Initialize input_charset.
* cppcharset.c (one_88591_to_utf8): New function.
(convert_iso88591_utf8): New function.
Add convert_iso88591_utf8 to conversion_tab.
(_cpp_input_to_utf8): New function.
(_cpp_init_iconv_buffer): New function.
(_cpp_close_iconv_buffer): New function.
* cpphash.h (_cpp_init_iconv_buffer): Declare.
(_cpp_close_iconv_buffer): Declare.
(struct cpp_buffer): Add input_cset_desc.
* cpplib.c (cpp_push_buffer): Call _cpp_init_iconv_buffer to
initialize the format of input file.
(_cpp_pop_buffer): Call _cpp_close_iconv_buffer.
* cpplex.c (_cpp_clean_line): Call _cpp_input_to_utf8 to convert
the complete line to UTF-8 format.
(_cpp_process_line_notes): Check buffer->next_line instead of
buffer->cur.
(_cpp_lex_direct): Same.
Index: c.opt
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/c.opt,v
retrieving revision 1.15
diff -p -r1.15 c.opt
*** c.opt 19 Aug 2003 20:29:00 -0000 1.15
--- c.opt 25 Aug 2003 22:58:30 -0000
*************** fexec-charset=
*** 478,483 ****
--- 478,487 ----
C ObjC C++ ObjC++ Joined RejectNegative
-fexec-charset=<cset> Convert all strings and character constants to character set <cset>
+ finput-charset=
+ C ObjC C++ ObjC++ Joined RejectNegative
+ -finput-charset=<codeset> Specify the default character set for source files.
+
fexternal-templates
C++ ObjC++
Index: c-opts.c
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/c-opts.c,v
retrieving revision 1.88
diff -p -r1.88 c-opts.c
*** c-opts.c 19 Aug 2003 20:29:00 -0000 1.88
--- c-opts.c 25 Aug 2003 22:58:30 -0000
*************** c_common_handle_option (size_t scode, co
*** 899,904 ****
--- 899,908 ----
cpp_opts->wide_charset = arg;
break;
+ case OPT_finput_charset_:
+ cpp_opts->input_charset = arg;
+ break;
+
case OPT_ftemplate_depth_:
max_tinst_depth = value;
break;
Index: cpplib.h
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/cpplib.h,v
retrieving revision 1.266
diff -p -r1.266 cpplib.h
*** cpplib.h 19 Aug 2003 21:04:38 -0000 1.266
--- cpplib.h 25 Aug 2003 22:58:30 -0000
*************** struct cpp_options
*** 332,337 ****
--- 332,340 ----
/* Holds the name of the target wide character set. */
const char *wide_charset;
+ /* Holds the default character set for source files. */
+ char *input_charset;
+
/* True to warn about precompiled header files we couldn't use. */
bool warn_invalid_pch;
*************** extern const char *cpp_type2name (enum c
*** 695,700 ****
--- 698,704 ----
string literal. Handles all relevant diagnostics. */
extern cppchar_t cpp_parse_escape (cpp_reader *, const unsigned char ** pstr,
const unsigned char *limit, int wide);
+ extern char *_cpp_input_to_utf8 (cpp_reader *, const unsigned char *, cppchar_t);
/* In cpphash.c */
Index: cppinit.c
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/cppinit.c,v
retrieving revision 1.295
diff -p -r1.295 cppinit.c
*** cppinit.c 19 Aug 2003 21:04:38 -0000 1.295
--- cppinit.c 25 Aug 2003 22:58:31 -0000
*************** Foundation, 59 Temple Place - Suite 330,
*** 24,29 ****
--- 24,30 ----
#include "cpplib.h"
#include "cpphash.h"
#include "mkdeps.h"
static void init_library (void);
static void mark_named_operators (cpp_reader *);
*************** cpp_create_reader (enum c_lang lang, has
*** 160,165 ****
--- 161,169 ----
/* Default to no charset conversion. */
CPP_OPTION (pfile, narrow_charset) = 0;
CPP_OPTION (pfile, wide_charset) = 0;
+
+ /* Default the input character set to iso-8859-1 for now. */
+ CPP_OPTION (pfile, input_charset) = "ISO-8859-1";
/* A fake empty "directory" used as the starting point for files
looked up without a search path. Name cannot be '/' because we
Index: cppcharset.c
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/cppcharset.c,v
retrieving revision 1.11
diff -p -r1.11 cppcharset.c
*** cppcharset.c 13 Jul 2003 17:34:17 -0000 1.11
--- cppcharset.c 25 Aug 2003 22:58:31 -0000
*************** one_utf16_to_utf8 (iconv_t bigend, const
*** 444,449 ****
--- 444,473 ----
return 0;
}
+ /* The first 256 characters of ISO-8859-1 are similar to the first 256
+ characters of Unicode, therefore the input value can be passed directly
+ to one_cppchar_to_utf8. */
+
+ static int
+ one_iso88591_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
+ uchar **outbufp, size_t *outbytesleftp)
+ {
+ const uchar *inbuf = *inbufp;
+ int rval;
+
+ if (*inbytesleftp < 1)
+ return EINVAL;
+
+ rval = one_cppchar_to_utf8 (*inbuf, outbufp, outbytesleftp);
+ if (rval)
+ return rval;
+
+ *inbufp += 1;
+ *inbytesleftp -= 1;
+
+ return 0;
+ }
+
/* Helper routine for the next few functions. The 'const' on
one_conversion means that we promise not to modify what function is
pointed to, which lets the inliner see through it. */
*************** convert_utf32_utf8 (iconv_t cd, const uc
*** 527,532 ****
--- 551,564 ----
return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
}
+ static bool
+ convert_iso88591_utf8 (iconv_t cd, const uchar *from, size_t flen,
+ struct strbuf *to)
+ {
+ return conversion_loop (one_iso88591_to_utf8, cd, from, flen, to);
+ }
+
+
/* Identity conversion, used when we have no alternative. */
static bool
convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
*************** static const struct conversion conversio
*** 604,609 ****
--- 636,642 ----
{ "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
{ "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
{ "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
+ { "ISO-8859-1/UTF-8", convert_iso88591_utf8, (iconv_t)0 },
};
/* Subroutine of cpp_init_iconv: initialize and return a
*************** cpp_interpret_charconst (cpp_reader *pfi
*** 1343,1346 ****
--- 1376,1422 ----
free ((void *)str.text);
return result;
+ }
+
+ char *
+ _cpp_input_to_utf8 (cpp_reader *pfile, const uchar *input, cppchar_t length)
+ {
+ struct strbuf tbuf;
+ struct cset_converter cvt = pfile->buffer->input_cset_desc;
+
+ tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, length);
+ tbuf.text = xmalloc (tbuf.asize);
+ tbuf.len = 0;
+
+ if (!APPLY_CONVERSION (cvt, input, length, &tbuf))
+ {
+ cpp_error (pfile, DL_ERROR, "converting input to source character set.");
+ return NULL;
+ }
+
+ if (length)
+ tbuf.text[tbuf.len] = '\n';
+ else
+ tbuf.text[0] = '\n';
+
+ return tbuf.text;
+ }
+
+ /* Check the input file format. At present assuming the input file
+ is in iso-8859-1 format. Convert this input character set to
+ source character set format (UTF-8). */
+
+ void
+ _cpp_init_iconv_buffer (cpp_reader *pfile, const char *from)
+ {
+ pfile->buffer->input_cset_desc = init_iconv_desc (pfile, SOURCE_CHARSET,
+ from);
+ }
+
+ void
+ _cpp_close_iconv_buffer (cpp_reader *pfile)
+ {
+ if (HAVE_ICONV
+ && pfile->buffer->input_cset_desc.func == convert_using_iconv)
+ iconv_close (pfile->buffer->input_cset_desc.cd);
}
Index: cpphash.h
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/cpphash.h,v
retrieving revision 1.197
diff -p -r1.197 cpphash.h
*** cpphash.h 2 Aug 2003 16:29:44 -0000 1.197
--- cpphash.h 25 Aug 2003 22:58:31 -0000
*************** struct cpp_buffer
*** 318,323 ****
--- 318,327 ----
/* Used for buffer overlays by cpptrad.c. */
const uchar *saved_cur, *saved_rlimit;
+
+ /* Descriptor for converting from the input character set to the
+ source character set. */
+ struct cset_converter input_cset_desc;
};
/* A cpp_reader encapsulates the "state" of a pre-processor run.
*************** extern void _cpp_init_internal_pragmas (
*** 556,561 ****
--- 560,567 ----
extern void _cpp_do_file_change (cpp_reader *, enum lc_reason, const char *,
unsigned int, unsigned int);
extern void _cpp_pop_buffer (cpp_reader *);
+ extern void _cpp_init_iconv_buffer (cpp_reader *, const char *);
+ extern void _cpp_close_iconv_buffer (cpp_reader *);
/* In cpptrad.c. */
extern bool _cpp_scan_out_logical_line (cpp_reader *, cpp_macro *);
Index: cpplib.c
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/cpplib.c,v
retrieving revision 1.350
diff -p -r1.350 cpplib.c
*** cpplib.c 2 Aug 2003 16:29:44 -0000 1.350
--- cpplib.c 25 Aug 2003 22:58:31 -0000
*************** cpp_push_buffer (cpp_reader *pfile, cons
*** 1912,1917 ****
--- 1912,1918 ----
int from_stage3, int return_at_eof)
{
cpp_buffer *new = xobnew (&pfile->buffer_ob, cpp_buffer);
+ const char *from = CPP_OPTION (pfile, input_charset);
/* Clears, amongst other things, if_stack and mi_cmacro. */
memset (new, 0, sizeof (cpp_buffer));
*************** cpp_push_buffer (cpp_reader *pfile, cons
*** 1924,1929 ****
--- 1925,1932 ----
new->need_line = true;
pfile->buffer = new;
+ _cpp_init_iconv_buffer (pfile, from);
+
return new;
}
*************** _cpp_pop_buffer (cpp_reader *pfile)
*** 1944,1949 ****
--- 1947,1954 ----
/* In case of a missing #endif. */
pfile->state.skipping = 0;
+
+ _cpp_close_iconv_buffer (pfile);
/* _cpp_do_file_change expects pfile->buffer to be the new one. */
pfile->buffer = buffer->prev;
Index: cpplex.c
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/cpplex.c,v
retrieving revision 1.246
diff -p -r1.246 cpplex.c
*** cpplex.c 21 Aug 2003 15:57:50 -0000 1.246
--- cpplex.c 25 Aug 2003 22:58:31 -0000
*************** void
*** 103,116 ****
_cpp_clean_line (cpp_reader *pfile)
{
cpp_buffer *buffer;
! const uchar *s;
uchar c, *d, *p;
buffer = pfile->buffer;
buffer->cur_note = buffer->notes_used = 0;
! buffer->cur = buffer->line_base = buffer->next_line;
buffer->need_line = false;
s = buffer->next_line - 1;
if (!buffer->from_stage3)
{
--- 103,118 ----
_cpp_clean_line (cpp_reader *pfile)
{
cpp_buffer *buffer;
! const uchar *s, *start;
uchar c, *d, *p;
+ cppchar_t len;
buffer = pfile->buffer;
buffer->cur_note = buffer->notes_used = 0;
! buffer->line_base = buffer->next_line;
buffer->need_line = false;
s = buffer->next_line - 1;
+ start = buffer->next_line;
if (!buffer->from_stage3)
{
*************** _cpp_clean_line (cpp_reader *pfile)
*** 164,169 ****
--- 166,175 ----
s++;
}
+ len = s - start;
+
+ pfile->buffer->cur = _cpp_input_to_utf8 (pfile, start, len);
+
*d = '\n';
/* A sentinel note that should never be processed. */
add_line_note (buffer, d + 1, '\n');
*************** _cpp_process_line_notes (cpp_reader *pfi
*** 210,216 ****
_cpp_line_note *note = &buffer->notes[buffer->cur_note];
unsigned int col;
! if (note->pos > buffer->cur)
break;
buffer->cur_note++;
--- 216,222 ----
_cpp_line_note *note = &buffer->notes[buffer->cur_note];
unsigned int col;
! if (note->pos == buffer->next_line)
break;
buffer->cur_note++;
*************** _cpp_lex_direct (cpp_reader *pfile)
*** 787,793 ****
result->line = pfile->line;
skipped_white:
! if (buffer->cur >= buffer->notes[buffer->cur_note].pos
&& !pfile->overlaid_buffer)
{
_cpp_process_line_notes (pfile, false);
--- 793,799 ----
result->line = pfile->line;
skipped_white:
! if (buffer->next_line >= buffer->notes[buffer->cur_note].pos
&& !pfile->overlaid_buffer)
{
_cpp_process_line_notes (pfile, false);