This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

patch to convert input character set to source character set


I have attached the patch which converters the input character set to source character set, UTF-8 or UTF-EDCDIC depending on basic execution character set. Also introduced new option --input-character= to specify the default character set of the source file. The input character set is set to "ISO-8859-1" by default.

Zack has already approved this patch.

Please let me know is this patch okay to checkin.

Thanks
Chandra

2003-08-22  Chandrakala Chavva  <cchavva@redhat.com>

        * c.opt (-finput-charset=): New option.
        * c-opts.c (c_common_handle_option): Handle OPT_finput_charset_.
        * cpplib.h (cpp_options): Add input_charset.
        (_cpp_input_to_utf8): Declare.
        * cppinit.c (cpp_create_reader): Initialize input_charset.
        * cppcharset.c (one_88591_to_utf8): New function.
        (convert_iso88591_utf8): New function.
        Add convert_iso88591_utf8 to conversion_tab.
        (_cpp_input_to_utf8): New function.
        (_cpp_init_iconv_buffer): New function.
        (_cpp_close_iconv_buffer): New function.
        * cpphash.h (_cpp_init_iconv_buffer): Declare.
        (_cpp_close_iconv_buffer): Declare.
        (struct cpp_buffer): Add input_cset_desc.
        * cpplib.c (cpp_push_buffer): Call _cpp_init_iconv_buffer to 
        initialize the format of input file.
        (_cpp_pop_buffer): Call _cpp_close_iconv_buffer.
        * cpplex.c (_cpp_clean_line): Call _cpp_input_to_utf8 to convert
        the complete line to UTF-8 format.
        (_cpp_process_line_notes): Check buffer->next_line instead of
        buffer->cur.
        (_cpp_lex_direct): Same.

Index: c.opt
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/c.opt,v
retrieving revision 1.15
diff -p -r1.15 c.opt
*** c.opt	19 Aug 2003 20:29:00 -0000	1.15
--- c.opt	25 Aug 2003 22:58:30 -0000
*************** fexec-charset=
*** 478,483 ****
--- 478,487 ----
  C ObjC C++ ObjC++ Joined RejectNegative
  -fexec-charset=<cset>	Convert all strings and character constants to character set <cset>
  
+ finput-charset=
+ C ObjC C++ ObjC++ Joined RejectNegative
+ -finput-charset=<codeset>	Specify the default character set for source files.
+ 
  fexternal-templates
  C++ ObjC++
  
Index: c-opts.c
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/c-opts.c,v
retrieving revision 1.88
diff -p -r1.88 c-opts.c
*** c-opts.c	19 Aug 2003 20:29:00 -0000	1.88
--- c-opts.c	25 Aug 2003 22:58:30 -0000
*************** c_common_handle_option (size_t scode, co
*** 899,904 ****
--- 899,908 ----
        cpp_opts->wide_charset = arg;
        break;
  
+     case OPT_finput_charset_:
+       cpp_opts->input_charset = arg;
+       break;
+ 
      case OPT_ftemplate_depth_:
        max_tinst_depth = value;
        break;
Index: cpplib.h
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/cpplib.h,v
retrieving revision 1.266
diff -p -r1.266 cpplib.h
*** cpplib.h	19 Aug 2003 21:04:38 -0000	1.266
--- cpplib.h	25 Aug 2003 22:58:30 -0000
*************** struct cpp_options
*** 332,337 ****
--- 332,340 ----
    /* Holds the name of the target wide character set.  */
    const char *wide_charset;
  
+   /* Holds the default character set for source files. */
+   char *input_charset;
+ 
    /* True to warn about precompiled header files we couldn't use.  */
    bool warn_invalid_pch;
  
*************** extern const char *cpp_type2name (enum c
*** 695,700 ****
--- 698,704 ----
     string literal.  Handles all relevant diagnostics.  */
  extern cppchar_t cpp_parse_escape (cpp_reader *, const unsigned char ** pstr,
  				   const unsigned char *limit, int wide);
+ extern char *_cpp_input_to_utf8 (cpp_reader *, const unsigned char *, cppchar_t);
  
  /* In cpphash.c */
  
Index: cppinit.c
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/cppinit.c,v
retrieving revision 1.295
diff -p -r1.295 cppinit.c
*** cppinit.c	19 Aug 2003 21:04:38 -0000	1.295
--- cppinit.c	25 Aug 2003 22:58:31 -0000
*************** Foundation, 59 Temple Place - Suite 330,
*** 24,29 ****
--- 24,30 ----
  #include "cpplib.h"
  #include "cpphash.h"
  #include "mkdeps.h"
  
  static void init_library (void);
  static void mark_named_operators (cpp_reader *);
*************** cpp_create_reader (enum c_lang lang, has
*** 160,165 ****
--- 161,169 ----
    /* Default to no charset conversion.  */
    CPP_OPTION (pfile, narrow_charset) = 0;
    CPP_OPTION (pfile, wide_charset) = 0;
+ 
+   /* Default the input character set to iso-8859-1 for now. */
+   CPP_OPTION (pfile, input_charset) = "ISO-8859-1";
  
    /* A fake empty "directory" used as the starting point for files
       looked up without a search path.  Name cannot be '/' because we
Index: cppcharset.c
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/cppcharset.c,v
retrieving revision 1.11
diff -p -r1.11 cppcharset.c
*** cppcharset.c	13 Jul 2003 17:34:17 -0000	1.11
--- cppcharset.c	25 Aug 2003 22:58:31 -0000
*************** one_utf16_to_utf8 (iconv_t bigend, const
*** 444,449 ****
--- 444,473 ----
    return 0;
  }
  
+ /* The first 256 characters of ISO-8859-1 are similar to the first 256
+    characters of Unicode, therefore the input value can be passed directly
+    to one_cppchar_to_utf8. */
+   
+ static int 
+ one_iso88591_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
+                       uchar **outbufp, size_t *outbytesleftp)
+ {
+   const uchar *inbuf = *inbufp;
+   int rval;
+ 
+   if (*inbytesleftp < 1)
+     return EINVAL;
+ 
+   rval = one_cppchar_to_utf8 (*inbuf, outbufp, outbytesleftp);
+   if (rval)
+     return rval;
+ 
+   *inbufp += 1;
+   *inbytesleftp -= 1;
+ 
+   return 0;
+ } 
+ 
  /* Helper routine for the next few functions.  The 'const' on
     one_conversion means that we promise not to modify what function is
     pointed to, which lets the inliner see through it.  */
*************** convert_utf32_utf8 (iconv_t cd, const uc
*** 527,532 ****
--- 551,564 ----
    return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
  }
  
+ static bool
+ convert_iso88591_utf8 (iconv_t cd, const uchar *from, size_t flen,
+                        struct strbuf *to)
+ {
+   return conversion_loop (one_iso88591_to_utf8, cd, from, flen, to);
+ }
+ 
+ 
  /* Identity conversion, used when we have no alternative.  */
  static bool
  convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
*************** static const struct conversion conversio
*** 604,609 ****
--- 636,642 ----
    { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
    { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
    { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
+   { "ISO-8859-1/UTF-8", convert_iso88591_utf8, (iconv_t)0 },
  };
  
  /* Subroutine of cpp_init_iconv: initialize and return a
*************** cpp_interpret_charconst (cpp_reader *pfi
*** 1343,1346 ****
--- 1376,1422 ----
      free ((void *)str.text);
  
    return result;
+ }
+ 
+ char *
+ _cpp_input_to_utf8 (cpp_reader *pfile, const uchar *input, cppchar_t length)
+ {
+   struct strbuf tbuf;
+   struct cset_converter cvt = pfile->buffer->input_cset_desc;
+ 
+   tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, length);
+   tbuf.text = xmalloc (tbuf.asize);
+   tbuf.len = 0;
+ 
+   if (!APPLY_CONVERSION (cvt, input, length, &tbuf))
+    {
+       cpp_error (pfile, DL_ERROR, "converting input to source character set.");
+       return NULL;
+    }
+ 
+   if (length)
+     tbuf.text[tbuf.len] = '\n';
+   else 
+     tbuf.text[0] = '\n';
+ 
+   return tbuf.text;
+ }
+   
+   /* Check the input file format. At present assuming the input file
+      is in iso-8859-1 format. Convert this input character set to
+      source character set format (UTF-8). */
+ 
+ void
+ _cpp_init_iconv_buffer (cpp_reader *pfile, const char *from)
+ {
+   pfile->buffer->input_cset_desc = init_iconv_desc (pfile, SOURCE_CHARSET, 
+ 						    from);
+ }
+ 
+ void 
+ _cpp_close_iconv_buffer (cpp_reader *pfile)
+ {
+   if (HAVE_ICONV 
+       && pfile->buffer->input_cset_desc.func == convert_using_iconv)
+     iconv_close (pfile->buffer->input_cset_desc.cd);
  }
Index: cpphash.h
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/cpphash.h,v
retrieving revision 1.197
diff -p -r1.197 cpphash.h
*** cpphash.h	2 Aug 2003 16:29:44 -0000	1.197
--- cpphash.h	25 Aug 2003 22:58:31 -0000
*************** struct cpp_buffer
*** 318,323 ****
--- 318,327 ----
  
    /* Used for buffer overlays by cpptrad.c.  */
    const uchar *saved_cur, *saved_rlimit;
+   
+   /* Descriptor for converting from the input character set to the
+      source character set.  */
+   struct cset_converter input_cset_desc; 
  };
  
  /* A cpp_reader encapsulates the "state" of a pre-processor run.
*************** extern void _cpp_init_internal_pragmas (
*** 556,561 ****
--- 560,567 ----
  extern void _cpp_do_file_change (cpp_reader *, enum lc_reason, const char *,
  				 unsigned int, unsigned int);
  extern void _cpp_pop_buffer (cpp_reader *);
+ extern void _cpp_init_iconv_buffer (cpp_reader *, const char *);
+ extern void _cpp_close_iconv_buffer (cpp_reader *);
  
  /* In cpptrad.c.  */
  extern bool _cpp_scan_out_logical_line (cpp_reader *, cpp_macro *);
Index: cpplib.c
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/cpplib.c,v
retrieving revision 1.350
diff -p -r1.350 cpplib.c
*** cpplib.c	2 Aug 2003 16:29:44 -0000	1.350
--- cpplib.c	25 Aug 2003 22:58:31 -0000
*************** cpp_push_buffer (cpp_reader *pfile, cons
*** 1912,1917 ****
--- 1912,1918 ----
  		 int from_stage3, int return_at_eof)
  {
    cpp_buffer *new = xobnew (&pfile->buffer_ob, cpp_buffer);
+   const char *from = CPP_OPTION (pfile, input_charset);
  
    /* Clears, amongst other things, if_stack and mi_cmacro.  */
    memset (new, 0, sizeof (cpp_buffer));
*************** cpp_push_buffer (cpp_reader *pfile, cons
*** 1924,1929 ****
--- 1925,1932 ----
    new->need_line = true;
  
    pfile->buffer = new;
+   _cpp_init_iconv_buffer (pfile, from);
+ 
    return new;
  }
  
*************** _cpp_pop_buffer (cpp_reader *pfile)
*** 1944,1949 ****
--- 1947,1954 ----
  
    /* In case of a missing #endif.  */
    pfile->state.skipping = 0;
+ 
+   _cpp_close_iconv_buffer (pfile);
  
    /* _cpp_do_file_change expects pfile->buffer to be the new one.  */
    pfile->buffer = buffer->prev;
Index: cpplex.c
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/cpplex.c,v
retrieving revision 1.246
diff -p -r1.246 cpplex.c
*** cpplex.c	21 Aug 2003 15:57:50 -0000	1.246
--- cpplex.c	25 Aug 2003 22:58:31 -0000
*************** void
*** 103,116 ****
  _cpp_clean_line (cpp_reader *pfile)
  {
    cpp_buffer *buffer;
!   const uchar *s;
    uchar c, *d, *p;
  
    buffer = pfile->buffer;
    buffer->cur_note = buffer->notes_used = 0;
!   buffer->cur = buffer->line_base = buffer->next_line;
    buffer->need_line = false;
    s = buffer->next_line - 1;
  
    if (!buffer->from_stage3)
      {
--- 103,118 ----
  _cpp_clean_line (cpp_reader *pfile)
  {
    cpp_buffer *buffer;
!   const uchar *s, *start;
    uchar c, *d, *p;
+   cppchar_t len;
  
    buffer = pfile->buffer;
    buffer->cur_note = buffer->notes_used = 0;
!   buffer->line_base = buffer->next_line;
    buffer->need_line = false;
    s = buffer->next_line - 1;
+   start = buffer->next_line;
  
    if (!buffer->from_stage3)
      {
*************** _cpp_clean_line (cpp_reader *pfile)
*** 164,169 ****
--- 166,175 ----
  	s++;
      }
  
+   len = s - start;
+ 
+   pfile->buffer->cur = _cpp_input_to_utf8 (pfile, start, len);
+ 
    *d = '\n';
    /* A sentinel note that should never be processed.  */
    add_line_note (buffer, d + 1, '\n');
*************** _cpp_process_line_notes (cpp_reader *pfi
*** 210,216 ****
        _cpp_line_note *note = &buffer->notes[buffer->cur_note];
        unsigned int col;
  
!       if (note->pos > buffer->cur)
  	break;
  
        buffer->cur_note++;
--- 216,222 ----
        _cpp_line_note *note = &buffer->notes[buffer->cur_note];
        unsigned int col;
  
!       if (note->pos == buffer->next_line)
  	break;
  
        buffer->cur_note++;
*************** _cpp_lex_direct (cpp_reader *pfile)
*** 787,793 ****
    result->line = pfile->line;
  
   skipped_white:
!   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
        && !pfile->overlaid_buffer)
      {
        _cpp_process_line_notes (pfile, false);
--- 793,799 ----
    result->line = pfile->line;
  
   skipped_white:
!   if (buffer->next_line >= buffer->notes[buffer->cur_note].pos
        && !pfile->overlaid_buffer)
      {
        _cpp_process_line_notes (pfile, false);

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]