This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
Iconv fallbacks - need testers

From: "Zack Weinberg" <zack at codesourcery dot com>
To: gcc-patches at gcc dot gnu dot org
Cc: "Kaveh R. Ghazi" <ghazi at caip dot rutgers dot edu>, Hans-Peter Nilsson <hp at bitrange dot com>
Date: Thu, 10 Jul 2003 09:15:42 -0700
Subject: Iconv fallbacks - need testers
This patch implements fallback conversion primitives for cpplib.
Fallback is not quite the proper word - to make sure they don't become
disused and break, they are first choice, even on systems where the
iconv library is up to scratch.  This should solve the problems where
cpplib bombs out because it can't convert from UTF-8 to UCS-[24][BL]E.

I've tested this on my system, so I know the conversion operations
work properly; I need people to try it out on systems that are
currently broken and let me know if it helps.

zw

        * cppcharset.c (one_utf8_to_cppchar, one_cppchar_to_utf8,
        one_utf8_to_utf32, one_utf32_to_utf8, one_utf8_to_utf16,
        one_utf16_to_utf8, conversion_loop, convert_utf8_utf16,
        convert_utf8_utf32, convert_utf16_utf8, convert_utf32_utf8,
        convert_no_conversion, convert_using_iconv): New functions.
        (APPLY_CONVERSION): New macro.
        (struct conversion, conversion_tab): New data structure.
        (init_iconv_desc): Check conversion_tab for a custom conversion
        primitive before trying to use iconv.
        (convert_cset): Deleted.
        (cpp_init_iconv): Use UTF- terminology, not UCS-.
        (_cpp_destroy_iconv): Update to match.
        (_cpp_valid_ucn): We don't need iconv to implement UCNs.
        (convert_ucn): Use one_cppchar_to_utf8 and APPLY_CONVERSION.
        (convert_escape, cpp_interpret_string): Use APPLY_CONVERSION.
        (_cpp_interpret_string_notranslate): New function, moved here
        from cpplib.c.

        * cpphash.h (convert_f, struct cset_converter): New types.
        (struct cpp_reader): narrow_cset_desc and wide_cset_desc
        are now struct cset_converter, not bare iconv_t.
        Update prototypes.
        * cpplib.c (interpret_string_notranslate): Moved to cppcharset.c;
        all callers changed.

===================================================================
Index: cppcharset.c
--- cppcharset.c	7 Jul 2003 04:46:29 -0000	1.7
+++ cppcharset.c	10 Jul 2003 16:07:19 -0000
@@ -92,8 +92,7 @@ Foundation, 59 Temple Place - Suite 330,
 #error "Unrecognized basic host character set"
 #endif
 
-/* This structure is used for a resizable string buffer, mostly by
-   convert_cset and cpp_interpret_string.  */
+/* This structure is used for a resizable string buffer throughout.  */
 struct strbuf
 {
   uchar *text;
@@ -103,23 +102,545 @@ struct strbuf
 
 /* This is enough to hold any string that fits on a single 80-column
    line, even if iconv quadruples its size (e.g. conversion from
-   ASCII to UCS-4) rounded up to a power of two.  */
+   ASCII to UTF-32) rounded up to a power of two.  */
 #define OUTBUF_BLOCK_SIZE 256
 
-/* Subroutine of cpp_init_iconv: initialize and return an iconv
-   descriptor for conversion from FROM to TO.  If iconv_open() fails,
-   issue an error and return (iconv_t) -1.  Silently return
-   (iconv_t) -1 if FROM and TO are identical.  */
-static iconv_t
-init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
+/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
+   logic.  This is because a depressing number of systems lack iconv,
+   or have have iconv libraries that do not do these conversions, so
+   we need a fallback implementation for them.  To ensure the fallback
+   doesn't break due to neglect, it is used on all systems.
+
+   UTF-32 encoding is nice and simple: a four-byte binary number,
+   constrained to the range 00000000-7FFFFFFF to avoid questions of
+   signedness.  We do have to cope with big- and little-endian
+   variants.
+
+   UTF-16 encoding uses two-byte binary numbers, again in big- and
+   little-endian variants, for all values in the 00000000-0000FFFF
+   range.  Values in the 00010000-0010FFFF range are encoded as pairs
+   of two-byte numbers, called "surrogate pairs": given a number S in
+   this range, it is mapped to a pair (H, L) as follows:
+
+     H = (S - 0x10000) / 0x400 + 0xD800
+     L = (S - 0x10000) % 0x400 + 0xDC00
+
+   Two-byte values in the D800...DFFF range are ill-formed except as a
+   component of a surrogate pair.  Even if the encoding within a
+   two-byte value is little-endian, the H member of the surrogate pair
+   comes first.
+
+   There is no way to encode values in the 00110000-7FFFFFFF range,
+   which is not currently a problem as there are no assigned code
+   points in that range; however, the author expects that it will
+   eventually become necessary to abandon UTF-16 due to this
+   limitation.  Note also that, because of these pairs, UTF-16 does
+   not meet the requirements of the C standard for a wide character
+   encoding (see 3.7.3 and 6.4.4.4p11).
+
+   UTF-8 encoding looks like this:
+
+   value range	       encoded as
+   00000000-0000007F   0xxxxxxx
+   00000080-000007FF   110xxxxx 10xxxxxx
+   00000800-0000FFFF   1110xxxx 10xxxxxx 10xxxxxx
+   00010000-001FFFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+   00200000-03FFFFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+   04000000-7FFFFFFF   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+   Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
+   which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
+   never occur.  Note also that any value that can be encoded by a
+   given row of the table can also be encoded by all successive rows,
+   but this is not done; only the shortest possible encoding for any
+   given value is valid.  For instance, the character 07C0 could be
+   encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
+   FC 80 80 80 9F 80.  Only the first is valid.
+
+   An implementation note: the transformation from UTF-16 to UTF-8, or
+   vice versa, is easiest done by using UTF-32 as an intermediary.  */
+
+/* Internal primitives which go from an UTF-8 byte stream to native-endian
+   UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
+   operation in several places below.  */
+static inline int
+one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
+		     cppchar_t *cp)
 {
-  iconv_t dsc;
+  static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
+  static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+  
+  cppchar_t c;
+  const uchar *inbuf = *inbufp;
+  size_t nbytes, i;
+
+  if (*inbytesleftp < 1)
+    return EINVAL;
+
+  c = *inbuf;
+  if (c < 0x80)
+    {
+      *cp = c;
+      *inbytesleftp -= 1;
+      *inbufp += 1;
+      return 0;
+    }
+
+  /* The number of leading 1-bits in the first byte indicates how many
+     bytes follow.  */
+  for (nbytes = 2; nbytes < 7; nbytes++)
+    if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
+      goto found;
+  return EILSEQ;
+ found:
+
+  if (*inbytesleftp < nbytes)
+    return EINVAL;
 
-  if (!strcmp (to, from))
-    return (iconv_t) -1;
+  c = (c & masks[nbytes-1]);
+  inbuf++;
+  for (i = 1; i < nbytes; i++)
+    {
+      cppchar_t n = *inbuf++;
+      if ((n & 0xC0) != 0x80)
+	return EILSEQ;
+      c = ((c << 6) + (n & 0x3F));
+    }
+
+  /* Make sure the shortest possible encoding was used.  */
+  if (c <=      0x7F && nbytes > 1) return EILSEQ;
+  if (c <=     0x7FF && nbytes > 2) return EILSEQ;
+  if (c <=    0xFFFF && nbytes > 3) return EILSEQ;
+  if (c <=  0x1FFFFF && nbytes > 4) return EILSEQ;
+  if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
+
+  /* Make sure the character is valid.  */
+  if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
+
+  *cp = c;
+  *inbufp = inbuf;
+  *inbytesleftp -= nbytes;
+  return 0;
+}
+
+static inline int
+one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
+{
+  static const uchar masks[6] =  { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+  static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
+  size_t nbytes;
+  uchar buf[6], *p = &buf[6];
+  uchar *outbuf = *outbufp;
 
-  dsc = iconv_open (to, from);
-  if (dsc == (iconv_t) -1)
+  nbytes = 1;
+  if (c < 0x80)
+    *--p = c;
+  else
+    {
+      do
+	{
+	  *--p = ((c & 0x3F) | 0x80);
+	  c >>= 6;
+	  nbytes++;
+	}
+      while (c >= 0x3F || (c & limits[nbytes-1]));
+      *--p = (c | masks[nbytes-1]);
+    }
+
+  if (*outbytesleftp < nbytes)
+    return E2BIG;
+
+  while (p < &buf[6])
+    *outbuf++ = *p++;
+  *outbytesleftp -= nbytes;
+  *outbufp = outbuf;
+  return 0;
+}
+
+/* The following four functions transform one character between the two
+   encodings named in the function name.  All have the signature
+   int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
+           uchar **outbufp, size_t *outbytesleftp)
+
+   BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
+   interpreted as a boolean indicating whether big-endian or
+   little-endian encoding is to be used for the member of the pair
+   that is not UTF-8.
+
+   INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
+   do for iconv.
+
+   The return value is either 0 for success, or an errno value for
+   failure, which may be E2BIG (need more space), EILSEQ (ill-formed
+   input sequence), ir EINVAL (incomplete input sequence).  */
+   
+static inline int
+one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
+		   uchar **outbufp, size_t *outbytesleftp)
+{
+  uchar *outbuf;
+  cppchar_t s;
+  int rval;
+
+  /* Check for space first, since we know exactly how much we need.  */
+  if (*outbytesleftp < 4)
+    return E2BIG;
+
+  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
+  if (rval)
+    return rval;
+
+  outbuf = *outbufp;
+  outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
+  outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
+  outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
+  outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
+
+  *outbufp += 4;
+  *outbytesleftp -= 4;
+  return 0;
+}
+
+static inline int
+one_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
+		   uchar **outbufp, size_t *outbytesleftp)
+{
+  cppchar_t s;
+  int rval;
+  const uchar *inbuf;
+
+  if (*inbytesleftp < 4)
+    return EINVAL;
+
+  inbuf = *inbufp;
+
+  s  = inbuf[bigend ? 0 : 3] << 24;
+  s += inbuf[bigend ? 1 : 2] << 16;
+  s += inbuf[bigend ? 2 : 1] << 8;
+  s += inbuf[bigend ? 3 : 0];
+
+  if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
+    return EILSEQ;
+
+  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
+  if (rval)
+    return rval;
+
+  *inbufp += 4;
+  *inbytesleftp -= 4;
+  return 0;
+}
+
+static inline int
+one_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
+		   uchar **outbufp, size_t *outbytesleftp)
+{
+  int rval;
+  cppchar_t s;
+  const uchar *save_inbuf = *inbufp;
+  size_t save_inbytesleft = *inbytesleftp;
+  uchar *outbuf = *outbufp;
+
+  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
+  if (rval)
+    return rval;
+
+  if (s > 0x0010FFFF)
+    {
+      *inbufp = save_inbuf;
+      *inbytesleftp = save_inbytesleft;
+      return EILSEQ;
+    }
+
+  if (s < 0xFFFF)
+    {
+      if (*outbytesleftp < 2)
+	{
+	  *inbufp = save_inbuf;
+	  *inbytesleftp = save_inbytesleft;
+	  return E2BIG;
+	}
+      outbuf[bigend ? 1 : 0] = (s & 0x00FF);
+      outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
+
+      *outbufp += 2;
+      *outbytesleftp -= 2;
+      return 0;
+    }
+  else
+    {
+      cppchar_t hi, lo;
+
+      if (*outbytesleftp < 4)
+	{
+	  *inbufp = save_inbuf;
+	  *inbytesleftp = save_inbytesleft;
+	  return E2BIG;
+	}
+
+      hi = (s - 0x10000) / 0x400 + 0xD800;
+      lo = (s - 0x10000) % 0x400 + 0xDC00;
+
+      /* Even if we are little-endian, put the high surrogate first.
+	 ??? Matches practice?  */
+      outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
+      outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
+      outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
+      outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
+
+      *outbufp += 4;
+      *outbytesleftp -= 4;
+      return 0;
+    }
+}
+
+static inline int
+one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
+		   uchar **outbufp, size_t *outbytesleftp)
+{
+  cppchar_t s;
+  const uchar *inbuf = *inbufp;
+  int rval;
+
+  if (*inbytesleftp < 2)
+    return EINVAL;
+  s  = inbuf[bigend ? 0 : 1] << 8;
+  s += inbuf[bigend ? 1 : 0];
+
+  /* Low surrogate without immediately preceding high surrogate is invalid.  */
+  if (s >= 0xDC00 && s <= 0xDFFF)
+    return EILSEQ;
+  /* High surrogate must have a following low surrogate.  */
+  else if (s >= 0xD800 && s <= 0xDBFF)
+    {
+      cppchar_t hi = s, lo;
+      if (*inbytesleftp < 4)
+	return EINVAL;
+
+      lo  = inbuf[bigend ? 2 : 3] << 8;
+      lo += inbuf[bigend ? 3 : 2];
+
+      if (lo < 0xDC00 || lo > 0xDFFF)
+	return EILSEQ;
+
+      s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
+    }
+
+  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
+  if (rval)
+    return rval;
+
+  /* Success - update the input pointers (one_cppchar_to_utf8 has done
+     the output pointers for us).  */
+  if (s <= 0xFFFF)
+    {
+      *inbufp += 2;
+      *inbytesleftp -= 2;
+    }
+  else
+    {
+      *inbufp += 4;
+      *inbytesleftp -= 4;
+    }
+  return 0;
+}
+
+/* Helper routine for the next few functions.  The 'const' on
+   one_conversion means that we promise not to modify what function is
+   pointed to, which lets the inliner see through it. */
+
+static inline bool
+conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
+					     uchar **, size_t *),
+		 iconv_t cd, const uchar *from, size_t flen, struct strbuf *to)
+{
+  const uchar *inbuf;
+  uchar *outbuf;
+  size_t inbytesleft, outbytesleft;
+  int rval;
+
+  inbuf = from;
+  inbytesleft = flen;
+  outbuf = to->text + to->len;
+  outbytesleft = to->asize - to->len;
+
+  for (;;)
+    {
+      do
+	rval = one_conversion (cd, &inbuf, &inbytesleft,
+			       &outbuf, &outbytesleft);
+      while (inbytesleft && !rval);
+
+      if (__builtin_expect (inbytesleft == 0, 1))
+	{
+	  to->len = to->asize - outbytesleft;
+	  return true;
+	}
+      if (rval != E2BIG)
+	{
+	  errno = rval;
+	  return false;
+	}
+
+      outbytesleft += OUTBUF_BLOCK_SIZE;
+      to->asize += OUTBUF_BLOCK_SIZE;
+      to->text = xrealloc (to->text, to->asize);
+      outbuf = to->text + to->asize - outbytesleft;
+    }
+}
+		 
+
+/* These functions convert entire strings between character sets.
+   They all have the signature
+
+   bool (*)(iconv_t cd, const uchar *from, size_t flen, struct strbuf *to);
+
+   The input string FROM is converted as specified by the function
+   name plus the iconv descriptor CD (which may be fake), and the
+   result appended to TO.  On any error, false is returned, otherwise true.  */
+
+/* These four use the custom conversion code above.  */
+static bool
+convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
+		    struct strbuf *to)
+{
+  return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
+}
+
+static bool
+convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
+		    struct strbuf *to)
+{
+  return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
+}
+
+static bool
+convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
+		    struct strbuf *to)
+{
+  return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
+}
+
+static bool
+convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
+		    struct strbuf *to)
+{
+  return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
+}
+
+/* Identity conversion, used when we have no alternative.  */
+static bool
+convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
+		       const uchar *from, size_t flen, struct strbuf *to)
+{
+  if (to->len + flen > to->asize)
+    {
+      to->asize = to->len + flen;
+      to->text = xrealloc (to->text, to->asize);
+    }
+  memcpy (to->text + to->len, from, flen);
+  to->len += flen;
+  return true;
+}
+
+/* And this one uses the system iconv primitive.  It's a little
+   different, since iconv's interface is a little different.  */
+
+static bool
+convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
+		     struct strbuf *to)
+{
+  ICONV_CONST char *inbuf;
+  char *outbuf;
+  size_t inbytesleft, outbytesleft;
+
+  /* Reset conversion descriptor and check that it is valid.  */
+  if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
+    return false;
+
+  inbuf = (ICONV_CONST char *)from;
+  inbytesleft = flen;
+  outbuf = (char *)to->text + to->len;
+  outbytesleft = to->asize - to->len;
+
+  for (;;)
+    {
+      iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
+      if (__builtin_expect (inbytesleft == 0, 1))
+	{
+	  to->len = to->asize - outbytesleft;
+	  return true;
+	}
+      if (errno != E2BIG)
+	return false;
+
+      outbytesleft += OUTBUF_BLOCK_SIZE;
+      to->asize += OUTBUF_BLOCK_SIZE;
+      to->text = xrealloc (to->text, to->asize);
+      outbuf = (char *)to->text + to->asize - outbytesleft;
+    }
+}
+
+/* Arrange for the above custom conversion logic to be used automatically
+   when conversion between a suitable pair of character sets is requested.  */
+
+#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
+   CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
+
+struct conversion
+{
+  const char *pair;
+  convert_f func;
+  iconv_t fake_cd;
+};
+static const struct conversion conversion_tab[] = {
+  { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
+  { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
+  { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
+  { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
+  { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
+  { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
+  { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
+  { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
+};
+
+/* Subroutine of cpp_init_iconv: initialize and return a
+   cset_converter structure for conversion from FROM to TO.  If
+   iconv_open() fails, issue an error and return an identity
+   converter.  Silently return an identity converter if FROM and TO
+   are identical.  */
+static struct cset_converter
+init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
+{
+  struct cset_converter ret;
+  char *pair;
+  size_t i;
+  
+  if (!strcasecmp (to, from))
+    {
+      ret.func = convert_no_conversion;
+      ret.cd = (iconv_t) -1;
+      return ret;
+    }
+
+  pair = alloca(strlen(to) + strlen(from) + 2);
+
+  strcpy(pair, from);
+  strcat(pair, "/");
+  strcat(pair, to);
+  for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
+    if (!strcasecmp (pair, conversion_tab[i].pair))
+      {
+	ret.func = conversion_tab[i].func;
+	ret.cd = conversion_tab[i].fake_cd;
+	return ret;
+      }
+
+  /* No custom converter - try iconv.  */
+  ret.func = convert_using_iconv;
+  ret.cd = iconv_open (to, from);
+
+  if (ret.cd == (iconv_t) -1)
     {
       if (errno == EINVAL)
 	cpp_error (pfile, DL_ERROR, /* XXX should be DL_SORRY */
@@ -127,8 +648,10 @@ init_iconv_desc (cpp_reader *pfile, cons
 		   from, to);
       else
 	cpp_errno (pfile, DL_ERROR, "iconv_open");
+
+      ret.func = convert_no_conversion;
     }
-  return dsc;
+  return ret;
 }
 
 /* If charset conversion is requested, initialize iconv(3) descriptors
@@ -146,9 +669,9 @@ cpp_init_iconv (cpp_reader *pfile)
   bool be = CPP_OPTION (pfile, bytes_big_endian);
 
   if (CPP_OPTION (pfile, wchar_precision) >= 32)
-    default_wcset = be ? "UCS-4BE" : "UCS-4LE";
+    default_wcset = be ? "UTF-32BE" : "UTF-32LE";
   else if (CPP_OPTION (pfile, wchar_precision) >= 16)
-    default_wcset = be ? "UCS-2BE" : "UCS-2LE";
+    default_wcset = be ? "UTF-16BE" : "UTF-16LE";
   else
     /* This effectively means that wide strings are not supported,
        so don't do any conversion at all.  */
@@ -181,67 +704,13 @@ _cpp_destroy_iconv (cpp_reader *pfile)
 {
   if (HAVE_ICONV)
     {
-      if (pfile->narrow_cset_desc != (iconv_t) -1)
-	iconv_close (pfile->narrow_cset_desc);
-      if (pfile->wide_cset_desc != (iconv_t) -1)
-	iconv_close (pfile->wide_cset_desc);
+      if (pfile->narrow_cset_desc.func == convert_using_iconv)
+	iconv_close (pfile->narrow_cset_desc.cd);
+      if (pfile->wide_cset_desc.func == convert_using_iconv)
+	iconv_close (pfile->wide_cset_desc.cd);
     }
 }
 
-/* iconv(3) utility wrapper.  Convert the string FROM, of length FLEN,
-   according to the iconv descriptor CD.  The result is appended to
-   the string buffer TO.  If DESC is (iconv_t)-1 or iconv is not
-   available, the string is simply copied into TO.
-
-   Returns true on success, false on error.  */
-
-static bool
-convert_cset (iconv_t cd, const uchar *from, size_t flen, struct strbuf *to)
-{
-  if (!HAVE_ICONV || cd == (iconv_t)-1)
-    {
-      if (to->len + flen > to->asize)
-	{
-	  to->asize = to->len + flen;
-	  to->text = xrealloc (to->text, to->asize);
-	}
-      memcpy (to->text + to->len, from, flen);
-      to->len += flen;
-      return true;
-    }
-  else
-    {
-      ICONV_CONST char *inbuf;
-      char *outbuf;
-      size_t inbytesleft, outbytesleft;
-
-      /* Reset conversion descriptor and check that it is valid.  */
-      if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
-	return false;
-
-      inbuf = (ICONV_CONST char *)from;
-      inbytesleft = flen;
-      outbuf = (char *)to->text + to->len;
-      outbytesleft = to->asize - to->len;
-
-      for (;;)
-	{
-	  iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
-	  if (__builtin_expect (inbytesleft == 0, 1))
-	    {
-	      to->len = to->asize - outbytesleft;
-	      return true;
-	    }
-	  if (errno != E2BIG)
-	    return false;
-
-	  outbytesleft += OUTBUF_BLOCK_SIZE;
-	  to->asize += OUTBUF_BLOCK_SIZE;
-	  to->text = xrealloc (to->text, to->asize);
-	  outbuf = (char *)to->text + to->asize - outbytesleft;
-	}
-    }
-}
 
 /* Utility routine that computes a mask of the form 0000...111... with
    WIDTH 1-bits.  */
@@ -390,15 +859,6 @@ _cpp_valid_ucn (cpp_reader *pfile, const
    "universal character %.*s is not valid at the start of an identifier",
 		   (int) (str - base), base);
     }
-  /* We don't accept UCNs if iconv is not available or will not
-     convert to the target wide character set.  */
-  else if (!HAVE_ICONV || pfile->wide_cset_desc == (iconv_t) -1)
-    {
-      /* XXX should be DL_SORRY */
-      cpp_error (pfile, DL_ERROR,
-	"universal character names are not supported in this configuration");
-    }
-
 
   if (result == 0)
     result = 1;
@@ -408,58 +868,31 @@ _cpp_valid_ucn (cpp_reader *pfile, const
 
 /* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
    it to the execution character set and write the result into TBUF.
-   An advanced pointer is returned.  Issues all relevant diagnostics.
-
-   UTF-8 encoding looks like this:
-
-   value range	       encoded as
-   00000000-0000007F   0xxxxxxx
-   00000080-000007FF   110xxxxx 10xxxxxx
-   00000800-0000FFFF   1110xxxx 10xxxxxx 10xxxxxx
-   00010000-001FFFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-   00200000-03FFFFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-   04000000-7FFFFFFF   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+   An advanced pointer is returned.  Issues all relevant diagnostics.  */
 
-   Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
-   which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
-   never occur.  Note also that any value that can be encoded by a
-   given row of the table can also be encoded by all successive rows,
-   but this is not done; only the shortest possible encoding for any
-   given value is valid.  For instance, the character 07C0 could be
-   encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
-   FC 80 80 80 9F 80.  Only the first is valid.  */
 
 static const uchar *
 convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
 	     struct strbuf *tbuf, bool wide)
 {
-  int nbytes;
-  uchar buf[6], *p = &buf[6];
-  static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
   cppchar_t ucn;
+  uchar buf[6];
+  uchar *bufp = buf;
+  size_t bytesleft = 6;
+  int rval;
+  struct cset_converter cvt
+    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
 
-  from++; /* skip u/U */
+  from++;  /* skip u/U */
   ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
-  if (!ucn)
-    return from;
 
-  nbytes = 1;
-  if (ucn < 0x80)
-    *--p = ucn;
-  else
+  rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
+  if (rval)
     {
-      do
-	{
-	  *--p = ((ucn & 0x3F) | 0x80);
-	  ucn >>= 6;
-	  nbytes++;
-	}
-      while (ucn >= 0x3F || (ucn & masks[nbytes-1]));
-      *--p = (ucn | masks[nbytes-1]);
+      errno = rval;
+      cpp_errno (pfile, DL_ERROR, "converting UCN to source character set");
     }
-
-  if (!convert_cset (wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc,
-		     p, nbytes, tbuf))
+  else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
     cpp_errno (pfile, DL_ERROR, "converting UCN to execution character set");
 
   return from;
@@ -615,6 +1048,8 @@ convert_escape (cpp_reader *pfile, const
 #endif
 
   uchar c;
+  struct cset_converter cvt
+    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
 
   c = *from;
   switch (c)
@@ -676,8 +1111,7 @@ convert_escape (cpp_reader *pfile, const
     }
 
   /* Now convert what we have to the execution character set.  */
-  if (!convert_cset (wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc,
-		     &c, 1, tbuf))
+  if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
     cpp_errno (pfile, DL_ERROR,
 	       "converting escape sequence to execution character set");
 
@@ -697,7 +1131,8 @@ cpp_interpret_string (cpp_reader *pfile,
   struct strbuf tbuf;
   const uchar *p, *base, *limit;
   size_t i;
-  iconv_t cd = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
+  struct cset_converter cvt
+    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
 
   tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
   tbuf.text = xmalloc (tbuf.asize);
@@ -719,7 +1154,7 @@ cpp_interpret_string (cpp_reader *pfile,
 	    {
 	      /* We have a run of normal characters; these can be fed
 		 directly to convert_cset.  */
-	      if (!convert_cset (cd, base, p - base, &tbuf))
+	      if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
 		goto fail;
 	    }
 	  if (p == limit)
@@ -741,6 +1176,25 @@ cpp_interpret_string (cpp_reader *pfile,
   free (tbuf.text);
   return false;
 }
+
+/* Subroutine of do_line and do_linemarker.  Convert escape sequences
+   in a string, but do not perform character set conversion.  */
+bool
+_cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *in,
+				   cpp_string *out)
+{
+  struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
+  bool retval;
+
+  pfile->narrow_cset_desc.func = convert_no_conversion;
+  pfile->narrow_cset_desc.cd = (iconv_t) -1;
+
+  retval = cpp_interpret_string (pfile, in, 1, out, false);
+
+  pfile->narrow_cset_desc = save_narrow_cset_desc;
+  return retval;
+}
+
 
 /* Subroutine of cpp_interpret_charconst which performs the conversion
    to a number, for narrow strings.  STR is the string structure returned
===================================================================
Index: cpphash.h
--- cpphash.h	5 Jul 2003 00:23:47 -0000	1.192
+++ cpphash.h	10 Jul 2003 16:07:19 -0000
@@ -35,6 +35,15 @@ typedef int iconv_t;  /* dummy */
 struct directive;		/* Deliberately incomplete.  */
 struct pending_option;
 struct op;
+struct strbuf;
+
+typedef bool (*convert_f) (iconv_t, const unsigned char *, size_t,
+			   struct strbuf *);
+struct cset_converter
+{
+  convert_f func;
+  iconv_t cd;
+};
 
 #ifndef HAVE_UCHAR
 typedef unsigned char uchar;
@@ -369,14 +378,13 @@ struct cpp_reader
   unsigned char *macro_buffer;
   unsigned int macro_buffer_len;
 
-  /* Iconv descriptor for converting from the source character set
-     to the execution character set.  (iconv_t)-1 for no conversion.  */
-  iconv_t narrow_cset_desc;
-
-  /* Iconv descriptor for converting from the execution character set
-     to the wide execution character set.  (iconv_t)-1 for no conversion
-     other than zero-extending each character to the width of wchar_t.  */
-  iconv_t wide_cset_desc;
+  /* Descriptor for converting from the source character set to the
+     execution character set.  */
+  struct cset_converter narrow_cset_desc;
+
+  /* Descriptor for converting from the source character set to the
+     wide execution character set.  */
+  struct cset_converter wide_cset_desc;
 
   /* Tree of other included files.  See cppfiles.c.  */
   struct splay_tree_s *all_include_files;
@@ -555,8 +563,11 @@ extern uchar *_cpp_copy_replacement_text
 extern size_t _cpp_replacement_text_len (const cpp_macro *);
 
 /* In cppcharset.c.  */
-cppchar_t _cpp_valid_ucn (cpp_reader *, const uchar **, const uchar *, int);
-void _cpp_destroy_iconv (cpp_reader *);
+extern cppchar_t _cpp_valid_ucn (cpp_reader *, const uchar **,
+				 const uchar *, int);
+extern void _cpp_destroy_iconv (cpp_reader *);
+extern bool _cpp_interpret_string_notranslate (cpp_reader *, const cpp_string *,
+					       cpp_string *);
 
 /* Utility routines and macros.  */
 #define DSC(str) (const uchar *)str, sizeof str - 1
===================================================================
Index: cpplib.c
--- cpplib.c	5 Jul 2003 00:23:52 -0000	1.341
+++ cpplib.c	10 Jul 2003 16:07:19 -0000
@@ -733,21 +733,6 @@ strtoul_for_line (const uchar *str, unsi
   return 0;
 }
 
-/* Subroutine of do_line and do_linemarker.  Convert escape sequences
-   in a string, but do not perform character set conversion.  */
-static bool
-interpret_string_notranslate (cpp_reader *pfile, const cpp_string *in,
-			      cpp_string *out)
-{
-  iconv_t save_narrow_cset_desc = pfile->narrow_cset_desc;
-  bool retval;
-
-  pfile->narrow_cset_desc = (iconv_t) -1;
-  retval = cpp_interpret_string (pfile, in, 1, out, false);
-  pfile->narrow_cset_desc = save_narrow_cset_desc;
-  return retval;
-}
-
 /* Interpret #line command.
    Note that the filename string (if any) is a true string constant
    (escapes are interpreted), unlike in #line.  */
@@ -780,7 +765,7 @@ do_line (cpp_reader *pfile)
   if (token->type == CPP_STRING)
     {
       cpp_string s = { 0, 0 };
-      if (interpret_string_notranslate (pfile, &token->val.str, &s))
+      if (_cpp_interpret_string_notranslate (pfile, &token->val.str, &s))
 	new_file = (const char *)s.text;
       check_eol (pfile);
     }
@@ -829,7 +814,7 @@ do_linemarker (cpp_reader *pfile)
   if (token->type == CPP_STRING)
     {
       cpp_string s = { 0, 0 };
-      if (interpret_string_notranslate (pfile, &token->val.str, &s))
+      if (_cpp_interpret_string_notranslate (pfile, &token->val.str, &s))
 	new_file = (const char *)s.text;
       
       new_sysp = 0;
Follow-Ups:
- Re: Iconv fallbacks - need testers
  - From: Kaveh R. Ghazi
- Re: Iconv fallbacks - need testers
  - From: Hans-Peter Nilsson
- Re: Iconv fallbacks - need testers
  - From: Zack Weinberg
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]