This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

cpplib: Preliminary implementation of UCNs


This does the CPP side of UCN implementation, at least until we decide
if we want to permit non-ASCII chars in identifiers and how to handle
them.

It handles UCNs in identifiers, with the valid UCNs of C++98 and C99
depending on the language (with the typo fix for C++98).  Inputting the
C99 stuff was awfully dull.  The ranges are quite disjoint for C++ and
C; IMO that's dumb and sometime in the future it's better to accept the
same thing whatever it is (it appears Comeau does this, accepting the
union of the two).  We accept the union of the two, unless -pedantic.
As C99 requires, its UCN digits are not permitted to start identifiers.
If we get serious about UCN support I suggest we use 2 generated bitmaps
to do most of the work of detecting a valid UCN, rather than traversing
the mile-long chain of "ifs".

The patch also correctly handles token pasting and paste avoidance.

UCNs are presently stored in cpplib in the form they appear in the
source file.  This is because it's easy for -E output, and I'm not sure
what we want to do as far as mangling is concerned, and I don't think
it's a good idea to make \u00aa and \u00AA the same identifier, never
mind making it the same as the character itself.  I can't find anything
in the standard that requires making them the same.  Comeau doesn't make
them the same from what I can see.

The UCN code is unified in a new file cppcharset.c, used by identifiers,
charconsts and string literals.  Simple tests for new functionality
are included; I'm not going to pretend they're comprehensive.

I expect there is little consensus about how we develop this going
forwards; certainly there wasn't the last time this was discussed.  I
don't think that should prevent this going in as a starting point.

Neil.

	* Makefile.in (LIBCPP_OBJS): Add cppcharset.o.
	(cppcharset.o): New target.
	* c-lex.c (is_extended_char): Move to cppcharset.c.
	(utf8_extend_token): Delete.
	* cppcharset.c: New file.
	* cpphash.h (_cpp_valid_ucn): New.
	* cpplex.c (lex_identifier): Update prototype.
	(continues_identifier_p): Rename forms_identifier_p.  Handle UCN
	escapes.
	(maybe_read_ucs): Rename maybe_read_ucn.  Update to use code
	in cppcharset.c.
	(lex_number, lex_identifier, cpp_parse_escape): Update.
	(_cpp_lex_direct): Update to handle UCNs.
	(cpp_avoid_paste): Don't paste to form a UCN.
testsuite:
	* ucn-1.c: New tests.
	* ucs.c: Update diagnostic messages.

Index: Makefile.in
===================================================================
RCS file: /cvs/gcc/gcc/gcc/Makefile.in,v
retrieving revision 1.1036
diff -u -p -r1.1036 Makefile.in
--- Makefile.in	19 Apr 2003 18:57:23 -0000	1.1036
+++ Makefile.in	19 Apr 2003 23:35:00 -0000
@@ -2320,7 +2320,7 @@ PREPROCESSOR_DEFINES = \
   @TARGET_SYSTEM_ROOT_DEFINE@
 
 LIBCPP_OBJS =	cpplib.o cpplex.o cppmacro.o cppexp.o cppfiles.o cpptrad.o \
-		cpphash.o cpperror.o cppinit.o \
+		cpphash.o cpperror.o cppinit.o cppcharset.o \
 		hashtable.o line-map.o mkdeps.o mbchar.o cpppch.o
 
 LIBCPP_DEPS =	$(CPPLIB_H) cpphash.h line-map.h hashtable.h intl.h \
@@ -2333,6 +2333,7 @@ libcpp.a: $(LIBCPP_OBJS)
 	$(AR) $(AR_FLAGS) libcpp.a $(LIBCPP_OBJS)
 	-$(RANLIB) libcpp.a
 
+cppcharset.o: cppcharset.c $(LIBCPP_DEPS)
 cpperror.o: cpperror.c $(LIBCPP_DEPS)
 cppexp.o:   cppexp.c   $(LIBCPP_DEPS)
 cpplex.o:   cpplex.c   $(LIBCPP_DEPS) mbchar.h
Index: c-lex.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/c-lex.c,v
retrieving revision 1.199
diff -u -p -r1.199 c-lex.c
--- c-lex.c	15 Mar 2003 12:18:44 -0000	1.199
+++ c-lex.c	19 Apr 2003 23:35:00 -0000
@@ -322,315 +322,6 @@ cb_undef (pfile, line, node)
   (*debug_hooks->undef) (SOURCE_LINE (map, line),
 			 (const char *) NODE_NAME (node));
 }
-
-#if 0 /* not yet */
-/* Returns nonzero if C is a universal-character-name.  Give an error if it
-   is not one which may appear in an identifier, as per [extendid].
-
-   Note that extended character support in identifiers has not yet been
-   implemented.  It is my personal opinion that this is not a desirable
-   feature.  Portable code cannot count on support for more than the basic
-   identifier character set.  */
-
-static inline int
-is_extended_char (c)
-     int c;
-{
-#ifdef TARGET_EBCDIC
-  return 0;
-#else
-  /* ASCII.  */
-  if (c < 0x7f)
-    return 0;
-
-  /* None of the valid chars are outside the Basic Multilingual Plane (the
-     low 16 bits).  */
-  if (c > 0xffff)
-    {
-      error ("universal-character-name '\\U%08x' not valid in identifier", c);
-      return 1;
-    }
-  
-  /* Latin */
-  if ((c >= 0x00c0 && c <= 0x00d6)
-      || (c >= 0x00d8 && c <= 0x00f6)
-      || (c >= 0x00f8 && c <= 0x01f5)
-      || (c >= 0x01fa && c <= 0x0217)
-      || (c >= 0x0250 && c <= 0x02a8)
-      || (c >= 0x1e00 && c <= 0x1e9a)
-      || (c >= 0x1ea0 && c <= 0x1ef9))
-    return 1;
-
-  /* Greek */
-  if ((c == 0x0384)
-      || (c >= 0x0388 && c <= 0x038a)
-      || (c == 0x038c)
-      || (c >= 0x038e && c <= 0x03a1)
-      || (c >= 0x03a3 && c <= 0x03ce)
-      || (c >= 0x03d0 && c <= 0x03d6)
-      || (c == 0x03da)
-      || (c == 0x03dc)
-      || (c == 0x03de)
-      || (c == 0x03e0)
-      || (c >= 0x03e2 && c <= 0x03f3)
-      || (c >= 0x1f00 && c <= 0x1f15)
-      || (c >= 0x1f18 && c <= 0x1f1d)
-      || (c >= 0x1f20 && c <= 0x1f45)
-      || (c >= 0x1f48 && c <= 0x1f4d)
-      || (c >= 0x1f50 && c <= 0x1f57)
-      || (c == 0x1f59)
-      || (c == 0x1f5b)
-      || (c == 0x1f5d)
-      || (c >= 0x1f5f && c <= 0x1f7d)
-      || (c >= 0x1f80 && c <= 0x1fb4)
-      || (c >= 0x1fb6 && c <= 0x1fbc)
-      || (c >= 0x1fc2 && c <= 0x1fc4)
-      || (c >= 0x1fc6 && c <= 0x1fcc)
-      || (c >= 0x1fd0 && c <= 0x1fd3)
-      || (c >= 0x1fd6 && c <= 0x1fdb)
-      || (c >= 0x1fe0 && c <= 0x1fec)
-      || (c >= 0x1ff2 && c <= 0x1ff4)
-      || (c >= 0x1ff6 && c <= 0x1ffc))
-    return 1;
-
-  /* Cyrillic */
-  if ((c >= 0x0401 && c <= 0x040d)
-      || (c >= 0x040f && c <= 0x044f)
-      || (c >= 0x0451 && c <= 0x045c)
-      || (c >= 0x045e && c <= 0x0481)
-      || (c >= 0x0490 && c <= 0x04c4)
-      || (c >= 0x04c7 && c <= 0x04c8)
-      || (c >= 0x04cb && c <= 0x04cc)
-      || (c >= 0x04d0 && c <= 0x04eb)
-      || (c >= 0x04ee && c <= 0x04f5)
-      || (c >= 0x04f8 && c <= 0x04f9))
-    return 1;
-
-  /* Armenian */
-  if ((c >= 0x0531 && c <= 0x0556)
-      || (c >= 0x0561 && c <= 0x0587))
-    return 1;
-
-  /* Hebrew */
-  if ((c >= 0x05d0 && c <= 0x05ea)
-      || (c >= 0x05f0 && c <= 0x05f4))
-    return 1;
-
-  /* Arabic */
-  if ((c >= 0x0621 && c <= 0x063a)
-      || (c >= 0x0640 && c <= 0x0652)
-      || (c >= 0x0670 && c <= 0x06b7)
-      || (c >= 0x06ba && c <= 0x06be)
-      || (c >= 0x06c0 && c <= 0x06ce)
-      || (c >= 0x06e5 && c <= 0x06e7))
-    return 1;
-
-  /* Devanagari */
-  if ((c >= 0x0905 && c <= 0x0939)
-      || (c >= 0x0958 && c <= 0x0962))
-    return 1;
-
-  /* Bengali */
-  if ((c >= 0x0985 && c <= 0x098c)
-      || (c >= 0x098f && c <= 0x0990)
-      || (c >= 0x0993 && c <= 0x09a8)
-      || (c >= 0x09aa && c <= 0x09b0)
-      || (c == 0x09b2)
-      || (c >= 0x09b6 && c <= 0x09b9)
-      || (c >= 0x09dc && c <= 0x09dd)
-      || (c >= 0x09df && c <= 0x09e1)
-      || (c >= 0x09f0 && c <= 0x09f1))
-    return 1;
-
-  /* Gurmukhi */
-  if ((c >= 0x0a05 && c <= 0x0a0a)
-      || (c >= 0x0a0f && c <= 0x0a10)
-      || (c >= 0x0a13 && c <= 0x0a28)
-      || (c >= 0x0a2a && c <= 0x0a30)
-      || (c >= 0x0a32 && c <= 0x0a33)
-      || (c >= 0x0a35 && c <= 0x0a36)
-      || (c >= 0x0a38 && c <= 0x0a39)
-      || (c >= 0x0a59 && c <= 0x0a5c)
-      || (c == 0x0a5e))
-    return 1;
-
-  /* Gujarati */
-  if ((c >= 0x0a85 && c <= 0x0a8b)
-      || (c == 0x0a8d)
-      || (c >= 0x0a8f && c <= 0x0a91)
-      || (c >= 0x0a93 && c <= 0x0aa8)
-      || (c >= 0x0aaa && c <= 0x0ab0)
-      || (c >= 0x0ab2 && c <= 0x0ab3)
-      || (c >= 0x0ab5 && c <= 0x0ab9)
-      || (c == 0x0ae0))
-    return 1;
-
-  /* Oriya */
-  if ((c >= 0x0b05 && c <= 0x0b0c)
-      || (c >= 0x0b0f && c <= 0x0b10)
-      || (c >= 0x0b13 && c <= 0x0b28)
-      || (c >= 0x0b2a && c <= 0x0b30)
-      || (c >= 0x0b32 && c <= 0x0b33)
-      || (c >= 0x0b36 && c <= 0x0b39)
-      || (c >= 0x0b5c && c <= 0x0b5d)
-      || (c >= 0x0b5f && c <= 0x0b61))
-    return 1;
-
-  /* Tamil */
-  if ((c >= 0x0b85 && c <= 0x0b8a)
-      || (c >= 0x0b8e && c <= 0x0b90)
-      || (c >= 0x0b92 && c <= 0x0b95)
-      || (c >= 0x0b99 && c <= 0x0b9a)
-      || (c == 0x0b9c)
-      || (c >= 0x0b9e && c <= 0x0b9f)
-      || (c >= 0x0ba3 && c <= 0x0ba4)
-      || (c >= 0x0ba8 && c <= 0x0baa)
-      || (c >= 0x0bae && c <= 0x0bb5)
-      || (c >= 0x0bb7 && c <= 0x0bb9))
-    return 1;
-
-  /* Telugu */
-  if ((c >= 0x0c05 && c <= 0x0c0c)
-      || (c >= 0x0c0e && c <= 0x0c10)
-      || (c >= 0x0c12 && c <= 0x0c28)
-      || (c >= 0x0c2a && c <= 0x0c33)
-      || (c >= 0x0c35 && c <= 0x0c39)
-      || (c >= 0x0c60 && c <= 0x0c61))
-    return 1;
-
-  /* Kannada */
-  if ((c >= 0x0c85 && c <= 0x0c8c)
-      || (c >= 0x0c8e && c <= 0x0c90)
-      || (c >= 0x0c92 && c <= 0x0ca8)
-      || (c >= 0x0caa && c <= 0x0cb3)
-      || (c >= 0x0cb5 && c <= 0x0cb9)
-      || (c >= 0x0ce0 && c <= 0x0ce1))
-    return 1;
-
-  /* Malayalam */
-  if ((c >= 0x0d05 && c <= 0x0d0c)
-      || (c >= 0x0d0e && c <= 0x0d10)
-      || (c >= 0x0d12 && c <= 0x0d28)
-      || (c >= 0x0d2a && c <= 0x0d39)
-      || (c >= 0x0d60 && c <= 0x0d61))
-    return 1;
-
-  /* Thai */
-  if ((c >= 0x0e01 && c <= 0x0e30)
-      || (c >= 0x0e32 && c <= 0x0e33)
-      || (c >= 0x0e40 && c <= 0x0e46)
-      || (c >= 0x0e4f && c <= 0x0e5b))
-    return 1;
-
-  /* Lao */
-  if ((c >= 0x0e81 && c <= 0x0e82)
-      || (c == 0x0e84)
-      || (c == 0x0e87)
-      || (c == 0x0e88)
-      || (c == 0x0e8a)
-      || (c == 0x0e0d)
-      || (c >= 0x0e94 && c <= 0x0e97)
-      || (c >= 0x0e99 && c <= 0x0e9f)
-      || (c >= 0x0ea1 && c <= 0x0ea3)
-      || (c == 0x0ea5)
-      || (c == 0x0ea7)
-      || (c == 0x0eaa)
-      || (c == 0x0eab)
-      || (c >= 0x0ead && c <= 0x0eb0)
-      || (c == 0x0eb2)
-      || (c == 0x0eb3)
-      || (c == 0x0ebd)
-      || (c >= 0x0ec0 && c <= 0x0ec4)
-      || (c == 0x0ec6))
-    return 1;
-
-  /* Georgian */
-  if ((c >= 0x10a0 && c <= 0x10c5)
-      || (c >= 0x10d0 && c <= 0x10f6))
-    return 1;
-
-  /* Hiragana */
-  if ((c >= 0x3041 && c <= 0x3094)
-      || (c >= 0x309b && c <= 0x309e))
-    return 1;
-
-  /* Katakana */
-  if ((c >= 0x30a1 && c <= 0x30fe))
-    return 1;
-
-  /* Bopmofo */
-  if ((c >= 0x3105 && c <= 0x312c))
-    return 1;
-
-  /* Hangul */
-  if ((c >= 0x1100 && c <= 0x1159)
-      || (c >= 0x1161 && c <= 0x11a2)
-      || (c >= 0x11a8 && c <= 0x11f9))
-    return 1;
-
-  /* CJK Unified Ideographs */
-  if ((c >= 0xf900 && c <= 0xfa2d)
-      || (c >= 0xfb1f && c <= 0xfb36)
-      || (c >= 0xfb38 && c <= 0xfb3c)
-      || (c == 0xfb3e)
-      || (c >= 0xfb40 && c <= 0xfb41)
-      || (c >= 0xfb42 && c <= 0xfb44)
-      || (c >= 0xfb46 && c <= 0xfbb1)
-      || (c >= 0xfbd3 && c <= 0xfd3f)
-      || (c >= 0xfd50 && c <= 0xfd8f)
-      || (c >= 0xfd92 && c <= 0xfdc7)
-      || (c >= 0xfdf0 && c <= 0xfdfb)
-      || (c >= 0xfe70 && c <= 0xfe72)
-      || (c == 0xfe74)
-      || (c >= 0xfe76 && c <= 0xfefc)
-      || (c >= 0xff21 && c <= 0xff3a)
-      || (c >= 0xff41 && c <= 0xff5a)
-      || (c >= 0xff66 && c <= 0xffbe)
-      || (c >= 0xffc2 && c <= 0xffc7)
-      || (c >= 0xffca && c <= 0xffcf)
-      || (c >= 0xffd2 && c <= 0xffd7)
-      || (c >= 0xffda && c <= 0xffdc)
-      || (c >= 0x4e00 && c <= 0x9fa5))
-    return 1;
-
-  error ("universal-character-name '\\u%04x' not valid in identifier", c);
-  return 1;
-#endif
-}
-
-/* Add the UTF-8 representation of C to the token_buffer.  */
-
-static void
-utf8_extend_token (c)
-     int c;
-{
-  int shift, mask;
-
-  if      (c <= 0x0000007f)
-    {
-      extend_token (c);
-      return;
-    }
-  else if (c <= 0x000007ff)
-    shift = 6, mask = 0xc0;
-  else if (c <= 0x0000ffff)
-    shift = 12, mask = 0xe0;
-  else if (c <= 0x001fffff)
-    shift = 18, mask = 0xf0;
-  else if (c <= 0x03ffffff)
-    shift = 24, mask = 0xf8;
-  else
-    shift = 30, mask = 0xfc;
-
-  extend_token (mask | (c >> shift));
-  do
-    {
-      shift -= 6;
-      extend_token ((unsigned char) (0x80 | (c >> shift)));
-    }
-  while (shift);
-}
-#endif
 
 int
 c_lex (value)
Index: cppcharset.c
===================================================================
RCS file: cppcharset.c
diff -N cppcharset.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ cppcharset.c	19 Apr 2003 23:35:00 -0000
@@ -0,0 +1,591 @@
+/* CPP Library - charsets
+   Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003
+   Free Software Foundation, Inc.
+
+   Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "cpplib.h"
+#include "cpphash.h"
+
+static int ucn_valid_in_identifier PARAMS ((cpp_reader *, cppchar_t));
+
+/* [lex.charset]: The character designated by the universal character
+   name \UNNNNNNNN is that character whose character short name in
+   ISO/IEC 10646 is NNNNNNNN; the character designated by the
+   universal character name \uNNNN is that character whose character
+   short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
+   for a universal character name is less than 0x20 or in the range
+   0x7F-0x9F (inclusive), or if the universal character name
+   designates a character in the basic source character set, then the
+   program is ill-formed.
+
+   *PSTR must be preceded by "\u" or "\U"; it is assumed that the
+   buffer end is delimited by a non-hex digit.  Returns zero if UCNs
+   are not part of the relevant standard, or if the string beginning
+   at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'.
+
+   Otherwise the non-zero value of the UCN, whether valid or invalid,
+   is returned.  Diagnostics are emitted for invalid values.  PSTR
+   is updated to point one beyond the UCN, or to the syntactically
+   invalid character.
+
+   IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
+   an identifier, or 2 otherwise.
+*/
+
+cppchar_t
+_cpp_valid_ucn (pfile, pstr, identifier_pos)
+     cpp_reader *pfile;
+     const uchar **pstr;
+     int identifier_pos;
+{
+  cppchar_t result, c;
+  unsigned int length;
+  const uchar *str = *pstr;
+  const uchar *base = str - 2;
+
+  /* Only attempt to interpret a UCS for C++ and C99.  */
+  if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
+    return 0;
+
+  /* We don't accept UCNs for an EBCDIC target.  */
+  if (CPP_OPTION (pfile, EBCDIC))
+    return 0;
+
+  if (str[-1] == 'u')
+    length = 4;
+  else if (str[-1] == 'U')
+    length = 8;
+  else
+    abort();
+
+  result = 0;
+  do
+    {
+      c = *str;
+      if (!ISXDIGIT (c))
+	break;
+      str++;
+      result = (result << 4) + hex_value (c);
+    }
+  while (--length);
+
+  *pstr = str;
+  if (length)
+    /* We'll error when we try it out as the start of an identifier.  */
+    cpp_error (pfile, DL_ERROR, "incomplete universal character name %.*s",
+	       str - base, base);
+  /* The standard permits $, @ and ` to be specified as UCNs.  We use
+     hex escapes so that this also works with EBCDIC hosts.  */
+  else if ((result < 0xa0
+	    && (result != 0x24 && result != 0x40 && result != 0x60))
+	   || (result & 0x80000000)
+	   || (result >= 0xD800 && result <= 0xDFFF))
+    {
+      cpp_error (pfile, DL_ERROR, "%.*s is not a valid universal character",
+		 str - base, base);
+    }
+  else if (identifier_pos)
+    {
+      int validity = ucn_valid_in_identifier (pfile, result);
+
+      if (validity == 0)
+	cpp_error (pfile, DL_ERROR,
+		   "universal character %.*s is not valid in an identifier",
+		   str - base, base);
+      else if (validity == 2 && identifier_pos == 1)
+	cpp_error (pfile, DL_ERROR,
+   "universal character %.*s is not valid at the start of an identifier",
+		   str - base, base);
+    }
+
+  if (result == 0)
+    result = 1;
+
+  return result;
+}
+
+/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
+   the start of an identifier, and 0 if C is not valid in an
+   identifier.  We assume C has already gone through the checks of
+   _cpp_valid_ucn.  */
+static int
+ucn_valid_in_identifier (pfile, c)
+     cpp_reader *pfile;
+     cppchar_t c;
+{
+  /* None of the valid chars are outside the Basic Multilingual Plane (the
+     low 16 bits).  */
+  if (c > 0xffff)
+    return 0;
+
+  if (CPP_OPTION (pfile, c99) || !CPP_PEDANTIC (pfile))
+    {
+      /* Latin.  */
+      if (c == 0x0aa || c == 0x00ba || c == 0x207f || c == 0x1e9b)
+	return 1;
+
+      /* Greek.  */
+      if (c == 0x0386)
+	return 1;
+
+      /* Cyrillic.  */
+      if (c == 0x040c)
+	return 1;
+
+      /* Hebrew.  */
+      if ((c >= 0x05b0 && c <= 0x05b9)
+	  || (c >= 0x05bb && c <= 0x005bd)
+	  || c == 0x05bf
+	  || (c >= 0x05c1 && c <= 0x05c2))
+	return 1;
+
+      /* Arabic.  */
+      if ((c >= 0x06d0 && c <= 0x06dc)
+	  || c == 0x06e8
+	  || (c >= 0x06ea && c <= 0x06ed))
+	return 1;
+
+      /* Devanagari */
+      if ((c >= 0x0901 && c <= 0x0903)
+	  || (c >= 0x093e && c <= 0x094d)
+	  || (c >= 0x0950 && c <= 0x0952)
+	  || c == 0x0963)
+	return 1;
+
+      /* Bengali */
+      if ((c >= 0x0981 && c <= 0x0983)
+	  || (c >= 0x09be && c <= 0x09c4)
+	  || (c >= 0x09c7 && c <= 0x09c8)
+	  || (c >= 0x09cb && c <= 0x09cd)
+	  || (c >= 0x09e2 && c <= 0x09e3))
+	return 1;
+
+      /* Gurmukhi */
+      if (c == 0x0a02
+	  || (c >= 0x0a3e && c <= 0x0a42)
+	  || (c >= 0x0a47 && c <= 0x0a48)
+	  || (c >= 0x0a4b && c <= 0x0a4d)
+	  || (c == 0x0a74))
+	return 1;
+      
+      /* Gujarati */
+      if ((c >= 0x0a81 && c <= 0x0a83)
+	  || (c >= 0x0abd && c <= 0x0ac5)
+	  || (c >= 0x0ac7 && c <= 0x0ac9)
+	  || (c >= 0x0acb && c <= 0x0acd)
+	  || (c == 0x0ad0))
+	return 1;
+
+      /* Oriya */
+      if ((c >= 0x0b01 && c <= 0x0b03)
+	  || (c >= 0x0b3e && c <= 0x0b43)
+	  || (c >= 0x0b47 && c <= 0x0b48)
+	  || (c >= 0x0b4b && c <= 0x0b4d))
+	return 1;
+
+      /* Tamil */
+      if ((c >= 0x0b82 && c <= 0x0b83)
+	  || (c >= 0x0bbe && c <= 0x0bc2)
+	  || (c >= 0x0bc6 && c <= 0x0bc8)
+	  || (c >= 0x0bc8 && c <= 0x0bcd))
+	return 1;
+
+      /* Telugu */
+      if ((c >= 0x0c01 && c <= 0x0c03)
+	  || (c >= 0x0c3e && c <= 0x0c44)
+	  || (c >= 0x0c46 && c <= 0x0c48)	
+	  || (c >= 0x0c4a && c <= 0x0c4d))
+	return 1;
+
+      /* Kannada */
+      if ((c >= 0x0c82 && c <= 0x0c83)
+	  || (c >= 0x0cbe && c <= 0x0cc4)
+	  || (c >= 0x0cc6 && c <= 0x0cc8)
+	  || (c >= 0x0cca && c <= 0x0ccd)
+	  || c == 0x0cde)
+	return 1;
+
+      /* Malayalam */
+      if ((c >= 0x0d02 && c <= 0x0d03)
+	  || (c >= 0x0d3e && c <= 0x0d43)
+	  || (c >= 0x0d46 && c <= 0x0d48)
+	  || (c >= 0x0d4a && c <= 0x0d4d))
+	return 1;
+
+      /* Thai */
+      if ((c >= 0x0e01 && c <= 0x0e3a)
+	  || (c >= 0x0e40 && c <= 0x0e5b))
+	return 1;
+
+      /* Lao */
+      if ((c >= 0x0ead && c <= 0x0eae)
+	  || (c >= 0x0eb0 && c <= 0x0eb9)
+	  || (c >= 0x0ebb && c <= 0x0ebd)
+	  || (c >= 0x0ec0 && c <= 0x0ec4)
+	  || c == 0x0ec6
+	  || (c >= 0x0ec8 && c <= 0x0ecd)
+	  || (c >= 0x0edc && c <= 0x0ed))
+	return 1;
+
+      /* Tibetan.  */
+      if (c == 0x0f00
+	  || (c >= 0x0f18 && c <= 0x0f19)
+	  || c == 0x0f35
+	  || c == 0x0f37
+	  || c == 0x0f39
+	  || (c >= 0x0f3e && c <= 0x0f47)
+	  || (c >= 0x0f49 && c <= 0x0f69)
+	  || (c >= 0x0f71 && c <= 0x0f84)
+	  || (c >= 0x0f86 && c <= 0x0f8b)
+	  || (c >= 0x0f90 && c <= 0x0f95)
+	  || c == 0x0f97
+	  || (c >= 0x0f99 && c <= 0x0fad)
+	  || (c >= 0x0fb1 && c <= 0x0fb7)
+	  || c == 0x0fb9)
+	return 1;
+
+      /* Katakana */
+      if ((c >= 0x30a1 && c <= 0x30f6)
+	  || (c >= 0x30fb && c <= 0x30fc))
+	return 1;
+
+      /* CJK Unified Ideographs.  */
+      if (c >= 0x4e00 && c <= 0x9fa5)
+	return 1;
+
+      /* Hangul.  */
+      if (c >= 0xac00 && c <= 0xd7a3)
+	return 1;
+
+      /* Digits.  */
+      if ((c >= 0x0660 && c <= 0x0669)
+	  || (c >= 0x06f0 && c <= 0x06f9)
+	  || (c >= 0x0966 && c <= 0x096f)
+	  || (c >= 0x09e6 && c <= 0x09ef)
+	  || (c >= 0x0a66 && c <= 0x0a6f)
+	  || (c >= 0x0ae6 && c <= 0x0aef)
+	  || (c >= 0x0b66 && c <= 0x0b6f)
+	  || (c >= 0x0be7 && c <= 0x0bef)
+	  || (c >= 0x0c66 && c <= 0x0c6f)
+	  || (c >= 0x0ce6 && c <= 0x0cef)
+	  || (c >= 0x0d66 && c <= 0x0d6f)
+	  || (c >= 0x0e50 && c <= 0x0e59)
+	  || (c >= 0x0ed0 && c <= 0x0ed9)
+	  || (c >= 0x0f20 && c <= 0x0f33))
+	return 2;
+
+      /* Special characters.  */
+      if (c == 0x00b5
+	  || c == 0x00b7
+	  || (c >= 0x02b0 && c <= 0x02b8)
+	  || c == 0x02bb
+	  || (c >= 0x02bd && c <= 0x02c1)
+	  || (c >= 0x02d0 && c <= 0x02d1)
+	  || (c >= 0x02e0 && c <= 0x02e4)
+	  || c == 0x037a
+	  || c == 0x0559
+	  || c == 0x093d
+	  || c == 0x0b3d
+	  || c == 0x1fbe
+	  || (c >= 0x203f && c <= 0x2040)
+	  || c == 0x2102
+	  || c == 0x2107
+	  || (c >= 0x210a && c <= 0x2113)
+	  || c == 0x2115
+	  || (c >= 0x2118 && c <= 0x211d)
+	  || c == 0x2124
+	  || c == 0x2126
+	  || c == 0x2128
+	  || (c >= 0x212a && c <= 0x2131)
+	  || (c >= 0x2133 && c <= 0x2138)
+	  || (c >= 0x2160 && c <= 0x2182)
+	  || (c >= 0x3005 && c <= 0x3007)
+	  || (c >= 0x3021 && c <= 0x3029))
+	return 1;	  
+    }
+  
+  if (CPP_OPTION (pfile, cplusplus) || !CPP_PEDANTIC (pfile))
+    {
+      /* Greek.  */
+      if (c == 0x0384)
+	return 1;
+
+      /* Cyrillic.  */
+      if (c == 0x040d)
+	return 1;
+
+      /* Hebrew.  */
+      if (c >= 0x05f3 && c <= 0x05f4)
+	return 1;
+
+      /* Lao.  */
+      if ((c >= 0x0ead && c <= 0x0eb0)
+	  || (c == 0x0eb2)
+	  || (c == 0x0eb3)
+	  || (c == 0x0ebd)
+	  || (c >= 0x0ec0 && c <= 0x0ec4)
+	  || (c == 0x0ec6))
+	return 1;
+
+      /* Hiragana */
+      if (c == 0x3094
+	  || (c >= 0x309d && c <= 0x309e))
+	return 1;
+
+      /* Katakana */
+      if ((c >= 0x30a1 && c <= 0x30fe))
+	return 1;
+
+      /* Hangul */
+      if ((c >= 0x1100 && c <= 0x1159)
+	  || (c >= 0x1161 && c <= 0x11a2)
+	  || (c >= 0x11a8 && c <= 0x11f9))
+	return 1;
+
+      /* CJK Unified Ideographs */
+      if ((c >= 0xf900 && c <= 0xfa2d)
+	  || (c >= 0xfb1f && c <= 0xfb36)
+	  || (c >= 0xfb38 && c <= 0xfb3c)
+	  || (c == 0xfb3e)
+	  || (c >= 0xfb40 && c <= 0xfb41)
+	  || (c >= 0xfb42 && c <= 0xfb44)
+	  || (c >= 0xfb46 && c <= 0xfbb1)
+	  || (c >= 0xfbd3 && c <= 0xfd3f)
+	  || (c >= 0xfd50 && c <= 0xfd8f)
+	  || (c >= 0xfd92 && c <= 0xfdc7)
+	  || (c >= 0xfdf0 && c <= 0xfdfb)
+	  || (c >= 0xfe70 && c <= 0xfe72)
+	  || (c == 0xfe74)
+	  || (c >= 0xfe76 && c <= 0xfefc)
+	  || (c >= 0xff21 && c <= 0xff3a)
+	  || (c >= 0xff41 && c <= 0xff5a)
+	  || (c >= 0xff66 && c <= 0xffbe)
+	  || (c >= 0xffc2 && c <= 0xffc7)
+	  || (c >= 0xffca && c <= 0xffcf)
+	  || (c >= 0xffd2 && c <= 0xffd7)
+	  || (c >= 0xffda && c <= 0xffdc)
+	  || (c >= 0x4e00 && c <= 0x9fa5))
+	return 1;
+    }
+
+  /* Latin */
+  if ((c >= 0x00c0 && c <= 0x00d6)
+      || (c >= 0x00d8 && c <= 0x00f6)
+      || (c >= 0x00f8 && c <= 0x01f5)
+      || (c >= 0x01fa && c <= 0x0217)
+      || (c >= 0x0250 && c <= 0x02a8)
+      || (c >= 0x1e00 && c <= 0x1e9a)
+      || (c >= 0x1ea0 && c <= 0x1ef9))
+    return 1;
+
+  /* Greek */
+  if ((c >= 0x0388 && c <= 0x038a)
+      || (c == 0x038c)
+      || (c >= 0x038e && c <= 0x03a1)
+      || (c >= 0x03a3 && c <= 0x03ce)
+      || (c >= 0x03d0 && c <= 0x03d6)
+      || (c == 0x03da)
+      || (c == 0x03dc)
+      || (c == 0x03de)
+      || (c == 0x03e0)
+      || (c >= 0x03e2 && c <= 0x03f3)
+      || (c >= 0x1f00 && c <= 0x1f15)
+      || (c >= 0x1f18 && c <= 0x1f1d)
+      || (c >= 0x1f20 && c <= 0x1f45)
+      || (c >= 0x1f48 && c <= 0x1f4d)
+      || (c >= 0x1f50 && c <= 0x1f57)
+      || (c == 0x1f59)
+      || (c == 0x1f5b)
+      || (c == 0x1f5d)
+      || (c >= 0x1f5f && c <= 0x1f7d)
+      || (c >= 0x1f80 && c <= 0x1fb4)
+      || (c >= 0x1fb6 && c <= 0x1fbc)
+      || (c >= 0x1fc2 && c <= 0x1fc4)
+      || (c >= 0x1fc6 && c <= 0x1fcc)
+      || (c >= 0x1fd0 && c <= 0x1fd3)
+      || (c >= 0x1fd6 && c <= 0x1fdb)
+      || (c >= 0x1fe0 && c <= 0x1fec)
+      || (c >= 0x1ff2 && c <= 0x1ff4)
+      || (c >= 0x1ff6 && c <= 0x1ffc))
+    return 1;
+
+  /* Cyrillic */
+  if ((c >= 0x0401 && c <= 0x040c)
+      || (c >= 0x040f && c <= 0x044f)
+      || (c >= 0x0451 && c <= 0x045c)
+      || (c >= 0x045e && c <= 0x0481)
+      || (c >= 0x0490 && c <= 0x04c4)
+      || (c >= 0x04c7 && c <= 0x04c8)
+      || (c >= 0x04cb && c <= 0x04cc)
+      || (c >= 0x04d0 && c <= 0x04eb)
+      || (c >= 0x04ee && c <= 0x04f5)
+      || (c >= 0x04f8 && c <= 0x04f9))
+    return 1;
+
+  /* Armenian */
+  if ((c >= 0x0531 && c <= 0x0556)
+      || (c >= 0x0561 && c <= 0x0587))
+    return 1;
+
+  /* Hebrew */
+  if ((c >= 0x05d0 && c <= 0x05ea)
+      || (c >= 0x05f0 && c <= 0x05f2))
+    return 1;
+
+  /* Arabic */
+  if ((c >= 0x0621 && c <= 0x063a)
+      || (c >= 0x0640 && c <= 0x0652)
+      || (c >= 0x0670 && c <= 0x06b7)
+      || (c >= 0x06ba && c <= 0x06be)
+      || (c >= 0x06c0 && c <= 0x06ce)
+      || (c >= 0x06e5 && c <= 0x06e7))
+    return 1;
+
+  /* Devanagari */
+  if ((c >= 0x0905 && c <= 0x0939)
+      || (c >= 0x0958 && c <= 0x0962))
+    return 1;
+
+  /* Bengali */
+  if ((c >= 0x0985 && c <= 0x098c)
+      || (c >= 0x098f && c <= 0x0990)
+      || (c >= 0x0993 && c <= 0x09a8)
+      || (c >= 0x09aa && c <= 0x09b0)
+      || (c == 0x09b2)
+      || (c >= 0x09b6 && c <= 0x09b9)
+      || (c >= 0x09dc && c <= 0x09dd)
+      || (c >= 0x09df && c <= 0x09e1)
+      || (c >= 0x09f0 && c <= 0x09f1))
+    return 1;
+
+  /* Gurmukhi */
+  if ((c >= 0x0a05 && c <= 0x0a0a)
+      || (c >= 0x0a0f && c <= 0x0a10)
+      || (c >= 0x0a13 && c <= 0x0a28)
+      || (c >= 0x0a2a && c <= 0x0a30)
+      || (c >= 0x0a32 && c <= 0x0a33)
+      || (c >= 0x0a35 && c <= 0x0a36)
+      || (c >= 0x0a38 && c <= 0x0a39)
+      || (c >= 0x0a59 && c <= 0x0a5c)
+      || (c == 0x0a5e))
+    return 1;
+
+  /* Gujarati */
+  if ((c >= 0x0a85 && c <= 0x0a8b)
+      || (c == 0x0a8d)
+      || (c >= 0x0a8f && c <= 0x0a91)
+      || (c >= 0x0a93 && c <= 0x0aa8)
+      || (c >= 0x0aaa && c <= 0x0ab0)
+      || (c >= 0x0ab2 && c <= 0x0ab3)
+      || (c >= 0x0ab5 && c <= 0x0ab9)
+      || (c == 0x0ae0))
+    return 1;
+
+  /* Oriya */
+  if ((c >= 0x0b05 && c <= 0x0b0c)
+      || (c >= 0x0b0f && c <= 0x0b10)
+      || (c >= 0x0b13 && c <= 0x0b28)
+      || (c >= 0x0b2a && c <= 0x0b30)
+      || (c >= 0x0b32 && c <= 0x0b33)
+      || (c >= 0x0b36 && c <= 0x0b39)
+      || (c >= 0x0b5c && c <= 0x0b5d)
+      || (c >= 0x0b5f && c <= 0x0b61))
+    return 1;
+
+  /* Tamil */
+  if ((c >= 0x0b85 && c <= 0x0b8a)
+      || (c >= 0x0b8e && c <= 0x0b90)
+      || (c >= 0x0b92 && c <= 0x0b95)
+      || (c >= 0x0b99 && c <= 0x0b9a)
+      || (c == 0x0b9c)
+      || (c >= 0x0b9e && c <= 0x0b9f)
+      || (c >= 0x0ba3 && c <= 0x0ba4)
+      || (c >= 0x0ba8 && c <= 0x0baa)
+      || (c >= 0x0bae && c <= 0x0bb5)
+      || (c >= 0x0bb7 && c <= 0x0bb9))
+    return 1;
+
+  /* Telugu */
+  if ((c >= 0x0c05 && c <= 0x0c0c)
+      || (c >= 0x0c0e && c <= 0x0c10)
+      || (c >= 0x0c12 && c <= 0x0c28)
+      || (c >= 0x0c2a && c <= 0x0c33)
+      || (c >= 0x0c35 && c <= 0x0c39)
+      || (c >= 0x0c60 && c <= 0x0c61))
+    return 1;
+
+  /* Kannada */
+  if ((c >= 0x0c85 && c <= 0x0c8c)
+      || (c >= 0x0c8e && c <= 0x0c90)
+      || (c >= 0x0c92 && c <= 0x0ca8)
+      || (c >= 0x0caa && c <= 0x0cb3)
+      || (c >= 0x0cb5 && c <= 0x0cb9)
+      || (c >= 0x0ce0 && c <= 0x0ce1))
+    return 1;
+
+  /* Malayalam */
+  if ((c >= 0x0d05 && c <= 0x0d0c)
+      || (c >= 0x0d0e && c <= 0x0d10)
+      || (c >= 0x0d12 && c <= 0x0d28)
+      || (c >= 0x0d2a && c <= 0x0d39)
+      || (c >= 0x0d60 && c <= 0x0d61))
+    return 1;
+
+  /* Thai */
+  if ((c >= 0x0e01 && c <= 0x0e30)
+      || (c >= 0x0e32 && c <= 0x0e33)
+      || (c >= 0x0e40 && c <= 0x0e46)
+      || (c >= 0x0e4f && c <= 0x0e5b))
+    return 1;
+
+  /* Lao */
+  if ((c >= 0x0e81 && c <= 0x0e82)
+      || (c == 0x0e84)
+      || (c == 0x0e87)
+      || (c == 0x0e88)
+      || (c == 0x0e8a)
+      || (c == 0x0e8d)
+      || (c >= 0x0e94 && c <= 0x0e97)
+      || (c >= 0x0e99 && c <= 0x0e9f)
+      || (c >= 0x0ea1 && c <= 0x0ea3)
+      || (c == 0x0ea5)
+      || (c == 0x0ea7)
+      || (c == 0x0eaa)
+      || (c == 0x0eab))
+    return 1;
+
+  /* Georgian */
+  if ((c >= 0x10a0 && c <= 0x10c5)
+      || (c >= 0x10d0 && c <= 0x10f6))
+    return 1;
+
+  /* Hiragana */
+  if ((c >= 0x3041 && c <= 0x3093)
+      || (c >= 0x309b && c <= 0x309c))
+    return 1;
+
+  /* Bopmofo */
+  if ((c >= 0x3105 && c <= 0x312c))
+    return 1;
+
+  return 0;
+}
Index: cpphash.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/cpphash.h,v
retrieving revision 1.183
diff -u -p -r1.183 cpphash.h
--- cpphash.h	19 Apr 2003 16:34:33 -0000	1.183
+++ cpphash.h	19 Apr 2003 23:35:00 -0000
@@ -555,6 +555,10 @@ extern bool _cpp_expansions_different_tr
 extern uchar *_cpp_copy_replacement_text PARAMS ((const cpp_macro *, uchar *));
 extern size_t _cpp_replacement_text_len PARAMS ((const cpp_macro *));
 
+/* In cppcharset.c.  */
+cppchar_t _cpp_valid_ucn PARAMS ((cpp_reader *, const uchar **,
+				  int identifer_p));
+
 /* Utility routines and macros.  */
 #define DSC(str) (const uchar *)str, sizeof str - 1
 #define xnew(T)		(T *) xmalloc (sizeof(T))
Index: cpplex.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/cpplex.c,v
retrieving revision 1.224
diff -u -p -r1.224 cpplex.c
--- cpplex.c	19 Apr 2003 16:34:33 -0000	1.224
+++ cpplex.c	19 Apr 2003 23:35:01 -0000
@@ -59,15 +59,14 @@ static const struct token_spelling token
 static void add_line_note PARAMS ((cpp_buffer *, const uchar *, unsigned int));
 static int skip_line_comment PARAMS ((cpp_reader *));
 static void skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
-static cpp_hashnode *lex_identifier PARAMS ((cpp_reader *));
+static cpp_hashnode *lex_identifier PARAMS ((cpp_reader *, const uchar *));
 static void lex_number PARAMS ((cpp_reader *, cpp_string *));
-static bool continues_identifier_p PARAMS ((cpp_reader *));
+static bool forms_identifier_p PARAMS ((cpp_reader *, int));
 static void lex_string PARAMS ((cpp_reader *, cpp_token *));
 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const uchar *,
 				  cppchar_t));
 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
-static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
-				   const unsigned char *, cppchar_t *));
+static cppchar_t maybe_read_ucn PARAMS ((cpp_reader *, const uchar **));
 static tokenrun *next_tokenrun PARAMS ((tokenrun *));
 
 static unsigned int hex_digit_value PARAMS ((unsigned int));
@@ -361,33 +360,53 @@ name_p (pfile, string)
 }
 
 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
-   an identifier.  */
+   an identifier.  FIRST is TRUE if this starts an identifier.  */
 static bool
-continues_identifier_p (pfile)
+forms_identifier_p (pfile, first)
      cpp_reader *pfile;
+     int first;
 {
-  if (*pfile->buffer->cur != '$' || !CPP_OPTION (pfile, dollars_in_ident))
-    return false;
+  cpp_buffer *buffer = pfile->buffer;
+
+  if (*buffer->cur == '$')
+    {
+      if (!CPP_OPTION (pfile, dollars_in_ident))
+	return false;
+
+      buffer->cur++;
+      if (CPP_PEDANTIC (pfile)
+	  && !pfile->state.skipping
+	  && !pfile->warned_dollar)
+	{
+	  pfile->warned_dollar = true;
+	  cpp_error (pfile, DL_PEDWARN, "'$' in identifier or number");
+	}
+
+      return true;
+    }
 
-  if (CPP_PEDANTIC (pfile) && !pfile->state.skipping && !pfile->warned_dollar)
+  /* Is this a syntactically valid UCN?  */
+  if (*buffer->cur == '\\'
+      && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
     {
-      pfile->warned_dollar = true;
-      cpp_error (pfile, DL_PEDWARN, "'$' in identifier or number");
+      buffer->cur += 2;
+      if (_cpp_valid_ucn (pfile, &buffer->cur, 1 + !first))
+	return true;
+      buffer->cur -= 2;
     }
-  pfile->buffer->cur++;
 
-  return true;
+  return false;
 }
 
 /* Lex an identifier starting at BUFFER->CUR - 1.  */
 static cpp_hashnode *
-lex_identifier (pfile)
+lex_identifier (pfile, base)
      cpp_reader *pfile;
+     const uchar *base;
 {
   cpp_hashnode *result;
-  const uchar *cur, *base;
+  const uchar *cur;
 
-  base = pfile->buffer->cur - 1;
   do
     {
       cur = pfile->buffer->cur;
@@ -398,7 +417,7 @@ lex_identifier (pfile)
 
       pfile->buffer->cur = cur;
     }
-  while (continues_identifier_p (pfile));
+  while (forms_identifier_p (pfile, false));
 
   result = (cpp_hashnode *)
     ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
@@ -444,7 +463,7 @@ lex_number (pfile, number)
 
       pfile->buffer->cur = cur;
     }
-  while (continues_identifier_p (pfile));
+  while (forms_identifier_p (pfile, false));
 
   number->len = cur - base;
   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
@@ -803,7 +822,6 @@ _cpp_lex_direct (pfile)
 	}
       /* Fall through.  */
 
-    start_ident:
     case '_':
     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
@@ -816,7 +834,7 @@ _cpp_lex_direct (pfile)
     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
     case 'Y': case 'Z':
       result->type = CPP_NAME;
-      result->val.node = lex_identifier (pfile);
+      result->val.node = lex_identifier (pfile, buffer->cur - 1);
 
       /* Convert named operators to their proper types.  */
       if (result->val.node->flags & NODE_OPERATOR)
@@ -1044,14 +1062,23 @@ _cpp_lex_direct (pfile)
     case '@': result->type = CPP_ATSIGN; break;
 
     case '$':
-      if (CPP_OPTION (pfile, dollars_in_ident))
-	goto start_ident;
-      /* Fall through...  */
-
-    default:
-      result->type = CPP_OTHER;
-      result->val.c = c;
-      break;
+    case '\\':
+      {
+	const uchar *base = --buffer->cur;
+
+	if (forms_identifier_p (pfile, true))
+	  {
+	    result->type = CPP_NAME;
+	    result->val.node = lex_identifier (pfile, base);
+	    break;
+	  }
+	buffer->cur++;
+
+      default:
+	result->type = CPP_OTHER;
+	result->val.c = c;
+	break;
+      }
     }
 
   return result;
@@ -1321,9 +1348,11 @@ cpp_avoid_paste (pfile, token1, token2)
 				|| b == CPP_CHAR || b == CPP_STRING); /* L */
     case CPP_NUMBER:	return (b == CPP_NUMBER || b == CPP_NAME
 				|| c == '.' || c == '+' || c == '-');
-    case CPP_OTHER:	return (CPP_OPTION (pfile, objc)
-				&& token1->val.c == '@'
-				&& (b == CPP_NAME || b == CPP_STRING));
+				      /* UCNs */
+    case CPP_OTHER:	return ((token1->val.c == '\\' && b == CPP_NAME)
+				|| (CPP_OPTION (pfile, objc)
+				    && token1->val.c == '@'
+				    && (b == CPP_NAME || b == CPP_STRING)));
     default:		break;
     }
 
@@ -1363,93 +1392,31 @@ hex_digit_value (c)
     abort ();
 }
 
-/* Parse a '\uNNNN' or '\UNNNNNNNN' sequence.  Returns 1 to indicate
-   failure if cpplib is not parsing C++ or C99.  Such failure is
-   silent, and no variables are updated.  Otherwise returns 0, and
-   warns if -Wtraditional.
-
-   [lex.charset]: The character designated by the universal character
-   name \UNNNNNNNN is that character whose character short name in
-   ISO/IEC 10646 is NNNNNNNN; the character designated by the
-   universal character name \uNNNN is that character whose character
-   short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
-   for a universal character name is less than 0x20 or in the range
-   0x7F-0x9F (inclusive), or if the universal character name
-   designates a character in the basic source character set, then the
-   program is ill-formed.
-
-   We assume that wchar_t is Unicode, so we don't need to do any
-   mapping.  Is this ever wrong?
-
-   PC points to the 'u' or 'U', PSTR is points to the byte after PC,
-   LIMIT is the end of the string or charconst.  PSTR is updated to
-   point after the UCS on return, and the UCS is written into PC.  */
-
-static int
-maybe_read_ucs (pfile, pstr, limit, pc)
+/* Read a possible universal character name starting at *PSTR.  */
+static cppchar_t
+maybe_read_ucn (pfile, pstr)
      cpp_reader *pfile;
-     const unsigned char **pstr;
-     const unsigned char *limit;
-     cppchar_t *pc;
+     const uchar **pstr;
 {
-  const unsigned char *p = *pstr;
-  unsigned int code = 0;
-  unsigned int c = *pc, length;
-
-  /* Only attempt to interpret a UCS for C++ and C99.  */
-  if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
-    return 1;
+  cppchar_t result, c = *pstr[-1];
 
-  if (CPP_WTRADITIONAL (pfile))
-    cpp_error (pfile, DL_WARNING,
-	       "the meaning of '\\%c' is different in traditional C", c);
-
-  length = (c == 'u' ? 4: 8);
-
-  if ((size_t) (limit - p) < length)
-    {
-      cpp_error (pfile, DL_ERROR, "incomplete universal-character-name");
-      /* Skip to the end to avoid more diagnostics.  */
-      p = limit;
-    }
-  else
+  result = _cpp_valid_ucn (pfile, pstr, false);
+  if (result)
     {
-      for (; length; length--, p++)
+      if (CPP_WTRADITIONAL (pfile))
+	cpp_error (pfile, DL_WARNING,
+		   "the meaning of '\\%c' is different in traditional C",
+		   (int) c);
+
+      if (CPP_OPTION (pfile, EBCDIC))
 	{
-	  c = *p;
-	  if (ISXDIGIT (c))
-	    code = (code << 4) + hex_digit_value (c);
-	  else
-	    {
-	      cpp_error (pfile, DL_ERROR,
-			 "non-hex digit '%c' in universal-character-name", c);
-	      /* We shouldn't skip in case there are multibyte chars.  */
-	      break;
-	    }
+	  cpp_error (pfile, DL_ERROR,
+		     "universal character with an EBCDIC target");
+	  result = 0x3f;  /* EBCDIC invalid character */
 	}
     }
 
-  if (CPP_OPTION (pfile, EBCDIC))
-    {
-      cpp_error (pfile, DL_ERROR, "universal-character-name on EBCDIC target");
-      code = 0x3f;  /* EBCDIC invalid character */
-    }
-  /* True extended characters are OK.  */
-  else if (code >= 0xa0
-	   && !(code & 0x80000000)
-	   && !(code >= 0xD800 && code <= 0xDFFF))
-    ;
-  /* The standard permits $, @ and ` to be specified as UCNs.  We use
-     hex escapes so that this also works with EBCDIC hosts.  */
-  else if (code == 0x24 || code == 0x40 || code == 0x60)
-    ;
-  /* Don't give another error if one occurred above.  */
-  else if (length == 0)
-    cpp_error (pfile, DL_ERROR, "universal-character-name out of range");
-
-  *pstr = p;
-  *pc = code;
-  return 0;
+  return result;
 }
 
 /* Returns the value of an escape sequence, truncated to the correct
@@ -1470,7 +1437,7 @@ cpp_parse_escape (pfile, pstr, limit, wi
 
   int unknown = 0;
   const unsigned char *str = *pstr, *charconsts;
-  cppchar_t c, mask;
+  cppchar_t c, ucn, mask;
   unsigned int width;
 
   if (CPP_OPTION (pfile, EBCDIC))
@@ -1519,7 +1486,11 @@ cpp_parse_escape (pfile, pstr, limit, wi
       break;
 
     case 'u': case 'U':
-      unknown = maybe_read_ucs (pfile, &str, limit, &c);
+      ucn = maybe_read_ucn (pfile, &str);
+      if (ucn)
+	c = ucn;
+      else
+	unknown = true;
       break;
 
     case 'x':
Index: testsuite/gcc.dg/cpp/ucn-1.c
===================================================================
RCS file: testsuite/gcc.dg/cpp/ucn-1.c
diff -N testsuite/gcc.dg/cpp/ucn-1.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ testsuite/gcc.dg/cpp/ucn-1.c	19 Apr 2003 23:35:05 -0000
@@ -0,0 +1,50 @@
+/* Copyright (C) 2003 Free Software Foundation, Inc.  */
+
+/* { dg-do preprocess } */
+/* { dg-options "-std=c99 -fno-show-column" } */
+
+/* This tests universal character sequences in identifiers and numbers
+   for C99.   Neil Booth, 19 Apr 2003.  */
+
+/* First, stuff that should be rejected.  */
+
+\u123;				/* { dg-error "incomplete universal" } */
+\U1234				/* { dg-error "incomplete universal" } */
+
+\u0020				/* { dg-error "not a valid" } */
+\U00000020			/* { dg-error "not a valid" } */
+
+\u0040				/* { dg-error "not valid in an ident" } */
+\U00000040			/* { dg-error "not valid in an ident" } */
+
+\u06f0				/* { dg-error "not valid at the start" } */
+
+/* Now, acceptable uses.  */
+
+\u00c0				/* { dg-bogus "invalid universal" } */
+\U00000401			/* { dg-bogus "invalid universal" } */
+
+\u00c0\u06f0			/* { dg-bogus "not valid at the start" } */
+
+/* Do they work in macros?  */
+
+#define \u00c0 1
+#if \u00c0 != 1
+# error simple UCN macro	/* { dg-bogus "simple UCN macro" } */
+#endif
+
+#define \u00c0\u0401 2
+#if \u00c0\u0401 != 2
+# error compound UCN macro	/* { dg-bogus "compound UCN macro" } */
+#endif
+
+/* Token pasting?  */
+
+#define f(x, y) x ## y
+#if f(\, u00c0) != 1
+# error simple token pasting	/* { dg-bogus "simple token pasting" } */
+#endif
+
+#if f(\u00c0, \u0401) != 2
+# error compound token pasting	/* { dg-bogus "compound token pasting" } */
+#endif
Index: testsuite/gcc.dg/cpp/ucs.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/testsuite/gcc.dg/cpp/ucs.c,v
retrieving revision 1.3
diff -u -p -r1.3 ucs.c
--- testsuite/gcc.dg/cpp/ucs.c	3 Apr 2002 21:59:03 -0000	1.3
+++ testsuite/gcc.dg/cpp/ucs.c	19 Apr 2003 23:35:05 -0000
@@ -51,7 +51,7 @@ void foo ()
   c = L'\ubad';		/* { dg-error "incomplete" "incompete UCN 1" } */
   c = L"\U1234"[0];	/* { dg-error "incomplete" "incompete UCN 2" } */
 
-  c = L'\u000x';	/* { dg-error "non-hex" "non-hex digit in UCN" } */
+  c = L'\u000x';	/* { dg-error "incomplete" "non-hex digit in UCN" } */
   /* If sizeof(HOST_WIDE_INT) > sizeof(wchar_t), we can get a multi-character
      constant warning even for wide characters.  */
   /* { dg-warning "too long|multi-character" "" { target *-*-* } 54 } */
@@ -61,7 +61,7 @@ void foo ()
   c = '\u00a0';		/* { dg-bogus "invalid" "00a0 is a valid UCN" } */
   c = '\U00000060';	/* { dg-bogus "invalid" "0060 is a valid UCN" } */
 
-  c = '\u0025';		/* { dg-error "range" "0025 is an invalid UCN" } */
-  c = L"\uD800"[0];	/* { dg-error "range" "D800 is an invalid UCN" } */
-  c = L'\U0000DFFF';	/* { dg-error "range" "DFFF is an invalid UCN" } */
+  c = '\u0025';		/* { dg-error "not a valid" "0025 invalid UCN" } */
+  c = L"\uD800"[0];	/* { dg-error "not a valid" "D800 invalid UCN" } */
+  c = L'\U0000DFFF';	/* { dg-error "not a valid" "DFFF invalid UCN" } */
 }


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]