This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
Implementing Universal Character Names in identifiers

From: "Martin v. Löwis" <loewis at informatik dot hu-berlin dot de>
To: gcc-patches at gcc dot gnu dot org
Date: Mon, 28 Oct 2002 08:15:39 +0100 (CET)
Subject: Implementing Universal Character Names in identifiers
This patch implements UCNs in cpplib. It does so by converting the
UCN to UTF-8, putting the UTF-8 bytes into the internal
representation of the identifier.

The back-ends will transparently output the UTF-8 identifiers into the
assembler file. If GNU as is used (or any other assembler supporting
non-ASCII identifiers), these UTF-8 strings will be copied transparently
into the object file. If the assembler does not support UTF-8, it
will produce a diagnostic.

As a result of this strategy, UCNs are now allowed in all places
mandated by the relevant standards, i.e. both in C99 and C++, and in
all identifiers, including macro names.

Regards,
Martin

2002-10-27  Martin v. Löwis  <loewis@informatik.hu-berlin.de>

	* c-lex.c (is_extended_char, utf8_extend_token): Remove.
	* cpplex.c (identifier_ucs_p, utf8_extend_token, 
	utf8_to_char): New functions.
	(parse_slow): Add utf8 parameter. Parse UCS names.
	(parse_identifier, parse_number): Adjust.
	(_cpp_lex_direct): Parse UCS names.
	(cpp_output_token): Print UCS names.
	* cpplib.h (NODE_UTF8): New flag.

Index: c-lex.c
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/c-lex.c,v
retrieving revision 1.190
diff -c -p -r1.190 c-lex.c
*** c-lex.c	16 Sep 2002 16:36:31 -0000	1.190
--- c-lex.c	27 Oct 2002 17:35:33 -0000
*************** cb_undef (pfile, line, node)
*** 356,669 ****
  			 (const char *) NODE_NAME (node));
  }
  
- #if 0 /* not yet */
- /* Returns nonzero if C is a universal-character-name.  Give an error if it
-    is not one which may appear in an identifier, as per [extendid].
- 
-    Note that extended character support in identifiers has not yet been
-    implemented.  It is my personal opinion that this is not a desirable
-    feature.  Portable code cannot count on support for more than the basic
-    identifier character set.  */
- 
- static inline int
- is_extended_char (c)
-      int c;
- {
- #ifdef TARGET_EBCDIC
-   return 0;
- #else
-   /* ASCII.  */
-   if (c < 0x7f)
-     return 0;
- 
-   /* None of the valid chars are outside the Basic Multilingual Plane (the
-      low 16 bits).  */
-   if (c > 0xffff)
-     {
-       error ("universal-character-name '\\U%08x' not valid in identifier", c);
-       return 1;
-     }
-   
-   /* Latin */
-   if ((c >= 0x00c0 && c <= 0x00d6)
-       || (c >= 0x00d8 && c <= 0x00f6)
-       || (c >= 0x00f8 && c <= 0x01f5)
-       || (c >= 0x01fa && c <= 0x0217)
-       || (c >= 0x0250 && c <= 0x02a8)
-       || (c >= 0x1e00 && c <= 0x1e9a)
-       || (c >= 0x1ea0 && c <= 0x1ef9))
-     return 1;
- 
-   /* Greek */
-   if ((c == 0x0384)
-       || (c >= 0x0388 && c <= 0x038a)
-       || (c == 0x038c)
-       || (c >= 0x038e && c <= 0x03a1)
-       || (c >= 0x03a3 && c <= 0x03ce)
-       || (c >= 0x03d0 && c <= 0x03d6)
-       || (c == 0x03da)
-       || (c == 0x03dc)
-       || (c == 0x03de)
-       || (c == 0x03e0)
-       || (c >= 0x03e2 && c <= 0x03f3)
-       || (c >= 0x1f00 && c <= 0x1f15)
-       || (c >= 0x1f18 && c <= 0x1f1d)
-       || (c >= 0x1f20 && c <= 0x1f45)
-       || (c >= 0x1f48 && c <= 0x1f4d)
-       || (c >= 0x1f50 && c <= 0x1f57)
-       || (c == 0x1f59)
-       || (c == 0x1f5b)
-       || (c == 0x1f5d)
-       || (c >= 0x1f5f && c <= 0x1f7d)
-       || (c >= 0x1f80 && c <= 0x1fb4)
-       || (c >= 0x1fb6 && c <= 0x1fbc)
-       || (c >= 0x1fc2 && c <= 0x1fc4)
-       || (c >= 0x1fc6 && c <= 0x1fcc)
-       || (c >= 0x1fd0 && c <= 0x1fd3)
-       || (c >= 0x1fd6 && c <= 0x1fdb)
-       || (c >= 0x1fe0 && c <= 0x1fec)
-       || (c >= 0x1ff2 && c <= 0x1ff4)
-       || (c >= 0x1ff6 && c <= 0x1ffc))
-     return 1;
- 
-   /* Cyrillic */
-   if ((c >= 0x0401 && c <= 0x040d)
-       || (c >= 0x040f && c <= 0x044f)
-       || (c >= 0x0451 && c <= 0x045c)
-       || (c >= 0x045e && c <= 0x0481)
-       || (c >= 0x0490 && c <= 0x04c4)
-       || (c >= 0x04c7 && c <= 0x04c8)
-       || (c >= 0x04cb && c <= 0x04cc)
-       || (c >= 0x04d0 && c <= 0x04eb)
-       || (c >= 0x04ee && c <= 0x04f5)
-       || (c >= 0x04f8 && c <= 0x04f9))
-     return 1;
- 
-   /* Armenian */
-   if ((c >= 0x0531 && c <= 0x0556)
-       || (c >= 0x0561 && c <= 0x0587))
-     return 1;
- 
-   /* Hebrew */
-   if ((c >= 0x05d0 && c <= 0x05ea)
-       || (c >= 0x05f0 && c <= 0x05f4))
-     return 1;
- 
-   /* Arabic */
-   if ((c >= 0x0621 && c <= 0x063a)
-       || (c >= 0x0640 && c <= 0x0652)
-       || (c >= 0x0670 && c <= 0x06b7)
-       || (c >= 0x06ba && c <= 0x06be)
-       || (c >= 0x06c0 && c <= 0x06ce)
-       || (c >= 0x06e5 && c <= 0x06e7))
-     return 1;
- 
-   /* Devanagari */
-   if ((c >= 0x0905 && c <= 0x0939)
-       || (c >= 0x0958 && c <= 0x0962))
-     return 1;
- 
-   /* Bengali */
-   if ((c >= 0x0985 && c <= 0x098c)
-       || (c >= 0x098f && c <= 0x0990)
-       || (c >= 0x0993 && c <= 0x09a8)
-       || (c >= 0x09aa && c <= 0x09b0)
-       || (c == 0x09b2)
-       || (c >= 0x09b6 && c <= 0x09b9)
-       || (c >= 0x09dc && c <= 0x09dd)
-       || (c >= 0x09df && c <= 0x09e1)
-       || (c >= 0x09f0 && c <= 0x09f1))
-     return 1;
- 
-   /* Gurmukhi */
-   if ((c >= 0x0a05 && c <= 0x0a0a)
-       || (c >= 0x0a0f && c <= 0x0a10)
-       || (c >= 0x0a13 && c <= 0x0a28)
-       || (c >= 0x0a2a && c <= 0x0a30)
-       || (c >= 0x0a32 && c <= 0x0a33)
-       || (c >= 0x0a35 && c <= 0x0a36)
-       || (c >= 0x0a38 && c <= 0x0a39)
-       || (c >= 0x0a59 && c <= 0x0a5c)
-       || (c == 0x0a5e))
-     return 1;
- 
-   /* Gujarati */
-   if ((c >= 0x0a85 && c <= 0x0a8b)
-       || (c == 0x0a8d)
-       || (c >= 0x0a8f && c <= 0x0a91)
-       || (c >= 0x0a93 && c <= 0x0aa8)
-       || (c >= 0x0aaa && c <= 0x0ab0)
-       || (c >= 0x0ab2 && c <= 0x0ab3)
-       || (c >= 0x0ab5 && c <= 0x0ab9)
-       || (c == 0x0ae0))
-     return 1;
- 
-   /* Oriya */
-   if ((c >= 0x0b05 && c <= 0x0b0c)
-       || (c >= 0x0b0f && c <= 0x0b10)
-       || (c >= 0x0b13 && c <= 0x0b28)
-       || (c >= 0x0b2a && c <= 0x0b30)
-       || (c >= 0x0b32 && c <= 0x0b33)
-       || (c >= 0x0b36 && c <= 0x0b39)
-       || (c >= 0x0b5c && c <= 0x0b5d)
-       || (c >= 0x0b5f && c <= 0x0b61))
-     return 1;
- 
-   /* Tamil */
-   if ((c >= 0x0b85 && c <= 0x0b8a)
-       || (c >= 0x0b8e && c <= 0x0b90)
-       || (c >= 0x0b92 && c <= 0x0b95)
-       || (c >= 0x0b99 && c <= 0x0b9a)
-       || (c == 0x0b9c)
-       || (c >= 0x0b9e && c <= 0x0b9f)
-       || (c >= 0x0ba3 && c <= 0x0ba4)
-       || (c >= 0x0ba8 && c <= 0x0baa)
-       || (c >= 0x0bae && c <= 0x0bb5)
-       || (c >= 0x0bb7 && c <= 0x0bb9))
-     return 1;
- 
-   /* Telugu */
-   if ((c >= 0x0c05 && c <= 0x0c0c)
-       || (c >= 0x0c0e && c <= 0x0c10)
-       || (c >= 0x0c12 && c <= 0x0c28)
-       || (c >= 0x0c2a && c <= 0x0c33)
-       || (c >= 0x0c35 && c <= 0x0c39)
-       || (c >= 0x0c60 && c <= 0x0c61))
-     return 1;
- 
-   /* Kannada */
-   if ((c >= 0x0c85 && c <= 0x0c8c)
-       || (c >= 0x0c8e && c <= 0x0c90)
-       || (c >= 0x0c92 && c <= 0x0ca8)
-       || (c >= 0x0caa && c <= 0x0cb3)
-       || (c >= 0x0cb5 && c <= 0x0cb9)
-       || (c >= 0x0ce0 && c <= 0x0ce1))
-     return 1;
- 
-   /* Malayalam */
-   if ((c >= 0x0d05 && c <= 0x0d0c)
-       || (c >= 0x0d0e && c <= 0x0d10)
-       || (c >= 0x0d12 && c <= 0x0d28)
-       || (c >= 0x0d2a && c <= 0x0d39)
-       || (c >= 0x0d60 && c <= 0x0d61))
-     return 1;
- 
-   /* Thai */
-   if ((c >= 0x0e01 && c <= 0x0e30)
-       || (c >= 0x0e32 && c <= 0x0e33)
-       || (c >= 0x0e40 && c <= 0x0e46)
-       || (c >= 0x0e4f && c <= 0x0e5b))
-     return 1;
- 
-   /* Lao */
-   if ((c >= 0x0e81 && c <= 0x0e82)
-       || (c == 0x0e84)
-       || (c == 0x0e87)
-       || (c == 0x0e88)
-       || (c == 0x0e8a)
-       || (c == 0x0e0d)
-       || (c >= 0x0e94 && c <= 0x0e97)
-       || (c >= 0x0e99 && c <= 0x0e9f)
-       || (c >= 0x0ea1 && c <= 0x0ea3)
-       || (c == 0x0ea5)
-       || (c == 0x0ea7)
-       || (c == 0x0eaa)
-       || (c == 0x0eab)
-       || (c >= 0x0ead && c <= 0x0eb0)
-       || (c == 0x0eb2)
-       || (c == 0x0eb3)
-       || (c == 0x0ebd)
-       || (c >= 0x0ec0 && c <= 0x0ec4)
-       || (c == 0x0ec6))
-     return 1;
- 
-   /* Georgian */
-   if ((c >= 0x10a0 && c <= 0x10c5)
-       || (c >= 0x10d0 && c <= 0x10f6))
-     return 1;
- 
-   /* Hiragana */
-   if ((c >= 0x3041 && c <= 0x3094)
-       || (c >= 0x309b && c <= 0x309e))
-     return 1;
- 
-   /* Katakana */
-   if ((c >= 0x30a1 && c <= 0x30fe))
-     return 1;
- 
-   /* Bopmofo */
-   if ((c >= 0x3105 && c <= 0x312c))
-     return 1;
- 
-   /* Hangul */
-   if ((c >= 0x1100 && c <= 0x1159)
-       || (c >= 0x1161 && c <= 0x11a2)
-       || (c >= 0x11a8 && c <= 0x11f9))
-     return 1;
- 
-   /* CJK Unified Ideographs */
-   if ((c >= 0xf900 && c <= 0xfa2d)
-       || (c >= 0xfb1f && c <= 0xfb36)
-       || (c >= 0xfb38 && c <= 0xfb3c)
-       || (c == 0xfb3e)
-       || (c >= 0xfb40 && c <= 0xfb41)
-       || (c >= 0xfb42 && c <= 0xfb44)
-       || (c >= 0xfb46 && c <= 0xfbb1)
-       || (c >= 0xfbd3 && c <= 0xfd3f)
-       || (c >= 0xfd50 && c <= 0xfd8f)
-       || (c >= 0xfd92 && c <= 0xfdc7)
-       || (c >= 0xfdf0 && c <= 0xfdfb)
-       || (c >= 0xfe70 && c <= 0xfe72)
-       || (c == 0xfe74)
-       || (c >= 0xfe76 && c <= 0xfefc)
-       || (c >= 0xff21 && c <= 0xff3a)
-       || (c >= 0xff41 && c <= 0xff5a)
-       || (c >= 0xff66 && c <= 0xffbe)
-       || (c >= 0xffc2 && c <= 0xffc7)
-       || (c >= 0xffca && c <= 0xffcf)
-       || (c >= 0xffd2 && c <= 0xffd7)
-       || (c >= 0xffda && c <= 0xffdc)
-       || (c >= 0x4e00 && c <= 0x9fa5))
-     return 1;
- 
-   error ("universal-character-name '\\u%04x' not valid in identifier", c);
-   return 1;
- #endif
- }
- 
- /* Add the UTF-8 representation of C to the token_buffer.  */
- 
- static void
- utf8_extend_token (c)
-      int c;
- {
-   int shift, mask;
- 
-   if      (c <= 0x0000007f)
-     {
-       extend_token (c);
-       return;
-     }
-   else if (c <= 0x000007ff)
-     shift = 6, mask = 0xc0;
-   else if (c <= 0x0000ffff)
-     shift = 12, mask = 0xe0;
-   else if (c <= 0x001fffff)
-     shift = 18, mask = 0xf0;
-   else if (c <= 0x03ffffff)
-     shift = 24, mask = 0xf8;
-   else
-     shift = 30, mask = 0xfc;
- 
-   extend_token (mask | (c >> shift));
-   do
-     {
-       shift -= 6;
-       extend_token ((unsigned char) (0x80 | (c >> shift)));
-     }
-   while (shift);
- }
- #endif
  
  int
  c_lex (value)
--- 356,361 ----
Index: cpplex.c
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/cpplex.c,v
retrieving revision 1.215
diff -c -p -r1.215 cpplex.c
*** cpplex.c	26 Sep 2002 22:25:12 -0000	1.215
--- cpplex.c	27 Oct 2002 17:35:33 -0000
*************** static void adjust_column PARAMS ((cpp_r
*** 71,77 ****
  static int skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
  static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *));
  static uchar *parse_slow PARAMS ((cpp_reader *, const uchar *, int,
! 				  unsigned int *));
  static void parse_number PARAMS ((cpp_reader *, cpp_string *, int));
  static int unescaped_terminator_p PARAMS ((cpp_reader *, const uchar *));
  static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
--- 71,77 ----
  static int skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
  static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *));
  static uchar *parse_slow PARAMS ((cpp_reader *, const uchar *, int,
! 				  unsigned int *, unsigned int *));
  static void parse_number PARAMS ((cpp_reader *, cpp_string *, int));
  static int unescaped_terminator_p PARAMS ((cpp_reader *, const uchar *));
  static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
*************** static tokenrun *next_tokenrun PARAMS ((
*** 86,91 ****
--- 86,95 ----
  
  static unsigned int hex_digit_value PARAMS ((unsigned int));
  static _cpp_buff *new_buff PARAMS ((size_t));
+ static bool identifier_ucs_p PARAMS ((cpp_reader *, cppchar_t));
+ static void utf8_extend_token PARAMS ((struct obstack *, int));
+ static cppchar_t utf8_to_char PARAMS((const unsigned char **));
+ 
  
  /* Utility routine:
  
*************** trigraph_p (pfile)
*** 161,166 ****
--- 165,529 ----
    return accept;
  }
  
+ /* Returns nonzero if C is a universal-character-name.  Give an error if it
+    is not one which may appear in an identifier, as per [extendid].
+ 
+    Note that extended character support in identifiers has not yet been
+    implemented.  It is my personal opinion that this is not a desirable
+    feature.  Portable code cannot count on support for more than the basic
+    identifier character set.  */
+ 
+ static bool
+ identifier_ucs_p (pfile, c)
+      cpp_reader *pfile;
+      cppchar_t c;
+ {
+ #ifdef TARGET_EBCDIC
+   return 0;
+ #else
+   /* ASCII.  */
+   if (c < 0x7f)
+     return 0;
+ 
+   /* None of the valid chars are outside the Basic Multilingual Plane (the
+      low 16 bits).  */
+   if (c > 0xffff)
+     {
+       cpp_error_with_line (pfile, DL_ERROR,
+                            pfile->line, 1, /* XXX */
+                            "universal-character-name '\\U%08x' not valid in identifier", (int)c);
+       return 0;
+     }
+   
+   /* Latin */
+   if ((c >= 0x00c0 && c <= 0x00d6)
+       || (c >= 0x00d8 && c <= 0x00f6)
+       || (c >= 0x00f8 && c <= 0x01f5)
+       || (c >= 0x01fa && c <= 0x0217)
+       || (c >= 0x0250 && c <= 0x02a8)
+       || (c >= 0x1e00 && c <= 0x1e9a)
+       || (c >= 0x1ea0 && c <= 0x1ef9))
+     return 1;
+ 
+   /* Greek */
+   if ((c == 0x0384)
+       || (c >= 0x0388 && c <= 0x038a)
+       || (c == 0x038c)
+       || (c >= 0x038e && c <= 0x03a1)
+       || (c >= 0x03a3 && c <= 0x03ce)
+       || (c >= 0x03d0 && c <= 0x03d6)
+       || (c == 0x03da)
+       || (c == 0x03dc)
+       || (c == 0x03de)
+       || (c == 0x03e0)
+       || (c >= 0x03e2 && c <= 0x03f3)
+       || (c >= 0x1f00 && c <= 0x1f15)
+       || (c >= 0x1f18 && c <= 0x1f1d)
+       || (c >= 0x1f20 && c <= 0x1f45)
+       || (c >= 0x1f48 && c <= 0x1f4d)
+       || (c >= 0x1f50 && c <= 0x1f57)
+       || (c == 0x1f59)
+       || (c == 0x1f5b)
+       || (c == 0x1f5d)
+       || (c >= 0x1f5f && c <= 0x1f7d)
+       || (c >= 0x1f80 && c <= 0x1fb4)
+       || (c >= 0x1fb6 && c <= 0x1fbc)
+       || (c >= 0x1fc2 && c <= 0x1fc4)
+       || (c >= 0x1fc6 && c <= 0x1fcc)
+       || (c >= 0x1fd0 && c <= 0x1fd3)
+       || (c >= 0x1fd6 && c <= 0x1fdb)
+       || (c >= 0x1fe0 && c <= 0x1fec)
+       || (c >= 0x1ff2 && c <= 0x1ff4)
+       || (c >= 0x1ff6 && c <= 0x1ffc))
+     return 1;
+ 
+   /* Cyrillic */
+   if ((c >= 0x0401 && c <= 0x040d)
+       || (c >= 0x040f && c <= 0x044f)
+       || (c >= 0x0451 && c <= 0x045c)
+       || (c >= 0x045e && c <= 0x0481)
+       || (c >= 0x0490 && c <= 0x04c4)
+       || (c >= 0x04c7 && c <= 0x04c8)
+       || (c >= 0x04cb && c <= 0x04cc)
+       || (c >= 0x04d0 && c <= 0x04eb)
+       || (c >= 0x04ee && c <= 0x04f5)
+       || (c >= 0x04f8 && c <= 0x04f9))
+     return 1;
+ 
+   /* Armenian */
+   if ((c >= 0x0531 && c <= 0x0556)
+       || (c >= 0x0561 && c <= 0x0587))
+     return 1;
+ 
+   /* Hebrew */
+   if ((c >= 0x05d0 && c <= 0x05ea)
+       || (c >= 0x05f0 && c <= 0x05f4))
+     return 1;
+ 
+   /* Arabic */
+   if ((c >= 0x0621 && c <= 0x063a)
+       || (c >= 0x0640 && c <= 0x0652)
+       || (c >= 0x0670 && c <= 0x06b7)
+       || (c >= 0x06ba && c <= 0x06be)
+       || (c >= 0x06c0 && c <= 0x06ce)
+       || (c >= 0x06e5 && c <= 0x06e7))
+     return 1;
+ 
+   /* Devanagari */
+   if ((c >= 0x0905 && c <= 0x0939)
+       || (c >= 0x0958 && c <= 0x0962))
+     return 1;
+ 
+   /* Bengali */
+   if ((c >= 0x0985 && c <= 0x098c)
+       || (c >= 0x098f && c <= 0x0990)
+       || (c >= 0x0993 && c <= 0x09a8)
+       || (c >= 0x09aa && c <= 0x09b0)
+       || (c == 0x09b2)
+       || (c >= 0x09b6 && c <= 0x09b9)
+       || (c >= 0x09dc && c <= 0x09dd)
+       || (c >= 0x09df && c <= 0x09e1)
+       || (c >= 0x09f0 && c <= 0x09f1))
+     return 1;
+ 
+   /* Gurmukhi */
+   if ((c >= 0x0a05 && c <= 0x0a0a)
+       || (c >= 0x0a0f && c <= 0x0a10)
+       || (c >= 0x0a13 && c <= 0x0a28)
+       || (c >= 0x0a2a && c <= 0x0a30)
+       || (c >= 0x0a32 && c <= 0x0a33)
+       || (c >= 0x0a35 && c <= 0x0a36)
+       || (c >= 0x0a38 && c <= 0x0a39)
+       || (c >= 0x0a59 && c <= 0x0a5c)
+       || (c == 0x0a5e))
+     return 1;
+ 
+   /* Gujarati */
+   if ((c >= 0x0a85 && c <= 0x0a8b)
+       || (c == 0x0a8d)
+       || (c >= 0x0a8f && c <= 0x0a91)
+       || (c >= 0x0a93 && c <= 0x0aa8)
+       || (c >= 0x0aaa && c <= 0x0ab0)
+       || (c >= 0x0ab2 && c <= 0x0ab3)
+       || (c >= 0x0ab5 && c <= 0x0ab9)
+       || (c == 0x0ae0))
+     return 1;
+ 
+   /* Oriya */
+   if ((c >= 0x0b05 && c <= 0x0b0c)
+       || (c >= 0x0b0f && c <= 0x0b10)
+       || (c >= 0x0b13 && c <= 0x0b28)
+       || (c >= 0x0b2a && c <= 0x0b30)
+       || (c >= 0x0b32 && c <= 0x0b33)
+       || (c >= 0x0b36 && c <= 0x0b39)
+       || (c >= 0x0b5c && c <= 0x0b5d)
+       || (c >= 0x0b5f && c <= 0x0b61))
+     return 1;
+ 
+   /* Tamil */
+   if ((c >= 0x0b85 && c <= 0x0b8a)
+       || (c >= 0x0b8e && c <= 0x0b90)
+       || (c >= 0x0b92 && c <= 0x0b95)
+       || (c >= 0x0b99 && c <= 0x0b9a)
+       || (c == 0x0b9c)
+       || (c >= 0x0b9e && c <= 0x0b9f)
+       || (c >= 0x0ba3 && c <= 0x0ba4)
+       || (c >= 0x0ba8 && c <= 0x0baa)
+       || (c >= 0x0bae && c <= 0x0bb5)
+       || (c >= 0x0bb7 && c <= 0x0bb9))
+     return 1;
+ 
+   /* Telugu */
+   if ((c >= 0x0c05 && c <= 0x0c0c)
+       || (c >= 0x0c0e && c <= 0x0c10)
+       || (c >= 0x0c12 && c <= 0x0c28)
+       || (c >= 0x0c2a && c <= 0x0c33)
+       || (c >= 0x0c35 && c <= 0x0c39)
+       || (c >= 0x0c60 && c <= 0x0c61))
+     return 1;
+ 
+   /* Kannada */
+   if ((c >= 0x0c85 && c <= 0x0c8c)
+       || (c >= 0x0c8e && c <= 0x0c90)
+       || (c >= 0x0c92 && c <= 0x0ca8)
+       || (c >= 0x0caa && c <= 0x0cb3)
+       || (c >= 0x0cb5 && c <= 0x0cb9)
+       || (c >= 0x0ce0 && c <= 0x0ce1))
+     return 1;
+ 
+   /* Malayalam */
+   if ((c >= 0x0d05 && c <= 0x0d0c)
+       || (c >= 0x0d0e && c <= 0x0d10)
+       || (c >= 0x0d12 && c <= 0x0d28)
+       || (c >= 0x0d2a && c <= 0x0d39)
+       || (c >= 0x0d60 && c <= 0x0d61))
+     return 1;
+ 
+   /* Thai */
+   if ((c >= 0x0e01 && c <= 0x0e30)
+       || (c >= 0x0e32 && c <= 0x0e33)
+       || (c >= 0x0e40 && c <= 0x0e46)
+       || (c >= 0x0e4f && c <= 0x0e5b))
+     return 1;
+ 
+   /* Lao */
+   if ((c >= 0x0e81 && c <= 0x0e82)
+       || (c == 0x0e84)
+       || (c == 0x0e87)
+       || (c == 0x0e88)
+       || (c == 0x0e8a)
+       || (c == 0x0e0d)
+       || (c >= 0x0e94 && c <= 0x0e97)
+       || (c >= 0x0e99 && c <= 0x0e9f)
+       || (c >= 0x0ea1 && c <= 0x0ea3)
+       || (c == 0x0ea5)
+       || (c == 0x0ea7)
+       || (c == 0x0eaa)
+       || (c == 0x0eab)
+       || (c >= 0x0ead && c <= 0x0eb0)
+       || (c == 0x0eb2)
+       || (c == 0x0eb3)
+       || (c == 0x0ebd)
+       || (c >= 0x0ec0 && c <= 0x0ec4)
+       || (c == 0x0ec6))
+     return 1;
+ 
+   /* Georgian */
+   if ((c >= 0x10a0 && c <= 0x10c5)
+       || (c >= 0x10d0 && c <= 0x10f6))
+     return 1;
+ 
+   /* Hiragana */
+   if ((c >= 0x3041 && c <= 0x3094)
+       || (c >= 0x309b && c <= 0x309e))
+     return 1;
+ 
+   /* Katakana */
+   if ((c >= 0x30a1 && c <= 0x30fe))
+     return 1;
+ 
+   /* Bopmofo */
+   if ((c >= 0x3105 && c <= 0x312c))
+     return 1;
+ 
+   /* Hangul */
+   if ((c >= 0x1100 && c <= 0x1159)
+       || (c >= 0x1161 && c <= 0x11a2)
+       || (c >= 0x11a8 && c <= 0x11f9))
+     return 1;
+ 
+   /* CJK Unified Ideographs */
+   if ((c >= 0xf900 && c <= 0xfa2d)
+       || (c >= 0xfb1f && c <= 0xfb36)
+       || (c >= 0xfb38 && c <= 0xfb3c)
+       || (c == 0xfb3e)
+       || (c >= 0xfb40 && c <= 0xfb41)
+       || (c >= 0xfb42 && c <= 0xfb44)
+       || (c >= 0xfb46 && c <= 0xfbb1)
+       || (c >= 0xfbd3 && c <= 0xfd3f)
+       || (c >= 0xfd50 && c <= 0xfd8f)
+       || (c >= 0xfd92 && c <= 0xfdc7)
+       || (c >= 0xfdf0 && c <= 0xfdfb)
+       || (c >= 0xfe70 && c <= 0xfe72)
+       || (c == 0xfe74)
+       || (c >= 0xfe76 && c <= 0xfefc)
+       || (c >= 0xff21 && c <= 0xff3a)
+       || (c >= 0xff41 && c <= 0xff5a)
+       || (c >= 0xff66 && c <= 0xffbe)
+       || (c >= 0xffc2 && c <= 0xffc7)
+       || (c >= 0xffca && c <= 0xffcf)
+       || (c >= 0xffd2 && c <= 0xffd7)
+       || (c >= 0xffda && c <= 0xffdc)
+       || (c >= 0x4e00 && c <= 0x9fa5))
+     return 1;
+ 
+   cpp_error_with_line (pfile, DL_ERROR,
+                        pfile->line, 1, /* XXX */
+                        "universal-character-name '\\u%04x' not valid in identifier", c);
+   return 0;
+ #endif
+ }
+ 
+ /* Add the UTF-8 representation of C to the token_buffer.  */
+ 
+ static void
+ utf8_extend_token (stack, c)
+      struct obstack *stack;
+      int c;
+ {
+   int shift, mask;
+ 
+   if      (c <= 0x0000007f)
+     {
+       obstack_1grow (stack, c);
+       return;
+     }
+   else if (c <= 0x000007ff)
+     shift = 6, mask = 0xc0;
+   else if (c <= 0x0000ffff)
+     shift = 12, mask = 0xe0;
+   else if (c <= 0x001fffff)
+     shift = 18, mask = 0xf0;
+   else if (c <= 0x03ffffff)
+     shift = 24, mask = 0xf8;
+   else
+     shift = 30, mask = 0xfc;
+ 
+   obstack_1grow (stack, mask | (c >> shift));
+   do
+     {
+       shift -= 6;
+       obstack_1grow (stack, (unsigned char) (0x80 | ((c >> shift) & 0x3f)));
+     }
+   while (shift);
+ }
+ 
+ static cppchar_t
+ utf8_to_char (pos)
+      const unsigned char **pos;
+ {
+   cppchar_t result = 0;
+   const unsigned char *s = *pos;
+   if (*s < 128)
+     {
+       result = *s;
+       *pos += 1;
+     }
+   else if (*s < 0xc0)
+     {
+       /* Cannot occur as first byte */
+       abort();
+     }
+   else if (*s < 0xE0)
+     {
+       result = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
+       *pos += 2;
+     }
+   else if (*s < 0xF0)
+     {
+       result =
+         ((s[0] & 0xf) << 12) +
+         ((s[1] & 0x3f) << 6) +
+         (s[2] & 0x3f);
+       *pos += 3;
+     }
+   else if (*s < 0xF8)
+     {
+       result =
+         ((s[0] & 0x7) << 18) +
+         ((s[1] & 0x3f) << 12) +
+         ((s[2] & 0x3f) << 6) +
+         (s[3] & 0x3f);
+       *pos += 4;
+     }
+   else
+     {
+       /* Other codes are reserved. */
+       abort ();
+     }
+   return result;
+ }
+ 
  /* Skips any escaped newlines introduced by '?' or a '\\', assumed to
     lie in buffer->cur[-1].  Returns the next byte, which will be in
     buffer->cur[-1].  This routine performs preprocessing stages 1 and
*************** parse_identifier (pfile)
*** 451,461 ****
    /* Check for slow-path cases.  */
    if (*cur == '?' || *cur == '\\' || *cur == '$')
      {
!       unsigned int len;
  
!       base = parse_slow (pfile, cur, 0, &len);
        result = (cpp_hashnode *)
  	ht_lookup (pfile->hash_table, base, len, HT_ALLOCED);
      }
    else
      {
--- 814,826 ----
    /* Check for slow-path cases.  */
    if (*cur == '?' || *cur == '\\' || *cur == '$')
      {
!       unsigned int len, utf8;
  
!       base = parse_slow (pfile, cur, 0, &len, &utf8);
        result = (cpp_hashnode *)
  	ht_lookup (pfile->hash_table, base, len, HT_ALLOCED);
+       if (utf8)
+         result->flags |= NODE_UTF8;
      }
    else
      {
*************** parse_identifier (pfile)
*** 493,503 ****
     pointer to the token's NUL-terminated spelling in permanent
     storage, and sets PLEN to its length.  */
  static uchar *
! parse_slow (pfile, cur, number_p, plen)
       cpp_reader *pfile;
       const uchar *cur;
       int number_p;
       unsigned int *plen;
  {
    cpp_buffer *buffer = pfile->buffer;
    const uchar *base = buffer->cur - 1;
--- 858,869 ----
     pointer to the token's NUL-terminated spelling in permanent
     storage, and sets PLEN to its length.  */
  static uchar *
! parse_slow (pfile, cur, number_p, plen, utf8)
       cpp_reader *pfile;
       const uchar *cur;
       int number_p;
       unsigned int *plen;
+      unsigned int *utf8;
  {
    cpp_buffer *buffer = pfile->buffer;
    const uchar *base = buffer->cur - 1;
*************** parse_slow (pfile, cur, number_p, plen)
*** 516,523 ****
--- 882,906 ----
    prevc = cur[-1];
    c = *cur++;
    buffer->cur = cur;
+   *utf8 = 0;
    for (;;)
      {
+       if (c == '\\' && (*buffer->cur == 'u'
+                         || *buffer->cur == 'U'))
+         {
+           cur = buffer->cur - 1;
+           c = *buffer->cur++;
+           if (maybe_read_ucs (pfile, &buffer->cur, buffer->rlimit, &c) == 0
+               && identifier_ucs_p (pfile, c))
+             {
+               utf8_extend_token (stack, c);
+               c = *buffer->cur++;
+               *utf8 = 1;
+               continue;
+             }
+           buffer->cur = cur;
+           c = *buffer->cur++;
+         }
        /* Potential escaped newline?  */
        buffer->backup_to = buffer->cur - 1;
        if (c == '?' || c == '\\')
*************** parse_number (pfile, number, leading_per
*** 570,575 ****
--- 953,959 ----
       int leading_period;
  {
    const uchar *cur;
+   unsigned int ignored;
  
    /* Fast-path loop.  Skim over a normal number.
       N.B. ISIDNUM does not include $.  */
*************** parse_number (pfile, number, leading_per
*** 579,585 ****
  
    /* Check for slow-path cases.  */
    if (*cur == '?' || *cur == '\\' || *cur == '$')
!     number->text = parse_slow (pfile, cur, 1 + leading_period, &number->len);
    else
      {
        const uchar *base = pfile->buffer->cur - 1;
--- 963,970 ----
  
    /* Check for slow-path cases.  */
    if (*cur == '?' || *cur == '\\' || *cur == '$')
!     number->text = parse_slow (pfile, cur, 1 + leading_period,
!                                &number->len, &ignored);
    else
      {
        const uchar *base = pfile->buffer->cur - 1;
*************** _cpp_lex_direct (pfile)
*** 1025,1031 ****
        if (c == '?')
  	result->type = CPP_QUERY;
        else if (c == '\\')
! 	goto random_char;
        else
  	goto trigraph;
        break;
--- 1410,1434 ----
        if (c == '?')
  	result->type = CPP_QUERY;
        else if (c == '\\')
!         {
!           const unsigned char *pos = buffer->cur;
!           
!           c = *buffer->cur++;
!           if ((c == 'u' || c == 'U')
!               && maybe_read_ucs (pfile, &buffer->cur,
!                                  buffer->rlimit, &c) == 0
!               && identifier_ucs_p (pfile, c))
!             {
!               buffer->cur = pos;
!               goto start_ident;
!             }
!           else
!             {
!               c = '\\';
!               buffer->cur = pos;
!               goto random_char;
!             }
!         }
        else
  	goto trigraph;
        break;
*************** cpp_output_token (token, fp)
*** 1503,1509 ****
  
      spell_ident:
      case SPELL_IDENT:
!       fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
      break;
  
      case SPELL_NUMBER:
--- 1906,1937 ----
  
      spell_ident:
      case SPELL_IDENT:
!       if ((token->val.node->flags & NODE_UTF8) == 0)
!         fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
!       else
!         {
!           const unsigned char *s = NODE_NAME (token->val.node);
!           int len = NODE_LEN (token->val.node);
!           for (; len; len--)
!             {
!               if (*s < 128)
!                 {
!                   fwrite (s, 1, 1, fp);
!                   s++;
!                   len--;
!                 }
!               else
!                 {
!                   const unsigned char *old = s;
!                   cppchar_t code = utf8_to_char (&s);
!                   if (code < 0x10000)
!                     fprintf (fp, "\\u%.4x", code);
!                   else
!                     fprintf (fp, "\\U%.8x", code);
!                   len += s - old;
!                 }
!             }
!         }
      break;
  
      case SPELL_NUMBER:
Index: cpplib.h
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/cpplib.h,v
retrieving revision 1.237
diff -c -p -r1.237 cpplib.h
*** cpplib.h	26 Sep 2002 22:25:12 -0000	1.237
--- cpplib.h	27 Oct 2002 17:35:33 -0000
*************** extern const char *progname;
*** 443,448 ****
--- 443,449 ----
  #define NODE_DIAGNOSTIC (1 << 3)	/* Possible diagnostic when lexed.  */
  #define NODE_WARN	(1 << 4)	/* Warn if redefined or undefined.  */
  #define NODE_DISABLED	(1 << 5)	/* A disabled macro.  */
+ #define NODE_UTF8       (1 << 6)        /* Node has UTF-8 bytes in it */
  
  /* Different flavors of hash node.  */
  enum node_type
Follow-Ups:
- Re: Implementing Universal Character Names in identifiers
  - From: Fergus Henderson
- Re: Implementing Universal Character Names in identifiers
  - From: Zack Weinberg
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]