This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[4.1] UCNs in identifiers


This patch implements UCNs in identifiers, as in:

char * \u00fc = "u-umlaut";

int main(void)
{
  char * \u00fd = "y-acute";
  
  if (\u00fd[0] != 'y' || \u00fc[0] != 'u')
    abort ();
  return 0;
}

for Darwin.  (Actually, not quite, an assembler patch is needed; but
this is the compiler work.)

I'd commit it right now, but am not quite sure what to do for other
ports.  Darwin works by saying that the input to the assembler is in
UTF-8.  For other ports, the choices are:

(a) Send UTF-8
(b) Mangle the names somehow
(c) Refuse UCNs in identifiers on those ports

At present, I'm leaning towards (c).  (b) is clearly not the right
thing, because it's imposing an ABI that might not be the right one.
(a) is also a possibility, it is certainly the least work.

Any opinions?

-- 
- Geoffrey Keating <geoffk@apple.com>

===File ~/patches/gcc-ucns-1.patch==========================
2005-01-06  Geoffrey Keating  <geoffk@apple.com>

	* internal.h (_cpp_interpret_identifier): New.
	* charset.c (_cpp_interpret_identifier): New.
	* lex.c (lex_identifier): Add extra parameter to indicate if initial
	character was '$' or '\'.  Support identifiers with UCNs.
	(forms_identifier_p): Allow UCNs.
	(_cpp_lex_direct): Pass extra parameter to lex_identifier.

Index: libcpp/charset.c
===================================================================
RCS file: /cvs/gcc/gcc/libcpp/charset.c,v
retrieving revision 1.3
diff -u -p -u -p -r1.3 charset.c
--- libcpp/charset.c	18 Sep 2004 00:56:19 -0000	1.3
+++ libcpp/charset.c	7 Jan 2005 02:23:18 -0000
@@ -1357,7 +1357,60 @@ cpp_interpret_charconst (cpp_reader *pfi
 
   return result;
 }
+
+/* Convert an identifier denoted by ID and LEN, which might contain
+   UCN escapes, to the source character set, either UTF-8 or
+   UTF-EBCDIC.  Assumes that the identifier is actually a valid identifier.  */
+cpp_hashnode *
+_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
+{
+  /* It turns out that a UCN escape always turns into fewer characters
+     than the escape itself, so we can allocate a temporary in advance.  */
+  uchar * buf = alloca (len + 1);
+  uchar * bufp = buf;
+  size_t idp;
+  
+  for (idp = 0; idp < len; idp++)
+    if (id[idp] != '\\')
+      *bufp++ = id[idp];
+    else
+      {
+	unsigned length = id[idp+1] == 'u' ? 4 : 8;
+	cppchar_t value = 0;
+	size_t bufleft = len - (bufp - buf);
+	int rval;
+
+	idp += 2;
+	while (length && idp < len && ISXDIGIT (id[idp]))
+	  {
+	    value = (value << 4) + hex_value (id[idp]);
+	    idp++;
+	    length--;
+	  }
+	idp--;
+
+	/* Special case for EBCDIC: if the identifier contains
+	   a '$' specified using a UCN, translate it to EBCDIC.  */
+	if (value == 0x24)
+	  {
+	    *bufp++ = '$';
+	    continue;
+	  }
+
+	rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
+	if (rval)
+	  {
+	    errno = rval;
+	    cpp_errno (pfile, CPP_DL_ERROR,
+		       "converting UCN to source character set");
+	    break;
+	  }
+      }
 
+  return CPP_HASHNODE (ht_lookup (pfile->hash_table, 
+				  buf, bufp - buf, HT_ALLOC));
+}
+
 /* Convert an input buffer (containing the complete contents of one
    source file) from INPUT_CHARSET to the source character set.  INPUT
    points to the input buffer, SIZE is its allocated size, and LEN is
Index: libcpp/internal.h
===================================================================
RCS file: /cvs/gcc/gcc/libcpp/internal.h,v
retrieving revision 1.11
diff -u -p -u -p -r1.11 internal.h
--- libcpp/internal.h	2 Jan 2005 01:32:21 -0000	1.11
+++ libcpp/internal.h	7 Jan 2005 02:23:18 -0000
@@ -571,6 +571,9 @@ extern unsigned char *_cpp_convert_input
 					  unsigned char *, size_t, size_t,
 					  off_t *);
 extern const char *_cpp_default_encoding (void);
+extern cpp_hashnode * _cpp_interpret_identifier (cpp_reader *pfile,
+						 const unsigned char *id,
+						 size_t len);
 
 /* Utility routines and macros.  */
 #define DSC(str) (const unsigned char *)str, sizeof str - 1
Index: libcpp/lex.c
===================================================================
RCS file: /cvs/gcc/gcc/libcpp/lex.c,v
retrieving revision 1.5
diff -u -p -u -p -r1.5 lex.c
--- libcpp/lex.c	9 Sep 2004 19:16:55 -0000	1.5
+++ libcpp/lex.c	7 Jan 2005 02:23:18 -0000
@@ -53,7 +53,7 @@ static const struct token_spelling token
 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
 static int skip_line_comment (cpp_reader *);
 static void skip_whitespace (cpp_reader *, cppchar_t);
-static cpp_hashnode *lex_identifier (cpp_reader *, const uchar *);
+static cpp_hashnode *lex_identifier (cpp_reader *, const uchar *, bool);
 static void lex_number (cpp_reader *, cpp_string *);
 static bool forms_identifier_p (cpp_reader *, int);
 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
@@ -453,7 +453,7 @@ forms_identifier_p (cpp_reader *pfile, i
     }
 
   /* Is this a syntactically valid UCN?  */
-  if (0 && *buffer->cur == '\\'
+  if (*buffer->cur == '\\'
       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
     {
       buffer->cur += 2;
@@ -467,39 +467,39 @@ forms_identifier_p (cpp_reader *pfile, i
 
 /* Lex an identifier starting at BUFFER->CUR - 1.  */
 static cpp_hashnode *
-lex_identifier (cpp_reader *pfile, const uchar *base)
+lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn)
 {
   cpp_hashnode *result;
-  const uchar *cur, *limit;
+  const uchar *cur;
   unsigned int len;
   unsigned int hash = HT_HASHSTEP (0, *base);
 
   cur = pfile->buffer->cur;
-  for (;;)
+  if (! starts_ucn)
+    while (ISIDNUM (*cur))
+      {
+	hash = HT_HASHSTEP (hash, *cur);
+	cur++;
+      }
+  pfile->buffer->cur = cur;
+  if (starts_ucn || forms_identifier_p (pfile, false))
     {
-      /* N.B. ISIDNUM does not include $.  */
-      while (ISIDNUM (*cur))
-	{
-	  hash = HT_HASHSTEP (hash, *cur);
-	  cur++;
-	}
-
-      pfile->buffer->cur = cur;
-      if (!forms_identifier_p (pfile, false))
-	break;
-
-      limit = pfile->buffer->cur;
-      while (cur < limit)
-	{
-	  hash = HT_HASHSTEP (hash, *cur);
-	  cur++;
-	}
+      /* Slower version for identifiers containing UCNs (or $).  */
+      do {
+	while (ISIDNUM (*pfile->buffer->cur))
+	  pfile->buffer->cur++;
+      } while (forms_identifier_p (pfile, false));
+      result = _cpp_interpret_identifier (pfile, base,
+					  pfile->buffer->cur - base);
     }
-  len = cur - base;
-  hash = HT_HASHFINISH (hash, len);
+  else
+    {
+      len = cur - base;
+      hash = HT_HASHFINISH (hash, len);
 
-  result = (cpp_hashnode *)
-    ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
+      result = (cpp_hashnode *)
+	ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
+    }
 
   /* Rarely, identifiers require diagnostics when lexed.  */
   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
@@ -922,7 +922,7 @@ _cpp_lex_direct (cpp_reader *pfile)
     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
     case 'Y': case 'Z':
       result->type = CPP_NAME;
-      result->val.node = lex_identifier (pfile, buffer->cur - 1);
+      result->val.node = lex_identifier (pfile, buffer->cur - 1, false);
 
       /* Convert named operators to their proper types.  */
       if (result->val.node->flags & NODE_OPERATOR)
@@ -1155,7 +1155,7 @@ _cpp_lex_direct (cpp_reader *pfile)
 	if (forms_identifier_p (pfile, true))
 	  {
 	    result->type = CPP_NAME;
-	    result->val.node = lex_identifier (pfile, base);
+	    result->val.node = lex_identifier (pfile, base, true);
 	    break;
 	  }
 	buffer->cur++;
============================================================


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]