This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
UTF-8 in gas

To: egcs-patches at cygnus dot com
Subject: UTF-8 in gas
From: Martin von Loewis <martin at mira dot isdn dot cs dot tu-berlin dot de>
Date: Mon, 2 Nov 1998 21:39:43 +0100
This is a patch to enable wide characters in gas. The intent is to get
the entire compiler chain (g++/gcc/gas/ld) aware of Unicode
identifiers; encoding them in UTF-8 has a number of advantages:

a) It is backwards-compatible. ASCII characters in UTF-8 appear as
   single byte, everything else uses bytes above 127 (which are
   currently rejected); and it is null-byte-free.

b) It works for both C and C++. gcc 2.8 contains a proposal to provide
   mangling for universal characters that appear in C++ identifiers.
   The upcoming C9x will also add universal character names, but mangling
   is not an option.

c) It is already supported by the ELF tools (and probably other binary
   formats). For some reason, the GNU ld does not care what bytes are
   in an identifier, as long as it is null-byte free.
   [Note: this is just an observation. The ELF spec might make
   stronger or weaker guarantees]

The supporting patches for g++ and c++filt are under development; g++
will fall-back to the proposed mangling if the assembler does not
support UTF-8 input.

At this point, I'd like to get some feed-back whether this feature is
acceptable for gas, and what changes I should make to the patch. The
code is based on binutils 2.9.1.0.3; I can update to more recent code.

The lexers for some platforms implement custom operand analysis, which
often restricts operands to ASCII. This has been widened only for
i386, so far.

Thanks for your attention,
Martin

1998-11-02  Martin von Löwis  <loewis@informatik.hu-berlin.de>

	* app.c (do_scrub_begin): Initialize lex above 127.
	(do_scrub_chars): Process unsigned characters.
	* expr.c (get_symbol_end): When getting characters above 127,
	verify that they form a proper UTF-8 sequence.
	* read.h (LEX_UTF8_BEGIN): New constant.
	(is_utf8_beginner): New macro.
	* read.c (lex_type): Initialize UTF-8 beginners.

diff -urp binutils-2.9.1.0.3/gas/app.c gas/app.c
--- binutils-2.9.1.0.3/gas/app.c	Mon Mar  2 18:27:54 1998
+++ gas/app.c	Sun Nov  1 14:08:57 1998
@@ -81,6 +81,7 @@ do_scrub_begin (m68k_mri)
      int m68k_mri;
 {
   const char *p;
+  int c;
 
   scrub_m68k_mri = m68k_mri;
 
@@ -114,6 +115,9 @@ do_scrub_begin (m68k_mri)
       lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
     }				/* declare symbol characters */
 
+  for (c = 128; c <= 255; c++)
+    lex [c] = LEX_IS_SYMBOL_COMPONENT;
+
   /* The m68k backend wants to be able to change comment_chars.  */
 #ifndef tc_comment_chars
 #define tc_comment_chars comment_chars
@@ -335,7 +339,7 @@ do_scrub_chars (get, tostart, tolen)
       fromend = from + fromlen,		\
       (fromlen == 0			\
        ? EOF				\
-       : *from++)))
+       : *(unsigned char*)from++)))
 
   /* This macro pushes a character back on the input stream.  */
 
@@ -1076,7 +1080,7 @@ do_scrub_chars (get, tostart, tolen)
 		{
 		  int type;
 
-		  ch2 = *s;
+		  ch2 = *(unsigned char*)s;
 		  type = lex[ch2];
 		  if (type != 0
 		      && type != LEX_IS_SYMBOL_COMPONENT)
diff -urp binutils-2.9.1.0.3/gas/config/tc-i386.c gas/config/tc-i386.c
--- binutils-2.9.1.0.3/gas/config/tc-i386.c	Tue Apr 28 21:47:53 1998
+++ gas/config/tc-i386.c	Sun Nov  1 14:38:54 1998
@@ -572,6 +572,9 @@ md_begin ()
 	if (isupper (c) || islower (c) || isdigit (c))
 	  operand_chars[c] = c;
 
+	if (c >= 128)
+	  operand_chars[c] = identifier_chars[c] = c;
+
 	if (isdigit (c) || c == '-')
 	  digit_chars[c] = c;
 
diff -urp binutils-2.9.1.0.3/gas/expr.c gas/expr.c
--- binutils-2.9.1.0.3/gas/expr.c	Wed Apr  1 04:40:03 1998
+++ gas/expr.c	Sun Nov  1 14:32:19 1998
@@ -1786,8 +1786,25 @@ get_symbol_end ()
      constructed string.  */
   if (is_name_beginner (c = *input_line_pointer++) || c == '\001')
     while (is_part_of_name (c = *input_line_pointer++)
-	   || c == '\001')
-      ;
+	   || c == '\001'
+	   || is_utf8_beginner (c))
+      /* See whether this is a valid utf8 sequence.  */
+      if (is_utf8_beginner (c))
+	{
+	  int bytecount = 1;
+	  if ((unsigned char)c >= 224)
+	    bytecount++;
+	  if ((unsigned char)c >= 240)
+	    bytecount++;
+	  if ((unsigned char)c >= 248)
+	    bytecount++;
+	  if ((unsigned char)c >= 252)
+	    bytecount++;
+	  for (; bytecount; bytecount--)
+	    if ((*(unsigned char*)input_line_pointer++ & 0xC0) != 0x80)
+	      as_bad ("Invalid byte %d in UTF-8 character", 
+		      (int)*(unsigned char*)(input_line_pointer - 1));
+	}
   *--input_line_pointer = 0;
   return (c);
 }
diff -urp binutils-2.9.1.0.3/gas/read.c gas/read.c
--- binutils-2.9.1.0.3/gas/read.c	Mon Apr 27 23:22:48 1998
+++ gas/read.c	Sat Oct 31 20:29:59 1998
@@ -114,9 +114,10 @@ char lex_type[256] =
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,       /* 192 */
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,       /* 208 */
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,       /* 224 */
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,       /* 240 */
 };
 
 
diff -urp binutils-2.9.1.0.3/gas/read.h gas/read.h
--- binutils-2.9.1.0.3/gas/read.h	Wed Feb  4 22:55:48 1998
+++ gas/read.h	Sat Oct 31 20:19:49 1998
@@ -35,11 +35,14 @@ extern char *input_line_pointer;/* -> ch
 
 #define	LEX_NAME	(1)	/* may continue a name */
 #define LEX_BEGIN_NAME	(2)	/* may begin a name */
+#define LEX_UTF8_BEGIN    (4)     /* starts Unicode names. */
 
 #define is_name_beginner(c) \
   ( lex_type[(unsigned char) (c)] & LEX_BEGIN_NAME )
 #define is_part_of_name(c) \
   ( lex_type[(unsigned char) (c)] & LEX_NAME       )
+#define is_utf8_beginner(c) \
+  ( lex_type[(unsigned char) (c)] & LEX_UTF8_BEGIN   )
 
 #ifndef is_a_char
 #define CHAR_MASK	(0xff)
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]