[PATCH] libcpp: For C++23 treat UCNs and UTF-8 chars not valid in identifiers as separate tokens

Jakub Jelinek jakub@redhat.com
Fri Aug 6 14:47:57 GMT 2021


On Fri, Aug 06, 2021 at 11:53:56AM +0200, Jakub Jelinek via Gcc-patches wrote:
> Actually, there is another change in P1949R7 that I haven't touched
> in the patch and not sure what the implications are.
> 
> To the preprocessing-token non-terminal it adds
> 	each universal-character-name that cannot be one of the above
> and changes the following paragraph:
>  ...
>  preprocessing operators and punctuators, and single
> +universal-character-names and
>  non-whitespace characters that do not lexically match the other
>  preprocessing token categories.
> +If a single universal-character-name does not match any of the other
> +preprocessing token categories, the program is ill-formed.
>  If a ' or a " character matches the last category, the behavior
>  is undefined.
>  ...

If the above (and identifier-start and identifier-continue non-terminals
only mentioning XID_Start+0x5F and XID_Continue UCNs) means that we should
indeed put each such UTF-8 char or UCN into a separate CPP_OTHER token
for C++23, then we need something like this incremental patch.
The drawback is worse diagnostics though, so maybe it would be useful if
the cpp_error that ... is not valid in an identifier or is not
valid at the start of an identifier would be emitted as a warning (and not
warn when skipping)?

2021-08-06  Jakub Jelinek  <jakub@redhat.com>

gcc/
	* c-lex.c (c_lex_with_flags): For CPP_OTHER with UCN in the
	text report the UCN instead of backslash in the error message.
gcc/testsuite/
	* g++.dg/cpp23/ucnid-1-utf8.C: Adjust expected diagnostics for
	C++23.
	* g++.dg/cpp23/ucnid-2-utf8.C: Likewise.
	* g++.dg/cpp23/normalize3.C: Likewise.
libcpp/
	* charset.c (_cpp_valid_ucn): For cxx23_identifiers, return false
	instead of emitting cpp_error for UCNs not valid in identifiers.
	(_cpp_valid_utf8): Similarly for UTF-8 characters not valid in
	identifiers.
	* lex.c (_cpp_lex_direct): For UCNs not valid in identifiers,
	create CPP_OTHER token for the whole UCN.

--- libcpp/charset.c.jj	2021-08-06 11:01:09.052644793 +0200
+++ libcpp/charset.c	2021-08-06 14:49:15.648388743 +0200
@@ -1158,6 +1158,15 @@ _cpp_valid_ucn (cpp_reader *pfile, const
     {
       int validity = ucn_valid_in_identifier (pfile, result, nst);
 
+      if (CPP_OPTION (pfile, cxx23_identifiers))
+	{
+	  if (validity == 0 || (validity == 2 && identifier_pos == 1))
+	    {
+	      *cp = result;
+	      *pstr = base + 2;
+	      return false;
+	    }
+	}
       if (validity == 0)
 	cpp_error (pfile, CPP_DL_ERROR,
 		   "universal character %.*s is not valid in an identifier",
@@ -1283,9 +1292,11 @@ _cpp_valid_utf8 (cpp_reader *pfile,
 	     because logically, the UTF-8 was converted to a UCN during
 	     translation phase 1 (even though we don't physically do it that
 	     way).  In C, this byte rather becomes grammatically a separate
-	     token.  */
+	     token.  In C++23, it should become gramatically a separate
+	     token as well.  */
 
-	  if (CPP_OPTION (pfile, cplusplus))
+	  if (CPP_OPTION (pfile, cplusplus)
+	      && !CPP_OPTION (pfile, cxx23_identifiers))
 	    cpp_error (pfile, CPP_DL_ERROR,
 		       "extended character %.*s is not valid in an identifier",
 		       (int) (*pstr - base), base);
@@ -1300,6 +1311,11 @@ _cpp_valid_utf8 (cpp_reader *pfile,
 	case 2:
 	  if (identifier_pos == 1)
 	    {
+	      if (CPP_OPTION (pfile, cxx23_identifiers))
+		{
+		  *pstr = base;
+		  return false;
+		}
 	      /* This is treated the same way in C++ or C99 -- lexed as an
 		 identifier which is then invalid because an identifier is
 		 not allowed to start with this character.  */
--- libcpp/lex.c.jj	2021-08-05 21:52:45.491176158 +0200
+++ libcpp/lex.c	2021-08-06 14:49:38.861067796 +0200
@@ -3398,6 +3398,16 @@ _cpp_lex_direct (cpp_reader *pfile)
 	    if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
 	      buffer->cur = pstr;
 	  }
+	else if (c == '\\'
+		 && (buffer->cur[0] == 'u' || buffer->cur[0] == 'U')
+		 && CPP_OPTION (pfile, cxx23_identifiers))
+	  {
+	    const uchar *pstr = base + 2;
+	    cppchar_t s;
+	    if (_cpp_valid_ucn (pfile, &pstr, buffer->rlimit, 0, NULL, &s,
+				NULL, NULL))
+	      buffer->cur = pstr;
+	  }
 	create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
 	break;
       }
--- gcc/c-family/c-lex.c.jj	2021-07-23 22:06:02.236084923 +0200
+++ gcc/c-family/c-lex.c	2021-08-06 15:04:29.817748967 +0200
@@ -613,6 +613,16 @@ c_lex_with_flags (tree *value, location_
 
 	if (c == '"' || c == '\'')
 	  error_at (*loc, "missing terminating %c character", (int) c);
+	else if (c == '\\'
+		 && tok->val.str.len == 6
+		 && tok->val.str.text[1] == 'u')
+	  error_at (*loc, "stray %<\\u%.4s%> in program",
+		    tok->val.str.text + 2);
+	else if (c == '\\'
+		 && tok->val.str.len == 10
+		 && tok->val.str.text[1] == 'U')
+	  error_at (*loc, "stray %<\\U%.8s%> in program",
+		    tok->val.str.text + 2);
 	else if (ISGRAPH (c))
 	  error_at (*loc, "stray %qc in program", (int) c);
 	else
--- gcc/testsuite/g++.dg/cpp23/ucnid-1-utf8.C.jj	2021-08-05 21:52:45.493176130 +0200
+++ gcc/testsuite/g++.dg/cpp23/ucnid-1-utf8.C	2021-08-06 16:27:09.965156132 +0200
@@ -3,16 +3,27 @@
 // { dg-options "" }
 
 bool 👷 = true;
-bool 👷‍♀ = false;	// { dg-error "is not valid in an identifier" }
-int ⏰ = 0;	// { dg-error "is not valid in an identifier" }
+bool 👷‍♀ = false;	// { dg-error "is not valid in an identifier" "" { target { ! c++23 } } }
+// { dg-error "stray \[^\n\r]* in program" "" { target c++23 } .-1 }
+int ⏰ = 0;	// { dg-error "is not valid in an identifier" "" { target { ! c++23 } } }
+// { dg-error "expected unqualified-id before '=' token" "" { target c++23 } .-1 }
+// { dg-error "stray \[^\n\r]* in program" "" { target c++23 } .-2 }
 int 🕐 = 0;
-int ☠ = 0;	// { dg-error "is not valid in an identifier" }
+int ☠ = 0;	// { dg-error "is not valid in an identifier" "" { target { ! c++23 } } }
+// { dg-error "expected unqualified-id before '=' token" "" { target c++23 } .-1 }
+// { dg-error "stray \[^\n\r]* in program" "" { target c++23 } .-2 }
 int 💀 = 0;
-int ✋ = 0;	// { dg-error "is not valid in an identifier" }
+int ✋ = 0;	// { dg-error "is not valid in an identifier" "" { target { ! c++23 } } }
+// { dg-error "expected unqualified-id before '=' token" "" { target c++23 } .-1 }
+// { dg-error "stray \[^\n\r]* in program" "" { target c++23 } .-2 }
 int 👊 = 0;
-int ✈ = 0;	// { dg-error "is not valid in an identifier" }
+int ✈ = 0;	// { dg-error "is not valid in an identifier" "" { target { ! c++23 } } }
+// { dg-error "expected unqualified-id before '=' token" "" { target c++23 } .-1 }
+// { dg-error "stray \[^\n\r]* in program" "" { target c++23 } .-2 }
 int 🚀 = 0;
-int ☹ = 0;	// { dg-error "is not valid in an identifier" }
+int ☹ = 0;	// { dg-error "is not valid in an identifier" "" { target { ! c++23 } } }
+// { dg-error "expected unqualified-id before '=' token" "" { target c++23 } .-1 }
+// { dg-error "stray \[^\n\r]* in program" "" { target c++23 } .-2 }
 int 😀 = 0;
 struct E {};
 class 💩 : public E {};
--- gcc/testsuite/g++.dg/cpp23/ucnid-2-utf8.C.jj	2021-08-05 21:52:45.493176130 +0200
+++ gcc/testsuite/g++.dg/cpp23/ucnid-2-utf8.C	2021-08-06 16:35:30.628232650 +0200
@@ -2,17 +2,44 @@
 // { dg-do compile }
 // { dg-options "-pedantic-errors" }
 
-bool 👷 = true;	// { dg-error "is not valid in an identifier" "" { target { c++98_only || c++23 } } }
-bool 👷‍♀ = false;	// { dg-error "is not valid in an identifier" }
-int ⏰ = 0;	// { dg-error "is not valid in an identifier" }
-int 🕐 = 0;	// { dg-error "is not valid in an identifier" "" { target { c++98_only || c++23 } } }
-int ☠ = 0;	// { dg-error "is not valid in an identifier" }
-int 💀 = 0;	// { dg-error "is not valid in an identifier" "" { target { c++98_only || c++23 } } }
-int ✋ = 0;	// { dg-error "is not valid in an identifier" }
-int 👊 = 0;	// { dg-error "is not valid in an identifier" "" { target { c++98_only || c++23 } } }
-int ✈ = 0;	// { dg-error "is not valid in an identifier" }
-int 🚀 = 0;	// { dg-error "is not valid in an identifier" "" { target { c++98_only || c++23 } } }
-int ☹ = 0;	// { dg-error "is not valid in an identifier" }
-int 😀 = 0;	// { dg-error "is not valid in an identifier" "" { target { c++98_only || c++23 } } }
+bool 👷 = true;	// { dg-error "is not valid in an identifier" "" { target { c++98_only } } }
+// { dg-error "expected unqualified-id before '=' token" "" { target c++23 } .-1 }
+// { dg-error "stray \[^\n\r]* in program" "" { target c++23 } .-2 }
+bool 👷‍♀ = false;	// { dg-error "is not valid in an identifier" "" { target { ! c++23 } } }
+// { dg-error "expected unqualified-id before '=' token" "" { target c++23 } .-1 }
+// { dg-error "stray \[^\n\r]* in program" "" { target c++23 } .-2 }
+int ⏰ = 0;	// { dg-error "is not valid in an identifier" "" { target { ! c++23 } } }
+// { dg-error "expected unqualified-id before '=' token" "" { target c++23 } .-1 }
+// { dg-error "stray \[^\n\r]* in program" "" { target c++23 } .-2 }
+int 🕐 = 0;	// { dg-error "is not valid in an identifier" "" { target { c++98_only } } }
+// { dg-error "expected unqualified-id before '=' token" "" { target c++23 } .-1 }
+// { dg-error "stray \[^\n\r]* in program" "" { target c++23 } .-2 }
+int ☠ = 0;	// { dg-error "is not valid in an identifier" "" { target { ! c++23 } } }
+// { dg-error "expected unqualified-id before '=' token" "" { target c++23 } .-1 }
+// { dg-error "stray \[^\n\r]* in program" "" { target c++23 } .-2 }
+int 💀 = 0;	// { dg-error "is not valid in an identifier" "" { target { c++98_only } } }
+// { dg-error "expected unqualified-id before '=' token" "" { target c++23 } .-1 }
+// { dg-error "stray \[^\n\r]* in program" "" { target c++23 } .-2 }
+int ✋ = 0;	// { dg-error "is not valid in an identifier" "" { target { ! c++23 } } }
+// { dg-error "expected unqualified-id before '=' token" "" { target c++23 } .-1 }
+// { dg-error "stray \[^\n\r]* in program" "" { target c++23 } .-2 }
+int 👊 = 0;	// { dg-error "is not valid in an identifier" "" { target { c++98_only } } }
+// { dg-error "expected unqualified-id before '=' token" "" { target c++23 } .-1 }
+// { dg-error "stray \[^\n\r]* in program" "" { target c++23 } .-2 }
+int ✈ = 0;	// { dg-error "is not valid in an identifier" "" { target { ! c++23 } } }
+// { dg-error "expected unqualified-id before '=' token" "" { target c++23 } .-1 }
+// { dg-error "stray \[^\n\r]* in program" "" { target c++23 } .-2 }
+int 🚀 = 0;	// { dg-error "is not valid in an identifier" "" { target { c++98_only } } }
+// { dg-error "expected unqualified-id before '=' token" "" { target c++23 } .-1 }
+// { dg-error "stray \[^\n\r]* in program" "" { target c++23 } .-2 }
+int ☹ = 0;	// { dg-error "is not valid in an identifier" "" { target { ! c++23 } } }
+// { dg-error "expected unqualified-id before '=' token" "" { target c++23 } .-1 }
+// { dg-error "stray \[^\n\r]* in program" "" { target c++23 } .-2 }
+int 😀 = 0;	// { dg-error "is not valid in an identifier" "" { target { c++98_only } } }
+// { dg-error "expected unqualified-id before '=' token" "" { target c++23 } .-1 }
+// { dg-error "stray \[^\n\r]* in program" "" { target c++23 } .-2 }
 struct E {};
-class 💩 : public E {};	// { dg-error "is not valid in an identifier" "" { target { c++98_only || c++23 } } }
+class 💩 : public E {};	// { dg-error "is not valid in an identifier" "" { target { c++98_only } } }
+// { dg-error "anonymous struct" "" { target c++23 } .-1 }
+// { dg-error "used as declaration" "" { target c++23 } .-2 }
+// { dg-error "stray \[^\n\r]* in program" "" { target c++23 } .-3 }
--- gcc/testsuite/g++.dg/cpp23/normalize3.C.jj	2021-08-05 21:52:45.493176130 +0200
+++ gcc/testsuite/g++.dg/cpp23/normalize3.C	2021-08-06 15:12:34.171050731 +0200
@@ -2,31 +2,24 @@
 // { dg-options "-pedantic-errors" }
 
 \u00AA
-\u00B7	// { dg-error "is not valid at the start of an identifier" }
+\u00B7
 \u0F43  // { dg-error "not in NFC" }
 a\u05B8\u05B9\u05B9\u05BBb
  a\u05BB\u05B9\u05B8\u05B9b  // { dg-error "not in NFC" }
-\u09CB	// { dg-error "is not valid at the start of an identifier" }
-\u09C7\u09BE // { dg-error "not in NFC" }
-	// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
-\u0B4B	// { dg-error "is not valid at the start of an identifier" }
-\u0B47\u0B3E // { dg-error "not in NFC" }
-	// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
-\u0BCA	// { dg-error "is not valid at the start of an identifier" }
-\u0BC6\u0BBE // { dg-error "not in NFC" }
-	// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
-\u0BCB	// { dg-error "is not valid at the start of an identifier" }
-\u0BC7\u0BBE // { dg-error "not in NFC" }
-	// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
-\u0CCA	// { dg-error "is not valid at the start of an identifier" }
-\u0CC6\u0CC2 // { dg-error "not in NFC" }
-	// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
-\u0D4A	// { dg-error "is not valid at the start of an identifier" }
-\u0D46\u0D3E // { dg-error "not in NFC" }
-	// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
-\u0D4B	// { dg-error "is not valid at the start of an identifier" }
-\u0D47\u0D3E // { dg-error "not in NFC" }
-	// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
+\u09CB
+\u09C7\u09BE
+\u0B4B
+\u0B47\u0B3E
+\u0BCA
+\u0BC6\u0BBE
+\u0BCB
+\u0BC7\u0BBE
+\u0CCA
+\u0CC6\u0CC2
+\u0D4A
+\u0D46\u0D3E
+\u0D4B
+\u0D47\u0D3E
 
 K
 \u212A // { dg-error "not in NFC" }
@@ -41,31 +34,24 @@ K
 \uAC00\u11A8 // { dg-error "not in NFC" }
 
 ª
-·	// { dg-error "is not valid at the start of an identifier" }
+·
 གྷ  // { dg-error "not in NFC" }
 aָֹֹֻb
  aָֹֹֻb  // { dg-error "not in NFC" }
-ো	// { dg-error "is not valid at the start of an identifier" }
-ো // { dg-error "not in NFC" }
-	// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
-ୋ	// { dg-error "is not valid at the start of an identifier" }
-ୋ // { dg-error "not in NFC" }
-	// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
-ொ	// { dg-error "is not valid at the start of an identifier" }
-ொ // { dg-error "not in NFC" }
-	// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
-ோ	// { dg-error "is not valid at the start of an identifier" }
-ோ // { dg-error "not in NFC" }
-	// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
-ೊ	// { dg-error "is not valid at the start of an identifier" }
-ೊ // { dg-error "not in NFC" }
-	// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
-ൊ	// { dg-error "is not valid at the start of an identifier" }
-ൊ // { dg-error "not in NFC" }
-	// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
-ോ	// { dg-error "is not valid at the start of an identifier" }
-ോ // { dg-error "not in NFC" }
-	// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
+ো
+ো
+ୋ
+ୋ
+ொ
+ொ
+ோ
+ோ
+ೊ
+ೊ
+ൊ
+ൊ
+ോ
+ോ
 
 K
 K // { dg-error "not in NFC" }

	Jakub



More information about the Gcc-patches mailing list