[PATCH] c++: Implement C++26 P1854R4 - Making non-encodable string literals ill-formed [PR110341]

Fri Aug 25 20:49:24 GMT 2023

Hi!

This paper voted in as DR makes some multi-character literals ill-formed.
'abcd' stays valid, but e.g. 'á' is newly invalid in UTF-8 exec charset
while valid e.g. in ISO-8859-1, because it is a single character which needs
2 bytes to be encoded.

The following patch does that by checking (only pedantically, especially
because it is a DR) if we'd emit a -Wmultichar warning because character
constant has more than one byte in it whether the number of bytes in the
narrow string matches number of bytes in CPP_STRING32 divided by char32_t
size in bytes.  If it is, it is normal multi-character literal constant
and is diagnosed normally with -Wmultichar, if the number of bytes is
larger, at least one of the c-chars in the sequence was encoded as 2+
bytes.

Now, doing this way has 2 drawbacks, some of the diagnostics which doesn't
result in cpp_interpret_string_1 failures can be printed twice, once
when calling cpp_interpret_string_1 for CPP_CHAR, once for CPP_STRING32.
And, functionally I think it must work 100% correctly if host source
character set is UTF-8 (because all valid UTF-8 chars are encodable in
UTF-32), but might not work for some control codes in UTF-EBCDIC if
that is the source character set (though I don't know if we really actually
support it, e.g. Linux iconv certainly doesn't).
All we actually need is count the number of c-chars in the literal,
alternative would be to write custom character counter which would quietly
interpret/skip over + count escape sequences and decode UTF-8 characters
in between those escape sequences.  But we'd need to have something similar
also for UTF-EBCDIC if it works at all, and from what I've looked, we don't
have anyything like that implemented in libcpp nor anywhere else in GCC.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
Or ok with some tweaks to avoid the second round of diagnostics from
cpp_interpret_string_1/convert_escape?  Or reimplement that second time and
count manually?

2023-08-25  Jakub Jelinek  <jakub@redhat.com>

	PR c++/110341
libcpp/
	* charset.cc: Implement C++ 26 P1854R4 - Making non-encodable string
	literals ill-formed.
	(narrow_str_to_charconst): Change last type from cpp_ttype to
	const cpp_token *.  For C++ if pedantic and i > 1 in CPP_CHAR
	interpret token also as CPP_STRING32 and if number of characters
	in the CPP_STRING32 is larger than number of bytes in CPP_CHAR,
	pedwarn on it.
	(cpp_interpret_charconst): Adjust narrow_str_to_charconst caller.
gcc/testsuite/
	* g++.dg/cpp26/literals1.C: New test.
	* g++.dg/cpp26/literals2.C: New test.
	* g++.dg/cpp23/wchar-multi1.C (c, d): Expect an error rather than
	warning.

--- libcpp/charset.cc.jj	2023-08-24 15:36:59.000000000 +0200
+++ libcpp/charset.cc	2023-08-25 17:14:14.098733396 +0200
@@ -2567,18 +2567,20 @@ cpp_interpret_string_notranslate (cpp_re
 /* Subroutine of cpp_interpret_charconst which performs the conversion
    to a number, for narrow strings.  STR is the string structure returned
    by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
-   cpp_interpret_charconst.  TYPE is the token type.  */
+   cpp_interpret_charconst.  TOKEN is the token.  */
 static cppchar_t
 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
 			 unsigned int *pchars_seen, int *unsignedp,
-			 enum cpp_ttype type)
+			 const cpp_token *token)
 {
+  enum cpp_ttype type = token->type;
   size_t width = CPP_OPTION (pfile, char_precision);
   size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
   size_t mask = width_to_mask (width);
   size_t i;
   cppchar_t result, c;
   bool unsigned_p;
+  bool diagnosed = false;
 
   /* The value of a multi-character character constant, or a
      single-character character constant whose representation in the
@@ -2602,7 +2604,37 @@ narrow_str_to_charconst (cpp_reader *pfi
 
   if (type == CPP_UTF8CHAR)
     max_chars = 1;
-  if (i > max_chars)
+  else if (i > 1 && CPP_OPTION (pfile, cplusplus) && CPP_PEDANTIC (pfile))
+    {
+      /* C++ as a DR since
+	 P1854R4 - Making non-encodable string literals ill-formed
+	 makes multi-character narrow character literals if any of the
+	 characters in the literal isn't encodable in char/unsigned char
+	 ill-formed.  We need to count the number of c-chars and compare
+	 that to str.len.  */
+      cpp_string str2 = { 0, 0 };
+      if (cpp_interpret_string (pfile, &token->val.str, 1, &str2,
+				CPP_STRING32))
+	{
+	  size_t width32 = converter_for_type (pfile, CPP_STRING32).width;
+	  size_t nbwc = width32 / width;
+	  size_t len = str2.len / nbwc;
+	  if (str2.text != token->val.str.text)
+	    free ((void *)str2.text);
+	  if (str.len > len)
+	    {
+	      diagnosed
+		= cpp_error (pfile, CPP_DL_PEDWARN,
+			     "character too large for character literal "
+			     "type");
+	      if (diagnosed && i > max_chars)
+		i = max_chars;
+	    }
+	}
+    }
+  if (diagnosed)
+    /* Already diagnosed above.  */;
+  else if (i > max_chars)
     {
       i = max_chars;
       cpp_error (pfile, type == CPP_UTF8CHAR ? CPP_DL_ERROR : CPP_DL_WARNING,
@@ -2747,7 +2779,7 @@ cpp_interpret_charconst (cpp_reader *pfi
 				    token->type);
   else
     result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp,
-				      token->type);
+				      token);
 
   if (str.text != token->val.str.text)
     free ((void *)str.text);
--- gcc/testsuite/g++.dg/cpp26/literals1.C.jj	2023-08-25 17:23:06.662878355 +0200
+++ gcc/testsuite/g++.dg/cpp26/literals1.C	2023-08-25 17:37:03.085132304 +0200
@@ -0,0 +1,65 @@
+// C++26 P1854R4 - Making non-encodable string literals ill-formed
+// { dg-do compile { target c++11 } }
+// { dg-require-effective-target int32 }
+// { dg-options "-pedantic-errors -finput-charset=UTF-8 -fexec-charset=UTF-8" }
+
+int a = 'abcd';						// { dg-warning "multi-character character constant" }
+int b = '\x61\x62\x63\x64';				// { dg-warning "multi-character character constant" }
+int c = 'á';						// { dg-error "character too large for character literal type" }
+int d = '😁';						// { dg-error "character too large for character literal type" }
+int e = '\N{FACE WITH TEARS OF JOY}';			// { dg-error "character too large for character literal type" }
+							// { dg-error "named universal character escapes are only valid in" "" { target c++20_down } .-1 }
+int f = '\U0001F602';					// { dg-error "character too large for character literal type" }
+wchar_t g = L'abcd';					// { dg-error "character constant too long for its type" "" { target c++23 } }
+							// { dg-warning "character constant too long for its type" "" { target c++20_down } .-1 }
+wchar_t h = L'\x61\x62\x63\x64';			// { dg-error "character constant too long for its type" "" { target c++23 } }
+							// { dg-warning "character constant too long for its type" "" { target c++20_down } .-1 }
+wchar_t i = L'á';
+char16_t j = u'abcd';					// { dg-error "character constant too long for its type" }
+char16_t k = u'\x61\x62\x63\x64';			// { dg-error "character constant too long for its type" }
+char16_t l = u'á';
+char16_t m = u'😁';					// { dg-error "character constant too long for its type" }
+char16_t n = u'\N{FACE WITH TEARS OF JOY}';		// { dg-error "character constant too long for its type" { target c++23 } }
+							// { dg-error "named universal character escapes are only valid in" "" { target c++20_down } .-1 }
+char16_t o = u'\U0001F602';				// { dg-error "character constant too long for its type" }
+char32_t p = U'abcd';					// { dg-error "character constant too long for its type" }
+char32_t q = U'\x61\x62\x63\x64';			// { dg-error "character constant too long for its type" }
+char32_t r = U'á';
+char32_t s = U'😁';
+char32_t t = U'\N{FACE WITH TEARS OF JOY}';		// { dg-error "named universal character escapes are only valid in" "" { target c++20_down } }
+char32_t u = U'\U0001F602';
+#if __cpp_unicode_characters >= 201411L
+auto v = u8'abcd';					// { dg-error "character constant too long for its type" "" { target c++17 } }
+auto w = u8'\x61\x62\x63\x64';				// { dg-error "character constant too long for its type" "" { target c++17 } }
+auto x = u8'á';						// { dg-error "character constant too long for its type" "" { target c++17 } }
+auto y = u8'😁';					// { dg-error "character constant too long for its type" "" { target c++17 } }
+auto z = u8'\N{FACE WITH TEARS OF JOY}';		// { dg-error "character constant too long for its type" "" { target c++17 } }
+							// { dg-error "named universal character escapes are only valid in" "" { target { c++17 && c++20_down } } .-1 }
+auto aa = u8'\U0001F602';				// { dg-error "character constant too long for its type" "" { target c++17 } }
+#endif
+const char *ab = "😁";
+const char *ac = "\N{FACE WITH TEARS OF JOY}";		// { dg-error "named universal character escapes are only valid in" "" { target c++20_down } }
+const char *ad = "\U0001F602";
+const char16_t *ae = u"😁";
+const char16_t *af = u"\N{FACE WITH TEARS OF JOY}";	// { dg-error "named universal character escapes are only valid in" "" { target c++20_down } }
+const char16_t *ag = u"\U0001F602";
+const char32_t *ah = U"😁";
+const char32_t *ai = U"\N{FACE WITH TEARS OF JOY}";	// { dg-error "named universal character escapes are only valid in" "" { target c++20_down } }
+const char32_t *aj = U"\U0001F602";
+auto ak = u8"😁";
+auto al = u8"\N{FACE WITH TEARS OF JOY}";		// { dg-error "named universal character escapes are only valid in" "" { target c++20_down } }
+auto am = u8"\U0001F602";
+int an = '\x123456789';					// { dg-error "hex escape sequence out of range" }
+wchar_t ao = L'\x123456789abcdef0';			// { dg-error "hex escape sequence out of range" }
+char16_t ap = u'\x12345678';				// { dg-error "hex escape sequence out of range" }
+char32_t aq = U'\x123456789abcdef0';			// { dg-error "hex escape sequence out of range" }
+#if __cpp_unicode_characters >= 201411L
+auto ar = u8'\x123456789abcdef0';			// { dg-error "hex escape sequence out of range" "" { target c++17 } }
+#endif
+char as = '\xff';
+#if __SIZEOF_WCHAR_T__ * __CHAR_BIT__ == 32
+wchar_t at = L'\xffffffff';
+#elif __SIZEOF_WCHAR_T__ * __CHAR_BIT__ == 16
+wchar_t at = L'\xffff';
+#endif
+int au = '\x1234';					// { dg-error "hex escape sequence out of range" }
--- gcc/testsuite/g++.dg/cpp26/literals2.C.jj	2023-08-25 17:37:34.549728535 +0200
+++ gcc/testsuite/g++.dg/cpp26/literals2.C	2023-08-25 17:41:03.923041763 +0200
@@ -0,0 +1,67 @@
+// C++26 P1854R4 - Making non-encodable string literals ill-formed
+// { dg-do compile { target c++11 } }
+// { dg-require-effective-target int32 }
+// { dg-options "-pedantic-errors -finput-charset=UTF-8 -fexec-charset=ISO-8859-1" }
+/* { dg-require-iconv "ISO-8859-1" } */
+
+int a = 'abcd';						// { dg-warning "multi-character character constant" }
+int b = '\x61\x62\x63\x64';				// { dg-warning "multi-character character constant" }
+int c = 'á';
+int d = '😁';						// { dg-error "converting to execution character set" }
+int e = '\N{FACE WITH TEARS OF JOY}';			// { dg-error "converting UCN to execution character set" }
+							// { dg-error "named universal character escapes are only valid in" "" { target c++20_down } .-1 }
+int f = '\U0001F602';					// { dg-error "converting UCN to execution character set" }
+wchar_t g = L'abcd';					// { dg-error "character constant too long for its type" "" { target c++23 } }
+							// { dg-warning "character constant too long for its type" "" { target c++20_down } .-1 }
+wchar_t h = L'\x61\x62\x63\x64';			// { dg-error "character constant too long for its type" "" { target c++23 } }
+							// { dg-warning "character constant too long for its type" "" { target c++20_down } .-1 }
+wchar_t i = L'á';
+char16_t j = u'abcd';					// { dg-error "character constant too long for its type" }
+char16_t k = u'\x61\x62\x63\x64';			// { dg-error "character constant too long for its type" }
+char16_t l = u'á';
+char16_t m = u'😁';					// { dg-error "character constant too long for its type" }
+char16_t n = u'\N{FACE WITH TEARS OF JOY}';		// { dg-error "character constant too long for its type" { target c++23 } }
+							// { dg-error "named universal character escapes are only valid in" "" { target c++20_down } .-1 }
+char16_t o = u'\U0001F602';				// { dg-error "character constant too long for its type" }
+char32_t p = U'abcd';					// { dg-error "character constant too long for its type" }
+char32_t q = U'\x61\x62\x63\x64';			// { dg-error "character constant too long for its type" }
+char32_t r = U'á';
+char32_t s = U'😁';
+char32_t t = U'\N{FACE WITH TEARS OF JOY}';		// { dg-error "named universal character escapes are only valid in" "" { target c++20_down } }
+char32_t u = U'\U0001F602';
+#if __cpp_unicode_characters >= 201411L
+auto v = u8'abcd';					// { dg-error "character constant too long for its type" "" { target c++17 } }
+auto w = u8'\x61\x62\x63\x64';				// { dg-error "character constant too long for its type" "" { target c++17 } }
+auto x = u8'á';						// { dg-error "character constant too long for its type" "" { target c++17 } }
+auto y = u8'😁';					// { dg-error "character constant too long for its type" "" { target c++17 } }
+auto z = u8'\N{FACE WITH TEARS OF JOY}';		// { dg-error "character constant too long for its type" "" { target c++17 } }
+							// { dg-error "named universal character escapes are only valid in" "" { target { c++17 && c++20_down } } .-1 }
+auto aa = u8'\U0001F602';				// { dg-error "character constant too long for its type" "" { target c++17 } }
+#endif
+const char *ab = "😁";					// { dg-error "converting to execution character set" }
+const char *ac = "\N{FACE WITH TEARS OF JOY}";		// { dg-error "converting UCN to execution character set" }
+							// { dg-error "named universal character escapes are only valid in" "" { target c++20_down } .-1 }
+const char *ad = "\U0001F602";				// { dg-error "converting UCN to execution character set" }
+const char16_t *ae = u"😁";
+const char16_t *af = u"\N{FACE WITH TEARS OF JOY}";	// { dg-error "named universal character escapes are only valid in" "" { target c++20_down } }
+const char16_t *ag = u"\U0001F602";
+const char32_t *ah = U"😁";
+const char32_t *ai = U"\N{FACE WITH TEARS OF JOY}";	// { dg-error "named universal character escapes are only valid in" "" { target c++20_down } }
+const char32_t *aj = U"\U0001F602";
+auto ak = u8"😁";
+auto al = u8"\N{FACE WITH TEARS OF JOY}";		// { dg-error "named universal character escapes are only valid in" "" { target c++20_down } }
+auto am = u8"\U0001F602";
+int an = '\x123456789';					// { dg-error "hex escape sequence out of range" }
+wchar_t ao = L'\x123456789abcdef0';			// { dg-error "hex escape sequence out of range" }
+char16_t ap = u'\x12345678';				// { dg-error "hex escape sequence out of range" }
+char32_t aq = U'\x123456789abcdef0';			// { dg-error "hex escape sequence out of range" }
+#if __cpp_unicode_characters >= 201411L
+auto ar = u8'\x123456789abcdef0';			// { dg-error "hex escape sequence out of range" "" { target c++17 } }
+#endif
+char as = '\xff';
+#if __SIZEOF_WCHAR_T__ * __CHAR_BIT__ == 32
+wchar_t at = L'\xffffffff';
+#elif __SIZEOF_WCHAR_T__ * __CHAR_BIT__ == 16
+wchar_t at = L'\xffff';
+#endif
+int au = '\x1234';					// { dg-error "hex escape sequence out of range" }
--- gcc/testsuite/g++.dg/cpp23/wchar-multi1.C.jj	2022-08-27 23:01:28.321565931 +0200
+++ gcc/testsuite/g++.dg/cpp23/wchar-multi1.C	2023-08-25 22:20:42.772015922 +0200
@@ -4,9 +4,9 @@
 
 char a = 'a';
 int b = 'ab';			// { dg-warning "multi-character character constant" }
-int c = '\u05D9';		// { dg-warning "multi-character character constant" }
+int c = '\u05D9';		// { dg-error "character too large for character literal type" }
 #if __SIZEOF_INT__ > 2
-int d = '\U0001F525';		// { dg-warning "multi-character character constant" "" { target int32 } }
+int d = '\U0001F525';		// { dg-error "character too large for character literal type" "" { target int32 } }
 #endif
 int e = 'abcd';			// { dg-warning "multi-character character constant" }
 wchar_t f = L'f';

	Jakub