Bug 100977 - [C++23] Implement C++ Identifier Syntax using Unicode Standard Annex 31
Summary: [C++23] Implement C++ Identifier Syntax using Unicode Standard Annex 31
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: c++ (show other bugs)
Version: 12.0
: P3 normal
Target Milestone: ---
Assignee: Jakub Jelinek
URL:
Keywords:
Depends on:
Blocks: c++23-core
  Show dependency treegraph
 
Reported: 2021-06-08 18:15 UTC by Jason Merrill
Modified: 2021-12-01 09:22 UTC (History)
3 users (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed: 2021-06-08 00:00:00


Attachments
gcc12-pr100977-1.patch (4.54 KB, patch)
2021-08-04 14:08 UTC, Jakub Jelinek
Details | Diff
gcc12-pr100977-2-wip.patch (22.26 KB, patch)
2021-08-04 18:40 UTC, Jakub Jelinek
Details | Diff
gcc12-pr100977-2.patch (25.17 KB, patch)
2021-08-05 10:17 UTC, Jakub Jelinek
Details | Diff

Note You need to log in before you can comment on or make changes to this bug.
Description Jason Merrill 2021-06-08 18:15:40 UTC
https://wg21.link/p1949r7

This seems like largely a matter of adding another category to libcpp/ucnid.tab.
Comment 1 Jakub Jelinek 2021-08-04 13:39:29 UTC
I think it might be better to make makeucnid parse also the https://www.unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt
file and read the XID_Start and XID_End properties from there.
But when I just regenerate ucnid.h using Unicode 13.0.0 txt files, the difference is:
--- /usr/src/gcc/libcpp/ucnid.h	2021-08-04 15:04:46.053701822 +0200
+++ ucnid.h	2021-08-04 15:05:36.773996631 +0200
@@ -505,6 +505,7 @@ static const struct ucnrange ucnranges[]
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x07f0 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x07f1 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x07f2 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x07fc },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x0815 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x0816 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x0817 },
@@ -529,7 +530,23 @@ static const struct ucnrange ucnranges[]
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x0858 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x0859 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x085a },
-{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x08e3 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x08d2 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x08d3 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x08d4 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x08d5 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x08d6 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x08d7 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x08d8 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x08d9 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x08da },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x08db },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x08dc },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x08dd },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x08de },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x08df },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x08e0 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x08e2 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x08e3 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x08e4 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x08e5 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x08e6 },
@@ -556,6 +573,7 @@ static const struct ucnrange ucnranges[]
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x08fb },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x08fc },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x08fd },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x08fe },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x0900 },
 { C99|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x0903 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x0904 },
@@ -615,6 +633,7 @@ static const struct ucnrange ucnranges[]
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x09e5 },
 { C99|N99|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x09ef },
 { C99|  0|CXX|C11|  0|CID|NFC|NKC|  0,   0, 0x09f1 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x09fd },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x0a01 },
 { C99|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x0a02 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x0a04 },
@@ -820,6 +839,8 @@ static const struct ucnrange ucnranges[]
 { C99|  0|CXX|C11|  0|CID|NFC|NKC|  0,   0, 0x0d28 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x0d29 },
 { C99|  0|CXX|C11|  0|CID|NFC|NKC|  0,   0, 0x0d39 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x0d3a },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   9, 0x0d3b },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x0d3d },
 { C99|  0|  0|C11|  0|CID|NFC|NKC|CTX,   0, 0x0d3e },
 { C99|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x0d43 },
@@ -894,7 +915,7 @@ static const struct ucnrange ucnranges[]
 { C99|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x0eb7 },
 { C99|  0|  0|C11|  0|CID|NFC|NKC|  0, 118, 0x0eb8 },
 { C99|  0|  0|C11|  0|CID|NFC|NKC|  0, 118, 0x0eb9 },
-{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x0eba },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   9, 0x0eba },
 { C99|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x0ebc },
 { C99|  0|CXX|C11|  0|CID|NFC|NKC|  0,   0, 0x0ebd },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x0ebf },
@@ -1031,6 +1052,22 @@ static const struct ucnrange ucnranges[]
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1a7a },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1a7b },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1a7e },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1aaf },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1ab0 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1ab1 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1ab2 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1ab3 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1ab4 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x1ab5 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x1ab6 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x1ab7 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x1ab8 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x1ab9 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x1aba },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1abb },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1abc },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1abe },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x1abf },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1b05 },
 {   0|  0|  0|C11|  0|  0|NFC|NKC|  0,   0, 0x1b06 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1b07 },
@@ -1094,6 +1131,8 @@ static const struct ucnrange ucnranges[]
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   1, 0x1ce7 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1cec },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1cf3 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1cf7 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1cf8 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1d2b },
 {   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0x1d2e },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1d2f },
@@ -1144,7 +1183,27 @@ static const struct ucnrange ucnranges[]
 {   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1de3 },
 {   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1de4 },
 {   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1de5 },
-{   0|  0|  0|C11|N11|CID|NFC|NKC|  0,   0, 0x1dfb },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1de6 },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1de7 },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1de8 },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1de9 },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1dea },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1deb },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1dec },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1ded },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1dee },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1def },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1df0 },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1df1 },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1df2 },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1df3 },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1df4 },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1df5 },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 232, 0x1df6 },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 228, 0x1df7 },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 228, 0x1df8 },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0,   0, 0x1dfa },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1dfb },
 {   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 233, 0x1dfc },
 {   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 220, 0x1dfd },
 {   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0x1dfe },
@@ -1527,8 +1586,6 @@ static const struct ucnrange ucnranges[]
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x324f },
 {   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0x327e },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x327f },
-{   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0x32fe },
-{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x32ff },
 {   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0x33ff },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x4dff },
 { C99|  0|CXX|C11|  0|CID|NFC|NKC|  0,   0, 0x9fa5 },
@@ -1543,7 +1600,9 @@ static const struct ucnrange ucnranges[]
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0xa67a },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0xa67b },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0xa67c },
-{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0xa69e },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0xa69b },
+{   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0xa69d },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0xa69e },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0xa6ef },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0xa6f0 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0xa76f },
@@ -1551,6 +1610,7 @@ static const struct ucnrange ucnranges[]
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0xa7f7 },
 {   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0xa7f9 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0xa805 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0xa82b },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0xa8c3 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0xa8df },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0xa8e0 },
@@ -1586,6 +1646,10 @@ static const struct ucnrange ucnranges[]
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0xaabe },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0xaac0 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0xaaf5 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0xab5b },
+{   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0xab5f },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0xab68 },
+{   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0xab69 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0xabec },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0xabff },
 { C99|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0xd7a3 },
@@ -1650,7 +1714,16 @@ static const struct ucnrange ucnranges[]
 {   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0xfe23 },
 {   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0xfe24 },
 {   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0xfe25 },
-{   0|  0|  0|C11|N11|CID|NFC|NKC|  0,   0, 0xfe2f },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0xfe26 },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 220, 0xfe27 },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 220, 0xfe28 },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 220, 0xfe29 },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 220, 0xfe2a },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 220, 0xfe2b },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 220, 0xfe2c },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 220, 0xfe2d },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0xfe2e },
+{   0|  0|  0|C11|N11|CID|NFC|NKC|  0, 230, 0xfe2f },
 {   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0xfe44 },
 {   0|  0|  0|  0|  0|CID|NFC|NKC|  0,   0, 0xfe46 },
 {   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0xfe52 },
@@ -1686,13 +1759,39 @@ static const struct ucnrange ucnranges[]
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0xfffd },
 {   0|  0|  0|  0|  0|CID|NFC|NKC|  0,   0, 0xffff },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x101fc },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x102df },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x10375 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x10376 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x10377 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x10378 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x10379 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x10a0c },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x10a0e },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x10a37 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x10a38 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   1, 0x10a39 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x10a3e },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x10ae4 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x10ae5 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x10d23 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x10d24 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x10d25 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x10d26 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x10eaa },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x10eab },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x10f45 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x10f46 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x10f47 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x10f48 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x10f49 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x10f4a },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x10f4b },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x10f4c },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x10f4d },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x10f4e },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x10f4f },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x11045 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1107e },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x11099 },
 {   0|  0|  0|C11|  0|  0|NFC|NKC|  0,   0, 0x1109a },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1109b },
@@ -1711,9 +1810,88 @@ static const struct ucnrange ucnranges[]
 {   0|  0|  0|C11|  0|  0|NFC|NKC|  0,   0, 0x1112f },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x11132 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   9, 0x11133 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x11172 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x111bf },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x111c9 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x11234 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   9, 0x11235 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x112e8 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   7, 0x112e9 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1133a },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   7, 0x1133b },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1133d },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|CTX,   0, 0x1133e },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1134a },
+{   0|  0|  0|C11|  0|  0|NFC|NKC|  0,   0, 0x1134c },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x11356 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|CTX,   0, 0x11357 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x11365 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x11366 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x11367 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x11368 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x11369 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1136a },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1136b },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1136f },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x11370 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x11371 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x11372 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x11373 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x11441 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x11445 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1145d },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x114af },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|CTX,   0, 0x114b0 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x114b9 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|CTX,   0, 0x114ba },
+{   0|  0|  0|C11|  0|  0|NFC|NKC|  0,   0, 0x114bc },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|CTX,   0, 0x114bd },
+{   0|  0|  0|C11|  0|  0|NFC|NKC|  0,   0, 0x114be },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x114c1 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   9, 0x114c2 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x115ae },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|CTX,   0, 0x115af },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x115b9 },
+{   0|  0|  0|C11|  0|  0|NFC|NKC|  0,   0, 0x115bb },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x115be },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   9, 0x115bf },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1163e },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x116b5 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   9, 0x116b6 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1172a },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x11838 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   9, 0x11839 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1192f },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|CTX,   0, 0x11930 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x11937 },
+{   0|  0|  0|C11|  0|  0|NFC|NKC|  0,   0, 0x11938 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1193c },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   9, 0x1193d },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x11942 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x119df },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x11a33 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x11a46 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x11a98 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x11c3e },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x11d41 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x11d43 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   9, 0x11d44 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x11d96 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x16aef },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   1, 0x16af0 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   1, 0x16af1 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   1, 0x16af2 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   1, 0x16af3 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x16b2f },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x16b30 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x16b31 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x16b32 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x16b33 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x16b34 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x16b35 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x16fef },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   6, 0x16ff0 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1bc9d },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1d15d },
 {   0|  0|  0|C11|  0|  0|  0|  0|  0,   0, 0x1d164 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 216, 0x1d165 },
@@ -1792,6 +1970,69 @@ static const struct ucnrange ucnranges[]
 {   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0x1d7cb },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1d7cd },
 {   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0x1d7ff },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1dfff },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e000 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e001 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e002 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e003 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e004 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e005 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1e007 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e008 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e009 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e00a },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e00b },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e00c },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e00d },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e00e },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e00f },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e010 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e011 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e012 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e013 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e014 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e015 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e016 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e017 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1e01a },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e01b },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e01c },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e01d },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e01e },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e01f },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e020 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1e022 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e023 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1e025 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e026 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e027 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e028 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e029 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1e12f },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e130 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e131 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e132 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e133 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e134 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e135 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1e2eb },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e2ec },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e2ed },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e2ee },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1e8cf },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x1e8d0 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x1e8d1 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x1e8d2 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x1e8d3 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x1e8d4 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 220, 0x1e8d5 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1e943 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e944 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e945 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e946 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e947 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e948 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x1e949 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1edff },
 {   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0x1ee03 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1ee04 },
@@ -1865,17 +2106,19 @@ static const struct ucnrange ucnranges[]
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1f12f },
 {   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0x1f14f },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1f169 },
-{   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0x1f16b },
+{   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0x1f16c },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1f18f },
 {   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0x1f190 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1f1ff },
 {   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0x1f202 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1f20f },
-{   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0x1f23a },
+{   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0x1f23b },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1f23f },
 {   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0x1f248 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1f24f },
 {   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0x1f251 },
+{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1fbef },
+{   0|  0|  0|C11|  0|CID|NFC|  0|  0,   0, 0x1fbf9 },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x1fffd },
 {   0|  0|  0|  0|  0|CID|NFC|NKC|  0,   0, 0x1ffff },
 {   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x2f7ff },

plus various changes in the check_nfc function.
So, the first question is if the C11/N11/C99 etc. stuff should use Unicode 4.1 (or what was used when it was generated) tables and only CXX20/NXX20 should use Unicode 13.0 tables (what about NFC/NKC?), or if it is ok to just regenerate everything using Unicode 13.0 files, add parsing of the DerivedCoreProperties.txt file too (and pick XID_Start and XID_Continue properties there, throw away everything < 0x80 and otherwise compute CXX20 flag
as XID_Continue and NXX20 flag as XID_Continue \ XID_Start.
Comment 2 Jakub Jelinek 2021-08-04 14:08:58 UTC
Created attachment 51258 [details]
gcc12-pr100977-1.patch

I think I found a bug in the makeucnid.c program, sometimes the ranges are split
even when they contain the identical flags and combining value (which results in
unnecessarily large table), but in other cases, e.g.
U+0483 to U+0487 inclusive are combining 230 and U+0488 is combining 0,
but the generated file had:
{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x0482 },
{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x0483 },
{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x0484 },
{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x0485 },
{   0|  0|  0|C11|  0|CID|NFC|NKC|  0, 230, 0x0486 },
{   0|  0|  0|C11|  0|CID|NFC|NKC|  0,   0, 0x048f },
i.e. 0x0487 would be handled as non-combining.
Comment 3 Jakub Jelinek 2021-08-04 16:14:50 UTC
Incrementally, here is a makeucnid.c patch to also emit CXX23 and NXX23 flags (CXX23 for valid as C++23 identifier and NXX23 for valid as C++23 identifier but not as the first character), but doesn't contain changes to actually handle it on the libcpp side.

--- libcpp/makeucnid.c.jj	2021-08-04 17:35:35.995944075 +0200
+++ libcpp/makeucnid.c	2021-08-04 18:13:56.399062234 +0200
@@ -17,7 +17,7 @@ along with this program; see the file CO
 
 /* Run this program as
    ./makeucnid ucnid.tab UnicodeData.txt DerivedNormalizationProps.txt \
-       > ucnid.h
+      DerivedCoreProperties.txt > ucnid.h
 */
 
 #include <stdio.h>
@@ -32,10 +32,12 @@ enum {
   N99 = 4,
   C11 = 8,
   N11 = 16,
-  all_languages = C99 | CXX | C11,
-  not_NFC = 32,
-  not_NFKC = 64,
-  maybe_not_NFC = 128
+  CXX23 = 32,
+  NXX23 = 64,
+  all_languages = C99 | CXX | C11 | CXX23 | NXX23,
+  not_NFC = 128,
+  not_NFKC = 256,
+  maybe_not_NFC = 512
 };
 
 #define NUM_CODE_POINTS 0x110000
@@ -241,6 +243,74 @@ read_derived (const char *fname)
   fclose (f);
 }
 
+/* Read DerivedCoreProperties.txt and fill in languages version in
+   flags from the XID_Start and XID_Continue properties.  */
+
+static void
+read_derivedcore (char *fname)
+{
+  FILE * f = fopen (fname, "r");
+  
+  if (!f)
+    fail ("opening DerivedCoreProperties.txt");
+  for (;;)
+    {
+      char line[256];
+      unsigned long codepoint_start, codepoint_end;
+      char *l;
+      int i, j;
+
+      if (!fgets (line, sizeof (line), f))
+	break;
+      if (line[0] == '#' || line[0] == '\n' || line[0] == '\r')
+	continue;
+      codepoint_start = strtoul (line, &l, 16);
+      if (l == line)
+	fail ("parsing DerivedCoreProperties.txt, reading code point");
+      if (codepoint_start > MAX_CODE_POINT)
+	fail ("parsing DerivedCoreProperties.txt, code point too large");
+      
+      if (*l == '.' && l[1] == '.')
+	{
+	  char *l2 = l + 2;
+	  codepoint_end = strtoul (l + 2, &l, 16);
+	  if (l == l2 || codepoint_end < codepoint_start)
+	    fail ("parsing DerivedCoreProperties.txt, reading code point");
+	  if (codepoint_end > MAX_CODE_POINT)
+	    fail ("parsing DerivedCoreProperties.txt, code point too large");
+	}
+      else
+	codepoint_end = codepoint_start;
+
+      while (*l == ' ')
+	l++;
+      if (*l++ != ';')
+	fail ("parsing DerivedCoreProperties.txt, reading code point");
+
+      while (*l == ' ')
+	l++;
+
+      if (codepoint_end < 0x80)
+        continue;
+
+      if (strncmp (l, "XID_Start ", 10) == 0)
+	{
+	  for (; codepoint_start <= codepoint_end; codepoint_start++)
+	    flags[codepoint_start]
+	      = (flags[codepoint_start] | CXX23) & ~NXX23;
+	}
+      else if (strncmp (l, "XID_Continue ", 13) == 0)
+	{
+	  for (; codepoint_start <= codepoint_end; codepoint_start++)
+	    if ((flags[codepoint_start] & CXX23) == 0)
+	      flags[codepoint_start] |= CXX23 | NXX23;
+	}
+    }
+  if (ferror (f))
+    fail ("reading DerivedCoreProperties.txt");
+  fclose (f);
+}
+
 /* Write out the table.
    The table consists of two words per entry.  The first word is the flags
    for the unicode code points up to and including the second word.  */
@@ -261,12 +331,14 @@ write_table (void)
 	|| really_safe != (decomp[i][0] == 0)
 	|| combining_value[i] != last_combine)
       {
-	printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
+	printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
 		last_flag & C99 ? "C99" : "  0",
 		last_flag & N99 ? "N99" : "  0",
 		last_flag & CXX ? "CXX" : "  0",
 		last_flag & C11 ? "C11" : "  0",
 		last_flag & N11 ? "N11" : "  0",
+		last_flag & CXX23 ? "CXX23" : "    0",
+		last_flag & NXX23 ? "NXX23" : "    0",
 		really_safe ? "CID" : "  0",
 		last_flag & not_NFC ? "  0" : "NFC",
 		last_flag & not_NFKC ? "  0" : "NKC",
@@ -439,11 +511,12 @@ write_copyright (void)
 int
 main(int argc, char ** argv)
 {
-  if (argc != 4)
+  if (argc != 5)
     fail ("too few arguments to makeucn");
   read_ucnid (argv[1]);
   read_table (argv[2]);
   read_derived (argv[3]);
+  read_derivedcore (argv[4]);
 
   write_copyright ();
   write_table ();
Comment 4 jsm-csl@polyomino.org.uk 2021-08-04 18:34:59 UTC
On Wed, 4 Aug 2021, jakub at gcc dot gnu.org via Gcc-bugs wrote:

> plus various changes in the check_nfc function.
> So, the first question is if the C11/N11/C99 etc. stuff should use Unicode 4.1
> (or what was used when it was generated) tables and only CXX20/NXX20 should use
> Unicode 13.0 tables (what about NFC/NKC?), or if it is ok to just regenerate
> everything using Unicode 13.0 files, add parsing of the
> DerivedCoreProperties.txt file too (and pick XID_Start and XID_Continue
> properties there, throw away everything < 0x80 and otherwise compute CXX20 flag
> as XID_Continue and NXX20 flag as XID_Continue \ XID_Start.

I think it's fine for the normalization tests for older standard versions 
to use the latest Unicode version, so changing each time we update from 
newer Unicode data (as per 
<https://gcc.gnu.org/legacy-ml/gcc-patches/2013-11/msg01901.html> I used 
Unicode 6.3.0 at that time).

A trickier question is whether the XID_Start and XID_Continue sets of 
characters used for C++23 are meant to be fixed to a particular Unicode 
version (possibly updated for future C++ versions) or whether the set used 
for C++23 is meant to be updated for each future Unicode release as it 
comes out.

(Note also that identifiers not in NFC become ill-formed, i.e. 
-Wnormalized=nfc needs to be a pedwarn for C++23.)
Comment 5 Jakub Jelinek 2021-08-04 18:40:54 UTC
Created attachment 51260 [details]
gcc12-pr100977-2-wip.patch

Here is WIP incremental patch, but I'd prefer to do it in steps, first the above
mentioned bug, then separately update to latest Unicode, then do the
cxx23_identifiers change and need to add there some testsuite coverage and deal
with the nfc.
Comment 6 Uroš Bizjak 2021-08-04 19:06:59 UTC
(In reply to Jakub Jelinek from comment #3)

> -	printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
> +	printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",

BTW: You can also use width with strings (e.g. %3s) to avoid spaces bellow.

>  		last_flag & C99 ? "C99" : "  0",
>  		last_flag & N99 ? "N99" : "  0",
>  		last_flag & CXX ? "CXX" : "  0",
>  		last_flag & C11 ? "C11" : "  0",
>  		last_flag & N11 ? "N11" : "  0",
> +		last_flag & CXX23 ? "CXX23" : "    0",
> +		last_flag & NXX23 ? "NXX23" : "    0",
>  		really_safe ? "CID" : "  0",
>  		last_flag & not_NFC ? "  0" : "NFC",
>  		last_flag & not_NFKC ? "  0" : "NKC",
Comment 7 Jakub Jelinek 2021-08-04 19:20:24 UTC
True, but is it worth changing on a tool that is one twice in a decade?
Comment 8 Uroš Bizjak 2021-08-04 19:25:32 UTC
(In reply to Jakub Jelinek from comment #7)
> True, but is it worth changing on a tool that is one twice in a decade?

Well, the question is self-answering ;)
Comment 9 Jakub Jelinek 2021-08-05 10:17:32 UTC
Created attachment 51265 [details]
gcc12-pr100977-2.patch

Here is an updated (but so far only very lightly tested) patch on top of the
https://gcc.gnu.org/pipermail/gcc-patches/2021-August/576748.html
and
https://gcc.gnu.org/pipermail/gcc-patches/2021-August/576749.html
patches, including the pedwarn for "is not in NFC" and testsuite coverage.
Comment 10 GCC Commits 2021-08-05 15:34:53 UTC
The master branch has been updated by Jakub Jelinek <jakub@gcc.gnu.org>:

https://gcc.gnu.org/g:4805b92a32637b987f924463d6af9dcf95b21f63

commit r12-2771-g4805b92a32637b987f924463d6af9dcf95b21f63
Author: Jakub Jelinek <jakub@redhat.com>
Date:   Thu Aug 5 17:34:16 2021 +0200

    libcpp: Fix makeucnid bug with combining values [PR100977]
    
    I've noticed in ucnid.h two adjacent lines that had all flags and combine
    values identical and as such were supposed to be merged.
    
    This is due to a bug in makeucnid.c, which records last_flag,
    last_combine and really_safe of what has just been printed, but
    because of a typo mishandles it for last_combine, always compares against
    the combining_value[0] which is 0.
    
    This has two effects on the table, one is that often the table is
    unnecessarily large, as for non-zero .combine every character has its own
    record instead of adjacent characters with the same flags and combine
    being merged.  This means larger tables.
    The other is that sometimes the last char that has combine set doesn't
    actually have it in the tables, because the code is printing entries only
    upon seeing the next character and if that character does have
    combining_value of 0 and flags are otherwise the same as previously printed,
    it will not print anything.
    
    The following patch fixes that, for clarity what exactly it affects
    I've regenerated with the same Unicode files as last time it has
    been regenerated.
    
    2021-08-05  Jakub Jelinek  <jakub@redhat.com>
    
            PR c++/100977
            * makeucnid.c (write_table): Fix computation of last_combine.
            * ucnid.h: Regenerated using Unicode 6.3.0 files.
Comment 11 GCC Commits 2021-08-05 15:35:42 UTC
The master branch has been updated by Jakub Jelinek <jakub@gcc.gnu.org>:

https://gcc.gnu.org/g:4739344d36e6d24764cbedde44a3fff6edc70f6c

commit r12-2772-g4739344d36e6d24764cbedde44a3fff6edc70f6c
Author: Jakub Jelinek <jakub@redhat.com>
Date:   Thu Aug 5 17:35:20 2021 +0200

    libcpp: Regenerate ucnid.h using Unicode 13.0.0 files [PR100977]
    
    The following patch (incremental to the makeucnid.c fix) regenerates
    ucnid.h with https://www.unicode.org/Public/13.0.0/ucd/ files.
    
    2021-08-05  Jakub Jelinek  <jakub@redhat.com>
    
            PR c++/100977
            * ucnid.h: Regenerated using Unicode 13.0.0 files.
Comment 12 GCC Commits 2021-09-01 20:37:29 UTC
The master branch has been updated by Jakub Jelinek <jakub@gcc.gnu.org>:

https://gcc.gnu.org/g:c4d6dcacfca1b804504515496e6d9de176d7f51e

commit r12-3302-gc4d6dcacfca1b804504515496e6d9de176d7f51e
Author: Jakub Jelinek <jakub@redhat.com>
Date:   Wed Sep 1 22:33:06 2021 +0200

    libcpp: Implement C++23 P1949R7 - C++ Identifier Syntax using Unicode Standard Annex 31
    
    The following patch implements the
    P1949R7 - C++ Identifier Syntax using Unicode Standard Annex 31
    paper.  We already allow UTF-8 characters in the source, so that part
    is already implemented, so IMHO all we need to do is pedwarn instead of
    just warn for the (default) -Wnormalize=nfc (or for -Wnormalize={id,nkfc})
    if the character is not in NFC and to use the unicode XID_Start and
    XID_Continue derived code properties to find out what characters are allowed
    (the standard actually adds U+005F to XID_Start, but we are handling the
    ASCII compatible characters differently already and they aren't allowed
    in UCNs in identifiers).  Instead of hardcoding the large tables
    in ucnid.tab, this patch makes makeucnid.c read them from the Unicode
    tables (13.0.0 version at this point).
    
    For non-pedantic mode, we accept as 2nd+ char in identifiers a union
    of valid characters in all supported modes, but for the 1st char it
    was actually pedantically requiring that it is not any of the characters
    that may not appear in the currently chosen standard as the first character.
    This patch changes it such that also what is allowed at the start of an
    identifier is a union of characters valid at the start of an identifier
    in any of the pedantic modes.
    
    2021-09-01  Jakub Jelinek  <jakub@redhat.com>
    
            PR c++/100977
    libcpp/
            * include/cpplib.h (struct cpp_options): Add cxx23_identifiers.
            * charset.c (CXX23, NXX23): New enumerators.
            (CID, NFC, NKC, CTX): Renumber.
            (ucn_valid_in_identifier): Implement P1949R7 - use CXX23 and
            NXX23 flags for cxx23_identifiers.  For start character in
            non-pedantic mode, allow characters that are allowed as start
            characters in any of the supported language modes, rather than
            disallowing characters allowed only as non-start characters in
            current mode but for characters from other language modes allowing
            them even if they are never allowed at start.
            * init.c (struct lang_flags): Add cxx23_identifiers.
            (lang_defaults): Add cxx23_identifiers column.
            (cpp_set_lang): Initialize CPP_OPTION (pfile, cxx23_identifiers).
            * lex.c (warn_about_normalization): If cxx23_identifiers, use
            cpp_pedwarning_with_line instead of cpp_warning_with_line for
            "is not in NFC" diagnostics.
            * makeucnid.c: Adjust usage comment.
            (CXX23, NXX23): New enumerators.
            (all_languages): Add CXX23.
            (not_NFC, not_NFKC, maybe_not_NFC): Renumber.
            (read_derivedcore): New function.
            (write_table): Print also CXX23 and NXX23 columns.
            (main): Require 5 arguments instead of 4, call read_derivedcore.
            * ucnid.h: Regenerated using Unicode 13.0.0 files.
    gcc/testsuite/
            * g++.dg/cpp23/normalize1.C: New test.
            * g++.dg/cpp23/normalize2.C: New test.
            * g++.dg/cpp23/normalize3.C: New test.
            * g++.dg/cpp23/normalize4.C: New test.
            * g++.dg/cpp23/normalize5.C: New test.
            * g++.dg/cpp23/normalize6.C: New test.
            * g++.dg/cpp23/normalize7.C: New test.
            * g++.dg/cpp23/ucnid-1-utf8.C: New test.
            * g++.dg/cpp23/ucnid-2-utf8.C: New test.
            * gcc.dg/cpp/ucnid-4.c: Don't expect
            "not valid at the start of an identifier" errors.
            * gcc.dg/cpp/ucnid-4-utf8.c: Likewise.
            * gcc.dg/cpp/ucnid-5-utf8.c: New test.
Comment 13 Jakub Jelinek 2021-09-01 20:38:55 UTC
Implemented for GCC 12.
Comment 14 GCC Commits 2021-11-30 08:51:40 UTC
The master branch has been updated by Jakub Jelinek <jakub@gcc.gnu.org>:

https://gcc.gnu.org/g:7abcc9ca20d4e17deabb308b5f483aaccc3dc02c

commit r12-5597-g7abcc9ca20d4e17deabb308b5f483aaccc3dc02c
Author: Jakub Jelinek <jakub@redhat.com>
Date:   Tue Nov 30 09:50:52 2021 +0100

    libcpp: Enable P1949R7 for C++11 and up as it was a DR [PR100977]
    
    Jonathan mentioned on IRC that:
    "Accept P1949R7 (C++ Identifier Syntax using Unicode Standard Annex 31) as
    a Defect Report and apply the changes therein to the C++ working paper."
    while I've actually implemented it only for -std={gnu,c}++{23,2b}.
    As the C++98 rules were significantly different, I'm not trying to change
    anything for C++98.
    
    2021-11-30  Jakub Jelinek  <jakub@redhat.com>
    
            PR c++/100977
            * init.c (lang_defaults): Enable cxx23_identifiers for
            -std={gnu,c}++{11,14,17,20} too.
    
            * c-c++-common/cpp/ucnid-2011-1-utf8.c: Expect errors in C++.
            * c-c++-common/cpp/ucnid-2011-1.c: Likewise.
            * g++.dg/cpp/ucnid-4-utf8.C: Add missing space to dg-options.
            * g++.dg/cpp23/normalize3.C: Enable for c++11 rather than just c++23.
            * g++.dg/cpp23/normalize4.C: Likewise.
            * g++.dg/cpp23/normalize5.C: Likewise.
            * g++.dg/cpp23/normalize7.C: Expect errors rather than just warnings
            for c++11 and up rather than just c++23.
            * g++.dg/cpp23/ucnid-2-utf8.C: Expect errors even for c++11 .. c++20.
Comment 15 GCC Commits 2021-12-01 09:22:27 UTC
The master branch has been updated by Jakub Jelinek <jakub@gcc.gnu.org>:

https://gcc.gnu.org/g:c264208e161830a5642ee3125871c23110508462

commit r12-5653-gc264208e161830a5642ee3125871c23110508462
Author: Jakub Jelinek <jakub@redhat.com>
Date:   Wed Dec 1 10:21:20 2021 +0100

    libcpp: Enable P1949R7 for C++98 too [PR100977]
    
    On Mon, Nov 29, 2021 at 05:53:58PM -0500, Jason Merrill wrote:
    > I'm inclined to go ahead and change C++98 as well; I doubt anyone is relying
    > on the particular C++98 extended character set rules, and we already accept
    > the union of the different sets when not pedantic.
    
    Ok, here is an incremental patch to do that also for -std={c,gnu}++98.
    
    2021-12-01  Jakub Jelinek  <jakub@redhat.com>
    
            PR c++/100977
            * init.c (struct lang_flags): Remove cxx23_identifiers.
            (lang_defaults): Remove cxx23_identifiers initializers.
            (cpp_set_lang): Don't copy cxx23_identifiers.
            * include/cpplib.h (struct cpp_options): Adjust comment about
            c11_identifiers.  Remove cxx23_identifiers field.
            * lex.c (warn_about_normalization): Use cplusplus instead of
            cxx23_identifiers.
            * charset.c (ucn_valid_in_identifier): Likewise.
    
            * g++.dg/cpp/ucnid-1.C: Adjust expected diagnostics.
            * g++.dg/cpp/ucnid-1-utf8.C: Likewise.