https://wg21.link/p1949r7 This seems like largely a matter of adding another category to libcpp/ucnid.tab.
I think it might be better to make makeucnid parse also the https://www.unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt file and read the XID_Start and XID_End properties from there. But when I just regenerate ucnid.h using Unicode 13.0.0 txt files, the difference is: --- /usr/src/gcc/libcpp/ucnid.h 2021-08-04 15:04:46.053701822 +0200 +++ ucnid.h 2021-08-04 15:05:36.773996631 +0200 @@ -505,6 +505,7 @@ static const struct ucnrange ucnranges[] { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x07f0 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x07f1 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x07f2 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x07fc }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x0815 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x0816 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x0817 }, @@ -529,7 +530,23 @@ static const struct ucnrange ucnranges[] { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x0858 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x0859 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x085a }, -{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x08e3 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x08d2 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x08d3 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x08d4 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x08d5 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x08d6 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x08d7 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x08d8 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x08d9 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x08da }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x08db }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x08dc }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x08dd }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x08de }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x08df }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x08e0 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x08e2 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x08e3 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x08e4 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x08e5 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x08e6 }, @@ -556,6 +573,7 @@ static const struct ucnrange ucnranges[] { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x08fb }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x08fc }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x08fd }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x08fe }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x0900 }, { C99| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x0903 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x0904 }, @@ -615,6 +633,7 @@ static const struct ucnrange ucnranges[] { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x09e5 }, { C99|N99| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x09ef }, { C99| 0|CXX|C11| 0|CID|NFC|NKC| 0, 0, 0x09f1 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x09fd }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x0a01 }, { C99| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x0a02 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x0a04 }, @@ -820,6 +839,8 @@ static const struct ucnrange ucnranges[] { C99| 0|CXX|C11| 0|CID|NFC|NKC| 0, 0, 0x0d28 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x0d29 }, { C99| 0|CXX|C11| 0|CID|NFC|NKC| 0, 0, 0x0d39 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x0d3a }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 9, 0x0d3b }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x0d3d }, { C99| 0| 0|C11| 0|CID|NFC|NKC|CTX, 0, 0x0d3e }, { C99| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x0d43 }, @@ -894,7 +915,7 @@ static const struct ucnrange ucnranges[] { C99| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x0eb7 }, { C99| 0| 0|C11| 0|CID|NFC|NKC| 0, 118, 0x0eb8 }, { C99| 0| 0|C11| 0|CID|NFC|NKC| 0, 118, 0x0eb9 }, -{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x0eba }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 9, 0x0eba }, { C99| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x0ebc }, { C99| 0|CXX|C11| 0|CID|NFC|NKC| 0, 0, 0x0ebd }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x0ebf }, @@ -1031,6 +1052,22 @@ static const struct ucnrange ucnranges[] { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1a7a }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1a7b }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1a7e }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1aaf }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1ab0 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1ab1 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1ab2 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1ab3 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1ab4 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x1ab5 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x1ab6 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x1ab7 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x1ab8 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x1ab9 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x1aba }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1abb }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1abc }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1abe }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x1abf }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1b05 }, { 0| 0| 0|C11| 0| 0|NFC|NKC| 0, 0, 0x1b06 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1b07 }, @@ -1094,6 +1131,8 @@ static const struct ucnrange ucnranges[] { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 1, 0x1ce7 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1cec }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1cf3 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1cf7 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1cf8 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1d2b }, { 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0x1d2e }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1d2f }, @@ -1144,7 +1183,27 @@ static const struct ucnrange ucnranges[] { 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1de3 }, { 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1de4 }, { 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1de5 }, -{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 0, 0x1dfb }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1de6 }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1de7 }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1de8 }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1de9 }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1dea }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1deb }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1dec }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1ded }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1dee }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1def }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1df0 }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1df1 }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1df2 }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1df3 }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1df4 }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1df5 }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 232, 0x1df6 }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 228, 0x1df7 }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 228, 0x1df8 }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 0, 0x1dfa }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1dfb }, { 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 233, 0x1dfc }, { 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 220, 0x1dfd }, { 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0x1dfe }, @@ -1527,8 +1586,6 @@ static const struct ucnrange ucnranges[] { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x324f }, { 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0x327e }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x327f }, -{ 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0x32fe }, -{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x32ff }, { 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0x33ff }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x4dff }, { C99| 0|CXX|C11| 0|CID|NFC|NKC| 0, 0, 0x9fa5 }, @@ -1543,7 +1600,9 @@ static const struct ucnrange ucnranges[] { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0xa67a }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0xa67b }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0xa67c }, -{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0xa69e }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0xa69b }, +{ 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0xa69d }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0xa69e }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0xa6ef }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0xa6f0 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0xa76f }, @@ -1551,6 +1610,7 @@ static const struct ucnrange ucnranges[] { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0xa7f7 }, { 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0xa7f9 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0xa805 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0xa82b }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0xa8c3 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0xa8df }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0xa8e0 }, @@ -1586,6 +1646,10 @@ static const struct ucnrange ucnranges[] { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0xaabe }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0xaac0 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0xaaf5 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0xab5b }, +{ 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0xab5f }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0xab68 }, +{ 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0xab69 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0xabec }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0xabff }, { C99| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0xd7a3 }, @@ -1650,7 +1714,16 @@ static const struct ucnrange ucnranges[] { 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0xfe23 }, { 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0xfe24 }, { 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0xfe25 }, -{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 0, 0xfe2f }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0xfe26 }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 220, 0xfe27 }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 220, 0xfe28 }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 220, 0xfe29 }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 220, 0xfe2a }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 220, 0xfe2b }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 220, 0xfe2c }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 220, 0xfe2d }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0xfe2e }, +{ 0| 0| 0|C11|N11|CID|NFC|NKC| 0, 230, 0xfe2f }, { 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0xfe44 }, { 0| 0| 0| 0| 0|CID|NFC|NKC| 0, 0, 0xfe46 }, { 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0xfe52 }, @@ -1686,13 +1759,39 @@ static const struct ucnrange ucnranges[] { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0xfffd }, { 0| 0| 0| 0| 0|CID|NFC|NKC| 0, 0, 0xffff }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x101fc }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x102df }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x10375 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x10376 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x10377 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x10378 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x10379 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x10a0c }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x10a0e }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x10a37 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x10a38 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 1, 0x10a39 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x10a3e }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x10ae4 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x10ae5 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x10d23 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x10d24 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x10d25 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x10d26 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x10eaa }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x10eab }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x10f45 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x10f46 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x10f47 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x10f48 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x10f49 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x10f4a }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x10f4b }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x10f4c }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x10f4d }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x10f4e }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x10f4f }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x11045 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1107e }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x11099 }, { 0| 0| 0|C11| 0| 0|NFC|NKC| 0, 0, 0x1109a }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1109b }, @@ -1711,9 +1810,88 @@ static const struct ucnrange ucnranges[] { 0| 0| 0|C11| 0| 0|NFC|NKC| 0, 0, 0x1112f }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x11132 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 9, 0x11133 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x11172 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x111bf }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x111c9 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x11234 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 9, 0x11235 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x112e8 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 7, 0x112e9 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1133a }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 7, 0x1133b }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1133d }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC|CTX, 0, 0x1133e }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1134a }, +{ 0| 0| 0|C11| 0| 0|NFC|NKC| 0, 0, 0x1134c }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x11356 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC|CTX, 0, 0x11357 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x11365 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x11366 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x11367 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x11368 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x11369 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1136a }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1136b }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1136f }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x11370 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x11371 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x11372 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x11373 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x11441 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x11445 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1145d }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x114af }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC|CTX, 0, 0x114b0 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x114b9 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC|CTX, 0, 0x114ba }, +{ 0| 0| 0|C11| 0| 0|NFC|NKC| 0, 0, 0x114bc }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC|CTX, 0, 0x114bd }, +{ 0| 0| 0|C11| 0| 0|NFC|NKC| 0, 0, 0x114be }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x114c1 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 9, 0x114c2 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x115ae }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC|CTX, 0, 0x115af }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x115b9 }, +{ 0| 0| 0|C11| 0| 0|NFC|NKC| 0, 0, 0x115bb }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x115be }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 9, 0x115bf }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1163e }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x116b5 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 9, 0x116b6 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1172a }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x11838 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 9, 0x11839 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1192f }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC|CTX, 0, 0x11930 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x11937 }, +{ 0| 0| 0|C11| 0| 0|NFC|NKC| 0, 0, 0x11938 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1193c }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 9, 0x1193d }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x11942 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x119df }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x11a33 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x11a46 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x11a98 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x11c3e }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x11d41 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x11d43 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 9, 0x11d44 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x11d96 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x16aef }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 1, 0x16af0 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 1, 0x16af1 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 1, 0x16af2 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 1, 0x16af3 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x16b2f }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x16b30 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x16b31 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x16b32 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x16b33 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x16b34 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x16b35 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x16fef }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 6, 0x16ff0 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1bc9d }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1d15d }, { 0| 0| 0|C11| 0| 0| 0| 0| 0, 0, 0x1d164 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 216, 0x1d165 }, @@ -1792,6 +1970,69 @@ static const struct ucnrange ucnranges[] { 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0x1d7cb }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1d7cd }, { 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0x1d7ff }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1dfff }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e000 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e001 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e002 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e003 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e004 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e005 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1e007 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e008 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e009 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e00a }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e00b }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e00c }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e00d }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e00e }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e00f }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e010 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e011 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e012 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e013 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e014 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e015 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e016 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e017 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1e01a }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e01b }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e01c }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e01d }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e01e }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e01f }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e020 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1e022 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e023 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1e025 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e026 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e027 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e028 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e029 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1e12f }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e130 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e131 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e132 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e133 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e134 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e135 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1e2eb }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e2ec }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e2ed }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e2ee }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1e8cf }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x1e8d0 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x1e8d1 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x1e8d2 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x1e8d3 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x1e8d4 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 220, 0x1e8d5 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1e943 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e944 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e945 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e946 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e947 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e948 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x1e949 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1edff }, { 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0x1ee03 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1ee04 }, @@ -1865,17 +2106,19 @@ static const struct ucnrange ucnranges[] { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1f12f }, { 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0x1f14f }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1f169 }, -{ 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0x1f16b }, +{ 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0x1f16c }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1f18f }, { 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0x1f190 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1f1ff }, { 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0x1f202 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1f20f }, -{ 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0x1f23a }, +{ 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0x1f23b }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1f23f }, { 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0x1f248 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1f24f }, { 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0x1f251 }, +{ 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1fbef }, +{ 0| 0| 0|C11| 0|CID|NFC| 0| 0, 0, 0x1fbf9 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x1fffd }, { 0| 0| 0| 0| 0|CID|NFC|NKC| 0, 0, 0x1ffff }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x2f7ff }, plus various changes in the check_nfc function. So, the first question is if the C11/N11/C99 etc. stuff should use Unicode 4.1 (or what was used when it was generated) tables and only CXX20/NXX20 should use Unicode 13.0 tables (what about NFC/NKC?), or if it is ok to just regenerate everything using Unicode 13.0 files, add parsing of the DerivedCoreProperties.txt file too (and pick XID_Start and XID_Continue properties there, throw away everything < 0x80 and otherwise compute CXX20 flag as XID_Continue and NXX20 flag as XID_Continue \ XID_Start.
Created attachment 51258 [details] gcc12-pr100977-1.patch I think I found a bug in the makeucnid.c program, sometimes the ranges are split even when they contain the identical flags and combining value (which results in unnecessarily large table), but in other cases, e.g. U+0483 to U+0487 inclusive are combining 230 and U+0488 is combining 0, but the generated file had: { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x0482 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x0483 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x0484 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x0485 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 230, 0x0486 }, { 0| 0| 0|C11| 0|CID|NFC|NKC| 0, 0, 0x048f }, i.e. 0x0487 would be handled as non-combining.
Incrementally, here is a makeucnid.c patch to also emit CXX23 and NXX23 flags (CXX23 for valid as C++23 identifier and NXX23 for valid as C++23 identifier but not as the first character), but doesn't contain changes to actually handle it on the libcpp side. --- libcpp/makeucnid.c.jj 2021-08-04 17:35:35.995944075 +0200 +++ libcpp/makeucnid.c 2021-08-04 18:13:56.399062234 +0200 @@ -17,7 +17,7 @@ along with this program; see the file CO /* Run this program as ./makeucnid ucnid.tab UnicodeData.txt DerivedNormalizationProps.txt \ - > ucnid.h + DerivedCoreProperties.txt > ucnid.h */ #include <stdio.h> @@ -32,10 +32,12 @@ enum { N99 = 4, C11 = 8, N11 = 16, - all_languages = C99 | CXX | C11, - not_NFC = 32, - not_NFKC = 64, - maybe_not_NFC = 128 + CXX23 = 32, + NXX23 = 64, + all_languages = C99 | CXX | C11 | CXX23 | NXX23, + not_NFC = 128, + not_NFKC = 256, + maybe_not_NFC = 512 }; #define NUM_CODE_POINTS 0x110000 @@ -241,6 +243,74 @@ read_derived (const char *fname) fclose (f); } +/* Read DerivedCoreProperties.txt and fill in languages version in + flags from the XID_Start and XID_Continue properties. */ + +static void +read_derivedcore (char *fname) +{ + FILE * f = fopen (fname, "r"); + + if (!f) + fail ("opening DerivedCoreProperties.txt"); + for (;;) + { + char line[256]; + unsigned long codepoint_start, codepoint_end; + char *l; + int i, j; + + if (!fgets (line, sizeof (line), f)) + break; + if (line[0] == '#' || line[0] == '\n' || line[0] == '\r') + continue; + codepoint_start = strtoul (line, &l, 16); + if (l == line) + fail ("parsing DerivedCoreProperties.txt, reading code point"); + if (codepoint_start > MAX_CODE_POINT) + fail ("parsing DerivedCoreProperties.txt, code point too large"); + + if (*l == '.' && l[1] == '.') + { + char *l2 = l + 2; + codepoint_end = strtoul (l + 2, &l, 16); + if (l == l2 || codepoint_end < codepoint_start) + fail ("parsing DerivedCoreProperties.txt, reading code point"); + if (codepoint_end > MAX_CODE_POINT) + fail ("parsing DerivedCoreProperties.txt, code point too large"); + } + else + codepoint_end = codepoint_start; + + while (*l == ' ') + l++; + if (*l++ != ';') + fail ("parsing DerivedCoreProperties.txt, reading code point"); + + while (*l == ' ') + l++; + + if (codepoint_end < 0x80) + continue; + + if (strncmp (l, "XID_Start ", 10) == 0) + { + for (; codepoint_start <= codepoint_end; codepoint_start++) + flags[codepoint_start] + = (flags[codepoint_start] | CXX23) & ~NXX23; + } + else if (strncmp (l, "XID_Continue ", 13) == 0) + { + for (; codepoint_start <= codepoint_end; codepoint_start++) + if ((flags[codepoint_start] & CXX23) == 0) + flags[codepoint_start] |= CXX23 | NXX23; + } + } + if (ferror (f)) + fail ("reading DerivedCoreProperties.txt"); + fclose (f); +} + /* Write out the table. The table consists of two words per entry. The first word is the flags for the unicode code points up to and including the second word. */ @@ -261,12 +331,14 @@ write_table (void) || really_safe != (decomp[i][0] == 0) || combining_value[i] != last_combine) { - printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n", + printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n", last_flag & C99 ? "C99" : " 0", last_flag & N99 ? "N99" : " 0", last_flag & CXX ? "CXX" : " 0", last_flag & C11 ? "C11" : " 0", last_flag & N11 ? "N11" : " 0", + last_flag & CXX23 ? "CXX23" : " 0", + last_flag & NXX23 ? "NXX23" : " 0", really_safe ? "CID" : " 0", last_flag & not_NFC ? " 0" : "NFC", last_flag & not_NFKC ? " 0" : "NKC", @@ -439,11 +511,12 @@ write_copyright (void) int main(int argc, char ** argv) { - if (argc != 4) + if (argc != 5) fail ("too few arguments to makeucn"); read_ucnid (argv[1]); read_table (argv[2]); read_derived (argv[3]); + read_derivedcore (argv[4]); write_copyright (); write_table ();
On Wed, 4 Aug 2021, jakub at gcc dot gnu.org via Gcc-bugs wrote: > plus various changes in the check_nfc function. > So, the first question is if the C11/N11/C99 etc. stuff should use Unicode 4.1 > (or what was used when it was generated) tables and only CXX20/NXX20 should use > Unicode 13.0 tables (what about NFC/NKC?), or if it is ok to just regenerate > everything using Unicode 13.0 files, add parsing of the > DerivedCoreProperties.txt file too (and pick XID_Start and XID_Continue > properties there, throw away everything < 0x80 and otherwise compute CXX20 flag > as XID_Continue and NXX20 flag as XID_Continue \ XID_Start. I think it's fine for the normalization tests for older standard versions to use the latest Unicode version, so changing each time we update from newer Unicode data (as per <https://gcc.gnu.org/legacy-ml/gcc-patches/2013-11/msg01901.html> I used Unicode 6.3.0 at that time). A trickier question is whether the XID_Start and XID_Continue sets of characters used for C++23 are meant to be fixed to a particular Unicode version (possibly updated for future C++ versions) or whether the set used for C++23 is meant to be updated for each future Unicode release as it comes out. (Note also that identifiers not in NFC become ill-formed, i.e. -Wnormalized=nfc needs to be a pedwarn for C++23.)
Created attachment 51260 [details] gcc12-pr100977-2-wip.patch Here is WIP incremental patch, but I'd prefer to do it in steps, first the above mentioned bug, then separately update to latest Unicode, then do the cxx23_identifiers change and need to add there some testsuite coverage and deal with the nfc.
(In reply to Jakub Jelinek from comment #3) > - printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n", > + printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n", BTW: You can also use width with strings (e.g. %3s) to avoid spaces bellow. > last_flag & C99 ? "C99" : " 0", > last_flag & N99 ? "N99" : " 0", > last_flag & CXX ? "CXX" : " 0", > last_flag & C11 ? "C11" : " 0", > last_flag & N11 ? "N11" : " 0", > + last_flag & CXX23 ? "CXX23" : " 0", > + last_flag & NXX23 ? "NXX23" : " 0", > really_safe ? "CID" : " 0", > last_flag & not_NFC ? " 0" : "NFC", > last_flag & not_NFKC ? " 0" : "NKC",
True, but is it worth changing on a tool that is one twice in a decade?
(In reply to Jakub Jelinek from comment #7) > True, but is it worth changing on a tool that is one twice in a decade? Well, the question is self-answering ;)
Created attachment 51265 [details] gcc12-pr100977-2.patch Here is an updated (but so far only very lightly tested) patch on top of the https://gcc.gnu.org/pipermail/gcc-patches/2021-August/576748.html and https://gcc.gnu.org/pipermail/gcc-patches/2021-August/576749.html patches, including the pedwarn for "is not in NFC" and testsuite coverage.
The master branch has been updated by Jakub Jelinek <jakub@gcc.gnu.org>: https://gcc.gnu.org/g:4805b92a32637b987f924463d6af9dcf95b21f63 commit r12-2771-g4805b92a32637b987f924463d6af9dcf95b21f63 Author: Jakub Jelinek <jakub@redhat.com> Date: Thu Aug 5 17:34:16 2021 +0200 libcpp: Fix makeucnid bug with combining values [PR100977] I've noticed in ucnid.h two adjacent lines that had all flags and combine values identical and as such were supposed to be merged. This is due to a bug in makeucnid.c, which records last_flag, last_combine and really_safe of what has just been printed, but because of a typo mishandles it for last_combine, always compares against the combining_value[0] which is 0. This has two effects on the table, one is that often the table is unnecessarily large, as for non-zero .combine every character has its own record instead of adjacent characters with the same flags and combine being merged. This means larger tables. The other is that sometimes the last char that has combine set doesn't actually have it in the tables, because the code is printing entries only upon seeing the next character and if that character does have combining_value of 0 and flags are otherwise the same as previously printed, it will not print anything. The following patch fixes that, for clarity what exactly it affects I've regenerated with the same Unicode files as last time it has been regenerated. 2021-08-05 Jakub Jelinek <jakub@redhat.com> PR c++/100977 * makeucnid.c (write_table): Fix computation of last_combine. * ucnid.h: Regenerated using Unicode 6.3.0 files.
The master branch has been updated by Jakub Jelinek <jakub@gcc.gnu.org>: https://gcc.gnu.org/g:4739344d36e6d24764cbedde44a3fff6edc70f6c commit r12-2772-g4739344d36e6d24764cbedde44a3fff6edc70f6c Author: Jakub Jelinek <jakub@redhat.com> Date: Thu Aug 5 17:35:20 2021 +0200 libcpp: Regenerate ucnid.h using Unicode 13.0.0 files [PR100977] The following patch (incremental to the makeucnid.c fix) regenerates ucnid.h with https://www.unicode.org/Public/13.0.0/ucd/ files. 2021-08-05 Jakub Jelinek <jakub@redhat.com> PR c++/100977 * ucnid.h: Regenerated using Unicode 13.0.0 files.
The master branch has been updated by Jakub Jelinek <jakub@gcc.gnu.org>: https://gcc.gnu.org/g:c4d6dcacfca1b804504515496e6d9de176d7f51e commit r12-3302-gc4d6dcacfca1b804504515496e6d9de176d7f51e Author: Jakub Jelinek <jakub@redhat.com> Date: Wed Sep 1 22:33:06 2021 +0200 libcpp: Implement C++23 P1949R7 - C++ Identifier Syntax using Unicode Standard Annex 31 The following patch implements the P1949R7 - C++ Identifier Syntax using Unicode Standard Annex 31 paper. We already allow UTF-8 characters in the source, so that part is already implemented, so IMHO all we need to do is pedwarn instead of just warn for the (default) -Wnormalize=nfc (or for -Wnormalize={id,nkfc}) if the character is not in NFC and to use the unicode XID_Start and XID_Continue derived code properties to find out what characters are allowed (the standard actually adds U+005F to XID_Start, but we are handling the ASCII compatible characters differently already and they aren't allowed in UCNs in identifiers). Instead of hardcoding the large tables in ucnid.tab, this patch makes makeucnid.c read them from the Unicode tables (13.0.0 version at this point). For non-pedantic mode, we accept as 2nd+ char in identifiers a union of valid characters in all supported modes, but for the 1st char it was actually pedantically requiring that it is not any of the characters that may not appear in the currently chosen standard as the first character. This patch changes it such that also what is allowed at the start of an identifier is a union of characters valid at the start of an identifier in any of the pedantic modes. 2021-09-01 Jakub Jelinek <jakub@redhat.com> PR c++/100977 libcpp/ * include/cpplib.h (struct cpp_options): Add cxx23_identifiers. * charset.c (CXX23, NXX23): New enumerators. (CID, NFC, NKC, CTX): Renumber. (ucn_valid_in_identifier): Implement P1949R7 - use CXX23 and NXX23 flags for cxx23_identifiers. For start character in non-pedantic mode, allow characters that are allowed as start characters in any of the supported language modes, rather than disallowing characters allowed only as non-start characters in current mode but for characters from other language modes allowing them even if they are never allowed at start. * init.c (struct lang_flags): Add cxx23_identifiers. (lang_defaults): Add cxx23_identifiers column. (cpp_set_lang): Initialize CPP_OPTION (pfile, cxx23_identifiers). * lex.c (warn_about_normalization): If cxx23_identifiers, use cpp_pedwarning_with_line instead of cpp_warning_with_line for "is not in NFC" diagnostics. * makeucnid.c: Adjust usage comment. (CXX23, NXX23): New enumerators. (all_languages): Add CXX23. (not_NFC, not_NFKC, maybe_not_NFC): Renumber. (read_derivedcore): New function. (write_table): Print also CXX23 and NXX23 columns. (main): Require 5 arguments instead of 4, call read_derivedcore. * ucnid.h: Regenerated using Unicode 13.0.0 files. gcc/testsuite/ * g++.dg/cpp23/normalize1.C: New test. * g++.dg/cpp23/normalize2.C: New test. * g++.dg/cpp23/normalize3.C: New test. * g++.dg/cpp23/normalize4.C: New test. * g++.dg/cpp23/normalize5.C: New test. * g++.dg/cpp23/normalize6.C: New test. * g++.dg/cpp23/normalize7.C: New test. * g++.dg/cpp23/ucnid-1-utf8.C: New test. * g++.dg/cpp23/ucnid-2-utf8.C: New test. * gcc.dg/cpp/ucnid-4.c: Don't expect "not valid at the start of an identifier" errors. * gcc.dg/cpp/ucnid-4-utf8.c: Likewise. * gcc.dg/cpp/ucnid-5-utf8.c: New test.
Implemented for GCC 12.
The master branch has been updated by Jakub Jelinek <jakub@gcc.gnu.org>: https://gcc.gnu.org/g:7abcc9ca20d4e17deabb308b5f483aaccc3dc02c commit r12-5597-g7abcc9ca20d4e17deabb308b5f483aaccc3dc02c Author: Jakub Jelinek <jakub@redhat.com> Date: Tue Nov 30 09:50:52 2021 +0100 libcpp: Enable P1949R7 for C++11 and up as it was a DR [PR100977] Jonathan mentioned on IRC that: "Accept P1949R7 (C++ Identifier Syntax using Unicode Standard Annex 31) as a Defect Report and apply the changes therein to the C++ working paper." while I've actually implemented it only for -std={gnu,c}++{23,2b}. As the C++98 rules were significantly different, I'm not trying to change anything for C++98. 2021-11-30 Jakub Jelinek <jakub@redhat.com> PR c++/100977 * init.c (lang_defaults): Enable cxx23_identifiers for -std={gnu,c}++{11,14,17,20} too. * c-c++-common/cpp/ucnid-2011-1-utf8.c: Expect errors in C++. * c-c++-common/cpp/ucnid-2011-1.c: Likewise. * g++.dg/cpp/ucnid-4-utf8.C: Add missing space to dg-options. * g++.dg/cpp23/normalize3.C: Enable for c++11 rather than just c++23. * g++.dg/cpp23/normalize4.C: Likewise. * g++.dg/cpp23/normalize5.C: Likewise. * g++.dg/cpp23/normalize7.C: Expect errors rather than just warnings for c++11 and up rather than just c++23. * g++.dg/cpp23/ucnid-2-utf8.C: Expect errors even for c++11 .. c++20.
The master branch has been updated by Jakub Jelinek <jakub@gcc.gnu.org>: https://gcc.gnu.org/g:c264208e161830a5642ee3125871c23110508462 commit r12-5653-gc264208e161830a5642ee3125871c23110508462 Author: Jakub Jelinek <jakub@redhat.com> Date: Wed Dec 1 10:21:20 2021 +0100 libcpp: Enable P1949R7 for C++98 too [PR100977] On Mon, Nov 29, 2021 at 05:53:58PM -0500, Jason Merrill wrote: > I'm inclined to go ahead and change C++98 as well; I doubt anyone is relying > on the particular C++98 extended character set rules, and we already accept > the union of the different sets when not pedantic. Ok, here is an incremental patch to do that also for -std={c,gnu}++98. 2021-12-01 Jakub Jelinek <jakub@redhat.com> PR c++/100977 * init.c (struct lang_flags): Remove cxx23_identifiers. (lang_defaults): Remove cxx23_identifiers initializers. (cpp_set_lang): Don't copy cxx23_identifiers. * include/cpplib.h (struct cpp_options): Adjust comment about c11_identifiers. Remove cxx23_identifiers field. * lex.c (warn_about_normalization): Use cplusplus instead of cxx23_identifiers. * charset.c (ucn_valid_in_identifier): Likewise. * g++.dg/cpp/ucnid-1.C: Adjust expected diagnostics. * g++.dg/cpp/ucnid-1-utf8.C: Likewise.