[gcc r12-7643] libstdc++: Fix reading UTF-8 characters for 16-bit targets [PR104875]

Jonathan Wakely redi@gcc.gnu.org
Mon Mar 14 13:09:22 GMT 2022


https://gcc.gnu.org/g:8f7b7c1495f92c72da154d32317943a2cc276ca8

commit r12-7643-g8f7b7c1495f92c72da154d32317943a2cc276ca8
Author: Jonathan Wakely <jwakely@redhat.com>
Date:   Fri Mar 11 14:52:38 2022 +0000

    libstdc++: Fix reading UTF-8 characters for 16-bit targets [PR104875]
    
    The current code in read_utf8_code_point assumes that integer promotion
    will create a 32-bit int, but that's not true for 16-bit targets like
    msp430 and avr. This changes the intermediate variables used for each
    octet from unsigned char to char32_t, so that (c << N) works correctly
    when N > 8.
    
    libstdc++-v3/ChangeLog:
    
            PR libstdc++/104875
            * src/c++11/codecvt.cc (read_utf8_code_point): Use char32_t to
            hold octets that will be left-shifted.

Diff:
---
 libstdc++-v3/src/c++11/codecvt.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc
index d9f2dacb647..9f8cb767732 100644
--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -254,7 +254,7 @@ namespace
     const size_t avail = from.size();
     if (avail == 0)
       return incomplete_mb_character;
-    unsigned char c1 = from[0];
+    char32_t c1 = (unsigned char) from[0];
     // https://en.wikipedia.org/wiki/UTF-8#Sample_code
     if (c1 < 0x80)
     {
@@ -267,7 +267,7 @@ namespace
     {
       if (avail < 2)
 	return incomplete_mb_character;
-      unsigned char c2 = from[1];
+      char32_t c2 = (unsigned char) from[1];
       if ((c2 & 0xC0) != 0x80)
 	return invalid_mb_sequence;
       char32_t c = (c1 << 6) + c2 - 0x3080;
@@ -279,12 +279,12 @@ namespace
     {
       if (avail < 3)
 	return incomplete_mb_character;
-      unsigned char c2 = from[1];
+      char32_t c2 = (unsigned char) from[1];
       if ((c2 & 0xC0) != 0x80)
 	return invalid_mb_sequence;
       if (c1 == 0xE0 && c2 < 0xA0) // overlong
 	return invalid_mb_sequence;
-      unsigned char c3 = from[2];
+      char32_t c3 = (unsigned char) from[2];
       if ((c3 & 0xC0) != 0x80)
 	return invalid_mb_sequence;
       char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
@@ -296,17 +296,17 @@ namespace
     {
       if (avail < 4)
 	return incomplete_mb_character;
-      unsigned char c2 = from[1];
+      char32_t c2 = (unsigned char) from[1];
       if ((c2 & 0xC0) != 0x80)
 	return invalid_mb_sequence;
       if (c1 == 0xF0 && c2 < 0x90) // overlong
 	return invalid_mb_sequence;
       if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
       return invalid_mb_sequence;
-      unsigned char c3 = from[2];
+      char32_t c3 = (unsigned char) from[2];
       if ((c3 & 0xC0) != 0x80)
 	return invalid_mb_sequence;
-      unsigned char c4 = from[3];
+      char32_t c4 = (unsigned char) from[3];
       if ((c4 & 0xC0) != 0x80)
 	return invalid_mb_sequence;
       char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;


More information about the Libstdc++-cvs mailing list