This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[patch, libfortran] UTF-8 Support, part 1
- From: Jerry DeLisle <jvdelisle at verizon dot net>
- To: Fortran List <fortran at gcc dot gnu dot org>, gcc-patches <gcc-patches at gcc dot gnu dot org>
- Date: Sun, 03 Aug 2008 21:39:45 -0700
- Subject: [patch, libfortran] UTF-8 Support, part 1
Hi all,
This initial patch provides partial support for ENCODING="utf-8" I/O.
The three test cases provided are for testing only. As I further develop this,
I will refine these into test cases for the test suite.
Testing and evaluation of this patch by others is welcome. There are a few
places where there is duplicated code related to processing delimiters. Its a
toss up to make this a macro or otherwise to consolidate these.
Also, please let me know if you see anything I have omitted.
The following items remain to be implemented:
1. Support of internal unit I/O to KIND=4 character strings/arrays. I plan this
as a separate patch. (Part 2)
2. Processing of T,TL, and TR format specifiers with UTF-8 encoded strings.
This will require positioning across variable width characters in the format
buffer. Another patch (Part 3)
I have regression tested this against current trunk on x86-64. While waiting
for review comments, I will proceed with Part 2. I have a scheme already worked
out for part 3.
OK to commit Part 1 after development of the test cases?
Regards,
Jerry
Index: gcc/fortran/io.c
===================================================================
--- gcc/fortran/io.c (revision 138226)
+++ gcc/fortran/io.c (working copy)
@@ -1706,8 +1706,7 @@ gfc_match_open (void)
if (open->encoding->expr_type == EXPR_CONSTANT)
{
- /* TODO: Implement UTF-8 here. */
- static const char * encoding[] = { "DEFAULT", NULL };
+ static const char * encoding[] = { "DEFAULT", "UTF-8", NULL };
if (!compare_to_allowed_values ("ENCODING", encoding, NULL, NULL,
open->encoding->value.character.string,
Index: gcc/fortran/simplify.c
===================================================================
--- gcc/fortran/simplify.c (revision 138226)
+++ gcc/fortran/simplify.c (working copy)
@@ -3795,6 +3795,8 @@ gfc_simplify_selected_char_kind (gfc_exp
if (gfc_compare_with_Cstring (e, "ascii", false) == 0
|| gfc_compare_with_Cstring (e, "default", false) == 0)
kind = 1;
+ else if (gfc_compare_with_Cstring (e, "iso_10646", false) == 0)
+ kind = 4;
else
kind = -1;
Index: libgfortran/intrinsics/selected_char_kind.c
===================================================================
--- libgfortran/intrinsics/selected_char_kind.c (revision 138226)
+++ libgfortran/intrinsics/selected_char_kind.c (working copy)
@@ -44,6 +44,8 @@ selected_char_kind (gfc_charlen_type nam
if ((len == 5 && strncasecmp (name, "ascii", 5) == 0)
|| (len == 7 && strncasecmp (name, "default", 7) == 0))
return 1;
+ else if (len == 9 && strncasecmp (name, "iso_10646", 9) == 0)
+ return 1;
else
return -1;
}
Index: libgfortran/io/read.c
===================================================================
--- libgfortran/io/read.c (revision 138226)
+++ libgfortran/io/read.c (working copy)
@@ -34,6 +34,8 @@ Boston, MA 02110-1301, USA. */
#include <ctype.h>
#include <stdlib.h>
+typedef unsigned char uchar;
+
/* read.c -- Deal with formatted reads */
@@ -270,44 +272,137 @@ read_a (st_parameter_dt *dtp, const fnod
memset (p + m, ' ', n);
}
-void
-read_a_char4 (st_parameter_dt *dtp, const fnode *f, char *p, int length)
+
+static void
+read_utf8_char4 (st_parameter_dt *dtp, void *p, int len, size_t width)
{
+ static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
+ static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+
+ static uchar buffer[6];
+ gfc_char4_t c, *dest;
+ int status, j;
+ size_t i, nbytes, nread;
char *s;
- gfc_char4_t *dest;
- int m, n, wi, status;
- size_t w;
- wi = f->u.w;
- if (wi == -1) /* '(A)' edit descriptor */
- wi = length;
+ len = ((int) width > len) ? len : (int) width;
- w = wi;
+ dest = (gfc_char4_t *) p;
- s = gfc_alloca (w);
+ /* Proceed with decoding one character at a time. */
+ for (j = 0; j < len; j++, dest++)
+ {
+ nbytes = 1;
+ s = (char *) &buffer[0];
+ status = read_block_form (dtp, s, &nbytes);
+ if (status == FAILURE)
+ return;
+
+ c = buffer[0];
+ if (c < 0x80)
+ goto saveit;
+
+ /* The number of leading 1-bits in the first byte indicates how many
+ bytes follow. */
+ for (nbytes = 2; nbytes < 7; nbytes++)
+ if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
+ goto found;
+ goto invalid;
+
+ found:
+ c = (c & masks[nbytes-1]);
+ nread = nbytes - 1;
+
+ s = (char *) &buffer[1];
+ status = read_block_form (dtp, s, &nread);
+ if (status == FAILURE)
+ return;
+ /* Decode the bytes read. */
+ for (i = 1; i < nbytes; i++)
+ {
+ gfc_char4_t n = *s++;
- /* Read in w bytes, treating comma as not a separator. */
- dtp->u.p.sf_read_comma = 0;
- status = read_block_form (dtp, s, &w);
- dtp->u.p.sf_read_comma =
- dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1;
+ if ((n & 0xC0) != 0x80)
+ goto invalid;
+
+ c = ((c << 6) + (n & 0x3F));
+ }
+
+ /* Make sure the shortest possible encoding was used. */
+ if (c <= 0x7F && nbytes > 1) goto invalid;
+ if (c <= 0x7FF && nbytes > 2) goto invalid;
+ if (c <= 0xFFFF && nbytes > 3) goto invalid;
+ if (c <= 0x1FFFFF && nbytes > 4) goto invalid;
+ if (c <= 0x3FFFFFF && nbytes > 5) goto invalid;
+
+ /* Make sure the character is valid. */
+ if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF))
+ goto invalid;
+ saveit:
+ *dest = c;
+ }
+
+ for (j = 0; j < len - (int) width; j++, dest++)
+ *dest = (unsigned char) ' ';
+ return;
+
+ invalid:
+ generate_error (&dtp->common, LIBERROR_READ_VALUE, "Invalid UTF-8 encoding");
+}
+
+
+static void
+read_default_char4 (st_parameter_dt *dtp, char *p, int len, size_t width)
+{
+ char *s;
+ gfc_char4_t *dest;
+ int m, n, status;
+
+ s = gfc_alloca (width);
+
+ status = read_block_form (dtp, s, &width);
if (status == FAILURE)
return;
- if (w > (size_t) length)
- s += (w - length);
+ if (width > (size_t) len)
+ s += (width - len);
- m = ((int) w > length) ? length : (int) w;
+ m = ((int) width > len) ? len : (int) width;
dest = (gfc_char4_t *) p;
for (n = 0; n < m; n++, dest++, s++)
*dest = (unsigned char ) *s;
- for (n = 0; n < length - (int) w; n++, dest++)
+ for (n = 0; n < len - (int) width; n++, dest++)
*dest = (unsigned char) ' ';
}
+
+void
+read_a_char4 (st_parameter_dt *dtp, const fnode *f, char *p, int length)
+{
+ int wi;
+ size_t w;
+
+ wi = f->u.w;
+ if (wi == -1) /* '(A)' edit descriptor */
+ wi = length;
+
+ w = wi;
+
+ /* Read in w characters, treating comma as not a separator. */
+ dtp->u.p.sf_read_comma = 0;
+
+ if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
+ read_utf8_char4 (dtp, p, length, w);
+ else
+ read_default_char4 (dtp, p, length, w);
+
+ dtp->u.p.sf_read_comma =
+ dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1;
+}
+
/* eat_leading_spaces()-- Given a character pointer and a width,
* ignore the leading spaces. */
Index: libgfortran/io/write.c
===================================================================
--- libgfortran/io/write.c (revision 138226)
+++ libgfortran/io/write.c (working copy)
@@ -36,10 +36,161 @@ Boston, MA 02110-1301, USA. */
#include <ctype.h>
#include <stdlib.h>
#include <stdbool.h>
+#include <errno.h>
#define star_fill(p, n) memset(p, '*', n)
#include "write_float.def"
+typedef unsigned char uchar;
+
+/* Write out default char4. */
+
+static void
+write_default_char4 (st_parameter_dt *dtp, gfc_char4_t *source,
+ int src_len, int w_len)
+{
+ char *p;
+ int j, k = 0;
+ gfc_char4_t c;
+ uchar d;
+
+ /* Take care of preceding blanks. */
+ if (w_len > src_len)
+ {
+ k = w_len - src_len;
+ p = write_block (dtp, k);
+ if (p == NULL)
+ return;
+ memset (p, ' ', k);
+ }
+
+ /* Get ready to handle delimiters if needed. */
+
+ switch (dtp->u.p.delim_status)
+ {
+ case DELIM_APOSTROPHE:
+ d = '\'';
+ break;
+ case DELIM_QUOTE:
+ d = '"';
+ break;
+ default:
+ d = ' ';
+ break;
+ }
+
+ /* Now process the remaining characters, one at a time. */
+ for (j = k; j < src_len; j++)
+ {
+ c = source[j];
+
+ /* Handle delimiters if any. */
+ if (c == d && d != ' ')
+ {
+ p = write_block (dtp, 2);
+ if (p == NULL)
+ return;
+ *p++ = (uchar) c;
+ }
+ else
+ {
+ p = write_block (dtp, 1);
+ if (p == NULL)
+ return;
+ }
+ *p = c > 255 ? '?' : (uchar) c;
+ }
+}
+
+
+/* Write out UTF-8 converted from char4. */
+
+static void
+write_utf8_char4 (st_parameter_dt *dtp, gfc_char4_t *source,
+ int src_len, int w_len)
+{
+ char *p;
+ int j, k = 0;
+ gfc_char4_t c;
+ static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+ static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
+ size_t nbytes;
+ uchar buf[6], d, *q;
+
+ /* Take care of preceding blanks. */
+ if (w_len > src_len)
+ {
+ k = w_len - src_len;
+ p = write_block (dtp, k);
+ if (p == NULL)
+ return;
+ memset (p, ' ', k);
+ }
+
+ /* Get ready to handle delimiters if needed. */
+
+ switch (dtp->u.p.delim_status)
+ {
+ case DELIM_APOSTROPHE:
+ d = '\'';
+ break;
+ case DELIM_QUOTE:
+ d = '"';
+ break;
+ default:
+ d = ' ';
+ break;
+ }
+
+ /* Now process the remaining characters, one at a time. */
+ for (j = k; j < src_len; j++)
+ {
+ c = source[j];
+ if (c < 0x80)
+ {
+ /* Handle the delimiters if any. */
+ if (c == d && d != ' ')
+ {
+ p = write_block (dtp, 2);
+ if (p == NULL)
+ return;
+ *p++ = (uchar) c;
+ }
+ else
+ {
+ p = write_block (dtp, 1);
+ if (p == NULL)
+ return;
+ }
+ *p = (uchar) c;
+ }
+ else
+ {
+ /* Convert to UTF-8 sequence. */
+ nbytes = 1;
+ q = &buf[6];
+
+ do
+ {
+ *--q = ((c & 0x3F) | 0x80);
+ c >>= 6;
+ nbytes++;
+ }
+ while (c >= 0x3F || (c & limits[nbytes-1]));
+
+ *--q = (c | masks[nbytes-1]);
+
+ p = write_block (dtp, nbytes);
+ if (p == NULL)
+ return;
+
+ while (q < &buf[6])
+ *p++ = *q++;
+ }
+ }
+}
+
+
void
write_a (st_parameter_dt *dtp, const fnode *f, const char *source, int len)
{
@@ -126,17 +277,16 @@ write_a (st_parameter_dt *dtp, const fno
/* The primary difference between write_a_char4 and write_a is that we have to
- deal with writing from the first byte of the 4-byte character and take care
- of endianess. This currently implements encoding="default" which means we
- write the lowest significant byte. If the 3 most significant bytes are
- not representable emit a '?'. TODO: Implement encoding="UTF-8"
- which will process all 4 bytes and translate to the encoded output. */
+ deal with writing from the first byte of the 4-byte character and pay
+ attention to the most significant bytes. For ENCODING="default" write the
+ lowest significant byte. If the 3 most significant bytes contain
+ non-zero values, emit a '?'. For ENCODING="utf-8", convert the UCS-32 value
+ to the UTF-8 encoded string before writing out. */
void
write_a_char4 (st_parameter_dt *dtp, const fnode *f, const char *source, int len)
{
int wlen;
- char *p;
gfc_char4_t *q;
wlen = f->u.string.length < 0
@@ -173,19 +323,15 @@ write_a_char4 (st_parameter_dt *dtp, con
/* Write out the previously scanned characters in the string. */
if (bytes > 0)
{
- p = write_block (dtp, bytes);
- if (p == NULL)
- return;
- for (j = 0; j < bytes; j++)
- p[j] = q[j] > 255 ? '?' : (unsigned char) q[j];
+ if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
+ write_utf8_char4 (dtp, q, bytes, 0);
+ else
+ write_default_char4 (dtp, q, bytes, 0);
bytes = 0;
}
/* Write out the CR_LF sequence. */
- p = write_block (dtp, 2);
- if (p == NULL)
- return;
- memcpy (p, crlf, 2);
+ write_default_char4 (dtp, crlf, 2, 0);
}
else
bytes++;
@@ -194,32 +340,19 @@ write_a_char4 (st_parameter_dt *dtp, con
/* Write out any remaining bytes if no LF was found. */
if (bytes > 0)
{
- p = write_block (dtp, bytes);
- if (p == NULL)
- return;
- for (j = 0; j < bytes; j++)
- p[j] = q[j] > 255 ? '?' : (unsigned char) q[j];
+ if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
+ write_utf8_char4 (dtp, q, bytes, 0);
+ else
+ write_default_char4 (dtp, q, bytes, 0);
}
}
else
{
#endif
- int j;
- p = write_block (dtp, wlen);
- if (p == NULL)
- return;
-
- if (wlen < len)
- {
- for (j = 0; j < wlen; j++)
- p[j] = q[j] > 255 ? '?' : (unsigned char) q[j];
- }
+ if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
+ write_utf8_char4 (dtp, q, len, wlen);
else
- {
- memset (p, ' ', wlen - len);
- for (j = wlen - len; j < wlen; j++)
- p[j] = q[j] > 255 ? '?' : (unsigned char) q[j];
- }
+ write_default_char4 (dtp, q, len, wlen);
#ifdef HAVE_CRLF
}
#endif
@@ -745,8 +878,6 @@ write_character (st_parameter_dt *dtp, c
{
int i, extra;
char *p, d;
- gfc_char4_t *q;
-
switch (dtp->u.p.delim_status)
{
@@ -769,9 +900,9 @@ write_character (st_parameter_dt *dtp, c
{
extra = 2;
- for (i = 0; i < length; i++)
- if (source[i] == d)
- extra++;
+ for (i = 0; i < length; i++)
+ if (source[i] == d)
+ extra++;
}
p = write_block (dtp, length + extra);
@@ -796,40 +927,24 @@ write_character (st_parameter_dt *dtp, c
}
else
{
- /* We have to scan the source string looking for delimiters to determine
- how large the write block needs to be. */
if (d == ' ')
- extra = 0;
- else
{
- extra = 2;
-
- q = (gfc_char4_t *) source;
- for (i = 0; i < length; i++, q++)
- if (*q == (gfc_char4_t) d)
- extra++;
- }
-
- p = write_block (dtp, length + extra);
- if (p == NULL)
- return;
-
- if (d == ' ')
- {
- q = (gfc_char4_t *) source;
- for (i = 0; i < length; i++, q++)
- p[i] = *q > 255 ? '?' : (unsigned char) *q;
+ if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
+ write_utf8_char4 (dtp, (gfc_char4_t *) source, length, 0);
+ else
+ write_default_char4 (dtp, (gfc_char4_t *) source, length, 0);
}
else
{
- *p++ = d;
- q = (gfc_char4_t *) source;
- for (i = 0; i < length; i++, q++)
- {
- *p++ = *q > 255 ? '?' : (unsigned char) *q;
- if (*q == (gfc_char4_t) d)
- *p++ = d;
- }
+ p = write_block (dtp, 1);
+ *p = d;
+
+ if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
+ write_utf8_char4 (dtp, (gfc_char4_t *) source, length, 0);
+ else
+ write_default_char4 (dtp, (gfc_char4_t *) source, length, 0);
+
+ p = write_block (dtp, 1);
*p = d;
}
}
program test3
INTEGER,PARAMETER :: ucs4 = selected_char_kind("ISO_10646")
CHARACTER(1,UCS4),PARAMETER :: nen=CHAR(INT(Z'5e74'),UCS4), & !year
gatsu=CHAR(INT(Z'6708'),ucs4), & !month
nichi=CHAR(INT(Z'65e5'),ucs4) !day
character(len= 25, kind=1) string
integer values(8)
call date_and_time (values=values)
open(6, encoding="utf-8")
write(6,1) values(1),nen,values(2),gatsu,values(3),nichi
!write(string,1) values(1),nen,values(2),gatsu,values(3),nichi
1 format(i0,a,i0,a,i0,a)
!print '(a)', string
end program test3
program test1
implicit none
integer, parameter :: k4 = 4
character(kind=4, len=30) :: string4, jerry4
string4 = k4_"This is Greek: \u039f\u03cd\u03c7\u03af"
jerry4 = k4_"Jerry in Japanese is: \u30b8\u30a8\u30ea\u30fc"
open(10, file="test.txt", encoding="utf-8", status="replace")
write(10,'(a)') string4
write(10,'(a)') jerry4
rewind(10)
string4 = k4_""
jerry4 = k4_""
read(10,'(a)') string4
read(10,'(a)') jerry4
open(6, encoding="utf-8")
write(6,*) string4
write(6,*) jerry4
end program test1
! ����
! Î?á½?Ï?ὶ
program test2
implicit none
integer, parameter :: k4 = 4
character(kind=4, len=30) :: string4, jerry4
string4 = k4_"This is Greek: \u039f\u03cd\u03c7\u03af"
jerry4 = k4_"Jerry in Japanese is: \u30b8\u30a8\u30ea\u30fc"
open(10, file="test.txt", encoding="utf-8", status="replace")
open(6, encoding="utf-8", delim="apostrophe")
!open(6, encoding="default")
write(10,'(a)') string4
write(10,'(a)') jerry4
rewind(10)
write(6,*) string4
write(6,*) jerry4
string4 = k4_""
jerry4 = k4_""
read(10,'(a)') string4
read(10,'(a)') jerry4
write(6,*) "k1_This "" is a test"
write(6,*) k4_"k4_This "" is a test"
print *, k4_"k4_This is"" Greek: \u039f\u03cd\u03c7\u03af"
write(6,*) "string4>",string4
write(6,*) "jerry4>",jerry4
end program test2
! ����
! Î?á½?Ï?ὶ