This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[patch, libfortran] UTF-8 Support, part 1

From: Jerry DeLisle <jvdelisle at verizon dot net>
To: Fortran List <fortran at gcc dot gnu dot org>, gcc-patches <gcc-patches at gcc dot gnu dot org>
Date: Sun, 03 Aug 2008 21:39:45 -0700
Subject: [patch, libfortran] UTF-8 Support, part 1

Hi all,

This initial patch provides partial support for ENCODING="utf-8" I/O.

The three test cases provided are for testing only. As I further develop this, I will refine these into test cases for the test suite.

Testing and evaluation of this patch by others is welcome. There are a few places where there is duplicated code related to processing delimiters. Its a toss up to make this a macro or otherwise to consolidate these.

Also, please let me know if you see anything I have omitted.

The following items remain to be implemented:

1. Support of internal unit I/O to KIND=4 character strings/arrays. I plan this as a separate patch. (Part 2)

2. Processing of T,TL, and TR format specifiers with UTF-8 encoded strings. This will require positioning across variable width characters in the format buffer. Another patch (Part 3)

I have regression tested this against current trunk on x86-64. While waiting for review comments, I will proceed with Part 2. I have a scheme already worked out for part 3.

OK to commit Part 1 after development of the test cases?

Regards,

Jerry

Index: gcc/fortran/io.c
===================================================================
--- gcc/fortran/io.c	(revision 138226)
+++ gcc/fortran/io.c	(working copy)
@@ -1706,8 +1706,7 @@ gfc_match_open (void)
     
       if (open->encoding->expr_type == EXPR_CONSTANT)
 	{
-	  /* TODO: Implement UTF-8 here.  */
-	  static const char * encoding[] = { "DEFAULT", NULL };
+	  static const char * encoding[] = { "DEFAULT", "UTF-8", NULL };
 
 	  if (!compare_to_allowed_values ("ENCODING", encoding, NULL, NULL,
 					  open->encoding->value.character.string,
Index: gcc/fortran/simplify.c
===================================================================
--- gcc/fortran/simplify.c	(revision 138226)
+++ gcc/fortran/simplify.c	(working copy)
@@ -3795,6 +3795,8 @@ gfc_simplify_selected_char_kind (gfc_exp
   if (gfc_compare_with_Cstring (e, "ascii", false) == 0
       || gfc_compare_with_Cstring (e, "default", false) == 0)
     kind = 1;
+  else if (gfc_compare_with_Cstring (e, "iso_10646", false) == 0)
+    kind = 4;
   else
     kind = -1;
 
Index: libgfortran/intrinsics/selected_char_kind.c
===================================================================
--- libgfortran/intrinsics/selected_char_kind.c	(revision 138226)
+++ libgfortran/intrinsics/selected_char_kind.c	(working copy)
@@ -44,6 +44,8 @@ selected_char_kind (gfc_charlen_type nam
   if ((len == 5 && strncasecmp (name, "ascii", 5) == 0)
       || (len == 7 && strncasecmp (name, "default", 7) == 0))
     return 1;
+  else if (len == 9 && strncasecmp (name, "iso_10646", 9) == 0)
+    return 1;
   else
     return -1;
 }
Index: libgfortran/io/read.c
===================================================================
--- libgfortran/io/read.c	(revision 138226)
+++ libgfortran/io/read.c	(working copy)
@@ -34,6 +34,8 @@ Boston, MA 02110-1301, USA.  */
 #include <ctype.h>
 #include <stdlib.h>
 
+typedef unsigned char uchar;
+
 /* read.c -- Deal with formatted reads */
 
 
@@ -270,44 +272,137 @@ read_a (st_parameter_dt *dtp, const fnod
     memset (p + m, ' ', n);
 }
 
-void
-read_a_char4 (st_parameter_dt *dtp, const fnode *f, char *p, int length)
+
+static void
+read_utf8_char4 (st_parameter_dt *dtp, void *p, int len, size_t width)
 {
+  static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
+  static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+
+  static uchar buffer[6];
+  gfc_char4_t c, *dest;
+  int status, j;
+  size_t i, nbytes, nread;
   char *s;
-  gfc_char4_t *dest;
-  int m, n, wi, status;
-  size_t w;
 
-  wi = f->u.w;
-  if (wi == -1) /* '(A)' edit descriptor  */
-    wi = length;
+  len = ((int) width > len) ? len : (int) width;
 
-  w = wi;
+  dest = (gfc_char4_t *) p;
 
-  s = gfc_alloca (w);
+  /* Proceed with decoding one character at a time.  */
+  for (j = 0; j < len; j++, dest++)
+    {
+      nbytes = 1;
+      s = (char *) &buffer[0];
+      status = read_block_form (dtp, s, &nbytes);
+      if (status == FAILURE)
+	return;
+
+      c = buffer[0];
+      if (c < 0x80)
+	goto saveit;
+
+      /* The number of leading 1-bits in the first byte indicates how many
+	  bytes follow.  */
+      for (nbytes = 2; nbytes < 7; nbytes++)
+	if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
+	  goto found;
+      goto invalid;
+	
+ found:
+      c = (c & masks[nbytes-1]);
+      nread = nbytes - 1;
+
+      s = (char *) &buffer[1];
+      status = read_block_form (dtp, s, &nread);
+      if (status == FAILURE)
+	return;
+      /* Decode the bytes read.  */
+      for (i = 1; i < nbytes; i++)
+	{
+	  gfc_char4_t n = *s++;
 
-  /* Read in w bytes, treating comma as not a separator.  */
-  dtp->u.p.sf_read_comma = 0;
-  status = read_block_form (dtp, s, &w);
-  dtp->u.p.sf_read_comma =
-    dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1;
+	  if ((n & 0xC0) != 0x80)
+	    goto invalid;
+
+	  c = ((c << 6) + (n & 0x3F));
+	}
+
+      /* Make sure the shortest possible encoding was used.  */
+      if (c <=      0x7F && nbytes > 1) goto invalid;
+      if (c <=     0x7FF && nbytes > 2) goto invalid;
+      if (c <=    0xFFFF && nbytes > 3) goto invalid;
+      if (c <=  0x1FFFFF && nbytes > 4) goto invalid;
+      if (c <= 0x3FFFFFF && nbytes > 5) goto invalid;
+
+      /* Make sure the character is valid.  */
+      if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF))
+	goto invalid;
+ saveit:
+      *dest = c;
+    }
+
+  for (j = 0; j < len - (int) width; j++, dest++)
+    *dest = (unsigned char) ' ';
+  return;
+  
+ invalid:
+  generate_error (&dtp->common, LIBERROR_READ_VALUE, "Invalid UTF-8 encoding");
+}
+
+
+static void
+read_default_char4 (st_parameter_dt *dtp, char *p, int len, size_t width)
+{
+  char *s;
+  gfc_char4_t *dest;
+  int m, n, status;
+
+  s = gfc_alloca (width);
+
+  status = read_block_form (dtp, s, &width);
   
   if (status == FAILURE)
     return;
-  if (w > (size_t) length)
-     s += (w - length);
+  if (width > (size_t) len)
+     s += (width - len);
 
-  m = ((int) w > length) ? length : (int) w;
+  m = ((int) width > len) ? len : (int) width;
   
   dest = (gfc_char4_t *) p;
   
   for (n = 0; n < m; n++, dest++, s++)
     *dest = (unsigned char ) *s;
 
-  for (n = 0; n < length - (int) w; n++, dest++)
+  for (n = 0; n < len - (int) width; n++, dest++)
     *dest = (unsigned char) ' ';
 }
 
+
+void
+read_a_char4 (st_parameter_dt *dtp, const fnode *f, char *p, int length)
+{
+  int wi;
+  size_t w;
+
+  wi = f->u.w;
+  if (wi == -1) /* '(A)' edit descriptor  */
+    wi = length;
+
+  w = wi;
+
+  /* Read in w characters, treating comma as not a separator.  */
+  dtp->u.p.sf_read_comma = 0;
+
+  if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
+    read_utf8_char4 (dtp, p, length, w);
+  else
+    read_default_char4 (dtp, p, length, w);
+  
+  dtp->u.p.sf_read_comma =
+    dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1;
+}
+
 /* eat_leading_spaces()-- Given a character pointer and a width,
  * ignore the leading spaces.  */
 
Index: libgfortran/io/write.c
===================================================================
--- libgfortran/io/write.c	(revision 138226)
+++ libgfortran/io/write.c	(working copy)
@@ -36,10 +36,161 @@ Boston, MA 02110-1301, USA.  */
 #include <ctype.h>
 #include <stdlib.h>
 #include <stdbool.h>
+#include <errno.h>
 #define star_fill(p, n) memset(p, '*', n)
 
 #include "write_float.def"
 
+typedef unsigned char uchar;
+
+/* Write out default char4.  */
+
+static void
+write_default_char4 (st_parameter_dt *dtp, gfc_char4_t *source,
+		     int src_len, int w_len)
+{
+  char *p;
+  int j, k = 0;
+  gfc_char4_t c;
+  uchar d;
+      
+  /* Take care of preceding blanks.  */
+  if (w_len > src_len)
+    {
+      k = w_len - src_len;
+      p = write_block (dtp, k);
+      if (p == NULL)
+	return;
+      memset (p, ' ', k);
+    }
+
+  /* Get ready to handle delimiters if needed.  */
+
+  switch (dtp->u.p.delim_status)
+    {
+    case DELIM_APOSTROPHE:
+      d = '\'';
+      break;
+    case DELIM_QUOTE:
+      d = '"';
+      break;
+    default:
+      d = ' ';
+      break;
+    }
+
+  /* Now process the remaining characters, one at a time.  */
+  for (j = k; j < src_len; j++)
+    {
+      c = source[j];
+    
+      /* Handle delimiters if any.  */
+      if (c == d && d != ' ')
+	{
+	  p = write_block (dtp, 2);
+	  if (p == NULL)
+	    return;
+	  *p++ = (uchar) c;
+	}
+      else
+	{
+	  p = write_block (dtp, 1);
+	  if (p == NULL)
+	    return;
+	}
+      *p = c > 255 ? '?' : (uchar) c;
+    }
+}
+
+
+/* Write out UTF-8 converted from char4.  */
+
+static void
+write_utf8_char4 (st_parameter_dt *dtp, gfc_char4_t *source,
+		     int src_len, int w_len)
+{
+  char *p;
+  int j, k = 0;
+  gfc_char4_t c;
+  static const uchar masks[6] =  { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+  static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
+  size_t nbytes;
+  uchar buf[6], d, *q; 
+
+  /* Take care of preceding blanks.  */
+  if (w_len > src_len)
+    {
+      k = w_len - src_len;
+      p = write_block (dtp, k);
+      if (p == NULL)
+	return;
+      memset (p, ' ', k);
+    }
+
+  /* Get ready to handle delimiters if needed.  */
+
+  switch (dtp->u.p.delim_status)
+    {
+    case DELIM_APOSTROPHE:
+      d = '\'';
+      break;
+    case DELIM_QUOTE:
+      d = '"';
+      break;
+    default:
+      d = ' ';
+      break;
+    }
+
+  /* Now process the remaining characters, one at a time.  */
+  for (j = k; j < src_len; j++)
+    {
+      c = source[j];
+      if (c < 0x80)
+	{
+	  /* Handle the delimiters if any.  */
+	  if (c == d && d != ' ')
+	    {
+	      p = write_block (dtp, 2);
+	      if (p == NULL)
+		return;
+	      *p++ = (uchar) c;
+	    }
+	  else
+	    {
+	      p = write_block (dtp, 1);
+	      if (p == NULL)
+		return;
+	    }
+	  *p = (uchar) c;
+	}
+      else
+	{
+	  /* Convert to UTF-8 sequence.  */
+	  nbytes = 1;
+	  q = &buf[6];
+
+	  do
+	    {
+	      *--q = ((c & 0x3F) | 0x80);
+	      c >>= 6;
+	      nbytes++;
+	    }
+	  while (c >= 0x3F || (c & limits[nbytes-1]));
+
+	  *--q = (c | masks[nbytes-1]);
+
+	  p = write_block (dtp, nbytes);
+	  if (p == NULL)
+	    return;
+
+	  while (q < &buf[6])
+	    *p++ = *q++;
+	}
+    }
+}
+
+
 void
 write_a (st_parameter_dt *dtp, const fnode *f, const char *source, int len)
 {
@@ -126,17 +277,16 @@ write_a (st_parameter_dt *dtp, const fno
 
 
 /* The primary difference between write_a_char4 and write_a is that we have to
-   deal with writing from the first byte of the 4-byte character and take care
-   of endianess.  This currently implements encoding="default" which means we
-   write the lowest significant byte. If the 3 most significant bytes are
-   not representable emit a '?'.  TODO: Implement encoding="UTF-8"
-   which will process all 4 bytes and translate to the encoded output.  */
+   deal with writing from the first byte of the 4-byte character and pay
+   attention to the most significant bytes.  For ENCODING="default" write the
+   lowest significant byte. If the 3 most significant bytes contain
+   non-zero values, emit a '?'.  For ENCODING="utf-8", convert the UCS-32 value
+   to the UTF-8 encoded string before writing out.  */
 
 void
 write_a_char4 (st_parameter_dt *dtp, const fnode *f, const char *source, int len)
 {
   int wlen;
-  char *p;
   gfc_char4_t *q;
 
   wlen = f->u.string.length < 0
@@ -173,19 +323,15 @@ write_a_char4 (st_parameter_dt *dtp, con
 	      /* Write out the previously scanned characters in the string.  */
 	      if (bytes > 0)
 		{
-		  p = write_block (dtp, bytes);
-		  if (p == NULL)
-		    return;
-		  for (j = 0; j < bytes; j++)
-		    p[j] = q[j] > 255 ? '?' : (unsigned char) q[j];
+		  if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
+		    write_utf8_char4 (dtp, q, bytes, 0);
+		  else
+		    write_default_char4 (dtp, q, bytes, 0);
 		  bytes = 0;
 		}
 
 	      /* Write out the CR_LF sequence.  */ 
-	      p = write_block (dtp, 2);
-              if (p == NULL)
-                return;
-	      memcpy (p, crlf, 2);
+	      write_default_char4 (dtp, crlf, 2, 0);
 	    }
 	  else
 	    bytes++;
@@ -194,32 +340,19 @@ write_a_char4 (st_parameter_dt *dtp, con
       /*  Write out any remaining bytes if no LF was found.  */
       if (bytes > 0)
 	{
-	  p = write_block (dtp, bytes);
-	  if (p == NULL)
-	    return;
-	  for (j = 0; j < bytes; j++)
-	    p[j] = q[j] > 255 ? '?' : (unsigned char) q[j];
+	  if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
+	    write_utf8_char4 (dtp, q, bytes, 0);
+	  else
+	    write_default_char4 (dtp, q, bytes, 0);
 	}
     }
   else
     {
 #endif
-      int j;
-      p = write_block (dtp, wlen);
-      if (p == NULL)
-	return;
-
-      if (wlen < len)
-	{
-	  for (j = 0; j < wlen; j++)
-	    p[j] = q[j] > 255 ? '?' : (unsigned char) q[j];
-	}
+      if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
+	write_utf8_char4 (dtp, q, len, wlen);
       else
-	{
-	  memset (p, ' ', wlen - len);
-	  for (j = wlen - len; j < wlen; j++)
-	    p[j] = q[j] > 255 ? '?' : (unsigned char) q[j];
-	}
+	write_default_char4 (dtp, q, len, wlen);
 #ifdef HAVE_CRLF
     }
 #endif
@@ -745,8 +878,6 @@ write_character (st_parameter_dt *dtp, c
 {
   int i, extra;
   char *p, d;
-  gfc_char4_t *q;
-
 
   switch (dtp->u.p.delim_status)
     {
@@ -769,9 +900,9 @@ write_character (st_parameter_dt *dtp, c
 	{
 	  extra = 2;
 
-	    for (i = 0; i < length; i++)
-	      if (source[i] == d)
-		extra++;
+	  for (i = 0; i < length; i++)
+	    if (source[i] == d)
+	      extra++;
 	}
 
       p = write_block (dtp, length + extra);
@@ -796,40 +927,24 @@ write_character (st_parameter_dt *dtp, c
     }
   else
     {
-      /* We have to scan the source string looking for delimiters to determine
-	 how large the write block needs to be.  */
       if (d == ' ')
-	extra = 0;
-      else
 	{
-	  extra = 2;
-
-	  q = (gfc_char4_t *) source;
-	  for (i = 0; i < length; i++, q++)
-	    if (*q == (gfc_char4_t) d)
-	      extra++;
-	}
-
-      p = write_block (dtp, length + extra);
-      if (p == NULL)
-	return;
-
-      if (d == ' ')
-	{
-	  q = (gfc_char4_t *) source;
-	  for (i = 0; i < length; i++, q++)
-	    p[i] = *q > 255 ? '?' : (unsigned char) *q;
+	  if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
+	    write_utf8_char4 (dtp, (gfc_char4_t *) source, length, 0);
+	  else
+	    write_default_char4 (dtp, (gfc_char4_t *) source, length, 0);
 	}
       else
 	{
-	  *p++ = d;
-	  q = (gfc_char4_t *) source;
-	  for (i = 0; i < length; i++, q++)
-	    {
-	      *p++ = *q > 255 ? '?' : (unsigned char) *q;
-	      if (*q == (gfc_char4_t) d)
-		*p++ = d;
-	    }
+	  p = write_block (dtp, 1);
+	  *p = d;
+
+	  if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
+	    write_utf8_char4 (dtp, (gfc_char4_t *) source, length, 0);
+	  else
+	    write_default_char4 (dtp, (gfc_char4_t *) source, length, 0);
+
+	  p = write_block (dtp, 1);
 	  *p = d;
 	}
     }

program test3
  INTEGER,PARAMETER :: ucs4 = selected_char_kind("ISO_10646")
  CHARACTER(1,UCS4),PARAMETER :: nen=CHAR(INT(Z'5e74'),UCS4), & !year
    gatsu=CHAR(INT(Z'6708'),ucs4), & !month
    nichi=CHAR(INT(Z'65e5'),ucs4) !day
  character(len= 25, kind=1) string
  integer values(8)
  call date_and_time (values=values)
  open(6, encoding="utf-8")
  write(6,1) values(1),nen,values(2),gatsu,values(3),nichi
  !write(string,1) values(1),nen,values(2),gatsu,values(3),nichi
1 format(i0,a,i0,a,i0,a)
  !print '(a)', string
end program test3

program test1
  implicit none
  integer, parameter :: k4 = 4
  character(kind=4, len=30) :: string4, jerry4
  string4 = k4_"This is Greek:  \u039f\u03cd\u03c7\u03af"
  jerry4 = k4_"Jerry in Japanese is:    \u30b8\u30a8\u30ea\u30fc"
  open(10, file="test.txt", encoding="utf-8", status="replace")
  write(10,'(a)') string4
  write(10,'(a)') jerry4
  rewind(10)
  string4 = k4_""
  jerry4 = k4_""
  read(10,'(a)') string4
  read(10,'(a)') jerry4
  open(6, encoding="utf-8")
  write(6,*) string4
  write(6,*) jerry4
end program test1

! ã?¸ã?¨ã?ªã?¼
! Î?á½?Ï?á½¶

program test2
  implicit none
  integer, parameter :: k4 = 4
  character(kind=4, len=30) :: string4, jerry4
  string4 = k4_"This is Greek: \u039f\u03cd\u03c7\u03af"
  jerry4 = k4_"Jerry in Japanese is: \u30b8\u30a8\u30ea\u30fc"
  open(10, file="test.txt", encoding="utf-8", status="replace")
  open(6, encoding="utf-8", delim="apostrophe")
  !open(6, encoding="default")
  write(10,'(a)') string4
  write(10,'(a)') jerry4
  rewind(10)
  write(6,*) string4
  write(6,*) jerry4
  string4 = k4_""
  jerry4 = k4_""
  read(10,'(a)') string4
  read(10,'(a)') jerry4
  write(6,*) "k1_This "" is a test"
  write(6,*) k4_"k4_This "" is a test"
  print *, k4_"k4_This is"" Greek: \u039f\u03cd\u03c7\u03af"
  write(6,*) "string4>",string4
  write(6,*) "jerry4>",jerry4
end program test2

! ã?¸ã?¨ã?ªã?¼
! Î?á½?Ï?á½¶

Follow-Ups:
- Re: [patch, libfortran] UTF-8 Support, part 1
  - From: Tobias Burnus

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]