[Ada] Implement BOM recognition

Arnaud Charlet charlet@adacore.com
Wed Dec 19 16:51:00 GMT 2007


Tested on i686-linux, committed on trunk

This patch is in two parts, the first is the addition of the
new package GNAT.Byte_Order_Mark (g-byorma.ads/adb). The second
is the use of this package in the compiler scanner to set the
encoding of the file accordingly.

The following test program has a UTF-8 BOM at the start and
should compile cleanly:

with Text_IO; use Text_IO;

procedure Hello is
   ПрПверка_СвязО : String := "Hello world";
   -- ПрПверка связО
begin
   Put_Line (ПрПверка_СвязО);
end;

2007-12-19  Robert Dewar  <dewar@adacore.com>

	* g-byorma.adb, g-byorma.ads, g-decstr.adb, g-decstr.ads,
	g-deutst.ads, g-encstr.adb, g-encstr.ads, g-enutst.ads: New files.

	* scn.adb: Implement BOM recognition
-------------- next part --------------
Index: g-byorma.adb
===================================================================
--- g-byorma.adb	(revision 0)
+++ g-byorma.adb	(revision 0)
@@ -0,0 +1,191 @@
+------------------------------------------------------------------------------
+--                                                                          --
+--                         GNAT COMPILER COMPONENTS                         --
+--                                                                          --
+--                 G N A T . B Y T E _ O R D E R _ M A R K                  --
+--                                                                          --
+--                                 B o d y                                  --
+--                                                                          --
+--                     Copyright (C) 2006-2007, AdaCore                     --
+--                                                                          --
+-- GNAT is free software;  you can  redistribute it  and/or modify it under --
+-- terms of the  GNU General Public License as published  by the Free Soft- --
+-- ware  Foundation;  either version 2,  or (at your option) any later ver- --
+-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
+-- or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License --
+-- for  more details.  You should have  received  a copy of the GNU General --
+-- Public License  distributed with GNAT;  see file COPYING.  If not, write --
+-- to  the  Free Software Foundation,  51  Franklin  Street,  Fifth  Floor, --
+-- Boston, MA 02110-1301, USA.                                              --
+--                                                                          --
+-- As a special exception,  if other files  instantiate  generics from this --
+-- unit, or you link  this unit with other files  to produce an executable, --
+-- this  unit  does not  by itself cause  the resulting  executable  to  be --
+-- covered  by the  GNU  General  Public  License.  This exception does not --
+-- however invalidate  any other reasons why  the executable file  might be --
+-- covered by the  GNU Public License.                                      --
+--                                                                          --
+-- GNAT was originally developed  by the GNAT team at  New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc.      --
+--                                                                          --
+------------------------------------------------------------------------------
+
+package body GNAT.Byte_Order_Mark is
+
+   --------------
+   -- Read_BOM --
+   --------------
+
+   procedure Read_BOM
+     (Str         : String;
+      Len         : out Natural;
+      BOM         : out BOM_Kind;
+      XML_Support : Boolean := False)
+   is
+   begin
+      --  UTF-16 (big-endian)
+
+      if Str'Length >= 2
+        and then Str (Str'First) = Character'Val (16#FE#)
+        and then Str (Str'First + 1) = Character'Val (16#FF#)
+      then
+         Len := 2;
+         BOM := UTF16_BE;
+
+      --  UTF-16 (little-endian)
+
+      elsif Str'Length >= 2
+        and then Str (Str'First) = Character'Val (16#FF#)
+        and then Str (Str'First + 1) = Character'Val (16#FE#)
+      then
+         Len := 2;
+         BOM := UTF16_LE;
+
+      --  UTF-32 (big-endian)
+
+      elsif Str'Length >= 4
+        and then Str (Str'First)     = Character'Val (16#00#)
+        and then Str (Str'First + 1) = Character'Val (16#00#)
+        and then Str (Str'First + 2) = Character'Val (16#FE#)
+        and then Str (Str'First + 3) = Character'Val (16#FF#)
+      then
+         Len := 4;
+         BOM := UTF32_BE;
+
+      --  UTF-32 (little-endian)
+
+      elsif Str'Length >= 4
+        and then Str (Str'First)     = Character'Val (16#FF#)
+        and then Str (Str'First + 1) = Character'Val (16#FE#)
+        and then Str (Str'First + 2) = Character'Val (16#00#)
+        and then Str (Str'First + 3) = Character'Val (16#00#)
+      then
+         Len := 4;
+         BOM := UTF32_LE;
+
+      --  UTF-8 (endian-independent)
+
+      elsif Str'Length >= 3
+        and then Str (Str'First)     = Character'Val (16#EF#)
+        and then Str (Str'First + 1) = Character'Val (16#BB#)
+        and then Str (Str'First + 2) = Character'Val (16#BF#)
+      then
+         Len := 3;
+         BOM := UTF8_All;
+
+      --  UCS-4 (big-endian) XML only
+
+      elsif XML_Support
+        and then Str'Length >= 4
+        and then Str (Str'First)     = Character'Val (16#00#)
+        and then Str (Str'First + 1) = Character'Val (16#00#)
+        and then Str (Str'First + 2) = Character'Val (16#00#)
+        and then Str (Str'First + 3) = Character'Val (16#3C#)
+      then
+         Len := 0;
+         BOM := UCS4_BE;
+
+      --  UCS-4 (little-endian) XML case
+
+      elsif XML_Support
+        and then Str'Length >= 4
+        and then Str (Str'First)     = Character'Val (16#3C#)
+        and then Str (Str'First + 1) = Character'Val (16#00#)
+        and then Str (Str'First + 2) = Character'Val (16#00#)
+        and then Str (Str'First + 3) = Character'Val (16#00#)
+      then
+         Len := 0;
+         BOM := UCS4_LE;
+
+      --  UCS-4 (unusual byte order 2143) XML case
+
+      elsif XML_Support
+        and then Str'Length >= 4
+        and then Str (Str'First)     = Character'Val (16#00#)
+        and then Str (Str'First + 1) = Character'Val (16#00#)
+        and then Str (Str'First + 2) = Character'Val (16#3C#)
+        and then Str (Str'First + 3) = Character'Val (16#00#)
+      then
+         Len := 0;
+         BOM := UCS4_2143;
+
+      --  UCS-4 (unusual byte order 3412) XML case
+
+      elsif XML_Support
+        and then Str'Length >= 4
+        and then Str (Str'First)     = Character'Val (16#00#)
+        and then Str (Str'First + 1) = Character'Val (16#3C#)
+        and then Str (Str'First + 2) = Character'Val (16#00#)
+        and then Str (Str'First + 3) = Character'Val (16#00#)
+      then
+         Len := 0;
+         BOM := UCS4_3412;
+
+      --  UTF-16 (big-endian) XML case
+
+      elsif XML_Support
+        and then Str'Length >= 4
+        and then Str (Str'First)     = Character'Val (16#00#)
+        and then Str (Str'First + 1) = Character'Val (16#3C#)
+        and then Str (Str'First + 2) = Character'Val (16#00#)
+        and then Str (Str'First + 3) = Character'Val (16#3F#)
+      then
+         Len := 0;
+         BOM := UTF16_BE;
+
+      --  UTF-32 (little-endian) XML case
+
+      elsif XML_Support
+        and then Str'Length >= 4
+        and then Str (Str'First)     = Character'Val (16#3C#)
+        and then Str (Str'First + 1) = Character'Val (16#00#)
+        and then Str (Str'First + 2) = Character'Val (16#3F#)
+        and then Str (Str'First + 3) = Character'Val (16#00#)
+      then
+         Len := 0;
+         BOM := UTF16_LE;
+
+      --  Unrecognized special encodings XML only
+
+      elsif XML_Support
+        and then Str'Length >= 4
+        and then Str (Str'First)     = Character'Val (16#3C#)
+        and then Str (Str'First + 1) = Character'Val (16#3F#)
+        and then Str (Str'First + 2) = Character'Val (16#78#)
+        and then Str (Str'First + 3) = Character'Val (16#6D#)
+      then
+         --  Utf8, ASCII, some part of ISO8859, Shift-JIS, EUC,...
+
+         Len := 0;
+         BOM := Unknown;
+
+      --  No BOM recognized
+
+      else
+         Len := 0;
+         BOM := Unknown;
+      end if;
+   end Read_BOM;
+
+end GNAT.Byte_Order_Mark;

Property changes on: g-byorma.adb
___________________________________________________________________
Name: svn:executable
   + *

Index: g-byorma.ads
===================================================================
--- g-byorma.ads	(revision 0)
+++ g-byorma.ads	(revision 0)
@@ -0,0 +1,104 @@
+------------------------------------------------------------------------------
+--                                                                          --
+--                         GNAT COMPILER COMPONENTS                         --
+--                                                                          --
+--                 G N A T . B Y T E _ O R D E R _ M A R K                  --
+--                                                                          --
+--                                 S p e c                                  --
+--                                                                          --
+--                     Copyright (C) 2006-2007, AdaCore                     --
+--                                                                          --
+-- GNAT is free software;  you can  redistribute it  and/or modify it under --
+-- terms of the  GNU General Public License as published  by the Free Soft- --
+-- ware  Foundation;  either version 2,  or (at your option) any later ver- --
+-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
+-- or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License --
+-- for  more details.  You should have  received  a copy of the GNU General --
+-- Public License  distributed with GNAT;  see file COPYING.  If not, write --
+-- to  the  Free Software Foundation,  51  Franklin  Street,  Fifth  Floor, --
+-- Boston, MA 02110-1301, USA.                                              --
+--                                                                          --
+-- As a special exception,  if other files  instantiate  generics from this --
+-- unit, or you link  this unit with other files  to produce an executable, --
+-- this  unit  does not  by itself cause  the resulting  executable  to  be --
+-- covered  by the  GNU  General  Public  License.  This exception does not --
+-- however invalidate  any other reasons why  the executable file  might be --
+-- covered by the  GNU Public License.                                      --
+--                                                                          --
+-- GNAT was originally developed  by the GNAT team at  New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc.      --
+--                                                                          --
+------------------------------------------------------------------------------
+
+--  This package provides a procedure for reading and interpreting the BOM
+--  (byte order mark) used to publish the encoding method for a string (for
+--  example, a UTF-8 encoded file in windows will start with the appropriate
+--  BOM sequence to signal UTF-8 encoding.
+
+--  There are two cases
+
+--    Case 1. UTF encodings for Unicode files
+
+--      Here the convention is to have the first character of the file be a
+--      non-breaking zero width space character (16#0000_FEFF#). For the UTF
+--      encodings, the representation of this character can be used to uniquely
+--      determine the encoding. Furthermore, the possibility of any confusion
+--      with unencoded files is minimal, since for example the UTF-8 encoding
+--      of this character looks like the sequence:
+
+--        LC_I_Diaeresis
+--        Right_Angle_Quotation
+--        Fraction_One_Half
+
+--      which is so unlikely to occur legitimately in normal use that it can
+--      safely be ignored in most cases (for example, no legitimate Ada source
+--      file could start with this sequence of characters).
+
+--   Case 2. Specialized XML encodings
+
+--     The XML standard defines a number of other possible encodings and also
+--     defines standardized sequences for marking these encodings. This package
+--     can also optionally handle these XML defined BOM sequences. These XML
+--     cases depend on the first character of the XML file being < so that the
+--     encoding of this character can be recognized.
+
+pragma Warnings (Off);
+pragma Compiler_Unit;
+pragma Warnings (On);
+
+package GNAT.Byte_Order_Mark is
+
+   type BOM_Kind is
+     (UTF8_All,  --  UTF8-encoding
+      UTF16_LE,  --  UTF16 little-endian encoding
+      UTF16_BE,  --  UTF16 big-endian encoding
+      UTF32_LE,  --  UTF32 little-endian encoding
+      UTF32_BE,  --  UTF32 big-endian encoding
+
+      --  The following cases are for XML only
+
+      UCS4_BE,   --  UCS-4, big endian machine (1234 order)
+      UCS4_LE,   --  UCS-4, little endian machine (4321 order)
+      UCS4_2143, --  UCS-4, unusual byte order (2143 order)
+      UCS4_3412, --  UCS-4, unusual byte order (3412 order)
+
+      --  Value returned if no BOM recognized
+
+      Unknown);  --  Unknown, assumed to be ASCII compatible
+
+   procedure Read_BOM
+     (Str         : String;
+      Len         : out Natural;
+      BOM         : out BOM_Kind;
+      XML_Support : Boolean := False);
+   --  This is the routine to read the BOM from the start of the given string
+   --  Str. On return BOM is set to the appropriate BOM_Kind and Len is set to
+   --  its length. The caller will typically skip the first Len characters in
+   --  the string to ignore the BOM sequence. The special XML possibilities are
+   --  recognized only if flag XML_Support is set to True. Note that for the
+   --  XML cases, Len is always set to zero on return (not to the length of the
+   --  relevant sequence) since in the XML cases, the sequence recognized is
+   --  for the first real character in the file (<) which is not to be skipped.
+
+end GNAT.Byte_Order_Mark;

Property changes on: g-byorma.ads
___________________________________________________________________
Name: svn:executable
   + *

Index: g-decstr.adb
===================================================================
--- g-decstr.adb	(revision 0)
+++ g-decstr.adb	(revision 0)
@@ -0,0 +1,972 @@
+------------------------------------------------------------------------------
+--                                                                          --
+--                         GNAT RUN-TIME COMPONENTS                         --
+--                                                                          --
+--                    G N A T . D E C O D E _ S T R I N G                   --
+--                                                                          --
+--                                 S p e c                                  --
+--                                                                          --
+--                       Copyright (C) 2007, AdaCore                        --
+--                                                                          --
+-- GNAT is free software;  you can  redistribute it  and/or modify it under --
+-- terms of the  GNU General Public License as published  by the Free Soft- --
+-- ware  Foundation;  either version 2,  or (at your option) any later ver- --
+-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
+-- or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License --
+-- for  more details.  You should have  received  a copy of the GNU General --
+-- Public License  distributed with GNAT;  see file COPYING.  If not, write --
+-- to  the  Free Software Foundation,  51  Franklin  Street,  Fifth  Floor, --
+-- Boston, MA 02110-1301, USA.                                              --
+--                                                                          --
+-- As a special exception,  if other files  instantiate  generics from this --
+-- unit, or you link  this unit with other files  to produce an executable, --
+-- this  unit  does not  by itself cause  the resulting  executable  to  be --
+-- covered  by the  GNU  General  Public  License.  This exception does not --
+-- however invalidate  any other reasons why  the executable file  might be --
+-- covered by the  GNU Public License.                                      --
+--                                                                          --
+-- GNAT was originally developed  by the GNAT team at  New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc.      --
+--                                                                          --
+------------------------------------------------------------------------------
+
+--  This package provides a utility routine for converting from an encoded
+--  string to a corresponding Wide_String or Wide_Wide_String value.
+
+with Interfaces; use Interfaces;
+
+with System.WCh_Cnv; use System.WCh_Cnv;
+with System.WCh_Con; use System.WCh_Con;
+
+package body GNAT.Decode_String is
+
+   -----------------------
+   -- Local Subprograms --
+   -----------------------
+
+   procedure Bad;
+   pragma No_Return (Bad);
+   --  Raise error for bad encoding
+
+   procedure Past_End;
+   pragma No_Return (Past_End);
+   --  Raise error for off end of string
+
+   ---------
+   -- Bad --
+   ---------
+
+   procedure Bad is
+   begin
+      raise Constraint_Error with
+        "bad encoding or character out of range";
+   end Bad;
+
+   ---------------------------
+   -- Decode_Wide_Character --
+   ---------------------------
+
+   procedure Decode_Wide_Character
+     (Input  : String;
+      Ptr    : in out Natural;
+      Result : out Wide_Character)
+   is
+      Char : Wide_Wide_Character;
+   begin
+      Decode_Wide_Wide_Character (Input, Ptr, Char);
+
+      if Wide_Wide_Character'Pos (Char) > 16#FFFF# then
+         Bad;
+      else
+         Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char));
+      end if;
+   end Decode_Wide_Character;
+
+   ------------------------
+   -- Decode_Wide_String --
+   ------------------------
+
+   function Decode_Wide_String (S : String) return Wide_String is
+      Result : Wide_String (1 .. S'Length);
+      Length : Natural;
+   begin
+      Decode_Wide_String (S, Result, Length);
+      return Result (1 .. Length);
+   end Decode_Wide_String;
+
+   procedure Decode_Wide_String
+     (S      : String;
+      Result : out Wide_String;
+      Length : out Natural)
+   is
+      Ptr : Natural;
+
+   begin
+      Ptr := S'First;
+      Length := 0;
+      while Ptr <= S'Last loop
+         if Length >= Result'Last then
+            Past_End;
+         end if;
+
+         Length := Length + 1;
+         Decode_Wide_Character (S, Ptr, Result (Length));
+      end loop;
+   end Decode_Wide_String;
+
+   --------------------------------
+   -- Decode_Wide_Wide_Character --
+   --------------------------------
+
+   procedure Decode_Wide_Wide_Character
+     (Input  : String;
+      Ptr    : in out Natural;
+      Result : out Wide_Wide_Character)
+   is
+      C : Character;
+
+      function In_Char return Character;
+      pragma Inline (In_Char);
+      --  Function to get one input character
+
+      -------------
+      -- In_Char --
+      -------------
+
+      function In_Char return Character is
+      begin
+         if Ptr <= Input'Last then
+            Ptr := Ptr + 1;
+            return Input (Ptr - 1);
+         else
+            Past_End;
+         end if;
+      end In_Char;
+
+   --  Start of processing for Decode_Wide_Wide_Character
+
+   begin
+      C := In_Char;
+
+      --  Special fast processing for UTF-8 case
+
+      if Encoding_Method = WCEM_UTF8 then
+         UTF8 : declare
+            U : Unsigned_32;
+            W : Unsigned_32;
+
+            procedure Get_UTF_Byte;
+            pragma Inline (Get_UTF_Byte);
+            --  Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
+            --  Reads a byte, and raises CE if the first two bits are not 10.
+            --  Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
+
+            ------------------
+            -- Get_UTF_Byte --
+            ------------------
+
+            procedure Get_UTF_Byte is
+            begin
+               U := Unsigned_32 (Character'Pos (In_Char));
+
+               if (U and 2#11000000#) /= 2#10_000000# then
+                  Bad;
+               end if;
+
+               W := Shift_Left (W, 6) or (U and 2#00111111#);
+            end Get_UTF_Byte;
+
+         --  Start of processing for UTF8 case
+
+         begin
+            --  Note: for details of UTF8 encoding see RFC 3629
+
+            U := Unsigned_32 (Character'Pos (C));
+
+            --  16#00_0000#-16#00_007F#: 0xxxxxxx
+
+            if (U and 2#10000000#) = 2#00000000# then
+               Result := Wide_Wide_Character'Val (Character'Pos (C));
+
+            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
+
+            elsif (U and 2#11100000#) = 2#110_00000# then
+               W := U and 2#00011111#;
+               Get_UTF_Byte;
+               Result := Wide_Wide_Character'Val (W);
+
+            --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
+
+            elsif (U and 2#11110000#) = 2#1110_0000# then
+               W := U and 2#00001111#;
+               Get_UTF_Byte;
+               Get_UTF_Byte;
+               Result := Wide_Wide_Character'Val (W);
+
+            --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+            elsif (U and 2#11111000#) = 2#11110_000# then
+               W := U and 2#00000111#;
+
+               for K in 1 .. 3 loop
+                  Get_UTF_Byte;
+               end loop;
+
+               Result := Wide_Wide_Character'Val (W);
+
+            --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
+            --                               10xxxxxx 10xxxxxx
+
+            elsif (U and 2#11111100#) = 2#111110_00# then
+               W := U and 2#00000011#;
+
+               for K in 1 .. 4 loop
+                  Get_UTF_Byte;
+               end loop;
+
+               Result := Wide_Wide_Character'Val (W);
+
+            --  All other cases are invalid, note that this includes:
+
+            --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
+            --                               10xxxxxx 10xxxxxx 10xxxxxx
+
+            --  since Wide_Wide_Character does not include code values
+            --  greater than 16#03FF_FFFF#.
+
+            else
+               Bad;
+            end if;
+         end UTF8;
+
+      --  All encoding functions other than UTF-8
+
+      else
+         Non_UTF8 : declare
+            function Char_Sequence_To_UTF is
+              new Char_Sequence_To_UTF_32 (In_Char);
+
+         begin
+            --  For brackets, must test for specific case of [ not followed by
+            --  quotation, where we must not call Char_Sequence_To_UTF, but
+            --  instead just return the bracket unchanged.
+
+            if Encoding_Method = WCEM_Brackets
+              and then C = '['
+              and then (Ptr > Input'Last or else Input (Ptr) /= '"')
+            then
+               Result := '[';
+
+            --  All other cases including [" with Brackets
+
+            else
+               Result :=
+                 Wide_Wide_Character'Val
+                   (Char_Sequence_To_UTF (C, Encoding_Method));
+            end if;
+         end Non_UTF8;
+      end if;
+   end Decode_Wide_Wide_Character;
+
+   -----------------------------
+   -- Decode_Wide_Wide_String --
+   -----------------------------
+
+   function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is
+      Result : Wide_Wide_String (1 .. S'Length);
+      Length : Natural;
+   begin
+      Decode_Wide_Wide_String (S, Result, Length);
+      return Result (1 .. Length);
+   end Decode_Wide_Wide_String;
+
+   procedure Decode_Wide_Wide_String
+     (S      : String;
+      Result : out Wide_Wide_String;
+      Length : out Natural)
+   is
+      Ptr : Natural;
+
+   begin
+      Ptr := S'First;
+      Length := 0;
+      while Ptr <= S'Last loop
+         if Length >= Result'Last then
+            Past_End;
+         end if;
+
+         Length := Length + 1;
+         Decode_Wide_Wide_Character (S, Ptr, Result (Length));
+      end loop;
+   end Decode_Wide_Wide_String;
+
+   -------------------------
+   -- Next_Wide_Character --
+   -------------------------
+
+   procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is
+   begin
+      if Ptr < Input'First then
+         Past_End;
+      end if;
+
+      --  Special efficient encoding for UTF-8 case
+
+      if Encoding_Method = WCEM_UTF8 then
+         UTF8 : declare
+            U : Unsigned_32;
+
+            procedure Getc;
+            pragma Inline (Getc);
+            --  Gets the character at Input (Ptr) and returns code in U as
+            --  Unsigned_32 value. On return Ptr is bumped past the character.
+
+            procedure Skip_UTF_Byte;
+            pragma Inline (Skip_UTF_Byte);
+            --  Skips past one encoded byte which must be 2#10xxxxxx#
+
+            ----------
+            -- Getc --
+            ----------
+
+            procedure Getc is
+            begin
+               if Ptr > Input'Last then
+                  Past_End;
+               else
+                  U := Unsigned_32 (Character'Pos (Input (Ptr)));
+                  Ptr := Ptr + 1;
+               end if;
+            end Getc;
+
+            -------------------
+            -- Skip_UTF_Byte --
+            -------------------
+
+            procedure Skip_UTF_Byte is
+            begin
+               Getc;
+
+               if (U and 2#11000000#) /= 2#10_000000# then
+                  Bad;
+               end if;
+            end Skip_UTF_Byte;
+
+         --  Start of processing for UTF-8 case
+
+         begin
+            --  16#00_0000#-16#00_007F#: 0xxxxxxx
+
+            Getc;
+
+            if (U and 2#10000000#) = 2#00000000# then
+               return;
+
+            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
+
+            elsif (U and 2#11100000#) = 2#110_00000# then
+               Skip_UTF_Byte;
+
+            --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
+
+            elsif (U and 2#11110000#) = 2#1110_0000# then
+               Skip_UTF_Byte;
+               Skip_UTF_Byte;
+
+            --  Any other code is invalid, note that this includes:
+
+            --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+            --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
+            --                               10xxxxxx 10xxxxxx
+
+            --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
+            --                               10xxxxxx 10xxxxxx 10xxxxxx
+
+            --  since Wide_Character does not allow codes > 16#FFFF#
+
+            else
+               Bad;
+            end if;
+         end UTF8;
+
+      --  Non-UTF-8 cass
+
+      else
+         declare
+            Discard : Wide_Character;
+         begin
+            Decode_Wide_Character (Input, Ptr, Discard);
+         end;
+      end if;
+   end Next_Wide_Character;
+
+   ------------------------------
+   -- Next_Wide_Wide_Character --
+   ------------------------------
+
+   procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
+   begin
+      --  Special efficient encoding for UTF-8 case
+
+      if Encoding_Method = WCEM_UTF8 then
+         UTF8 : declare
+            U : Unsigned_32;
+
+            procedure Getc;
+            pragma Inline (Getc);
+            --  Gets the character at Input (Ptr) and returns code in U as
+            --  Unsigned_32 value. On return Ptr is bumped past the character.
+
+            procedure Skip_UTF_Byte;
+            pragma Inline (Skip_UTF_Byte);
+            --  Skips past one encoded byte which must be 2#10xxxxxx#
+
+            ----------
+            -- Getc --
+            ----------
+
+            procedure Getc is
+            begin
+               if Ptr > Input'Last then
+                  Past_End;
+               else
+                  U := Unsigned_32 (Character'Pos (Input (Ptr)));
+                  Ptr := Ptr + 1;
+               end if;
+            end Getc;
+
+            -------------------
+            -- Skip_UTF_Byte --
+            -------------------
+
+            procedure Skip_UTF_Byte is
+            begin
+               Getc;
+
+               if (U and 2#11000000#) /= 2#10_000000# then
+                  Bad;
+               end if;
+            end Skip_UTF_Byte;
+
+         --  Start of processing for UTF-8 case
+
+         begin
+            if Ptr < Input'First then
+               Past_End;
+            end if;
+
+            --  16#00_0000#-16#00_007F#: 0xxxxxxx
+
+            Getc;
+
+            if (U and 2#10000000#) = 2#00000000# then
+               null;
+
+            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
+
+            elsif (U and 2#11100000#) = 2#110_00000# then
+               Skip_UTF_Byte;
+
+            --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
+
+            elsif (U and 2#11110000#) = 2#1110_0000# then
+               Skip_UTF_Byte;
+               Skip_UTF_Byte;
+
+            --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+            elsif (U and 2#11111000#) = 2#11110_000# then
+               for K in 1 .. 3 loop
+                  Skip_UTF_Byte;
+               end loop;
+
+            --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
+            --                               10xxxxxx 10xxxxxx
+
+            elsif (U and 2#11111100#) = 2#111110_00# then
+               for K in 1 .. 4 loop
+                  Skip_UTF_Byte;
+               end loop;
+
+            --  Any other code is invalid, note that this includes:
+
+            --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
+            --                               10xxxxxx 10xxxxxx 10xxxxxx
+
+            --  since Wide_Wide_Character does not allow codes > 16#03FF_FFFF#
+
+            else
+               Bad;
+            end if;
+         end UTF8;
+
+      --  Non-UTF-8 cass
+
+      else
+         declare
+            Discard : Wide_Wide_Character;
+         begin
+            Decode_Wide_Wide_Character (Input, Ptr, Discard);
+         end;
+      end if;
+   end Next_Wide_Wide_Character;
+
+   --------------
+   -- Past_End --
+   --------------
+
+   procedure Past_End is
+   begin
+      raise Constraint_Error with "past end of string";
+   end Past_End;
+
+   -------------------------
+   -- Prev_Wide_Character --
+   -------------------------
+
+   procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is
+   begin
+      if Ptr > Input'Last + 1 then
+         Past_End;
+      end if;
+
+      --  Special efficient encoding for UTF-8 case
+
+      if Encoding_Method = WCEM_UTF8 then
+         UTF8 : declare
+            U : Unsigned_32;
+
+            procedure Getc;
+            pragma Inline (Getc);
+            --  Gets the character at Input (Ptr - 1) and returns code in U as
+            --  Unsigned_32 value. On return Ptr is decremented by one.
+
+            procedure Skip_UTF_Byte;
+            pragma Inline (Skip_UTF_Byte);
+            --  Checks that U is 2#10xxxxxx# and then calls Get
+
+            ----------
+            -- Getc --
+            ----------
+
+            procedure Getc is
+            begin
+               if Ptr <= Input'First then
+                  Past_End;
+               else
+                  Ptr := Ptr - 1;
+                  U := Unsigned_32 (Character'Pos (Input (Ptr)));
+               end if;
+            end Getc;
+
+            -------------------
+            -- Skip_UTF_Byte --
+            -------------------
+
+            procedure Skip_UTF_Byte is
+            begin
+               if (U and 2#11000000#) = 2#10_000000# then
+                  Getc;
+               else
+                  Bad;
+               end if;
+            end Skip_UTF_Byte;
+
+         --  Start of processing for UTF-8 case
+
+         begin
+            --  16#00_0000#-16#00_007F#: 0xxxxxxx
+
+            Getc;
+
+            if (U and 2#10000000#) = 2#00000000# then
+               return;
+
+            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
+
+            else
+               Skip_UTF_Byte;
+
+               if (U and 2#11100000#) = 2#110_00000# then
+                  return;
+
+               --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
+
+               else
+                  Skip_UTF_Byte;
+
+                  if (U and 2#11110000#) = 2#1110_0000# then
+                     return;
+
+                     --  Any other code is invalid, note that this includes:
+
+                     --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
+                     --                           10xxxxxx
+
+                     --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
+                     --                               10xxxxxx 10xxxxxx
+                     --                               10xxxxxx
+
+                     --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
+                     --                               10xxxxxx 10xxxxxx
+                     --                               10xxxxxx 10xxxxxx
+
+                     --  since Wide_Character does not allow codes > 16#FFFF#
+
+                  else
+                     Bad;
+                  end if;
+               end if;
+            end if;
+         end UTF8;
+
+      --  Special efficient encoding for brackets case
+
+      elsif Encoding_Method = WCEM_Brackets then
+         Brackets : declare
+            P : Natural;
+            S : Natural;
+
+         begin
+            --  See if we have "] at end positions
+
+            if Ptr > Input'First + 1
+              and then Input (Ptr - 1) = ']'
+              and then Input (Ptr - 2) = '"'
+            then
+               P := Ptr - 2;
+
+               --  Loop back looking for [" at start
+
+               while P >= Ptr - 10 loop
+                  if P <= Input'First + 1 then
+                     Bad;
+
+                  elsif Input (P - 1) = '"'
+                    and then Input (P - 2) = '['
+                  then
+                     --  Found ["..."], scan forward to check it
+
+                     S := P - 2;
+                     P := S;
+                     Next_Wide_Character (Input, P);
+
+                     --  OK if at original pointer, else error
+
+                     if P = Ptr then
+                        Ptr := S;
+                        return;
+                     else
+                        Bad;
+                     end if;
+                  end if;
+
+                  P := P - 1;
+               end loop;
+
+               --  Falling through loop means more than 8 chars between the
+               --  enclosing brackets (or simply a missing left bracket)
+
+               Bad;
+
+            --  Here if no bracket sequence present
+
+            else
+               if Ptr = Input'First then
+                  Past_End;
+               else
+                  Ptr := Ptr - 1;
+               end if;
+            end if;
+         end Brackets;
+
+      --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
+      --  go to the start of the string and skip forwards till Ptr matches.
+
+      else
+         Non_UTF_Brackets : declare
+            Discard : Wide_Character;
+            PtrS    : Natural;
+            PtrP    : Natural;
+
+         begin
+            PtrS := Input'First;
+
+            if Ptr <= PtrS then
+               Past_End;
+            end if;
+
+            loop
+               PtrP := PtrS;
+               Decode_Wide_Character (Input, PtrS, Discard);
+
+               if PtrS = Ptr then
+                  Ptr := PtrP;
+                  return;
+
+               elsif PtrS > Ptr then
+                  Bad;
+               end if;
+            end loop;
+
+         exception
+            when Constraint_Error =>
+               Bad;
+         end Non_UTF_Brackets;
+      end if;
+   end Prev_Wide_Character;
+
+   ------------------------------
+   -- Prev_Wide_Wide_Character --
+   ------------------------------
+
+   procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
+   begin
+      if Ptr > Input'Last + 1 then
+         Past_End;
+      end if;
+
+      --  Special efficient encoding for UTF-8 case
+
+      if Encoding_Method = WCEM_UTF8 then
+         UTF8 : declare
+            U : Unsigned_32;
+
+            procedure Getc;
+            pragma Inline (Getc);
+            --  Gets the character at Input (Ptr - 1) and returns code in U as
+            --  Unsigned_32 value. On return Ptr is decremented by one.
+
+            procedure Skip_UTF_Byte;
+            pragma Inline (Skip_UTF_Byte);
+            --  Checks that U is 2#10xxxxxx# and then calls Get
+
+            ----------
+            -- Getc --
+            ----------
+
+            procedure Getc is
+            begin
+               if Ptr <= Input'First then
+                  Past_End;
+               else
+                  Ptr := Ptr - 1;
+                  U := Unsigned_32 (Character'Pos (Input (Ptr)));
+               end if;
+            end Getc;
+
+            -------------------
+            -- Skip_UTF_Byte --
+            -------------------
+
+            procedure Skip_UTF_Byte is
+            begin
+               if (U and 2#11000000#) = 2#10_000000# then
+                  Getc;
+               else
+                  Bad;
+               end if;
+            end Skip_UTF_Byte;
+
+         --  Start of processing for UTF-8 case
+
+         begin
+            --  16#00_0000#-16#00_007F#: 0xxxxxxx
+
+            Getc;
+
+            if (U and 2#10000000#) = 2#00000000# then
+               return;
+
+            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
+
+            else
+               Skip_UTF_Byte;
+
+               if (U and 2#11100000#) = 2#110_00000# then
+                  return;
+
+               --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
+
+               else
+                  Skip_UTF_Byte;
+
+                  if (U and 2#11110000#) = 2#1110_0000# then
+                     return;
+
+                  --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
+                  --                           10xxxxxx
+
+                  else
+                     Skip_UTF_Byte;
+
+                     if (U and 2#11111000#) = 2#11110_000# then
+                        return;
+
+                     --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
+                     --                               10xxxxxx 10xxxxxx
+                     --                               10xxxxxx
+
+                     else
+                        Skip_UTF_Byte;
+
+                        if (U and 2#11111100#) = 2#111110_00# then
+                           return;
+
+                        --  Any other code is invalid, note that this includes:
+
+                        --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
+                        --                               10xxxxxx 10xxxxxx
+                        --                               10xxxxxx 10xxxxxx
+
+                        --  since Wide_Wide_Character does not allow codes
+                        --  greater than 16#03FF_FFFF#
+
+                        else
+                           Bad;
+                        end if;
+                     end if;
+                  end if;
+               end if;
+            end if;
+         end UTF8;
+
+      --  Special efficient encoding for brackets case
+
+      elsif Encoding_Method = WCEM_Brackets then
+         Brackets : declare
+            P : Natural;
+            S : Natural;
+
+         begin
+            --  See if we have "] at end positions
+
+            if Ptr > Input'First + 1
+              and then Input (Ptr - 1) = ']'
+              and then Input (Ptr - 2) = '"'
+            then
+               P := Ptr - 2;
+
+               --  Loop back looking for [" at start
+
+               while P >= Ptr - 10 loop
+                  if P <= Input'First + 1 then
+                     Bad;
+
+                  elsif Input (P - 1) = '"'
+                    and then Input (P - 2) = '['
+                  then
+                     --  Found ["..."], scan forward to check it
+
+                     S := P - 2;
+                     P := S;
+                     Next_Wide_Wide_Character (Input, P);
+
+                     --  OK if at original pointer, else error
+
+                     if P = Ptr then
+                        Ptr := S;
+                        return;
+                     else
+                        Bad;
+                     end if;
+                  end if;
+
+                  P := P - 1;
+               end loop;
+
+               --  Falling through loop means more than 8 chars between the
+               --  enclosing brackets (or simply a missing left bracket)
+
+               Bad;
+
+            --  Here if no bracket sequence present
+
+            else
+               if Ptr = Input'First then
+                  Past_End;
+               else
+                  Ptr := Ptr - 1;
+               end if;
+            end if;
+         end Brackets;
+
+      --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
+      --  go to the start of the string and skip forwards till Ptr matches.
+
+      else
+         Non_UTF8_Brackets : declare
+            Discard : Wide_Wide_Character;
+            PtrS    : Natural;
+            PtrP    : Natural;
+
+         begin
+            PtrS := Input'First;
+
+            if Ptr <= PtrS then
+               Past_End;
+            end if;
+
+            loop
+               PtrP := PtrS;
+               Decode_Wide_Wide_Character (Input, PtrS, Discard);
+
+               if PtrS = Ptr then
+                  Ptr := PtrP;
+                  return;
+
+               elsif PtrS > Ptr then
+                  Bad;
+               end if;
+            end loop;
+
+         exception
+             when Constraint_Error =>
+               Bad;
+         end Non_UTF8_Brackets;
+      end if;
+   end Prev_Wide_Wide_Character;
+
+   --------------------------
+   -- Validate_Wide_String --
+   --------------------------
+
+   function Validate_Wide_String (S : String) return Boolean is
+      Ptr : Natural;
+
+   begin
+      Ptr := S'First;
+      while Ptr <= S'Last loop
+         Next_Wide_Character (S, Ptr);
+      end loop;
+
+      return True;
+
+   exception
+      when Constraint_Error =>
+         return False;
+   end Validate_Wide_String;
+
+   -------------------------------
+   -- Validate_Wide_Wide_String --
+   -------------------------------
+
+   function Validate_Wide_Wide_String (S : String) return Boolean is
+      Ptr : Natural;
+
+   begin
+      Ptr := S'First;
+      while Ptr <= S'Last loop
+         Next_Wide_Wide_Character (S, Ptr);
+      end loop;
+
+      return True;
+
+   exception
+      when Constraint_Error =>
+         return False;
+   end Validate_Wide_Wide_String;
+
+end GNAT.Decode_String;

Property changes on: g-decstr.adb
___________________________________________________________________
Name: svn:executable
   + *

Index: g-decstr.ads
===================================================================
--- g-decstr.ads	(revision 0)
+++ g-decstr.ads	(revision 0)
@@ -0,0 +1,163 @@
+------------------------------------------------------------------------------
+--                                                                          --
+--                         GNAT RUN-TIME COMPONENTS                         --
+--                                                                          --
+--                    G N A T . D E C O D E _ S T R I N G                   --
+--                                                                          --
+--                                 S p e c                                  --
+--                                                                          --
+--                       Copyright (C) 2007, AdaCore                        --
+--                                                                          --
+-- GNAT is free software;  you can  redistribute it  and/or modify it under --
+-- terms of the  GNU General Public License as published  by the Free Soft- --
+-- ware  Foundation;  either version 2,  or (at your option) any later ver- --
+-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
+-- or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License --
+-- for  more details.  You should have  received  a copy of the GNU General --
+-- Public License  distributed with GNAT;  see file COPYING.  If not, write --
+-- to  the  Free Software Foundation,  51  Franklin  Street,  Fifth  Floor, --
+-- Boston, MA 02110-1301, USA.                                              --
+--                                                                          --
+-- As a special exception,  if other files  instantiate  generics from this --
+-- unit, or you link  this unit with other files  to produce an executable, --
+-- this  unit  does not  by itself cause  the resulting  executable  to  be --
+-- covered  by the  GNU  General  Public  License.  This exception does not --
+-- however invalidate  any other reasons why  the executable file  might be --
+-- covered by the  GNU Public License.                                      --
+--                                                                          --
+-- GNAT was originally developed  by the GNAT team at  New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc.      --
+--                                                                          --
+------------------------------------------------------------------------------
+
+--  This generic package provides utility routines for converting from an
+--  encoded string to a corresponding Wide_String or Wide_Wide_String value
+--  using a specified encoding convention, which is supplied as the generic
+--  parameter. UTF-8 is handled especially efficiently, and if the encoding
+--  method is known at compile time to be WCEM_UTF8, then the instantiation
+--  is specialized to handle only the UTF-8 case and exclude code for the
+--  other encoding methods. The package also provides positioning routines
+--  for skipping encoded characters in either direction, and for validating
+--  strings for correct encodings.
+
+--  Note: this package is only about decoding sequences of 8-bit characters
+--  into corresponding 16-bit Wide_String or 32-bit Wide_Wide_String values.
+--  It knows nothing at all about the character encodings being used for the
+--  resulting Wide_Character and Wide_Wide_Character values. Most often this
+--  will be Unicode/ISO-10646 as specified by the Ada RM, but this package
+--  does not make any assumptions about the character coding. See also the
+--  packages Ada.Wide_[Wide_]Characters.Unicode for unicode specific functions.
+
+--  Note on the use of brackets encoding (WCEM_Brackets). The brackets encoding
+--  method is ambiguous in the context of this package, since there is no way
+--  to tell if ["1234"] is eight unencoded characters or one encoded character.
+--  In the context of Ada sources, any sequence starting [" must be the start
+--  of an encoding (since that sequence is not valid in Ada source otherwise).
+--  The routines in this package use the same approach. If the input string
+--  contains the sequence [" then this is assumed to be the start of a brackets
+--  encoding sequence, and if it does not match the syntax, an error is raised.
+--  In the case of the Prev functions, a sequence ending with "] is assumed to
+--  be a valid brackets sequence, and an error is raised if it is not.
+
+with System.WCh_Con;
+
+generic
+   Encoding_Method : System.WCh_Con.WC_Encoding_Method;
+
+package GNAT.Decode_String is
+   pragma Pure;
+
+   function Decode_Wide_String (S : String) return Wide_String;
+   pragma Inline (Decode_Wide_String);
+   --  Decode the given String, which is encoded using the indicated coding
+   --  method, returning the corresponding decoded Wide_String value. If S
+   --  contains a character code that cannot be represented with the given
+   --  encoding, then Constraint_Error is raised.
+
+   procedure Decode_Wide_String
+     (S      : String;
+      Result : out Wide_String;
+      Length : out Natural);
+   --  Similar to the above function except that the result is stored in the
+   --  given Wide_String variable Result, starting at Result (Result'First). On
+   --  return, Length is set to the number of characters stored in Result. The
+   --  caller must ensure that Result is long enough (an easy choice is to set
+   --  the length equal to the S'Length, since decoding can never increase the
+   --  string length). If the length of Result is insufficient Constraint_Error
+   --  will be raised.
+
+   function Decode_Wide_Wide_String (S : String) return Wide_Wide_String;
+   pragma Inline (Decode_Wide_Wide_String);
+   --  Same as above function but for Wide_Wide_String output
+
+   procedure Decode_Wide_Wide_String
+     (S      : String;
+      Result : out Wide_Wide_String;
+      Length : out Natural);
+   --  Same as above procedure, but for Wide_Wide_String output
+
+   function Validate_Wide_String (S : String) return Boolean;
+   --  This function inspects the string S to determine if it contains only
+   --  valid encodings corresponding to Wide_Character values using the
+   --  given encoding. If a call to Decode_Wide_String (S) would return
+   --  without raising Constraint_Error, then Validate_Wide_String will
+   --  return True. If the call would have raised Constraint_Error, then
+   --  Validate_Wide_String will return False.
+
+   function Validate_Wide_Wide_String (S : String) return Boolean;
+   --  Similar to Validate_Wide_String, except that it succeeds if the string
+   --  contains only encodings corresponding to Wide_Wide_Character values.
+
+   procedure Decode_Wide_Character
+     (Input  : String;
+      Ptr    : in out Natural;
+      Result : out Wide_Character);
+   pragma Inline (Decode_Wide_Character);
+   --  This is a lower level procedure that decodes a single character using
+   --  the given encoding method. The encoded character is stored in Input,
+   --  starting at Input (Ptr). The resulting output character is stored in
+   --  Result, and on return Ptr is updated past the input character or
+   --  encoding sequence. Constraint_Error will be raised if the input has
+   --  has a character that cannot be represented using the given encoding,
+   --  or if Ptr is outside the bounds of the Input string.
+
+   procedure Decode_Wide_Wide_Character
+     (Input  : String;
+      Ptr    : in out Natural;
+      Result : out Wide_Wide_Character);
+   --  Same as above procedure but with Wide_Wide_Character input
+
+   procedure Next_Wide_Character (Input : String; Ptr : in out Natural);
+   --  This procedure examines the input string starting at Input (Ptr), and
+   --  advances Ptr past one character in the encoded string, so that on return
+   --  Ptr points to the next encoded character. Constraint_Error is raised if
+   --  an invalid encoding is encountered, or the end of the string is reached
+   --  or if Ptr is less than String'First on entry, or if the character
+   --  skipped is not a valid Wide_Character code. This call may be more
+   --  efficient than calling Decode_Wide_Character and discarding the result.
+
+   procedure Prev_Wide_Character (Input : String; Ptr : in out Natural);
+   --  This procedure is similar to Next_Encoded_Character except that it moves
+   --  backwards in the string, so that on return, Ptr is set to point to the
+   --  previous encoded character. Constraint_Error is raised if the start of
+   --  the string is encountered. It is valid for Ptr to be one past the end
+   --  of the string for this call (in which case on return it will point to
+   --  the last encoded character).
+   --
+   --  Note: it is not generally possible to do this function efficiently with
+   --  all encodings, the current implementation is only efficient for the case
+   --  of UTF-8 (Encoding_Method = WCEM_UTF8) and Brackets (Encoding_Method =
+   --  WCEM_Brackets). For all other encodings, we work by starting at the
+   --  beginning of the string and moving forward till Ptr is reached, which
+   --  is correct but slow.
+
+   procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural);
+   --  Similar to Next_Wide_Character except that codes skipped must be valid
+   --  Wide_Wide_Character codes.
+
+   procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural);
+   --  Similar to Prev_Wide_Character except that codes skipped must be valid
+   --  Wide_Wide_Character codes.
+
+end GNAT.Decode_String;

Property changes on: g-decstr.ads
___________________________________________________________________
Name: svn:executable
   + *

Index: g-deutst.ads
===================================================================
--- g-deutst.ads	(revision 0)
+++ g-deutst.ads	(revision 0)
@@ -0,0 +1,45 @@
+------------------------------------------------------------------------------
+--                                                                          --
+--                         GNAT RUN-TIME COMPONENTS                         --
+--                                                                          --
+--               G N A T . D E C O D E _ U T F 8 _ S T R I N G              --
+--                                                                          --
+--                                 S p e c                                  --
+--                                                                          --
+--                       Copyright (C) 2007, AdaCore                        --
+--                                                                          --
+-- GNAT is free software;  you can  redistribute it  and/or modify it under --
+-- terms of the  GNU General Public License as published  by the Free Soft- --
+-- ware  Foundation;  either version 2,  or (at your option) any later ver- --
+-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
+-- or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License --
+-- for  more details.  You should have  received  a copy of the GNU General --
+-- Public License  distributed with GNAT;  see file COPYING.  If not, write --
+-- to  the  Free Software Foundation,  51  Franklin  Street,  Fifth  Floor, --
+-- Boston, MA 02110-1301, USA.                                              --
+--                                                                          --
+-- As a special exception,  if other files  instantiate  generics from this --
+-- unit, or you link  this unit with other files  to produce an executable, --
+-- this  unit  does not  by itself cause  the resulting  executable  to  be --
+-- covered  by the  GNU  General  Public  License.  This exception does not --
+-- however invalidate  any other reasons why  the executable file  might be --
+-- covered by the  GNU Public License.                                      --
+--                                                                          --
+-- GNAT was originally developed  by the GNAT team at  New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc.      --
+--                                                                          --
+------------------------------------------------------------------------------
+
+--  This package provides a pre-instantiation of GNAT.Decode_String for the
+--  common case of UTF-8 encoding. As noted in the documentation of that
+--  package, this UTF-8 instantiation is efficient and specialized so that
+--  it has only the code for the UTF-8 case. See g-decstr.ads for full
+--  documentation on this package.
+
+with GNAT.Decode_String;
+
+with System.WCh_Con;
+
+package GNAT.Decode_UTF8_String is
+  new GNAT.Decode_String (System.WCh_Con.WCEM_UTF8);
Index: g-encstr.adb
===================================================================
--- g-encstr.adb	(revision 0)
+++ g-encstr.adb	(revision 0)
@@ -0,0 +1,260 @@
+------------------------------------------------------------------------------
+--                                                                          --
+--                         GNAT RUN-TIME COMPONENTS                         --
+--                                                                          --
+--                    G N A T . E N C O D E _ S T R I N G                   --
+--                                                                          --
+--                                 B o d y                                  --
+--                                                                          --
+--                       Copyright (C) 2007, AdaCore                        --
+--                                                                          --
+-- GNAT is free software;  you can  redistribute it  and/or modify it under --
+-- terms of the  GNU General Public License as published  by the Free Soft- --
+-- ware  Foundation;  either version 2,  or (at your option) any later ver- --
+-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
+-- or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License --
+-- for  more details.  You should have  received  a copy of the GNU General --
+-- Public License  distributed with GNAT;  see file COPYING.  If not, write --
+-- to  the  Free Software Foundation,  51  Franklin  Street,  Fifth  Floor, --
+-- Boston, MA 02110-1301, USA.                                              --
+--                                                                          --
+-- As a special exception,  if other files  instantiate  generics from this --
+-- unit, or you link  this unit with other files  to produce an executable, --
+-- this  unit  does not  by itself cause  the resulting  executable  to  be --
+-- covered  by the  GNU  General  Public  License.  This exception does not --
+-- however invalidate  any other reasons why  the executable file  might be --
+-- covered by the  GNU Public License.                                      --
+--                                                                          --
+-- GNAT was originally developed  by the GNAT team at  New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc.      --
+--                                                                          --
+------------------------------------------------------------------------------
+
+with Interfaces; use Interfaces;
+
+with System.WCh_Con; use System.WCh_Con;
+with System.WCh_Cnv; use System.WCh_Cnv;
+
+package body GNAT.Encode_String is
+
+   -----------------------
+   -- Local Subprograms --
+   -----------------------
+
+   procedure Bad;
+   pragma No_Return (Bad);
+   --  Raise error for bad character code
+
+   procedure Past_End;
+   pragma No_Return (Past_End);
+   --  Raise error for off end of string
+
+   ---------
+   -- Bad --
+   ---------
+
+   procedure Bad is
+   begin
+      raise Constraint_Error with
+        "character cannot be encoded with given Encoding_Method";
+   end Bad;
+
+   ------------------------
+   -- Encode_Wide_String --
+   ------------------------
+
+   function Encode_Wide_String (S : Wide_String) return String is
+      Long : constant Natural := WC_Longest_Sequences (Encoding_Method);
+      Result : String (1 .. S'Length * Long);
+      Length : Natural;
+   begin
+      Encode_Wide_String (S, Result, Length);
+      return Result (1 .. Length);
+   end Encode_Wide_String;
+
+   procedure Encode_Wide_String
+     (S      : Wide_String;
+      Result : out String;
+      Length : out Natural)
+   is
+      Ptr : Natural;
+
+   begin
+      Ptr := S'First;
+      for J in S'Range loop
+         Encode_Wide_Character (S (J), Result, Ptr);
+      end loop;
+
+      Length := Ptr - S'First;
+   end Encode_Wide_String;
+
+   -----------------------------
+   -- Encode_Wide_Wide_String --
+   -----------------------------
+
+   function Encode_Wide_Wide_String (S : Wide_Wide_String) return String is
+      Long : constant Natural := WC_Longest_Sequences (Encoding_Method);
+      Result : String (1 .. S'Length * Long);
+      Length : Natural;
+   begin
+      Encode_Wide_Wide_String (S, Result, Length);
+      return Result (1 .. Length);
+   end Encode_Wide_Wide_String;
+
+   procedure Encode_Wide_Wide_String
+     (S      : Wide_Wide_String;
+      Result : out String;
+      Length : out Natural)
+   is
+      Ptr : Natural;
+
+   begin
+      Ptr := S'First;
+      for J in S'Range loop
+         Encode_Wide_Wide_Character (S (J), Result, Ptr);
+      end loop;
+
+      Length := Ptr - S'First;
+   end Encode_Wide_Wide_String;
+
+   ---------------------------
+   -- Encode_Wide_Character --
+   ---------------------------
+
+   procedure Encode_Wide_Character
+     (Char   : Wide_Character;
+      Result : in out String;
+      Ptr    : in out Natural)
+   is
+   begin
+      Encode_Wide_Wide_Character
+        (Wide_Wide_Character'Val (Wide_Character'Pos (Char)), Result, Ptr);
+
+   exception
+      when Constraint_Error =>
+         Bad;
+   end Encode_Wide_Character;
+
+   --------------------------------
+   -- Encode_Wide_Wide_Character --
+   --------------------------------
+
+   procedure Encode_Wide_Wide_Character
+     (Char   : Wide_Wide_Character;
+      Result : in out String;
+      Ptr    : in out Natural)
+   is
+      U : Unsigned_32;
+
+      procedure Out_Char (C : Character);
+      pragma Inline (Out_Char);
+      --  Procedure to store one character for instantiation below
+
+      --------------
+      -- Out_Char --
+      --------------
+
+      procedure Out_Char (C : Character) is
+      begin
+         if Ptr > Result'Last then
+            Past_End;
+         else
+            Result (Ptr) := C;
+            Ptr := Ptr + 1;
+         end if;
+      end Out_Char;
+
+   --  Start of processing for Encode_Wide_Wide_Character;
+
+   begin
+      --  Efficient code for UTF-8 case
+
+      if Encoding_Method = WCEM_UTF8 then
+
+         --  Note: for details of UTF8 encoding see RFC 3629
+
+         U := Unsigned_32 (Wide_Wide_Character'Pos (Char));
+
+         --  16#00_0000#-16#00_007F#: 0xxxxxxx
+
+         if U <= 16#00_007F# then
+            Out_Char (Character'Val (U));
+
+         --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
+
+         elsif U <= 16#00_07FF# then
+            Out_Char (Character'Val (2#11000000# or Shift_Right (U, 6)));
+            Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
+
+         --  16#00_0800#-16#00_FFFF#: 1110xxxx 10xxxxxx 10xxxxxx
+
+         elsif U <= 16#00_FFFF# then
+            Out_Char (Character'Val (2#11100000# or Shift_Right (U, 12)));
+            Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
+                                                          and 2#00111111#)));
+            Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
+
+         --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+         elsif U <= 16#10_FFFF# then
+            Out_Char (Character'Val (2#11110000# or Shift_Right (U, 18)));
+            Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
+                                                          and 2#00111111#)));
+            Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
+                                                       and 2#00111111#)));
+            Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
+
+         --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
+         --                               10xxxxxx 10xxxxxx
+
+         elsif U <= 16#03FF_FFFF# then
+            Out_Char (Character'Val (2#11111000# or Shift_Right (U, 24)));
+            Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18)
+                                                       and 2#00111111#)));
+            Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
+                                                       and 2#00111111#)));
+            Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
+                                                       and 2#00111111#)));
+            Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
+
+         --  All other cases are invalid character codes, not this includes:
+
+         --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
+         --                               10xxxxxx 10xxxxxx 10xxxxxx
+
+         --  since Wide_Wide_Character values cannot exceed 16#3F_FFFF#
+
+         else
+            Bad;
+         end if;
+
+      --  All encoding methods other than UTF-8
+
+      else
+         Non_UTF8 : declare
+            procedure UTF_32_To_String is
+              new UTF_32_To_Char_Sequence (Out_Char);
+            --  Instantiate conversion procedure with above Out_Char routine
+
+         begin
+            UTF_32_To_String
+              (UTF_32_Code (Wide_Wide_Character'Pos (Char)), Encoding_Method);
+
+         exception
+            when Constraint_Error =>
+               Bad;
+         end Non_UTF8;
+      end if;
+   end Encode_Wide_Wide_Character;
+
+   --------------
+   -- Past_End --
+   --------------
+
+   procedure Past_End is
+   begin
+      raise Constraint_Error with "past end of string";
+   end Past_End;
+
+end GNAT.Encode_String;

Property changes on: g-encstr.adb
___________________________________________________________________
Name: svn:executable
   + *

Index: g-encstr.ads
===================================================================
--- g-encstr.ads	(revision 0)
+++ g-encstr.ads	(revision 0)
@@ -0,0 +1,111 @@
+------------------------------------------------------------------------------
+--                                                                          --
+--                         GNAT RUN-TIME COMPONENTS                         --
+--                                                                          --
+--                    G N A T . E N C O D E _ S T R I N G                   --
+--                                                                          --
+--                                 S p e c                                  --
+--                                                                          --
+--                       Copyright (C) 2007, AdaCore                        --
+--                                                                          --
+-- GNAT is free software;  you can  redistribute it  and/or modify it under --
+-- terms of the  GNU General Public License as published  by the Free Soft- --
+-- ware  Foundation;  either version 2,  or (at your option) any later ver- --
+-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
+-- or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License --
+-- for  more details.  You should have  received  a copy of the GNU General --
+-- Public License  distributed with GNAT;  see file COPYING.  If not, write --
+-- to  the  Free Software Foundation,  51  Franklin  Street,  Fifth  Floor, --
+-- Boston, MA 02110-1301, USA.                                              --
+--                                                                          --
+-- As a special exception,  if other files  instantiate  generics from this --
+-- unit, or you link  this unit with other files  to produce an executable, --
+-- this  unit  does not  by itself cause  the resulting  executable  to  be --
+-- covered  by the  GNU  General  Public  License.  This exception does not --
+-- however invalidate  any other reasons why  the executable file  might be --
+-- covered by the  GNU Public License.                                      --
+--                                                                          --
+-- GNAT was originally developed  by the GNAT team at  New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc.      --
+--                                                                          --
+------------------------------------------------------------------------------
+
+--  This generic package provides utility routines for converting from
+--  Wide_String or Wide_Wide_String to encoded String using a specified
+--  encoding convention, which is supplied as the generic parameter. If
+--  this parameter is a known at compile time constant (e.g. a constant
+--  definned in System.WCh_Con), the instantiation is specialized so that
+--  it applies only to this specified coding.
+
+--  Note: this package is only about encoding sequences of 16- or 32-bit
+--  characters into a sequence of 8-bit codes. It knows nothing at all about
+--  the character encodings being used for the input Wide_Character and
+--  Wide_Wide_Character values, although some of the encoding methods (notably
+--  JIS and EUC) have built in assumptions about the range of possible input
+--  code values. Most often the input will be Unicode/ISO-10646 as specified by
+--  the Ada RM, but this package does not make any assumptions about the
+--  character coding, and in the case of UTF-8 all possible code values can be
+--  encoded. See also the packages Ada.Wide_[Wide_]Characters.Unicode for
+--  unicode specific functions.
+
+--  Note on brackets encoding (WCEM_Brackets). On input, upper half characters
+--  can be represented as ["hh"] but the routines in this package will only use
+--  brackets encodings for codes higher than 16#FF#, so upper half characters
+--  will be output as single Character values.
+
+with System.WCh_Con;
+
+generic
+   Encoding_Method : System.WCh_Con.WC_Encoding_Method;
+
+package GNAT.Encode_String is
+   pragma Pure;
+
+   function Encode_Wide_String (S : Wide_String) return String;
+   pragma Inline (Encode_Wide_String);
+   --  Encode the given Wide_String, returning a String encoded using the
+   --  given encoding method. Constraint_Error will be raised if the encoding
+   --  method cannot accomodate the input data.
+
+   procedure Encode_Wide_String
+     (S      : Wide_String;
+      Result : out String;
+      Length : out Natural);
+   --  Encode the given Wide_String, storing the encoded string in Result,
+   --  with Length being set to the length of the encoded string. The caller
+   --  must ensure that Result is long enough (see useful constants defined
+   --  in System.WCh_Con: WC_Longest_Sequence, WC_Longest_Sequences). If the
+   --  length of Result is insufficient Constraint_Error will be raised.
+   --  Constraint_Error will also be raised if the encoding method cannot
+   --  accomodate the input data.
+
+   function Encode_Wide_Wide_String (S : Wide_Wide_String) return String;
+   pragma Inline (Encode_Wide_Wide_String);
+   --  Same as above function but for Wide_Wide_String input
+
+   procedure Encode_Wide_Wide_String
+     (S      : Wide_Wide_String;
+      Result : out String;
+      Length : out Natural);
+   --  Same as above procedure, but for Wide_Wide_String input
+
+   procedure Encode_Wide_Character
+     (Char   : Wide_Character;
+      Result : in out String;
+      Ptr    : in out Natural);
+   pragma Inline (Encode_Wide_Character);
+   --  This is a lower level procedure that encodes the single character Char.
+   --  The output is stored in Result starting at Result (Ptr), and Ptr is
+   --  updated past the stored value. Constraint_Error is raised if Result
+   --  is not long enough to accomodate the result, or if the encoding method
+   --  specified does not accomodate the input character value, or if Ptr is
+   --  outside the bounds of the Result string.
+
+   procedure Encode_Wide_Wide_Character
+     (Char   : Wide_Wide_Character;
+      Result : in out String;
+      Ptr    : in out Natural);
+   --  Same as above procedure but with Wide_Wide_Character input
+
+end GNAT.Encode_String;

Property changes on: g-encstr.ads
___________________________________________________________________
Name: svn:executable
   + *

Index: g-enutst.ads
===================================================================
--- g-enutst.ads	(revision 0)
+++ g-enutst.ads	(revision 0)
@@ -0,0 +1,45 @@
+------------------------------------------------------------------------------
+--                                                                          --
+--                         GNAT RUN-TIME COMPONENTS                         --
+--                                                                          --
+--               G N A T . E N C O D E _ U T F 8 _ S T R I N G              --
+--                                                                          --
+--                                 S p e c                                  --
+--                                                                          --
+--                       Copyright (C) 2007, AdaCore                        --
+--                                                                          --
+-- GNAT is free software;  you can  redistribute it  and/or modify it under --
+-- terms of the  GNU General Public License as published  by the Free Soft- --
+-- ware  Foundation;  either version 2,  or (at your option) any later ver- --
+-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
+-- or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License --
+-- for  more details.  You should have  received  a copy of the GNU General --
+-- Public License  distributed with GNAT;  see file COPYING.  If not, write --
+-- to  the  Free Software Foundation,  51  Franklin  Street,  Fifth  Floor, --
+-- Boston, MA 02110-1301, USA.                                              --
+--                                                                          --
+-- As a special exception,  if other files  instantiate  generics from this --
+-- unit, or you link  this unit with other files  to produce an executable, --
+-- this  unit  does not  by itself cause  the resulting  executable  to  be --
+-- covered  by the  GNU  General  Public  License.  This exception does not --
+-- however invalidate  any other reasons why  the executable file  might be --
+-- covered by the  GNU Public License.                                      --
+--                                                                          --
+-- GNAT was originally developed  by the GNAT team at  New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc.      --
+--                                                                          --
+------------------------------------------------------------------------------
+
+--  This package provides a pre-instantiation of GNAT.Encode_String for the
+--  common case of UTF-8 encoding. As noted in the documentation of that
+--  package, this UTF-8 instantiation is efficient and specialized so that
+--  it has only the code for the UTF-8 case. See g-encstr.ads for full
+--  documentation on this package.
+
+with GNAT.Encode_String;
+
+with System.WCh_Con;
+
+package GNAT.Encode_UTF8_String is
+  new GNAT.Encode_String (System.WCh_Con.WCEM_UTF8);
Index: scn.adb
===================================================================
--- scn.adb	(revision 130811)
+++ scn.adb	(working copy)
@@ -28,6 +28,7 @@ with Csets;    use Csets;
 with Hostparm; use Hostparm;
 with Namet;    use Namet;
 with Opt;      use Opt;
+with Output;   use Output;
 with Restrict; use Restrict;
 with Rident;   use Rident;
 with Scans;    use Scans;
@@ -35,6 +36,10 @@ with Sinfo;    use Sinfo;
 with Sinput;   use Sinput;
 with Uintp;    use Uintp;
 
+with GNAT.Byte_Order_Mark; use GNAT.Byte_Order_Mark;
+
+with System.WCh_Con; use System.WCh_Con;
+
 package body Scn is
 
    use ASCII;
@@ -266,6 +271,42 @@ package body Scn is
          Set_License (Current_Source_File, Determine_License);
       end if;
 
+      --  Check for BOM
+
+      declare
+         BOM : BOM_Kind;
+         Len : Natural;
+         Tst : String (1 .. 5);
+
+      begin
+         for J in 1 .. 5 loop
+            Tst (J) := Source (Scan_Ptr + Source_Ptr (J) - 1);
+         end loop;
+
+         Read_BOM (Tst, Len, BOM, False);
+
+         case BOM is
+            when UTF8_All =>
+               Scan_Ptr := Scan_Ptr + Source_Ptr (Len);
+               Wide_Character_Encoding_Method := WCEM_UTF8;
+               Upper_Half_Encoding := True;
+
+            when UTF16_LE | UTF16_BE =>
+               Write_Line ("UTF-16 encoding format not recognized");
+               raise Unrecoverable_Error;
+
+            when UTF32_LE | UTF32_BE =>
+               Write_Line ("UTF-32 encoding format not recognized");
+               raise Unrecoverable_Error;
+
+            when Unknown =>
+               null;
+
+            when others =>
+               raise Program_Error;
+         end case;
+      end;
+
       --  Because of the License stuff above, Scng.Initialize_Scanner cannot
       --  call Scan. Scan initial token (note this initializes Prev_Token,
       --  Prev_Token_Ptr).


More information about the Gcc-patches mailing list