CVS difference for ai05s/ai05-0137-1.txt

Differences between 1.3 and version 1.4
Log of other versions for file ai05s/ai05-0137-1.txt

--- ai05s/ai05-0137-1.txt	2009/07/11 03:06:22	1.3
+++ ai05s/ai05-0137-1.txt	2009/10/15 04:20:17	1.4
@@ -1,4 +1,4 @@
-!standard A.4.11                                       09-06-30  AI05-0137-1/03
+!standard A.4.11                                       09-10-12  AI05-0137-1/05
 !class Amendment 09-02-12
 !status Amendment 201Z 09-06-30
 !status ARG Approved  7-0-0  09-06-13
@@ -21,7 +21,7 @@
 construct or use such text strings.
 
 It would be useful for ASIS users, but also for the Ada
-community at-large to define a package to handle encoding/decoding
+community at large to define a package to handle encoding/decoding
 between Wide_String/Wide_Wide_String and UTF_8/UTF_16.
 
 !proposal
@@ -43,62 +43,63 @@
 
 package Ada.Strings.UTF_Encoding is
    pragma Pure (UTF_Encoding);
-   
+
    type Encoding_Scheme is (UTF_None, UTF_8, UTF_16BE, UTF_16LE, UTF_16);
    subtype Short_Encoding is Encoding_Scheme range UTF_8  .. UTF_16LE;
    subtype Long_Encoding  is Encoding_Scheme range UTF_16 .. UTF_16;
-   
-   BOM_8    : constant String := Character'Val (16#EF#) & Character'Val (16#BB#) &
-                 Character'Val (16#BF#);
-   BOM_16BE : constant String := Character'Val (16#FE#) & Character'Val (16#FF#);
-   BOM_16LE : constant String := Character'Val (16#FF#) & Character'Val (16#FE#);
+
+   BOM_8    : constant String := Character'Val(16#EF#) &
+                 Character'Val(16#BB#) & Character'Val(16#BF#);
+   BOM_16BE : constant String := Character'Val(16#FE#) & Character'Val(16#FF#);
+   BOM_16LE : constant String := Character'Val(16#FF#) & Character'Val(16#FE#);
 
-   BOM_16   : constant Wide_String := (1 => Wide_Character'Val (16#FEFF#));
+   BOM_16   : constant Wide_String := (1 => Wide_Character'Val(16#FEFF#));
 
-   function Encode (Item   : in Wide_String;      
+   function Encode (Item   : in Wide_String;
                     Scheme : in Short_Encoding := UTF_8)
             return String;
-   function Encode (Item   : in Wide_Wide_String; 
-                    Scheme : in Short_Encoding := UTF_8) 
+   function Encode (Item   : in Wide_Wide_String;
+                    Scheme : in Short_Encoding := UTF_8)
             return String;
-   function Decode (Item   : in String;           
-                    Scheme : in Short_Encoding := UTF_8) 
+   function Decode (Item   : in String;
+                    Scheme : in Short_Encoding := UTF_8)
             return Wide_String;
-   function Decode (Item   : in String;           
-                    Scheme : in Short_Encoding := UTF_8) 
+   function Decode (Item   : in String;
+                    Scheme : in Short_Encoding := UTF_8)
             return Wide_Wide_String;
 
-   function Encode (Item   : in Wide_Wide_String; 
-                    Scheme : in Long_Encoding := UTF_16) 
+   function Encode (Item   : in Wide_Wide_String;
+                    Scheme : in Long_Encoding := UTF_16)
             return Wide_String;
-   function Decode (Item   : in Wide_String;      
-                    Scheme : in Long_Encoding := UTF_16) 
+   function Decode (Item   : in Wide_String;
+                    Scheme : in Long_Encoding := UTF_16)
             return Wide_Wide_String;
 
    function Encoding (Item : in String) return Encoding_Scheme;
 
    function Encoding (Item : in Wide_String) return Encoding_Scheme;
-   
+
    Encoding_Error : exception;
 
 end Ada.Strings.UTF_Encoding;
 
-The type Encoding_Scheme defines encoding schemes. UTF_8 corresponds to the UTF-8
-encoding scheme defined by Annex D of ISO/IEC 106046. UTF_16 corresponds to the
-UTF-16 encoding scheme defined by Annex C of ISO/IEC 106046 stored in 16 bits;
-UTF_16BE corresponds to the UTF-16 encoding scheme stored in 8 bits, big endian;
-and UTF_16LE corresponds to the UTF-16 encoding scheme on 8 bits, little endian.
+The type Encoding_Scheme defines encoding schemes. UTF_8 corresponds to the
+UTF-8 encoding scheme defined by Annex D of ISO/IEC 106046. UTF_16 corresponds
+to the UTF-16 encoding scheme defined by Annex C of ISO/IEC 106046 stored in 16
+bits; UTF_16BE corresponds to the UTF-16 encoding scheme stored in 8 bits, big
+endian; and UTF_16LE corresponds to the UTF-16 encoding scheme on 8 bits, little
+endian.
 
 The subtype Short_Encoding covers the values of Encoding_Scheme for 8
-bits encoding schemes, and the subtype Long_Encoding covers the values of
-Encoding_Scheme for 16 bits encoding schemes.
+bit encoding schemes, and the subtype Long_Encoding covers the values of
+Encoding_Scheme for 16 bit encoding schemes.
 
 Each of the Encode functions takes a Wide_String (respectively
 Wide_Wide_String) Item parameter and returns a String (respectively
 Wide_String) whose characters have position values that correspond to
 the encoding of the Item parameter according to the encoding scheme
 specified by the Scheme parameter. For UTF_8, no overlong encoding
-is returned. The lower bound of the returned string shall be 1.
+is returned. The lower bound of the returned string is 1.
 
 Each of the Decode functions takes a String (respectively Wide_String)
 Item parameter which is assumed to contain characters whose position
@@ -116,7 +117,7 @@
   Scheme parameter, the sequence is ignored;
 - Otherwise, Encoding_Error is raised.
 
-The Encode functions do put BOM sequences in the result.
+The Encode functions do not put BOM sequences in the result.
 
 For each of the Encoding functions, if the initial characters of Item match
 a BOM, the corresponding encoding is returned; otherwise, UTF_None is returned.
@@ -126,13 +127,13 @@
 If an implementation supports other encoding schemes, another similar child
 of Ada.Strings should be defined.
 
-Note: A BOM can be included in a file or other entity to indicate the encoding;
-it is skipped when decoding. An explicit concatenation is needed to include a BOM
-in an encoded entity (it is not added automatically). Typically, only the first
-line of a file or other entity will contain a BOM. When decoding, the appropriate
-Encoding function can be used on the first line to determine the encoding; that
-encoding will then be used in subsequent calls to Decode to convert all of the
-lines to an internal format.
+Note: A BOM (Byte-Order Mark, code position 16#FEFF#) can be included in a file or
+other entity to indicate the encoding; it is skipped when decoding. An explicit
+concatenation is needed to include a BOM in an encoded entity (it is not added
+automatically). Typically, only the first line of a file or other entity will
+contain a BOM. When decoding, the appropriate Encoding function can be used on
+the first line to determine the encoding; that encoding will then be used in
+subsequent calls to Decode to convert all of the lines to an internal format.
 
 !discussion
 
@@ -142,28 +143,28 @@
 an integer value to each character; this value is called the
 code-point of the character. Normally, a character string should be
 represented as a sequence of code-points; however, it would waste a
-lot of space, since ISO 10646 defines 32 bit code-points. An encoding
+lot of space, since ISO 10646 defines 32-bit code-points. An encoding
 scheme is a representation of a string of characters, using a more
 economical representation. Typically, an encoding scheme uses a suite
 of integer values, where each code-point is represented by one or
-several consecutive values. UTF-8 is an encoding scheme that uses 8
-bit values. In some cases, UTF-8 defines several possible encodings
+several consecutive values. UTF-8 is an encoding scheme that uses 8-bit
+values. In some cases, UTF-8 defines several possible encodings
 for a code-point; in this case, the shortest one should be used; other
-encodings are called overlong encodings. UTF-16 uses 16 bit
-values. UTF-32 uses 32 bit values, which is of little interest since
+encodings are called overlong encodings. UTF-16 uses 16-bit
+values. UTF-32 uses 32-bit values, which is of little interest since
 nothing is gained compared to UCS-32 (raw encoding).
 
 There is no problem when using a String to encode UTF-8, or a
 Wide_String to encode UTF-16. However, it is sometimes useful to
 encode/decode a UTF-16 (or even UTF-32) encoded text into/from a
-String; in that case, characters must be paired to form 16 bit values
-(or 32 bit values). This can be done in two ways, Big-Endian (high
+String; in that case, characters must be paired to form 16-bit values
+(or 32-bit values). This can be done in two ways, Big Endian (high
 order character first) or Little Endian (low order character first). A
 special value, called BOM (Byte Order Mark, 16#FEFF#), can be used at
 the beginning of an encoded text (with 4 leading zeroes for
 UTF-32). The BOM corresponds to no code-point, and is discarded when
 decoding, but it is used to recognize whether a stream of bytes is
-Big-Endian or Little-Endian UTF-16 or UTF-32. By extension, the
+Big Endian or Little Endian UTF-16 or UTF-32. By extension, the
 sequence 16#EF# 16#BB# 16#BF# can be used as BOM to identify UTF-8
 text (although there is no byte order issue in UTF-8; actually, use of
 BOM for UTF-8 is discouraged).
@@ -207,20 +208,20 @@
    necessary to slice the first line specially.
 
 For encoding, it does not seem useful to have the BOM handled by the
-encoding functions, since it is easy to catenate the appropriate
+encoding functions, since it is easy to concatenate the appropriate
 constant.
 
 
-Alternative designs: 
+Alternative designs:
 
 Short_Encoding and Long_Encoding could be different types rather than
-subtypes of a same type. 
+subtypes of a same type.
 
 Arrays of Unsigned_8 or Unsigned_16 could be used in place of
 (Wide_)String. That would enforce strong typing to differentiate
 between an Ada String and an encoded string. OTOH, it is likely to be
 more of a burden than a help to most casual users. Moreover, it would
-not allow to keep ASIS program text as a Wide_String. 
+not allow to keep ASIS program text to be kept as a Wide_String.
 
 
 Existing similar packages:
@@ -235,7 +236,7 @@
 individual characters (not strings), does not support UTF-8, and is
 provided by generics that require a user-provided input/output formal
 function. Although more general, this solution would be too
-heavy-weight for the casual user.
+heavyweight for the casual user.
 
 !corrigendum A.4.11(0)
 
@@ -250,42 +251,42 @@
 
 @xcode<@b<package> Ada.Strings.UTF_Encoding @b<is>
    @b<pragma> Pure (UTF_Encoding);
-   
+
    @b<type> Encoding_Scheme @b<is> (UTF_None, UTF_8, UTF_16BE, UTF_16LE, UTF_16);
    @b<subtype> Short_Encoding @b<is> Encoding_Scheme @b<range> UTF_8  .. UTF_16LE;
    @b<subtype> Long_Encoding  @b<is> Encoding_Scheme @b<range> UTF_16 .. UTF_16;
-   
-   BOM_8    : @b<constant> String := Character'Val (16#EF#) & Character'Val (16#BB#) &
-                 Character'Val (16#BF#);
-   BOM_16BE : @b<constant> String := Character'Val (16#FE#) & Character'Val (16#FF#);
-   BOM_16LE : @b<constant> String := Character'Val (16#FF#) & Character'Val (16#FE#);
+
+   BOM_8    : @b<constant> String := Character'Val(16#EF#) &
+                 Character'Val(16#BB#) & Character'Val(16#BF#);
+   BOM_16BE : @b<constant> String := Character'Val(16#FE#) & Character'Val(16#FF#);
+   BOM_16LE : @b<constant> String := Character'Val(16#FF#) & Character'Val(16#FE#);
 
-   BOM_16   : @b<constant> Wide_String := (1 =@> Wide_Character'Val (16#FEFF#));
+   BOM_16   : @b<constant> Wide_String := (1 =@> Wide_Character'Val(16#FEFF#));
 
-   @b<function> Encode (Item   : @b<in> Wide_String;      
+   @b<function> Encode (Item   : @b<in> Wide_String;
                     Scheme : @b<in> Short_Encoding := UTF_8)
             @b<return> String;
-   @b<function> Encode (Item   : @b<in> Wide_Wide_String; 
-                    Scheme : @b<in> Short_Encoding := UTF_8) 
+   @b<function> Encode (Item   : @b<in> Wide_Wide_String;
+                    Scheme : @b<in> Short_Encoding := UTF_8)
             @b<return> String;
-   @b<function> Decode (Item   : @b<in> String;           
-                    Scheme : @b<in> Short_Encoding := UTF_8) 
+   @b<function> Decode (Item   : @b<in> String;
+                    Scheme : @b<in> Short_Encoding := UTF_8)
             @b<return> Wide_String;
-   @b<function> Decode (Item   : @b<in> String;           
-                    Scheme : @b<in> Short_Encoding := UTF_8) 
+   @b<function> Decode (Item   : @b<in> String;
+                    Scheme : @b<in> Short_Encoding := UTF_8)
             @b<return> Wide_Wide_String;
 
-   @b<function> Encode (Item   : @b<in> Wide_Wide_String; 
-                    Scheme : @b<in> Long_Encoding := UTF_16) 
+   @b<function> Encode (Item   : @b<in> Wide_Wide_String;
+                    Scheme : @b<in> Long_Encoding := UTF_16)
             @b<return> Wide_String;
-   @b<function> Decode (Item   : @b<in> Wide_String;      
-                    Scheme : @b<in> Long_Encoding := UTF_16) 
+   @b<function> Decode (Item   : @b<in> Wide_String;
+                    Scheme : @b<in> Long_Encoding := UTF_16)
             @b<return> Wide_Wide_String;
 
    @b<function> Encoding (Item : @b<in> String) @b<return> Encoding_Scheme;
 
    @b<function> Encoding (Item : @b<in> Wide_String) @b<return> Encoding_Scheme;
-   
+
    Encoding_Error : @b<exception>;
 
 @b<end> Ada.Strings.UTF_Encoding;>
@@ -296,16 +297,16 @@
 UTF_16BE corresponds to the UTF-16 encoding scheme stored in 8 bits, big endian;
 and UTF_16LE corresponds to the UTF-16 encoding scheme on 8 bits, little endian.
 
-The subtype Short_Encoding covers the values of Encoding_Scheme for 8
-bits encoding schemes, and the subtype Long_Encoding covers the values of
-Encoding_Scheme for 16 bits encoding schemes.
+The subtype Short_Encoding covers the values of Encoding_Scheme for 8-bit
+encoding schemes, and the subtype Long_Encoding covers the values of
+Encoding_Scheme for 16-bit encoding schemes.
 
 Each of the Encode functions takes a Wide_String (respectively
 Wide_Wide_String) Item parameter and returns a String (respectively
 Wide_String) whose characters have position values that correspond to
 the encoding of the Item parameter according to the encoding scheme
 specified by the Scheme parameter. For UTF_8, no overlong encoding
-is returned. The lower bound of the returned string shall be 1.
+is returned. The lower bound of the returned string is 1.
 
 Each of the Decode functions takes a String (respectively Wide_String)
 Item parameter which is assumed to contain characters whose position
@@ -323,7 +324,7 @@
 Scheme parameter, the sequence is ignored;>
 @xbullet<Otherwise, Encoding_Error is raised.>
 
-The Encode functions do put BOM sequences in the result.
+The Encode functions do not put BOM sequences in the result.
 
 For each of the Encoding functions, if the initial characters of Item match
 a BOM, the corresponding encoding is returned; otherwise, UTF_None is returned.
@@ -334,7 +335,8 @@
 of Ada.Strings should be defined.
 
 NOTE@hr
-@s9<14  A BOM can be included in a file or other entity to indicate the encoding;
+@s9<14  A BOM (Byte-Order Mark, code position 16#FEFF#) can be included in a file
+or other entity to indicate the encoding;
 it is skipped when decoding. An explicit concatenation is needed to include a BOM
 in an encoded entity (it is not added automatically). Typically, only the first
 line of a file or other entity will contain a BOM. When decoding, the appropriate
@@ -434,13 +436,13 @@
 The above is fine by me.  Anyone who goes and looks up UTF-16 in ISO-10646 will find
 the reference to Unicode.
 
-> AARM Note: How the UTF-16 encoding is stored in 8 and 16 bits is 
-> defined by reference to Unicode 4.0.0 in ISO/IEC 106046: "Unicode 
+> AARM Note: How the UTF-16 encoding is stored in 8 and 16 bits is
+> defined by reference to Unicode 4.0.0 in ISO/IEC 106046: "Unicode
 > Standard Annex, UAX#9, The Unicode Bidirectional Algorithm, Version 4.0.0, 2003-04-17."
 
 I don't think this is necessary, and in fact that isn't where UTF-16LE and UTF-16BE are
 defined.  ISO-10646 doesn't identify exactly where they are defined. It merely says they
 are defined somewhere in Unicode 4.0.  That reference I included was just to show you that
 they were willing to make a formal reference to the Unicode Standard.
- 
+
 ****************************************************************

Questions? Ask the ACAA Technical Agent