Patch: charset aliases merge from Classpath

Anthony Green green@redhat.com
Mon Feb 7 21:20:00 GMT 2005


I'm checking in this recent and highly anticipated change from GNU
Classpath.

AG


2005-02-07  Robert Schuster  <thebohemian@gmx.net>

	* gnu/java/nio/charset/ISO_8859_1.java,
	gnu/java/nio/charset/US_ASCII.java, 
	gnu/java/nio/charset/UTF_16.java,
	gnu/java/nio/charset/UTF_16_LE.java,
	gnu/java/nio/charset/UTF_16_BE.java,
	gnu/java/nio/charset/UTF_8.java: Fixed canonical names
	 and aliases according to
	 "http://www.iana.org/assignments/character-sets",
	 "http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html"
	 and "http://oss.software.ibm.com/cgi-bin/icu/convexp?s=ALL".
	* gnu/java/nio/charset/Provider.java: Made charset lookup
	 case-insensitive which fixes bug #11740. 


Index: gnu/java/nio/charset/ISO_8859_1.java
===================================================================
RCS file: /cvs/gcc/gcc/libjava/gnu/java/nio/charset/ISO_8859_1.java,v
retrieving revision 1.2
diff -u -p -r1.2 ISO_8859_1.java
--- gnu/java/nio/charset/ISO_8859_1.java	6 Nov 2004 22:44:47 -0000	1.2
+++ gnu/java/nio/charset/ISO_8859_1.java	7 Feb 2005 20:28:13 -0000
@@ -1,5 +1,5 @@
 /* ISO_8859_1.java -- 
-   Copyright (C) 2002, 2004 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2004, 2005 Free Software Foundation, Inc.
 
 This file is part of GNU Classpath.
 
@@ -53,7 +53,28 @@ final class ISO_8859_1 extends Charset
 {
   ISO_8859_1 ()
   {
-    super ("ISO-8859-1", new String[]{"ISO-LATIN-1"});
+    /* Canonical charset name chosen according to:
+     * http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html
+     */
+    super ("ISO-8859-1", new String[] {
+        /* These names are provided by 
+         * http://www.iana.org/assignments/character-sets
+         */
+        "iso-ir-100",
+        "ISO_8859-1",
+        "latin1",
+        "l1",
+        "IBM819",
+        "CP819",
+        "csISOLatin1",
+        "8859_1",
+        /* These names are provided by
+         * http://oss.software.ibm.com/cgi-bin/icu/convexp?s=ALL
+         */
+        "ISO8859_1", "ISO_8859_1", "ibm-819", "ISO_8859-1:1987",
+        "819"
+        });
+
   }
 
   public boolean contains (Charset cs)
Index: gnu/java/nio/charset/Provider.java
===================================================================
RCS file: /cvs/gcc/gcc/libjava/gnu/java/nio/charset/Provider.java,v
retrieving revision 1.1
diff -u -p -r1.1 Provider.java
--- gnu/java/nio/charset/Provider.java	11 Nov 2002 07:36:41 -0000	1.1
+++ gnu/java/nio/charset/Provider.java	7 Feb 2005 20:28:13 -0000
@@ -1,5 +1,5 @@
 /* Provider.java -- 
-   Copyright (C) 2002 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2005 Free Software Foundation, Inc.
 
 This file is part of GNU Classpath.
 
@@ -48,6 +48,7 @@ import java.util.Iterator;
  * {@link Charset#charsetForName} and * {@link Charset#availableCharsets}.
  *
  * @author Jesse Rosenstock
+ * @author Robert Schuster (thebohemian@gmx.net)
  * @see Charset
  */
 public final class Provider extends CharsetProvider
@@ -63,12 +64,14 @@ public final class Provider extends Char
   }
 
   /**
-   * Map from charset name to charset canonical name.
+   * Map from charset name to charset canonical name. The strings
+   * are all lower-case to allow case-insensitive retrieval of
+   * Charset instances. 
    */
   private final HashMap canonicalNames;
 
   /**
-   * Map from canonical name to Charset.
+   * Map from lower-case canonical name to Charset.
    * TODO: We may want to use soft references.  We would then need to keep
    * track of the class name to regenerate the object.
    */
@@ -76,8 +79,6 @@ public final class Provider extends Char
 
   private Provider ()
   {
-    // FIXME: We might need to make the name comparison case insensitive.
-    // Verify this with the Sun JDK.
     canonicalNames = new HashMap ();
     charsets = new HashMap ();
 
@@ -106,24 +107,42 @@ public final class Provider extends Char
                       .iterator ();
   }
 
+  /**
+   * Returns a Charset instance by converting the given
+   * name to lower-case, looking up the canonical charset
+   * name and finally looking up the Charset with that name.
+   * 
+   * <p>The lookup is therefore case-insensitive.</p>
+   * 
+   *  @returns The Charset having <code>charsetName</code>
+   *  as its alias or null if no such Charset exist.
+   */
   public Charset charsetForName (String charsetName)
   {
-    return (Charset) charsets.get (canonicalize (charsetName));
-  }
-
-  private Object canonicalize (String charsetName)
-  {
-    Object o = canonicalNames.get (charsetName);
-    return o == null ? charsetName : o;
+    return (Charset) charsets.get(canonicalNames.get(charsetName.toLowerCase()));
   }
 
+  /**
+   * Puts a Charset under its canonical name into the 'charsets' map.
+   * Then puts a mapping from all its alias names to the canonical name.
+   * 
+   * <p>All names are converted to lower-case</p>.
+   * 
+   * @param cs
+   */
   private void addCharset (Charset cs)
   {
-    String canonicalName = cs.name ();
+    String canonicalName = cs.name().toLowerCase();
     charsets.put (canonicalName, cs);
+    
+    /* Adds a mapping between the canonical name
+     * itself making a lookup using that name
+     * no special case.
+     */  
+    canonicalNames.put(canonicalName, canonicalName);
 
     for (Iterator i = cs.aliases ().iterator (); i.hasNext (); )
-      canonicalNames.put (i.next (), canonicalName);
+      canonicalNames.put (((String) i.next()).toLowerCase(), canonicalName);
   }
 
   public static synchronized Provider provider ()
Index: gnu/java/nio/charset/US_ASCII.java
===================================================================
RCS file: /cvs/gcc/gcc/libjava/gnu/java/nio/charset/US_ASCII.java,v
retrieving revision 1.2
diff -u -p -r1.2 US_ASCII.java
--- gnu/java/nio/charset/US_ASCII.java	6 Nov 2004 22:44:47 -0000	1.2
+++ gnu/java/nio/charset/US_ASCII.java	7 Feb 2005 20:28:13 -0000
@@ -1,5 +1,5 @@
 /* US_ASCII.java -- 
-   Copyright (C) 2002, 2004 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2004, 2005 Free Software Foundation, Inc.
 
 This file is part of GNU Classpath.
 
@@ -53,7 +53,29 @@ final class US_ASCII extends Charset
 {
   US_ASCII ()
   {
-    super ("US-ASCII", new String[]{"ISO646-US"});
+    /* Canonical charset name chosen according to:
+     * http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html
+     */
+    super ("US-ASCII", new String[] {
+        /* These names are provided by 
+         * http://www.iana.org/assignments/character-sets
+         */
+        "iso-ir-6",
+        "ANSI_X3.4-1986",
+        "ISO_646.irv:1991",
+        "ASCII",
+        "ISO646-US",
+        "ASCII",
+        "us",
+        "IBM367",
+        "cp367",
+        "csASCII",
+        /* These names are provided by
+         * http://oss.software.ibm.com/cgi-bin/icu/convexp?s=ALL
+         */
+        "ANSI_X3.4-1968", "iso_646.irv:1983", "ascii7", "646",
+        "windows-20127"
+        });
   }
 
   public boolean contains (Charset cs)
Index: gnu/java/nio/charset/UTF_16.java
===================================================================
RCS file: /cvs/gcc/gcc/libjava/gnu/java/nio/charset/UTF_16.java,v
retrieving revision 1.2
diff -u -p -r1.2 UTF_16.java
--- gnu/java/nio/charset/UTF_16.java	16 Oct 2004 18:06:02 -0000	1.2
+++ gnu/java/nio/charset/UTF_16.java	7 Feb 2005 20:28:13 -0000
@@ -1,5 +1,5 @@
 /* UTF_16.java -- 
-   Copyright (C) 2002, 2004  Free Software Foundation, Inc.
+   Copyright (C) 2002, 2004, 2005  Free Software Foundation, Inc.
 
 This file is part of GNU Classpath.
 
@@ -51,7 +51,14 @@ final class UTF_16 extends Charset
 {
   UTF_16 ()
   {
-    super ("UTF-16", null);
+    super ("UTF-16", new String[] {
+        // witnessed by the internet
+        "UTF16",
+        /* These names are provided by
+         * http://oss.software.ibm.com/cgi-bin/icu/convexp?s=ALL
+         */
+        "ISO-10646-UCS-2", "unicode", "csUnicode", "ucs-2"
+    });
   }
 
   public boolean contains (Charset cs)
Index: gnu/java/nio/charset/UTF_16BE.java
===================================================================
RCS file: /cvs/gcc/gcc/libjava/gnu/java/nio/charset/UTF_16BE.java,v
retrieving revision 1.2
diff -u -p -r1.2 UTF_16BE.java
--- gnu/java/nio/charset/UTF_16BE.java	16 Oct 2004 18:06:02 -0000	1.2
+++ gnu/java/nio/charset/UTF_16BE.java	7 Feb 2005 20:28:13 -0000
@@ -1,5 +1,5 @@
 /* UTF_16BE.java -- 
-   Copyright (C) 2002, 2004  Free Software Foundation, Inc.
+   Copyright (C) 2002, 2004, 2005  Free Software Foundation, Inc.
 
 This file is part of GNU Classpath.
 
@@ -51,7 +51,18 @@ final class UTF_16BE extends Charset
 {
   UTF_16BE ()
   {
-    super ("UTF-16BE", null);
+    super ("UTF-16BE",  new String[] {
+        // witnessed by the internet
+        "UTF16BE",
+        /* These names are provided by
+         * http://oss.software.ibm.com/cgi-bin/icu/convexp?s=ALL
+         */
+        "x-utf-16be", "ibm-1200", "ibm-1201", "ibm-5297",
+        "ibm-13488", "ibm-17584", "windows-1201", "cp1200", "cp1201",
+        "UTF16_BigEndian",
+        // see http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html
+        "UnicodeBigUnmarked"
+    });
   }
 
   public boolean contains (Charset cs)
Index: gnu/java/nio/charset/UTF_16LE.java
===================================================================
RCS file: /cvs/gcc/gcc/libjava/gnu/java/nio/charset/UTF_16LE.java,v
retrieving revision 1.2
diff -u -p -r1.2 UTF_16LE.java
--- gnu/java/nio/charset/UTF_16LE.java	16 Oct 2004 18:06:02 -0000	1.2
+++ gnu/java/nio/charset/UTF_16LE.java	7 Feb 2005 20:28:14 -0000
@@ -1,5 +1,5 @@
 /* UTF_16LE.java -- 
-   Copyright (C) 2002, 2004  Free Software Foundation, Inc.
+   Copyright (C) 2002, 2004, 2005  Free Software Foundation, Inc.
 
 This file is part of GNU Classpath.
 
@@ -51,7 +51,17 @@ final class UTF_16LE extends Charset
 {
   UTF_16LE ()
   {
-    super ("UTF-16LE", null);
+    super ("UTF-16LE", new String[] {
+        // witnessed by the internet
+        "UTF16LE", 
+        /* These names are provided by
+         * http://oss.software.ibm.com/cgi-bin/icu/convexp?s=ALL
+         */
+        "x-utf-16le", "ibm-1202", "ibm-13490", "ibm-17586",
+        "UTF16_LittleEndian",
+        // see http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html
+        "UnicodeLittleUnmarked"
+    });
   }
 
   public boolean contains (Charset cs)
Index: gnu/java/nio/charset/UTF_8.java
===================================================================
RCS file: /cvs/gcc/gcc/libjava/gnu/java/nio/charset/UTF_8.java,v
retrieving revision 1.2
diff -u -p -r1.2 UTF_8.java
--- gnu/java/nio/charset/UTF_8.java	6 Nov 2004 22:44:47 -0000	1.2
+++ gnu/java/nio/charset/UTF_8.java	7 Feb 2005 20:28:14 -0000
@@ -1,5 +1,5 @@
 /* UTF_8.java -- 
-   Copyright (C) 2002, 2004 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2004, 2005  Free Software Foundation, Inc.
 
 This file is part of GNU Classpath.
 
@@ -62,7 +62,15 @@ final class UTF_8 extends Charset
 {
   UTF_8 ()
   {
-    super ("UTF-8", null);
+    super ("UTF-8", new String[] {
+        /* These names are provided by
+         * http://oss.software.ibm.com/cgi-bin/icu/convexp?s=ALL
+         */
+        "ibm-1208", "ibm-1209", "ibm-5304", "ibm-5305",
+        "windows-65001", "cp1208",
+        // see http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html
+        "UTF8"
+    });
   }
 
   public boolean contains (Charset cs)





More information about the Java-patches mailing list