This is the mail archive of the java@gcc.gnu.org mailing list for the Java project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

RFC: String.getBytes(String) and Charsets...


Short story:

We are running JCIFS (see: http://jcifs.samba.org/ ) in libgcj.  There were
two modifications to libgcj required.  The first I committed yesterday.  See:

http://gcc.gnu.org/ml/java-patches/2004-q3/msg00985.html

The second change is that String.getBytes("UnicodeLittleUnmarked") must
work.  Sun's runtime has this conversion, and libgcj does not.

The initial approach I took for our internal use was to add
gnu.gcj.convert.Output_UnicodeLittleUnmarked to our build of libgcj.

My second (partially implemented) approach was the attached patch plus an
as of yet unwritten java.nio.charset.Charset that would do the conversion.

I am now having second thoughts.  The attached UnicodeLittleTest.java shows
that Sun's runtime does not have a Charset for UnicodeLittleUnmarked but
can still do the encoding with String.getBytes().

This leads me to believe that they have a mechanism similar to
gnu.gcj.convert.Output that String uses in addition to using a Charset.

Perhaps the best bet would be just to add
gnu.gcj.convert.Output_UnicodeLittleUnmarked and forget about my second
approach.

Thoughts?

David Daney.
Index: gnu/gcj/convert/UnicodeToBytes.java
===================================================================
RCS file: /cvs/gcc/gcc/libjava/gnu/gcj/convert/UnicodeToBytes.java,v
retrieving revision 1.12
diff -u -r1.12 UnicodeToBytes.java
--- gnu/gcj/convert/UnicodeToBytes.java	22 Jun 2004 19:24:32 -0000	1.12
+++ gnu/gcj/convert/UnicodeToBytes.java	23 Sep 2004 18:30:47 -0000
@@ -8,6 +8,13 @@
 
 package gnu.gcj.convert; 
 
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.CoderResult;
+
 public abstract class UnicodeToBytes extends IOConverter
 {
   /** Buffer to emit bytes to.
@@ -97,7 +104,18 @@
 	return (UnicodeToBytes) encodingClass.newInstance();
       } 
     catch (Throwable ex) 
-      { 
+      {
+        try
+          {
+            // Try using finding java.nio.charset.Charset and using
+            // the adaptor.  Use the original name as Charsets have
+            // their own canonical names.
+            return new CharsetAdaptor (Charset.forName(encoding));
+          }
+        catch (Exception _)
+          {
+            // Ignore, and try the next method.
+          }
 	try
 	  {
 	    // We pass the original name to iconv and let it handle
@@ -174,4 +192,106 @@
 	currCachePos = (currCachePos + 1) % CACHE_SIZE;
       }
   }
+
+  /**
+   * Adaptor class that allow any {@link Charset} to be used
+   * as a UnicodeToBytes converter.
+   */
+  static class CharsetAdaptor extends UnicodeToBytes
+  {
+    /**
+     * The CharsetEncoder that does all the work.
+     */
+    private final CharsetEncoder encoder;
+
+    /**
+     * ByteBuffer wrapper for this.buf.
+     */
+    private ByteBuffer outBuf;
+
+    /**
+     * Dummy ByteBuffer of size zero.
+     */
+    private ByteBuffer fullBuf;
+
+
+    /**
+     * Create a new CharsetAdaptor for the given Charset.
+     *
+     * @param cs The Charset.
+     */
+    CharsetAdaptor(Charset cs)
+    {
+      encoder = cs.newEncoder();
+      // Use default replacments on bad input so that we don't have to
+      // deal with errors.
+      encoder.onMalformedInput(CodingErrorAction.REPLACE);
+      encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
+    }
+
+    /**
+     * Return the encoder's name.  The backing Charset's name is
+     * returned.
+     *
+     * @return The name.
+     */
+    public String getName()
+    {
+      return encoder.charset().name();
+    }
+
+    public int write (char[] inbuffer, int inpos, int inlength)
+    {
+      // Wrap the char array so it can be used by the encoder.
+      CharBuffer b = CharBuffer.wrap(inbuffer, inpos, inlength);
+      write(b);
+      return b.position() - inpos; // Number of chars consumed.
+    }
+
+    public int write (String str, int inpos, int inlength, char work)
+    {
+      // Wrap the String so it can be used by the encoder.
+      CharBuffer b = CharBuffer.wrap(str, inpos, inlength);
+      write(b);
+      return b.position() - inpos; // Number of chars consumed.
+    }
+
+    /**
+     * Encode as much of inBuf as will fit in buf.  The number of
+     * chars consumed is reflected by the new position of inBuf.  The
+     * output is put in buf and count is incremented by the number of
+     * bytes written.
+     *
+     * @param inBuf The input.
+     */
+    private void write(CharBuffer inBuf)
+    {
+      // Reuse existing outBuf if it is still wrapping the same array
+      // it was created with.
+      if (outBuf == null || !outBuf.hasArray() || outBuf.array() != buf)
+        outBuf = ByteBuffer.wrap(buf);
+
+      // Set the current position.
+      outBuf.position(count);
+      // Do the conversion.
+      encoder.encode(inBuf, outBuf, false);
+      // Mark the new end of buf.
+      count = outBuf.position();
+    }
+
+    /**
+     * Check for cached output in the converter.  This is done by
+     * checking the return value from a flush into a full ByteBuffer
+     *
+     * @return true if there is cached output that has not been
+     * written to buf.
+     */
+    public boolean havePendingByes()
+    {
+      if (fullBuf == null)
+        fullBuf = ByteBuffer.allocate(0);
+      return CoderResult.OVERFLOW == encoder.flush(fullBuf);
+    }
+  }
+    
 }
import java.nio.charset.*;
import java.util.*;


public class UnicodeLittleTest
{
    public static void main(String args[])
    {
        try {

            SortedMap m = Charset.availableCharsets();
            Iterator it = m.entrySet().iterator();
            while (it.hasNext()) {
                Map.Entry me = (Map.Entry)it.next();
                String key = (String) me.getKey();
                Charset value = (Charset) me.getValue();
                System.out.println(key);
                System.out.print("  ");
                Iterator ita = value.aliases().iterator();
                while (ita.hasNext()) {
                    String alias = (String)ita.next();
                    System.out.print(alias);
                    System.out.print(" ");
                }
                System.out.println();
            }
            String s = "Hello";
            try {
                Charset cs = Charset.forName("UnicodeLittleUnmarked");
                System.out.println(cs);
            }
            catch(Exception ex) {
                ex.printStackTrace();
            }

            byte[] ba = s.getBytes("UnicodeLittleUnmarked");
            for (int i = 0; i < ba.length; i++) {
                System.out.println("ba[" + i + "] = " + ba[i]);
            }
        }
        catch (Exception ex) {
            ex.printStackTrace();
        }
    }
}

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]