This is the mail archive of the
java@gcc.gnu.org
mailing list for the Java project.
RFC: String.getBytes(String) and Charsets...
- From: David Daney <ddaney at avtrex dot com>
- To: java at gcc dot gnu dot org
- Date: Thu, 23 Sep 2004 11:59:09 -0700
- Subject: RFC: String.getBytes(String) and Charsets...
Short story:
We are running JCIFS (see: http://jcifs.samba.org/ ) in libgcj. There were
two modifications to libgcj required. The first I committed yesterday. See:
http://gcc.gnu.org/ml/java-patches/2004-q3/msg00985.html
The second change is that String.getBytes("UnicodeLittleUnmarked") must
work. Sun's runtime has this conversion, and libgcj does not.
The initial approach I took for our internal use was to add
gnu.gcj.convert.Output_UnicodeLittleUnmarked to our build of libgcj.
My second (partially implemented) approach was the attached patch plus an
as of yet unwritten java.nio.charset.Charset that would do the conversion.
I am now having second thoughts. The attached UnicodeLittleTest.java shows
that Sun's runtime does not have a Charset for UnicodeLittleUnmarked but
can still do the encoding with String.getBytes().
This leads me to believe that they have a mechanism similar to
gnu.gcj.convert.Output that String uses in addition to using a Charset.
Perhaps the best bet would be just to add
gnu.gcj.convert.Output_UnicodeLittleUnmarked and forget about my second
approach.
Thoughts?
David Daney.
Index: gnu/gcj/convert/UnicodeToBytes.java
===================================================================
RCS file: /cvs/gcc/gcc/libjava/gnu/gcj/convert/UnicodeToBytes.java,v
retrieving revision 1.12
diff -u -r1.12 UnicodeToBytes.java
--- gnu/gcj/convert/UnicodeToBytes.java 22 Jun 2004 19:24:32 -0000 1.12
+++ gnu/gcj/convert/UnicodeToBytes.java 23 Sep 2004 18:30:47 -0000
@@ -8,6 +8,13 @@
package gnu.gcj.convert;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.CoderResult;
+
public abstract class UnicodeToBytes extends IOConverter
{
/** Buffer to emit bytes to.
@@ -97,7 +104,18 @@
return (UnicodeToBytes) encodingClass.newInstance();
}
catch (Throwable ex)
- {
+ {
+ try
+ {
+ // Try using finding java.nio.charset.Charset and using
+ // the adaptor. Use the original name as Charsets have
+ // their own canonical names.
+ return new CharsetAdaptor (Charset.forName(encoding));
+ }
+ catch (Exception _)
+ {
+ // Ignore, and try the next method.
+ }
try
{
// We pass the original name to iconv and let it handle
@@ -174,4 +192,106 @@
currCachePos = (currCachePos + 1) % CACHE_SIZE;
}
}
+
+ /**
+ * Adaptor class that allow any {@link Charset} to be used
+ * as a UnicodeToBytes converter.
+ */
+ static class CharsetAdaptor extends UnicodeToBytes
+ {
+ /**
+ * The CharsetEncoder that does all the work.
+ */
+ private final CharsetEncoder encoder;
+
+ /**
+ * ByteBuffer wrapper for this.buf.
+ */
+ private ByteBuffer outBuf;
+
+ /**
+ * Dummy ByteBuffer of size zero.
+ */
+ private ByteBuffer fullBuf;
+
+
+ /**
+ * Create a new CharsetAdaptor for the given Charset.
+ *
+ * @param cs The Charset.
+ */
+ CharsetAdaptor(Charset cs)
+ {
+ encoder = cs.newEncoder();
+ // Use default replacments on bad input so that we don't have to
+ // deal with errors.
+ encoder.onMalformedInput(CodingErrorAction.REPLACE);
+ encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
+ }
+
+ /**
+ * Return the encoder's name. The backing Charset's name is
+ * returned.
+ *
+ * @return The name.
+ */
+ public String getName()
+ {
+ return encoder.charset().name();
+ }
+
+ public int write (char[] inbuffer, int inpos, int inlength)
+ {
+ // Wrap the char array so it can be used by the encoder.
+ CharBuffer b = CharBuffer.wrap(inbuffer, inpos, inlength);
+ write(b);
+ return b.position() - inpos; // Number of chars consumed.
+ }
+
+ public int write (String str, int inpos, int inlength, char work)
+ {
+ // Wrap the String so it can be used by the encoder.
+ CharBuffer b = CharBuffer.wrap(str, inpos, inlength);
+ write(b);
+ return b.position() - inpos; // Number of chars consumed.
+ }
+
+ /**
+ * Encode as much of inBuf as will fit in buf. The number of
+ * chars consumed is reflected by the new position of inBuf. The
+ * output is put in buf and count is incremented by the number of
+ * bytes written.
+ *
+ * @param inBuf The input.
+ */
+ private void write(CharBuffer inBuf)
+ {
+ // Reuse existing outBuf if it is still wrapping the same array
+ // it was created with.
+ if (outBuf == null || !outBuf.hasArray() || outBuf.array() != buf)
+ outBuf = ByteBuffer.wrap(buf);
+
+ // Set the current position.
+ outBuf.position(count);
+ // Do the conversion.
+ encoder.encode(inBuf, outBuf, false);
+ // Mark the new end of buf.
+ count = outBuf.position();
+ }
+
+ /**
+ * Check for cached output in the converter. This is done by
+ * checking the return value from a flush into a full ByteBuffer
+ *
+ * @return true if there is cached output that has not been
+ * written to buf.
+ */
+ public boolean havePendingByes()
+ {
+ if (fullBuf == null)
+ fullBuf = ByteBuffer.allocate(0);
+ return CoderResult.OVERFLOW == encoder.flush(fullBuf);
+ }
+ }
+
}
import java.nio.charset.*;
import java.util.*;
public class UnicodeLittleTest
{
public static void main(String args[])
{
try {
SortedMap m = Charset.availableCharsets();
Iterator it = m.entrySet().iterator();
while (it.hasNext()) {
Map.Entry me = (Map.Entry)it.next();
String key = (String) me.getKey();
Charset value = (Charset) me.getValue();
System.out.println(key);
System.out.print(" ");
Iterator ita = value.aliases().iterator();
while (ita.hasNext()) {
String alias = (String)ita.next();
System.out.print(alias);
System.out.print(" ");
}
System.out.println();
}
String s = "Hello";
try {
Charset cs = Charset.forName("UnicodeLittleUnmarked");
System.out.println(cs);
}
catch(Exception ex) {
ex.printStackTrace();
}
byte[] ba = s.getBytes("UnicodeLittleUnmarked");
for (int i = 0; i < ba.length; i++) {
System.out.println("ba[" + i + "] = " + ba[i]);
}
}
catch (Exception ex) {
ex.printStackTrace();
}
}
}