Name: rlT66838 Date: 07/06/99
1 - Compile and run the included program.
2 - program is included
3 - JDK 1.1 and 1.2 report surrogates as EOF ... wrong!!
JDK 1.1 moreover doesn't accept the standard "UTF-8" name.
4 - trace information is irrelevant
5 - Classic VM (build JDK-1.2.1-A, native threads)
... and on many other VMs
6 - no other data should be relevant ... except that I've known
about surrogate handling bugs in the JDK for some time, and
only recently had a reason to try to come up with a test
case that'd demonstrate them.
The "test4" encoded pairs are mostly, if not completely,
correct; they came from James Clark's XML test suite. If
there's an error there I'd assume it's in the decodings I
assigned to them ... but having eyeballed this carefully
and double checked using some other folks' work, I think
that is also correct.
REPRODUCE USING:
----------------
/**
* This program is subject to the terms of the
* GNU Library General Public License (LGPL) version 2.0
*
* It may freely be copied and modified, so long as the original
* licencing terms are not removed.
*/
import java.io.*;
/**
* Processes some UTF-8 test data java.io character conversion support for
* the UTF-8 encodings, then gives that character conversion support an
* overall pass or fail rating. It also sanity checks whether the decodings
* are lenient (accepting some erroneous encodings).
*
* <P> Some of the test cases here are taken from standard XML test suites;
* UTF-8 is one of the two encodings XML processors must support, so this
* encoding support should be very correct to support next generation
* web (and internet) applications with maximal interoperability. (Also, it
* should be fast -- the JDK 1.1 and 1.2 sun.io converters are slow.)
*
* <P> Note that JDK 1.1 and JDK 1.2 don't currently pass these tests;
* there are known problems in UTF-8 surrogate support at this time.
*
* @author David Brownell (###@###.###)
* @version July 1, 1999
*/
public class utf8
{
//
// "UTF-8" is the only IANA registered encoding name. However,
// JDK 1.1.x and some other JVM implementations only accept a
// Java-proprietary encoding name ("UTF8") ... for compatibility,
// that name should also be tested, but for correctness it's
// an error if that's the only name that's supported.
//
private static String encodingName = "UTF-8";
//
// Positive tests -- test both output and input processing against
// various "known good" data
//
private static boolean positive (
byte encoded [],
char decoded [],
String label
) {
boolean retval = true;
int i = 0;
try {
//
// Ensure that writing encodes correctly
//
ByteArrayOutputStream out;
OutputStreamWriter writer;
byte result [];
out = new ByteArrayOutputStream ();
writer = new OutputStreamWriter (out, encodingName);
writer.write (decoded);
writer.close ();
result = out.toByteArray ();
if (result.length != encoded.length) {
System.err.println (label + ": write length wrong, "
+ result.length
+ " (should be " + encoded.length + ")");
retval = false;
}
for (i = 0; i < encoded.length && i < result.length; i++) {
if (encoded [i] != result [i]) {
System.err.println (label + ": result [" + i + "] = 0x"
+ Integer.toHexString (0x0ff & result [i])
+ ", should be 0x"
+ Integer.toHexString (0x0ff & encoded [i]));
retval = false;
}
}
//
// Ensure that reading decodes correctly
//
ByteArrayInputStream in;
InputStreamReader reader;
in = new ByteArrayInputStream (encoded);
reader = new InputStreamReader (in, encodingName);
for (i = 0; i < decoded.length; i++) {
int c = reader.read ();
if (c != decoded [i]) {
System.err.print (label + ": read failed, char " + i);
System.err.print (" ... expected 0x"
+ Integer.toHexString (decoded [i]));
if (c == -1)
System.err.println (", got EOF");
else
System.err.println (", got 0x"
+ Integer.toHexString (c));
retval = false;
if (c == -1)
return retval;
}
}
if (reader.read () != -1) {
System.err.println (label + ": read failed, no EOF");
return false;
}
} catch (Exception e) {
System.err.println (label + ": failed "
+ "(i = " + i + "), "
+ e.getClass ().getName ()
+ ", " + e.getMessage ());
// e.printStackTrace ();
return false;
}
return retval;
}
//
// Negative tests -- only for input processing, make sure that
// invalid or corrupt characters are rejected.
//
private static boolean negative (byte encoded [], String label)
{
try {
ByteArrayInputStream in;
InputStreamReader reader;
int c;
in = new ByteArrayInputStream (encoded);
reader = new InputStreamReader (in, encodingName);
c = reader.read ();
System.err.print (label + ": read failed, ");
if (c == -1)
System.err.println ("reported EOF");
else
System.err.println ("returned char 0x"
+ Integer.toHexString (c)
+ ", expected exception");
return false;
} catch (CharConversionException e) {
return true;
} catch (Throwable t) {
System.err.println (label + ": failed, threw "
+ t.getClass ().getName ()
+ ", " + t.getMessage ());
}
return false;
}
//
// TEST #0: Examples from RFC 2279
// This is a positive test.
//
private static byte test0_bytes [] = {
// A<NOT IDENTICAL TO><ALPHA>.
(byte)0x41,
(byte)0xE2, (byte)0x89, (byte)0xA2,
(byte)0xCE, (byte)0x91,
(byte)0x2E,
// Korean word "hangugo"
(byte)0xED, (byte)0x95, (byte)0x9C,
(byte)0xEA, (byte)0xB5, (byte)0xAD,
(byte)0xEC, (byte)0x96, (byte)0xB4,
// Japanese word "nihongo"
(byte)0xE6, (byte)0x97, (byte)0xA5,
(byte)0xE6, (byte)0x9C, (byte)0xAC,
(byte)0xE8, (byte)0xAA, (byte)0x9E
};
private static char test0_chars [] = {
// A<NOT IDENTICAL TO><ALPHA>.
0x0041, 0x2262, 0x0391, 0x002e,
// Korean word "hangugo"
0xD55C, 0xAD6D, 0xC5B4,
// Japanese word "nihongo"
0x65E5, 0x672C, 0x8A9E
};
//
// From RFC 2279, the ranges which define the values we focus some
// "organized" testing on -- test each boundary, and a little on each
// side of the boundary.
//
// Note that some encodings are errors: the shortest encoding must be
// used. On the "be lenient in what you accept" principle, those not
// tested as input cases; on the "be strict in what you send" principle,
// they are tested as output cases instead.
//
// UCS-4 range (hex.) UTF-8 octet sequence (binary)
// 0000 0000-0000 007F 0xxxxxxx
// 0000 0080-0000 07FF 110xxxxx 10xxxxxx
// 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
//
// 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// 0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx
//
//
// TEST #1: One byte encoded values. Works just like ASCII; these
// values were chosen for boundary testing. This is a positive test.
//
// 0000 0000-0000 007F 0xxxxxxx
//
private static byte test1_bytes [] = {
(byte) 0x00, (byte) 0x01, (byte) 0x7e, (byte) 0x7f
};
private static char test1_chars [] = {
0x0000, 0x0001, 0x007e, 0x007f
};
//
// TEST #2: Two byte encoded values, chosen for boundary testing.
// This is a positive test.
//
// 0000 0080-0000 07FF 110xxxxx 10xxxxxx
//
// Encodings CX bb, with X = 0 or 1 and 'b' values irrelevant,
// should have used a shorter encoding.
//
private static byte test2_bytes [] = {
(byte) 0xc2, (byte) 0x80,
(byte) 0xc2, (byte) 0x81,
(byte) 0xc3, (byte) 0xa0,
(byte) 0xdf, (byte) 0xbe,
(byte) 0xdf, (byte) 0xbf
};
private static char test2_chars [] = {
0x0080,
0x0081,
0x00E0,
0x07FE,
0x07FF
};
//
// TEST #3: Three byte encoded values, chosen for boundary testing.
// This is a positive test.
//
// 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
//
// Encodings EO Xb bb, with X = 8 or 9 and 'b' values irrelevant,
// should have used a shorter encoding.
//
private static byte test3_bytes [] = {
(byte) 0xe0, (byte) 0xa0, (byte) 0x80,
(byte) 0xe0, (byte) 0xa0, (byte) 0x81,
(byte) 0xe1, (byte) 0x80, (byte) 0x80,
(byte) 0xe8, (byte) 0x80, (byte) 0x80,
(byte) 0xef, (byte) 0xbf, (byte) 0xbe,
(byte) 0xef, (byte) 0xbf, (byte) 0xbf
};
private static char test3_chars [] = {
0x0800,
0x0801,
0x1000,
0x8000,
0xFFFE,
0xFFFF
};
//
// TEST #4: Four byte encoded values, needing surrogate pairs,
// chosen for boundary testing. This is a positive test.
//
// NOTE: some four byte encodings exceed the range of Unicode
// with surrogate pairs (UTF-16); those MUST be negatively tested.
//
// 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
//
// Encodings F0 8b bb bb, where again the 'b' values are irrelevant,
// should have used a shorter encoding.
//
private static byte test4_bytes [] = {
(byte) 0xf0, (byte) 0x90, (byte) 0x80, (byte) 0x80,
(byte) 0xf0, (byte) 0x90, (byte) 0x80, (byte) 0x81,
(byte) 0xf0, (byte) 0x90, (byte) 0x88, (byte) 0x80,
(byte) 0xf0, (byte) 0x90, (byte) 0x90, (byte) 0x80,
(byte) 0xf0, (byte) 0x90, (byte) 0x8f, (byte) 0xbf,
(byte) 0xf1, (byte) 0x90, (byte) 0x8f, (byte) 0xbf,
(byte) 0xf2, (byte) 0x90, (byte) 0x8f, (byte) 0xbf,
(byte) 0xf4, (byte) 0x8f, (byte) 0xbf, (byte) 0xbf
};
private static char test4_chars [] = {
0xD800, 0xDC00,
0xD800, 0xDC01,
0xD800, 0xDE00,
0xD801, 0xDC00,
0xD800, 0xDFFF,
0xD900, 0xDFFF,
0xDA00, 0xDFFF,
0xDBFF, 0xDFFF,
};
//
// NEGATIVE TESTS:
//
// four byte encodings that are out of range for UTF-16
// as the result can't be encoded with surrogate pairs
private static byte test5_bytes []
= { (byte) 0xf7, (byte) 0x8f, (byte) 0xbf, (byte) 0xbf };
private static byte test6_bytes []
= { (byte) 0xf7, (byte) 0x8f, (byte) 0xbf, (byte) 0xbf };
private static byte test13_bytes []
= { (byte) 0xf7, (byte) 0x80, (byte) 0x80, (byte) 0x80 };
// five and six byte encodings (leniency discouraged)
private static byte test7_bytes []
= { (byte) 0xf8, (byte) 0x80, (byte) 0x80,
(byte) 0x80, (byte) 0x80 };
private static byte test8_bytes []
= { (byte) 0xf8, (byte) 0xbf, (byte) 0x80,
(byte) 0x80, (byte) 0x80 };
private static byte test9_bytes []
= { (byte) 0xfc, (byte) 0x80, (byte) 0x80,
(byte) 0x80, (byte) 0x80, (byte) 0x80 };
private static byte test10_bytes []
= { (byte) 0xfc, (byte) 0x80, (byte) 0x80,
(byte) 0x80, (byte) 0x80, (byte) 0x81 };
// orphan "extension" bytes (e.g. some ISO-8859-1 characters)
private static byte test11_bytes []
= { (byte) 0x80 };
private static byte test12_bytes []
= { (byte) 0xa9 };
//
// Just for information -- see if these cases are accepted; they're
// all errors ("too short" encodings), but ones which generally
// ought to be accepted for leniency (though see RFC 2279).
//
// three encodings of ASCII NUL
private static byte bad0_bytes []
= { (byte) 0xc0, (byte) 0x80 };
private static byte bad1_bytes []
= { (byte) 0xe0, (byte) 0x80, (byte) 0x80 };
private static byte bad2_bytes []
= { (byte) 0xf0, (byte) 0x80, (byte) 0x80, (byte) 0x80 };
// ... and other values
private static byte bad3_bytes []
= { (byte) 0xc1, (byte) 0x80 };
private static byte bad4_bytes []
= { (byte) 0xe0, (byte) 0x81, (byte) 0x80 };
private static byte bad5_bytes []
= { (byte) 0xe0, (byte) 0x90, (byte) 0x80 };
/**
* Main program to give a pass or fail rating to a JVM's UTF-8 support.
* No arguments needed.
*/
public static void main (String argv [])
{
boolean pass = true;
System.out.println ("");
System.out.println ("------ checking UTF-8 correctness ...");
try {
new InputStreamReader (System.in, "UTF-8");
} catch (Exception e) {
encodingName = "UTF8";
System.out.println ("... requires nonstandard encoding name "
+ encodingName);
pass = false;
}
//
// Positive tests -- good data is dealt with correctly
//
pass &= positive (test0_bytes, test0_chars, "RFC 2279 Examples");
pass &= positive (test1_bytes, test1_chars, "One Byte Characters");
pass &= positive (test2_bytes, test2_chars, "Two Byte Characters");
pass &= positive (test3_bytes, test3_chars, "Three Byte Characters");
pass &= positive (test4_bytes, test4_chars, "Surrogate Pairs");
//
// Negative tests -- "bad" data is dealt with correctly ... in
// this case, "bad" is just out-of-range for Unicode systems,
// rather than values encoded contrary to spec (such as NUL
// being encoded as '0xc0 0x80', not '0x00').
//
pass &= negative (test5_bytes, "Four byte range error (0)");
pass &= negative (test6_bytes, "Four byte range error (1)");
pass &= negative (test13_bytes, "Four byte range error (2)");
pass &= negative (test7_bytes, "Five byte error (0)");
pass &= negative (test8_bytes, "Five byte error (1)");
pass &= negative (test9_bytes, "Six byte error (0)");
pass &= negative (test10_bytes, "Six byte error (1)");
pass &= negative (test11_bytes, "Orphan continuation (1)");
pass &= negative (test12_bytes, "Orphan continuation (2)");
//
// PASS/FAIL status is what the whole thing is about.
//
if (pass)
System.out.println ("PASS -- UTF-8 support works right!");
else
System.out.println ("FAIL -- incorrect UTF-8 support.");
//
// Just for information (most are lenient)
//
boolean strict;
System.out.println ("");
System.out.println ("------ checking decoder leniency ...");
strict = negative (bad0_bytes, "Fat zero (0)");
strict &= negative (bad1_bytes, "Fat zero (1)");
strict &= negative (bad2_bytes, "Fat zero (2)");
strict &= negative (bad3_bytes, "Fat '@' (0)");
strict &= negative (bad4_bytes, "Fat '@' (1)");
strict &= negative (bad5_bytes, "Fat 0x0400");
if (strict)
System.out.println ("... decoder is strict.");
else
System.out.println ("... decoder is lenient.");
System.exit (pass ? 0 : 1);
}
}
(Review ID: 85136)
======================================================================