JDK-6795537 : UTF_8$Decoder returns wrong results
  • Type: Bug
  • Component: core-libs
  • Sub-Component: java.nio.charsets
  • Affected Version: 7
  • Priority: P4
  • Status: Resolved
  • Resolution: Fixed
  • OS: windows_xp
  • CPU: x86
  • Submitted: 2009-01-20
  • Updated: 2014-04-30
  • Resolved: 2014-04-09
The Version table provides details related to the release that this issue/RFE will be addressed.

Unresolved : Release in which this issue/RFE will be addressed.
Resolved: Release in which this issue/RFE has been resolved.
Fixed : Release in which this issue/RFE has been fixed. The release containing this fix may be available for download as an Early Access Release or a General Availability Release.

To download the current JDK release, click here.
JDK 8
8Fixed
Description
FULL PRODUCT VERSION :
java version "1.7.0-ea"
Java(TM) SE Runtime Environment (build 1.7.0-ea-b43)
Java HotSpot(TM) Client VM (build 14.0-b10, mixed mode, sharing)

ADDITIONAL OS VERSION INFORMATION :
Windows XP SR-2

A DESCRIPTION OF THE PROBLEM :
UTF_8$Decoder returns wrong results, see results ...

STEPS TO FOLLOW TO REPRODUCE THE PROBLEM :
run source

EXPECTED VERSUS ACTUAL BEHAVIOR :
EXPECTED -
new byte[]{(byte)0xC0} ---> CoderResult.malformedForLength(1)
new byte[]{(byte)0xE1, (byte)0x40 ---> CoderResult.malformedForLength(1)
new byte[]{(byte)0xE1, (byte)0x80, (byte)0x42} ---> CoderResult.malformedForLength(1)

ACTUAL -
new byte[]{(byte)0xC0} ---> CoderResult..UNDERFLOW
new byte[]{(byte)0xE1, (byte)0x40 ---> CoderResult.UNDERFLOW
new byte[]{(byte)0xE1, (byte)0x80, (byte)0x42} ---> CoderResult.malformedForLength(2)


REPRODUCIBILITY :
This bug can be reproduced always.

---------- BEGIN SOURCE ----------
package java.nio.charset;

import java.lang.reflect.*;
import java.nio.*;

/**
 *
 * @author Ulf Zibis <Ulf.Zibis at CoSoCo.de>
 */
public class TestCharsetDecoder extends CharsetDecoder {

    public CharsetDecoder decoder;
    public Class<CharsetDecoder> decoderClass;
    public Method decodeArrayLoopMethod;
    public Method decodeBufferLoopMethod;

    public TestCharsetDecoder(CharsetDecoder decoder) throws Exception {
        super(decoder.charset(), decoder.averageCharsPerByte(), decoder.maxCharsPerByte());
        this.decoder = decoder;
        decoderClass = (Class<CharsetDecoder>)Class.forName(decoder.charset().getClass().getName()+"$Decoder");
        decodeArrayLoopMethod = decoderClass.getDeclaredMethod(
                "decodeArrayLoop", ByteBuffer.class, CharBuffer.class);
        decodeArrayLoopMethod.setAccessible(true);
        decodeBufferLoopMethod = decoderClass.getDeclaredMethod(
                "decodeBufferLoop", ByteBuffer.class, CharBuffer.class);
        decodeBufferLoopMethod.setAccessible(true);
    }

    public CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
        return decoder.decodeLoop(in, out);
    }

    public CoderResult decodeArrayLoop(ByteBuffer in, CharBuffer out) throws Exception {
        return (CoderResult)decodeArrayLoopMethod.invoke(decoder, in, out);
    }

    public CoderResult decodeBufferLoop(ByteBuffer in, CharBuffer out) throws Exception {
        return (CoderResult)decodeBufferLoopMethod.invoke(decoder, in, out);
    }
}

package sun.nio.cs;

import java.io.*;
import java.nio.*;
import java.nio.charset.*;
import java.util.*;
import org.junit.*;
import static org.junit.Assert.*;
import static org.junit.Assume.*;
import org.junit.runner.*;
import org.junit.runners.*;
import org.junit.runners.Parameterized.*;

/**
 *
 * @author Ulf.Zibis @ CoSoCo.de
 */
@RunWith(Parameterized.class)
public class UTF_8Test {

    // test parameters:
    private static final CoderResult MAL_1 = CoderResult.malformedForLength(1);
    private static final CoderResult MAL_2 = CoderResult.malformedForLength(2);
    private static final CoderResult MAL_3 = CoderResult.malformedForLength(3);
    private static final CoderResult UFLOW = CoderResult.UNDERFLOW;
    private static final Object[][] PARAMETERS = new Object[][]{
    // samples:
        /*   0 */ { new byte[]{(byte)0xC0, (byte)0x40}, 0, MAL_1 }, // !UTF, 'A'
        /*   1 */ { new byte[]{(byte)0xC1, (byte)0x41}, 0, MAL_1 }, // !UTF, 'B'
        /*   2 */ { new byte[]{(byte)0xC2, (byte)0x42}, 0, MAL_2 }, // UTF_21, 'C'
        /*   3 */ { new byte[]{(byte)0xC3, (byte)0xA0}, 2, UFLOW }, // UTF_21, UTF_22
        /*   4 */ { new byte[]{(byte)0xC0}, 0, MAL_1 }, // !UTF
        /*   5 */ { new byte[]{(byte)0xC1}, 0, MAL_1 }, // !UTF
        /*   6 */ { new byte[]{(byte)0xC2}, 0, UFLOW }, // UTF_21
        /*   7 */ { new byte[]{(byte)0xE0, (byte)0x80, (byte)0x80}, 0, MAL_2 }, // UTF_31, !UTF_32, UTF_33
        /*   8 */ { new byte[]{(byte)0xE1, (byte)0x80, (byte)0x42}, 0, MAL_3 }, // UTF_31, UTF_32, 'C'
        /*   9 */ { new byte[]{(byte)0xE0, (byte)0xA0, (byte)0x80}, 3, UFLOW }, // UTF_31, UTF_32, UTF_33
        /*  10 */ { new byte[]{(byte)0xE1, (byte)0x80, (byte)0x81}, 3, UFLOW }, // UTF_31, UTF_32, UTF_33
        /*  11 */ { new byte[]{(byte)0xE1, (byte)0x40}, 0, MAL_2 }, // UTF_31, 'A'
        /*  12 */ { new byte[]{(byte)0xE0, (byte)0xA0}, 0, UFLOW }, // UTF_31, UTF_32
        /*  13 */ { new byte[]{(byte)0xE1}, 0, UFLOW }, // UTF_31
        /*  14 */ { new byte[]{(byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80}, 0, MAL_1 }, // 5 * !UTF
        /*  15 */ { new byte[]{(byte)0xC0, (byte)0xC0, (byte)0xC0, (byte)0xC0, (byte)0xC0}, 0, MAL_1 }, // 5 * !UTF
        /*  16 */ { new byte[]{(byte)0xC0, (byte)0xC0, (byte)0x42}, 0, MAL_1 }, // !UTF, !UTF, 'C'
//        /*   0 */ { new char[] {'\u0041', '\u0042', '\uFFFE'} },
//        /*   1 */ { new char[] {'\u0041', '\u0042', '\uFFFF'} },
    };
    private static final int PROCESS_SINGLE = -1;
    private static int parametersInTest = 0;
    // parameters:
    private static Charset cs;
    private static ByteBuffer inBytes;
    private static int expectedPos;
    private static CoderResult expected;
//    private static CharBuffer inChars;
    // temp:
    private TestCharsetDecoder decoder;
//    private CharsetEncoder encoder;
    boolean ok = false;
    // results:
    private CharBuffer outChars;
//    private ByteBuffer outBytes;

    public UTF_8Test(byte[] inBytes, int position, CoderResult result) throws IOException {
        byte[] previous = null;
        if (this.inBytes != null)
            previous = this.inBytes.array();

        cs = Charset.forName("UTF-8");
        this.inBytes = ByteBuffer.wrap(inBytes);
//        this.inChars = CharBuffer.wrap(inChars);
        expectedPos = position;
        expected = result;
        if (previous != inBytes) {
//            System.out.println("\n>>> PARAMETERS["+(parametersInTest++)+"]: "+Arrays.toString(inBytes));
            System.out.print("\n>>> PARAMETERS["+(parametersInTest++)+"]: [");
            for (int i=0; i<inBytes.length; i++)
                System.out.print(Integer.toBinaryString(inBytes[i]&0xFF)+(i<inBytes.length-1 ? ", " :""));
            System.out.println("]");
//            System.out.printf(" [%h, %h]%n", new Integer(3), new Integer(-3));
//            System.out.printf(" [%X, %X, %X, %X, %X]%n", inBytes);
//            System.out.printf(" [%X, %X, %X, %X, %X]%n", inBytes[0], inBytes[1]);
        }
    }

    @Parameters
    public static Collection data() {
        List parameters = new ArrayList(Arrays.asList(PROCESS_SINGLE < 0 ?
            PARAMETERS : new Object[]{PARAMETERS[PROCESS_SINGLE]}));
        return parameters;
    }

    @Before
    public void setUp() throws Exception {
        decoder = new TestCharsetDecoder(cs.newDecoder());
        outChars = CharBuffer.allocate((int)(inBytes.capacity()*decoder.maxCharsPerByte()));
        inBytes.rewind();
//        assumeTrue(cs.canEncode());
//        encoder = cs.newEncoder();
//        outBytes = ByteBuffer.allocate((int)(inChars.capacity()*encoder.maxBytesPerChar()));
    }

    @After
    public void tearDown() throws Exception {
        System.out.println(ok ? " > OK" : "");
    }

    @Ignore
    @Test
    public void testDecodeArrayLoop() throws Exception {
        System.out.print("NOW TEST > ["+decoder.decoderClass.getName()+"] decodeArrayLoop");
        CoderResult result = null;
        result = decoder.decodeArrayLoop(inBytes, outChars);
        int position = inBytes.position();
        assertEquals("position", expectedPos, position);
        assertEquals("decodeArrayLoop()", expected, result);
        if (result.isOverflow())
            result.throwException();
        ok = true;
    }

    @Ignore
    @Test
    public void testDecodeBufferLoop() throws Exception {
        System.out.print("NOW TEST > ["+decoder.decoderClass.getName()+"] decodeBufferLoop");
        CoderResult result = null;
        result = decoder.decodeBufferLoop(inBytes, outChars);
        int position = inBytes.position();
        assertEquals("position", expectedPos, position);
        assertEquals("decodeBufferLoop()", expected, result);
        if (result.isOverflow())
            result.throwException();
        ok = true;
    }

    @Ignore
    @Test
    public void testDecodeLoop() throws CharacterCodingException {
        System.out.print("NOW TEST > ["+decoder.decoderClass.getName()+"] decodeLoop");
        CoderResult result = null;
        result = decoder.decodeLoop(inBytes, outChars);
        int position = inBytes.position();
        assertEquals("position", expectedPos, position);
        assertEquals("decodeLoop()", expected, result);
        if (result.isOverflow())
            result.throwException();
        ok = true;
    }

    @Test
    public void testDecoder() throws CharacterCodingException {
        System.out.print("NOW TEST > ["+decoder.decoderClass.getName()+"] decode");
        CoderResult result = null;
        result = decoder.decode(inBytes, outChars, false);
        int position = inBytes.position();
        assertEquals("position", expectedPos, position);
//        assertEquals("decode()", expected, result); // for pessimistic estimation
        if (expected.isMalformed())
            assertEquals("decode()", MAL_1, result);
        else
            assertEquals("decode()", expected, result);
        if (result.isOverflow())
            result.throwException();
//        inBytes.rewind();
//        outChars.clear();
        result = decoder.decode(inBytes, outChars, true);
        position = inBytes.position();
        assertEquals("position", expectedPos, position);
        if (expected.isUnderflow() && inBytes.hasRemaining())
            assertEquals("decode()", CoderResult.malformedForLength(inBytes.remaining()), result);
        else if (expected.isMalformed())
            assertEquals("decode()", MAL_1, result);
        else
            assertEquals("decode()", expected, result);
        if (result.isOverflow())
            result.throwException();
        ok = true;
    }
}

---------- END SOURCE ----------

Comments
This one should have been fixed in jdk8. new byte[]{(byte)0xC0} ---> CoderResult.malformedForLength(1) new byte[]{(byte)0xE1, (byte)0x40 ---> CoderResult.malformedForLength(1) new byte[]{(byte)0xE1, (byte)0x80, (byte)0x42} ---> CoderResult.malformedForLength(2) #The unicode standard's "Maximal subpart of an ill-formed subsequence" requests case (3) to return 2.
09-04-2014

EVALUATION (1) & (2) are the cases that the implementation checks the sl/sp first then the validation. It would be ended of a "malformed" CR if there were bytes after the leading byte (with the assumption that we are going to have a possible 2 bytes form, then the validation fails...) . It looks like the current implementation brings us better performance and the fact that we have been doing this for years (read compatibility) I would prefer to keep it as-is. new byte[]{(byte)0xC0} ---> CoderResult.malformedForLength(1) vs new byte[]{(byte)0xC0} ---> CoderResult..UNDERFLOW (3) new byte[]{(byte)0xE1, (byte)0x80, (byte)0x42} ---> CoderResult.malformedForLength(1) vs new byte[]{(byte)0xE1, (byte)0x80, (byte)0x42} ---> we had similar discussion before regarding the "legnth" of malformed.
20-01-2009