JDK-8073700 : XMLStreamWriter outputs Unicode extended characters (non-BMP) incorrectly
  • Type: Bug
  • Component: xml
  • Sub-Component: javax.xml.stream
  • Affected Version: 8u31
  • Priority: P4
  • Status: Closed
  • Resolution: Duplicate
  • OS: windows_7
  • CPU: x86_64
  • Submitted: 2015-02-21
  • Updated: 2016-05-25
  • Resolved: 2016-05-25
Related Reports
Duplicate :  
Description
FULL PRODUCT VERSION :
java version "1.8.0_31"
Java(TM) SE Runtime Environment (build 1.8.0_31-b13)
Java HotSpot(TM) 64-Bit Server VM (build 25.31-b07, mixed mode)

ADDITIONAL OS VERSION INFORMATION :
Microsoft Windows [Version 6.1.7601]

A DESCRIPTION OF THE PROBLEM :
Outputting character data using XMLStreamwriter leads to incorrect results with unicode characters not in the BMP (Character.charCount(int) != 1).

STEPS TO FOLLOW TO REPRODUCE THE PROBLEM :
Run the following example, which attempts to write the character U+10480 (𐒀���) wrapped in a tag <el>

import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;

import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;

public class XmlStreamWriterExtendedCharactersFail {
    public static void main(String[] args) throws XMLStreamException {
        String inlineStr = "inlineStr = 𐒀���";
        // create string using StringBuilder to avoid Java file encoding confusion:
        String sbStr = new StringBuilder("sbStr = ").appendCodePoint(0x10480).toString();
        assert sbStr.equals(inlineStr);
        System.out.println(sbStr);

        OutputStreamWriter outWriter = new OutputStreamWriter(System.out,
                StandardCharsets.UTF_8.newEncoder());
        XMLStreamWriter writer = XMLOutputFactory.newFactory()
                .createXMLStreamWriter(outWriter);
        writer.writeStartDocument("UTF-8", "1.1");
        writer.writeStartElement("el");
        writer.writeCharacters(sbStr);
        writer.writeEndElement();
        writer.writeEndDocument();
        writer.close();
    }
}

EXPECTED VERSUS ACTUAL BEHAVIOR :
EXPECTED -
The following output:

sbStr = 𐒀���
<?xml version="1.1" encoding="UTF-8"?><el>sbStr = 𐒀���</el>
ACTUAL -
sbStr = 𐒀���
<?xml version="1.1" encoding="UTF-8"?><el>sbStr = &#xd801;&#xdc80;</el>

Note that &#xd801;&#xdc80; are invalid code points and will lead to an error when parsing with SAX.

REPRODUCIBILITY :
This bug can be reproduced always.

---------- BEGIN SOURCE ----------
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;

import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;

public class XmlStreamWriterExtendedCharactersFail {
    public static void main(String[] args) throws XMLStreamException {
        String inlineStr = "inlineStr = 𐒀���";
        // create string using StringBuilder to avoid Java file encoding confusion:
        String sbStr = new StringBuilder("sbStr = ").appendCodePoint(0x10480).toString();
        assert sbStr.equals(inlineStr);
        System.out.println(sbStr);

        OutputStreamWriter outWriter = new OutputStreamWriter(System.out,
                StandardCharsets.UTF_8.newEncoder());
        XMLStreamWriter writer = XMLOutputFactory.newFactory()
                .createXMLStreamWriter(outWriter);
        writer.writeStartDocument("UTF-8", "1.1");
        writer.writeStartElement("el");
        writer.writeCharacters(sbStr);
        writer.writeEndElement();
        writer.writeEndDocument();
        writer.close();
    }
}
---------- END SOURCE ----------

CUSTOMER SUBMITTED WORKAROUND :
workaround: wrapper around XMLStreamWriter:

import java.io.IOException;
import java.io.Writer;
import java.nio.CharBuffer;

import javax.xml.namespace.NamespaceContext;
import javax.xml.stream.FactoryConfigurationError;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;

import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.text.translate.CharSequenceTranslator;

public class UnicodeXMLStreamWriter implements XMLStreamWriter {
    private XMLStreamWriter xmlSW;
    private Writer writer;
    private CharSequenceTranslator characterEscapor = StringEscapeUtils.ESCAPE_XML11;

    public static UnicodeXMLStreamWriter newInstance(Writer writer) throws XMLStreamException, FactoryConfigurationError {
        return newInstance(writer, XMLOutputFactory.newFactory());
    }

    public static UnicodeXMLStreamWriter newInstance(Writer writer, XMLOutputFactory factory) throws XMLStreamException {
        XMLStreamWriter xmlSW = factory.createXMLStreamWriter(writer);
        return new UnicodeXMLStreamWriter(writer, xmlSW);
    }

    public UnicodeXMLStreamWriter(Writer writer, XMLStreamWriter xmlSW) {
        this.writer = writer;
        this.xmlSW = xmlSW;
    }

    public void writeCharacters(String text) throws XMLStreamException {
        // finish writing start element
        xmlSW.writeCharacters("");
        try {
            characterEscapor.translate(text, writer);
        } catch (IOException e) {
            throw new XMLStreamException(e);
        }
    }

    public void writeCharacters(char[] text, int start, int len)
            throws XMLStreamException {
        // finish writing start element
        xmlSW.writeCharacters("");
        try {
            characterEscapor.translate(CharBuffer.wrap(text, start, len), writer);
        } catch (IOException e) {
            throw new XMLStreamException(e);
        }
    }

    //////////////// REMAINING METHODS ARE DELEGATES to xmlSW ////////////////

    public void writeStartElement(String localName) throws XMLStreamException {
        xmlSW.writeStartElement(localName);
    }

    public void writeStartElement(String namespaceURI, String localName)
            throws XMLStreamException {
        xmlSW.writeStartElement(namespaceURI, localName);
    }

    public void writeStartElement(String prefix, String localName,
            String namespaceURI) throws XMLStreamException {
        xmlSW.writeStartElement(prefix, localName, namespaceURI);
    }

    public void writeEmptyElement(String namespaceURI, String localName)
            throws XMLStreamException {
        xmlSW.writeEmptyElement(namespaceURI, localName);
    }

    public void writeEmptyElement(String prefix, String localName,
            String namespaceURI) throws XMLStreamException {
        xmlSW.writeEmptyElement(prefix, localName, namespaceURI);
    }

    public void writeEmptyElement(String localName) throws XMLStreamException {
        xmlSW.writeEmptyElement(localName);
    }

    public void writeEndElement() throws XMLStreamException {
        xmlSW.writeEndElement();
    }

    public void writeEndDocument() throws XMLStreamException {
        xmlSW.writeEndDocument();
    }

    public void close() throws XMLStreamException {
        xmlSW.close();
    }

    public void flush() throws XMLStreamException {
        xmlSW.flush();
    }

    public void writeAttribute(String localName, String value)
            throws XMLStreamException {
        xmlSW.writeAttribute(localName, value);
    }

    public void writeAttribute(String prefix, String namespaceURI,
            String localName, String value) throws XMLStreamException {
        xmlSW.writeAttribute(prefix, namespaceURI, localName, value);
    }

    public void writeAttribute(String namespaceURI, String localName,
            String value) throws XMLStreamException {
        xmlSW.writeAttribute(namespaceURI, localName, value);
    }

    public void writeNamespace(String prefix, String namespaceURI)
            throws XMLStreamException {
        xmlSW.writeNamespace(prefix, namespaceURI);
    }

    public void writeDefaultNamespace(String namespaceURI)
            throws XMLStreamException {
        xmlSW.writeDefaultNamespace(namespaceURI);
    }

    public void writeComment(String data) throws XMLStreamException {
        xmlSW.writeComment(data);
    }

    public void writeProcessingInstruction(String target)
            throws XMLStreamException {
        xmlSW.writeProcessingInstruction(target);
    }

    public void writeProcessingInstruction(String target, String data)
            throws XMLStreamException {
        xmlSW.writeProcessingInstruction(target, data);
    }

    public void writeCData(String data) throws XMLStreamException {
        xmlSW.writeCData(data);
    }

    public void writeDTD(String dtd) throws XMLStreamException {
        xmlSW.writeDTD(dtd);
    }

    public void writeEntityRef(String name) throws XMLStreamException {
        xmlSW.writeEntityRef(name);
    }

    public void writeStartDocument() throws XMLStreamException {
        xmlSW.writeStartDocument();
    }

    public void writeStartDocument(String version) throws XMLStreamException {
        xmlSW.writeStartDocument(version);
    }

    public void writeStartDocument(String encoding, String version)
            throws XMLStreamException {
        xmlSW.writeStartDocument(encoding, version);
    }

    public String getPrefix(String uri) throws XMLStreamException {
        return xmlSW.getPrefix(uri);
    }

    public void setPrefix(String prefix, String uri) throws XMLStreamException {
        xmlSW.setPrefix(prefix, uri);
    }

    public void setDefaultNamespace(String uri) throws XMLStreamException {
        xmlSW.setDefaultNamespace(uri);
    }

    public void setNamespaceContext(NamespaceContext context)
            throws XMLStreamException {
        xmlSW.setNamespaceContext(context);
    }

    public NamespaceContext getNamespaceContext() {
        return xmlSW.getNamespaceContext();
    }

    public Object getProperty(String name) throws IllegalArgumentException {
        return xmlSW.getProperty(name);
    }
}


Comments
The reported problem was resolved by JDK-8145974. Provided reproducer now generates correct character reference for Unicode pair: &#x10480;
25-05-2016

In Windows 7 (NetBeans) Checked this with JDK 8u25, 8u31, 8u40 ea b23 and 9 ea b50 and could reproduce this issue.
24-02-2015