A DESCRIPTION OF THE REQUEST : Often the high 16 bit of a Unicode code point are given as static final constant. This method would ensure HotSpot compiler to optimize better in compare of using Character.toSurrogates(int,char[],int). Have in mind, that sometimes only the high or low surrogate is needed, so pair highSurrogate(), lowSurrogate() would be more flexible than toSurrogates(). JUSTIFICATION : Compare the disassembly outputs (sections EXPECTED, ACTUAL) of HotSpot compiler for the given code snippets. 1.) Character.highSurrogate(char,char) reduces register pressure (needs 1 less CPU register). 2.) Character.highSurrogate(char,char) would perform better (needs 2 less CPU instructions). Aside application progammers, many charset decoders in package sun.nio.cs would benefit from. EXPECTED VERSUS ACTUAL BEHAVIOR : EXPECTED - char[] C2 = new char[2]; void decode(char c) { char[] _C2 = C2; _C2[0] = Character.highSurrogate((char)0x2, c); _C2[1] = Character.lowSurrogate(c); } 0x00b8c3e2: mov 0x2c(%esi),%ebx ;*getfield C2 0x00b8c3e5: mov %edi,%ebp 0x00b8c3e7: shr $0xa,%ebp 0x00b8c3ea: add $0xd840,%ebp ;*iadd ; (0x2 << 6) + MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10) 0x00b8c3f0: mov 0x8(%ebx),%ecx ; implicit exception: dispatches to 0x00b8c4c1 0x00b8c3f3: cmp $0x1,%ecx 0x00b8c3f6: jbe 0x00b8c46d 0x00b8c3f8: mov %bp,0xc(%ebx) ;*castore 0x00b8c3fc: and $0x3ff,%edi 0x00b8c402: or $0xdc00,%edi 0x00b8c408: mov %di,0xe(%ebx) ;*castore ACTUAL - char[] C2 = new char[2]; void decode(char c) { Character.toSurrogates(0x20000 + c, C2, 0); } 0x00b89562: mov 0x2c(%esi),%eax ;*getfield C2 0x00b89565: mov %edi,%ecx 0x00b89567: add $0x20000,%ecx ;*iadd 0x00b8956d: mov %ecx,%ebp 0x00b8956f: shr $0xa,%ebp 0x00b89572: add $0xd7c0,%ebp ;*iadd ; MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10) 0x00b89578: mov 0x8(%eax),%ebx ; implicit exception: dispatches to 0x00b89649 0x00b8957b: cmp $0x1,%ebx 0x00b8957e: jbe 0x00b895f5 0x00b89580: mov %bp,0xc(%eax) ;*castore 0x00b89584: and $0x3ff,%ecx 0x00b8958a: or $0xdc00,%ecx 0x00b89590: mov %cx,0xe(%eax) ;*castore ---------- BEGIN SOURCE ---------- /** * Converts the specified character (Unicode code point) to the 2 * {@code char}s of it's UTF-16 representation as surrogate pair. * * <p><b>Note:</b> Does not check, if code point is outside illegal range, * i.e. U+D800 ... U+DFFF or > U+10FFFF, and, in case, returns invalid result. * To avoid this, check this by {@link #isSurrogate(char)} and * {@link #isValidCodePoint(int)} method before. * * @param codePoint a Unicode code point * @param dst an array of {@code char} in which the {@code codePoint}'s * UTF-16 surrogate pair is stored. * @param dstIndex the start index into the {@code dst} array where the * converted value is stored. * @return the number of written chars, always 2 * @throws IndexOutOfBoundsException if {@code dstIndex} is negative or if * {@code dst} at {@code dstIndex} doesn't have enough array * elements to store the 2 {@code char} values. */ static int toSurrogates(int codePoint, char[] dst, int dstIndex) { // We write elements "backwards" to guarantee all-or-nothing dst[dstIndex+1] = lowSurrogate(codePoint); dst[dstIndex] = highSurrogate(codePoint); return 2; } /** * Converts the specified character (Unicode code point) to the 1st * {@code char} of it's UTF-16 representation as surrogate pair. * * <p><b>Note:</b> Does not check, if code point is outside valid UTF-16 * surrogate representation range, i.e. < U+10000 or > U+10FFFF, * and, in case, returns invalid result. * To avoid this, check this by {@link #isSurrogate(char)} and * {@link #isValidCodePoint(int)} method before. * * @param codePoint a Unicode code point * @return the 1st {@code char} of {@code codePoint}'s UTF-16 representation. * @since 1.7 */ public static char highSurrogate(int codePoint) { return (char)((codePoint >>> 10) + (MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))); } /** * Converts the specified character (Unicode code point) to the 1st * {@code char} of it's UTF-16 representation as surrogate pair. * I case of having {@code highCPWord} as a {@code static final} constant, * this method ensures optimized performance in case of virtual machine's * byte code compiler inlining. * * <p><b>Note:</b> Does not check, if code point is outside illegal range, * i.e. U+D800 ... U+DFFF or > U+10FFFF, and, in case, returns invalid result. * To avoid this, check this by {@link #isSurrogate(char)} and * {@link #isValidCodePoint(int)} method before. * * @param highCPWord high 16 bit of a Unicode code point * @param lowCPWord low 16 bit of a Unicode code point * @return the 1st {@code char} of {@code codePoint}'s UTF-16 representation. * @since 1.7 */ public static char highSurrogate(char highCPWord, char lowCPWord) { return (char)((lowCPWord >>> 10) + ((highCPWord << 6) + MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))); } /** * Converts the specified character (Unicode code point) to the 2nd * {@code char} of it's UTF-16 representation as surrogate pair. * * <p><b>Note:</b> Does not check, if code point is outside illegal range, * i.e. U+D800 ... U+DFFF or > U+10FFFF, and, in case, returns invalid result. * To avoid this, check this by {@link #isSurrogate(char)} and * {@link #isValidCodePoint(int)} method before. * * @param codePoint a Unicode code point * @return the 2nd {@code char} of {@code codePoint}'s UTF-16 representation. * @since 1.7 */ public static char lowSurrogate(int codePoint) { return (char)((codePoint & 0x03FF) | MIN_LOW_SURROGATE); } ALTERNATIVE: /** * Converts the specified character (Unicode code point) to the 2 * {@code char}s of it's UTF-16 representation as surrogate pair. * I case of having {@code highCPWord} as a {@code static final} constant, * this method ensures optimized performance in case of virtual machine's * byte code compiler inlining. * * <p><b>Note:</b> Does not check, if code point is outside illegal range, * i.e. U+D800 ... U+DFFF or > U+10FFFF, and, in case, returns invalid result. * To avoid this, check this by {@link #isSurrogate(char)} and * {@link #isValidCodePoint(int)} method before. * * @param highCPWord high 16 bit of a Unicode code point * @param lowCPWord low 16 bit of a Unicode code point * @param dst an array of {@code char} in which the {@code codePoint}'s * UTF-16 surrogate pair is stored. * @param dstIndex the start index into the {@code dst} array where the * converted value is stored. * @return the number of written chars, always 2 * @throws IndexOutOfBoundsException if {@code dstIndex} is negative or if * {@code dst} at {@code dstIndex} doesn't have enough array * elements to store the 2 {@code char} values. */ static int toSurrogates(char highCPWord, char lowCPWord, char[] dst, int dstIndex) { // We write elements "backwards" to guarantee all-or-nothing dst[dstIndex+1] = (char)((lowCPWord & 0x03FF) | MIN_LOW_SURROGATE); dst[dstIndex] = (char)((lowCPWord >>> 10) + ((highCPWord << 6) + MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))); return 2; } ---------- END SOURCE ----------
|