A DESCRIPTION OF THE REQUEST :
Often the high 16 bit of a Unicode code point are given as static final constant. This method would ensure HotSpot compiler to optimize better in compare of using Character.toSurrogates(int,char[],int).
Have in mind, that sometimes only the high or low surrogate is needed, so pair highSurrogate(), lowSurrogate() would be more flexible than toSurrogates().
JUSTIFICATION :
Compare the disassembly outputs (sections EXPECTED, ACTUAL) of HotSpot compiler for the given code snippets.
1.) Character.highSurrogate(char,char) reduces register pressure (needs 1 less CPU register).
2.) Character.highSurrogate(char,char) would perform better (needs 2 less CPU instructions).
Aside application progammers, many charset decoders in package sun.nio.cs would benefit from.
EXPECTED VERSUS ACTUAL BEHAVIOR :
EXPECTED -
char[] C2 = new char[2];
void decode(char c) {
char[] _C2 = C2;
_C2[0] = Character.highSurrogate((char)0x2, c);
_C2[1] = Character.lowSurrogate(c);
}
0x00b8c3e2: mov 0x2c(%esi),%ebx ;*getfield C2
0x00b8c3e5: mov %edi,%ebp
0x00b8c3e7: shr $0xa,%ebp
0x00b8c3ea: add $0xd840,%ebp ;*iadd
; (0x2 << 6) + MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)
0x00b8c3f0: mov 0x8(%ebx),%ecx ; implicit exception: dispatches to 0x00b8c4c1
0x00b8c3f3: cmp $0x1,%ecx
0x00b8c3f6: jbe 0x00b8c46d
0x00b8c3f8: mov %bp,0xc(%ebx) ;*castore
0x00b8c3fc: and $0x3ff,%edi
0x00b8c402: or $0xdc00,%edi
0x00b8c408: mov %di,0xe(%ebx) ;*castore
ACTUAL -
char[] C2 = new char[2];
void decode(char c) {
Character.toSurrogates(0x20000 + c, C2, 0);
}
0x00b89562: mov 0x2c(%esi),%eax ;*getfield C2
0x00b89565: mov %edi,%ecx
0x00b89567: add $0x20000,%ecx ;*iadd
0x00b8956d: mov %ecx,%ebp
0x00b8956f: shr $0xa,%ebp
0x00b89572: add $0xd7c0,%ebp ;*iadd
; MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)
0x00b89578: mov 0x8(%eax),%ebx ; implicit exception: dispatches to 0x00b89649
0x00b8957b: cmp $0x1,%ebx
0x00b8957e: jbe 0x00b895f5
0x00b89580: mov %bp,0xc(%eax) ;*castore
0x00b89584: and $0x3ff,%ecx
0x00b8958a: or $0xdc00,%ecx
0x00b89590: mov %cx,0xe(%eax) ;*castore
---------- BEGIN SOURCE ----------
/**
* Converts the specified character (Unicode code point) to the 2
* {@code char}s of it's UTF-16 representation as surrogate pair.
*
* <p><b>Note:</b> Does not check, if code point is outside illegal range,
* i.e. U+D800 ... U+DFFF or > U+10FFFF, and, in case, returns invalid result.
* To avoid this, check this by {@link #isSurrogate(char)} and
* {@link #isValidCodePoint(int)} method before.
*
* @param codePoint a Unicode code point
* @param dst an array of {@code char} in which the {@code codePoint}'s
* UTF-16 surrogate pair is stored.
* @param dstIndex the start index into the {@code dst} array where the
* converted value is stored.
* @return the number of written chars, always 2
* @throws IndexOutOfBoundsException if {@code dstIndex} is negative or if
* {@code dst} at {@code dstIndex} doesn't have enough array
* elements to store the 2 {@code char} values.
*/
static int toSurrogates(int codePoint, char[] dst, int dstIndex) {
// We write elements "backwards" to guarantee all-or-nothing
dst[dstIndex+1] = lowSurrogate(codePoint);
dst[dstIndex] = highSurrogate(codePoint);
return 2;
}
/**
* Converts the specified character (Unicode code point) to the 1st
* {@code char} of it's UTF-16 representation as surrogate pair.
*
* <p><b>Note:</b> Does not check, if code point is outside valid UTF-16
* surrogate representation range, i.e. < U+10000 or > U+10FFFF,
* and, in case, returns invalid result.
* To avoid this, check this by {@link #isSurrogate(char)} and
* {@link #isValidCodePoint(int)} method before.
*
* @param codePoint a Unicode code point
* @return the 1st {@code char} of {@code codePoint}'s UTF-16 representation.
* @since 1.7
*/
public static char highSurrogate(int codePoint) {
return (char)((codePoint >>> 10) +
(MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)));
}
/**
* Converts the specified character (Unicode code point) to the 1st
* {@code char} of it's UTF-16 representation as surrogate pair.
* I case of having {@code highCPWord} as a {@code static final} constant,
* this method ensures optimized performance in case of virtual machine's
* byte code compiler inlining.
*
* <p><b>Note:</b> Does not check, if code point is outside illegal range,
* i.e. U+D800 ... U+DFFF or > U+10FFFF, and, in case, returns invalid result.
* To avoid this, check this by {@link #isSurrogate(char)} and
* {@link #isValidCodePoint(int)} method before.
*
* @param highCPWord high 16 bit of a Unicode code point
* @param lowCPWord low 16 bit of a Unicode code point
* @return the 1st {@code char} of {@code codePoint}'s UTF-16 representation.
* @since 1.7
*/
public static char highSurrogate(char highCPWord, char lowCPWord) {
return (char)((lowCPWord >>> 10) + ((highCPWord << 6) +
MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)));
}
/**
* Converts the specified character (Unicode code point) to the 2nd
* {@code char} of it's UTF-16 representation as surrogate pair.
*
* <p><b>Note:</b> Does not check, if code point is outside illegal range,
* i.e. U+D800 ... U+DFFF or > U+10FFFF, and, in case, returns invalid result.
* To avoid this, check this by {@link #isSurrogate(char)} and
* {@link #isValidCodePoint(int)} method before.
*
* @param codePoint a Unicode code point
* @return the 2nd {@code char} of {@code codePoint}'s UTF-16 representation.
* @since 1.7
*/
public static char lowSurrogate(int codePoint) {
return (char)((codePoint & 0x03FF) | MIN_LOW_SURROGATE);
}
ALTERNATIVE:
/**
* Converts the specified character (Unicode code point) to the 2
* {@code char}s of it's UTF-16 representation as surrogate pair.
* I case of having {@code highCPWord} as a {@code static final} constant,
* this method ensures optimized performance in case of virtual machine's
* byte code compiler inlining.
*
* <p><b>Note:</b> Does not check, if code point is outside illegal range,
* i.e. U+D800 ... U+DFFF or > U+10FFFF, and, in case, returns invalid result.
* To avoid this, check this by {@link #isSurrogate(char)} and
* {@link #isValidCodePoint(int)} method before.
*
* @param highCPWord high 16 bit of a Unicode code point
* @param lowCPWord low 16 bit of a Unicode code point
* @param dst an array of {@code char} in which the {@code codePoint}'s
* UTF-16 surrogate pair is stored.
* @param dstIndex the start index into the {@code dst} array where the
* converted value is stored.
* @return the number of written chars, always 2
* @throws IndexOutOfBoundsException if {@code dstIndex} is negative or if
* {@code dst} at {@code dstIndex} doesn't have enough array
* elements to store the 2 {@code char} values.
*/
static int toSurrogates(char highCPWord, char lowCPWord, char[] dst, int dstIndex) {
// We write elements "backwards" to guarantee all-or-nothing
dst[dstIndex+1] = (char)((lowCPWord & 0x03FF) | MIN_LOW_SURROGATE);
dst[dstIndex] = (char)((lowCPWord >>> 10) + ((highCPWord << 6) +
MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)));
return 2;
}
---------- END SOURCE ----------