JDK-8145322 : Code generated from unsafe loops can be slightly improved
  • Type: Enhancement
  • Component: hotspot
  • Sub-Component: compiler
  • Affected Version: 9
  • Priority: P4
  • Status: Resolved
  • Resolution: Fixed
  • Submitted: 2015-12-14
  • Updated: 2017-10-31
  • Resolved: 2016-01-12
The Version table provides details related to the release that this issue/RFE will be addressed.

Unresolved : Release in which this issue/RFE will be addressed.
Resolved: Release in which this issue/RFE has been resolved.
Fixed : Release in which this issue/RFE has been fixed. The release containing this fix may be available for download as an Early Access Release or a General Availability Release.

To download the current JDK release, click here.
JDK 9
9 b105Fixed
Related Reports
Relates :  
Relates :  
Relates :  
Relates :  
Relates :  
Relates :  
Description
Paul spotted the following small inefficiencies:

        for (; wi < l; wi++) {
            long bi = ((long) Objects.checkIndex(wi, l, null)) << LOG2_ARRAY_LONG_INDEX_SCALE;
            long av = U.getLongUnaligned(a, aOffset + bi);
            long bv = U.getLongUnaligned(b, bOffset + bi);
            if (av != bv) {

is compiled as:

0b0   B9: #     B28 B10 <- B8 B13       Loop: B9-B13 inner main of N130 Freq: 977.661
0b0     movl    RDX, RDI        # spill
0b2     # castII of RDX
0b2     movq    RBX, [R9 + #16 + RDX << #3]     # long
0b7     movq    RAX, [RSI + #16 + RDX << #3]    # long
0bc     cmpq    RBX, RAX
0bf     jne     B28  P=0.000000 C=7836.000000
0bf
0c5   B10: #    B28 B11 <- B9  Freq: 977.66
0c5     movl    RDX, RDI        # spill
0c7     incl    RDX     # int
0c9     # castII of RDX
0c9     movq    RBX, [R9 + #16 + RDX << #3]     # long
0ce     movq    RAX, [RSI + #16 + RDX << #3]    # long
0d3     cmpq    RBX, RAX
0d6     jne     B28  P=0.000000 C=7836.000000
0d6
0dc   B11: #    B28 B12 <- B10  Freq: 977.66
0dc     movl    RDX, RDI        # spill
0de     addl    RDX, #2 # int 
0e1     # castII of RDX 
0e1     movq    RBX, [R9 + #16 + RDX << #3]     # long
0e6     movq    RAX, [RSI + #16 + RDX << #3]    # long
0eb     cmpq    RBX, RAX
0ee     jne     B28  P=0.000000 C=7836.000000
0ee
0f4   B12: #    B28 B13 <- B11  Freq: 977.659
0f4     movl    RDX, RDI        # spill
0f6     addl    RDX, #3 # int
0f9     # castII of RDX
0f9     movq    RBX, [R9 + #16 + RDX << #3]     # long
0fe     movq    RAX, [RSI + #16 + RDX << #3]    # long
103     cmpq    RBX, RAX
106     jne     B28  P=0.000000 C=7836.000000
106
10c   B13: #    B9 B14 <- B12  Freq: 977.659
10c     addl    RDI, #4 # int
10f     cmpl    RDI, RBP
111     jl,s   B9       # loop end  P=0.998980 C=7836.000000

But the intermediate increment of the induction variable should be folded in the address computation of the memory accesses.
This loop:
        for (; wi < length >> valuesPerWidth; wi++) {
            long bi = ((long) wi) << LOG2_ARRAY_LONG_INDEX_SCALE;
            long av = U.getLongUnaligned(a, aOffset + bi);
            long bv = U.getLongUnaligned(b, bOffset + bi);
            if (av != bv) {

with length and array length is compiled as:

0b0   B7: #     B32 B8 <- B6 B15        Loop: B7-B15 inner main of N123 Freq: 975.843
0b0     movslq  R8, RSI # i2l
0b3     movq    RAX, [RDX + #16 + R8 << #3]     # long
0b8     movq    RDI, [RBP + #16 + R8 << #3]     # long
0bd     cmpq    RAX, RDI 
0c0     jne     B32  P=0.000000 C=7836.000000
0c0
0c6   B8: #     B33 B9 <- B7  Freq: 975.842
0c6     movl    R8, RSI # spill
0c9     incl    R8      # int
0cc     movslq  RDI, R8 # i2l
0cf     movq    RAX, [RDX + #16 + RDI << #3]    # long
0d4     movq    RDI, [RBP + #16 + RDI << #3]    # long
0d9     cmpq    RAX, RDI
0dc     jne     B33  P=0.000000 C=7836.000000
0dc
0e2   B9: #     B33 B10 <- B8  Freq: 975.842
0e2     movl    R8, RSI # spill
0e5     addl    R8, #2  # int
0e9     movslq  RDI, R8 # i2l
0ec     movq    RAX, [RDX + #16 + RDI << #3]    # long
0f1     movq    RDI, [RBP + #16 + RDI << #3]    # long
0f6     cmpq    RAX, RDI
0f9     jne     B33  P=0.000000 C=7836.000000
0f9
0ff   B10: #    B33 B11 <- B9  Freq: 975.842
0ff     movl    R8, RSI # spill
102     addl    R8, #3  # int
106     movslq  RDI, R8 # i2l 
109     movq    RAX, [RDX + #16 + RDI << #3]    # long
10e     movq    RDI, [RBP + #16 + RDI << #3]    # long
113     cmpq    RAX, RDI 
116     jne     B33  P=0.000000 C=7836.000000
116
11c   B11: #    B33 B12 <- B10  Freq: 975.841
11c     movl    R8, RSI # spill
11f     addl    R8, #4  # int
123     movslq  RDI, R8 # i2l
126     movq    RAX, [RDX + #16 + RDI << #3]    # long
12b     movq    RDI, [RBP + #16 + RDI << #3]    # long 
130     cmpq    RAX, RDI
133     jne     B33  P=0.000000 C=7836.000000
133
139   B12: #    B33 B13 <- B11  Freq: 975.841
139     movl    R8, RSI # spill
13c     addl    R8, #5  # int
140     movslq  RDI, R8 # i2l 
143     movq    RAX, [RDX + #16 + RDI << #3]    # long
148     movq    RDI, [RBP + #16 + RDI << #3]    # long 
14d     cmpq    RAX, RDI
150     jne     B33  P=0.000000 C=7836.000000
150
156   B13: #    B33 B14 <- B12  Freq: 975.84
156     movl    R8, RSI # spill
159     addl    R8, #6  # int
15d     movslq  RDI, R8 # i2l 
160     movq    RAX, [RDX + #16 + RDI << #3]    # long
165     movq    RDI, [RBP + #16 + RDI << #3]    # long 
16a     cmpq    RAX, RDI
16d     jne     B33  P=0.000000 C=7836.000000
16d
173   B14: #    B33 B15 <- B13  Freq: 975.84
173     movl    R8, RSI # spill
176     addl    R8, #7  # int
17a     movslq  RDI, R8 # i2l 
17d     movq    RAX, [RDX + #16 + RDI << #3]    # long
182     movq    RDI, [RBP + #16 + RDI << #3]    # long 
187     cmpq    RAX, RDI
18a     jne     B33  P=0.000000 C=7836.000000
18a
190   B15: #    B7 B16 <- B14  Freq: 975.839
190     addl    RSI, #8 # int
193     cmpl    RSI, R11
196     jl     B7       # loop end  P=0.998980 C=7836.000000

The i2l conversions are not needed because the loop bounds guarantee the induction variable is always positive. As above intermediate increment of the induction variable should be folded in the address computations.