Relates :
|
|
Relates :
|
|
Relates :
|
|
Relates :
|
|
Relates :
|
|
Relates :
|
Paul spotted the following small inefficiencies: for (; wi < l; wi++) { long bi = ((long) Objects.checkIndex(wi, l, null)) << LOG2_ARRAY_LONG_INDEX_SCALE; long av = U.getLongUnaligned(a, aOffset + bi); long bv = U.getLongUnaligned(b, bOffset + bi); if (av != bv) { is compiled as: 0b0 B9: # B28 B10 <- B8 B13 Loop: B9-B13 inner main of N130 Freq: 977.661 0b0 movl RDX, RDI # spill 0b2 # castII of RDX 0b2 movq RBX, [R9 + #16 + RDX << #3] # long 0b7 movq RAX, [RSI + #16 + RDX << #3] # long 0bc cmpq RBX, RAX 0bf jne B28 P=0.000000 C=7836.000000 0bf 0c5 B10: # B28 B11 <- B9 Freq: 977.66 0c5 movl RDX, RDI # spill 0c7 incl RDX # int 0c9 # castII of RDX 0c9 movq RBX, [R9 + #16 + RDX << #3] # long 0ce movq RAX, [RSI + #16 + RDX << #3] # long 0d3 cmpq RBX, RAX 0d6 jne B28 P=0.000000 C=7836.000000 0d6 0dc B11: # B28 B12 <- B10 Freq: 977.66 0dc movl RDX, RDI # spill 0de addl RDX, #2 # int 0e1 # castII of RDX 0e1 movq RBX, [R9 + #16 + RDX << #3] # long 0e6 movq RAX, [RSI + #16 + RDX << #3] # long 0eb cmpq RBX, RAX 0ee jne B28 P=0.000000 C=7836.000000 0ee 0f4 B12: # B28 B13 <- B11 Freq: 977.659 0f4 movl RDX, RDI # spill 0f6 addl RDX, #3 # int 0f9 # castII of RDX 0f9 movq RBX, [R9 + #16 + RDX << #3] # long 0fe movq RAX, [RSI + #16 + RDX << #3] # long 103 cmpq RBX, RAX 106 jne B28 P=0.000000 C=7836.000000 106 10c B13: # B9 B14 <- B12 Freq: 977.659 10c addl RDI, #4 # int 10f cmpl RDI, RBP 111 jl,s B9 # loop end P=0.998980 C=7836.000000 But the intermediate increment of the induction variable should be folded in the address computation of the memory accesses. This loop: for (; wi < length >> valuesPerWidth; wi++) { long bi = ((long) wi) << LOG2_ARRAY_LONG_INDEX_SCALE; long av = U.getLongUnaligned(a, aOffset + bi); long bv = U.getLongUnaligned(b, bOffset + bi); if (av != bv) { with length and array length is compiled as: 0b0 B7: # B32 B8 <- B6 B15 Loop: B7-B15 inner main of N123 Freq: 975.843 0b0 movslq R8, RSI # i2l 0b3 movq RAX, [RDX + #16 + R8 << #3] # long 0b8 movq RDI, [RBP + #16 + R8 << #3] # long 0bd cmpq RAX, RDI 0c0 jne B32 P=0.000000 C=7836.000000 0c0 0c6 B8: # B33 B9 <- B7 Freq: 975.842 0c6 movl R8, RSI # spill 0c9 incl R8 # int 0cc movslq RDI, R8 # i2l 0cf movq RAX, [RDX + #16 + RDI << #3] # long 0d4 movq RDI, [RBP + #16 + RDI << #3] # long 0d9 cmpq RAX, RDI 0dc jne B33 P=0.000000 C=7836.000000 0dc 0e2 B9: # B33 B10 <- B8 Freq: 975.842 0e2 movl R8, RSI # spill 0e5 addl R8, #2 # int 0e9 movslq RDI, R8 # i2l 0ec movq RAX, [RDX + #16 + RDI << #3] # long 0f1 movq RDI, [RBP + #16 + RDI << #3] # long 0f6 cmpq RAX, RDI 0f9 jne B33 P=0.000000 C=7836.000000 0f9 0ff B10: # B33 B11 <- B9 Freq: 975.842 0ff movl R8, RSI # spill 102 addl R8, #3 # int 106 movslq RDI, R8 # i2l 109 movq RAX, [RDX + #16 + RDI << #3] # long 10e movq RDI, [RBP + #16 + RDI << #3] # long 113 cmpq RAX, RDI 116 jne B33 P=0.000000 C=7836.000000 116 11c B11: # B33 B12 <- B10 Freq: 975.841 11c movl R8, RSI # spill 11f addl R8, #4 # int 123 movslq RDI, R8 # i2l 126 movq RAX, [RDX + #16 + RDI << #3] # long 12b movq RDI, [RBP + #16 + RDI << #3] # long 130 cmpq RAX, RDI 133 jne B33 P=0.000000 C=7836.000000 133 139 B12: # B33 B13 <- B11 Freq: 975.841 139 movl R8, RSI # spill 13c addl R8, #5 # int 140 movslq RDI, R8 # i2l 143 movq RAX, [RDX + #16 + RDI << #3] # long 148 movq RDI, [RBP + #16 + RDI << #3] # long 14d cmpq RAX, RDI 150 jne B33 P=0.000000 C=7836.000000 150 156 B13: # B33 B14 <- B12 Freq: 975.84 156 movl R8, RSI # spill 159 addl R8, #6 # int 15d movslq RDI, R8 # i2l 160 movq RAX, [RDX + #16 + RDI << #3] # long 165 movq RDI, [RBP + #16 + RDI << #3] # long 16a cmpq RAX, RDI 16d jne B33 P=0.000000 C=7836.000000 16d 173 B14: # B33 B15 <- B13 Freq: 975.84 173 movl R8, RSI # spill 176 addl R8, #7 # int 17a movslq RDI, R8 # i2l 17d movq RAX, [RDX + #16 + RDI << #3] # long 182 movq RDI, [RBP + #16 + RDI << #3] # long 187 cmpq RAX, RDI 18a jne B33 P=0.000000 C=7836.000000 18a 190 B15: # B7 B16 <- B14 Freq: 975.839 190 addl RSI, #8 # int 193 cmpl RSI, R11 196 jl B7 # loop end P=0.998980 C=7836.000000 The i2l conversions are not needed because the loop bounds guarantee the induction variable is always positive. As above intermediate increment of the induction variable should be folded in the address computations.