This benchmark:
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.TearDown;
import org.openjdk.jmh.annotations.Warmup;
import sun.misc.Unsafe;
import java.lang.invoke.MethodHandles;
import java.lang.invoke.VarHandle;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.concurrent.TimeUnit;
import static jdk.incubator.foreign.MemoryLayouts.JAVA_INT;
@BenchmarkMode(Mode.AverageTime)
@Warmup(iterations = 5, time = 500, timeUnit = TimeUnit.MILLISECONDS)
@Measurement(iterations = 10, time = 500, timeUnit = TimeUnit.MILLISECONDS)
@State(org.openjdk.jmh.annotations.Scope.Thread)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@Fork(value = 3, jvmArgsAppend = { "--add-modules=jdk.incubator.foreign" })
public class LoopOverPolluted {
static final int ELEM_SIZE = 1_000_000;
static final int CARRIER_SIZE = (int) JAVA_INT.byteSize();
static final int ALLOC_SIZE = ELEM_SIZE * CARRIER_SIZE;
static final Unsafe unsafe = Utils.unsafe;
ByteBuffer heapBuffer = ByteBuffer.allocate(ALLOC_SIZE).order(ByteOrder.nativeOrder());
ByteBuffer directBuffer = ByteBuffer.allocateDirect(ALLOC_SIZE).order(ByteOrder.nativeOrder());
static final VarHandle VH_int = MethodHandles.byteBufferViewVarHandle(int[].class, ByteOrder.nativeOrder());
static final VarHandle VH_float = MethodHandles.byteBufferViewVarHandle(float[].class, ByteOrder.nativeOrder());
byte[] arr;
long addr;
@Setup
public void setup() {
addr = unsafe.allocateMemory(ALLOC_SIZE);
for (int i = 0; i < ELEM_SIZE; i++) {
unsafe.putInt(addr + (i * 4), i);
}
arr = new byte[ALLOC_SIZE];
for (int i = 0; i < ELEM_SIZE; i++) {
unsafe.putInt(arr, Unsafe.ARRAY_BYTE_BASE_OFFSET + (i * 4), i);
}
for (int i = 0; i < ELEM_SIZE; i++) {
VH_int.set(directBuffer, i, i);
VH_float.set(directBuffer, i, i);
VH_int.set(heapBuffer, i, i);
VH_float.set(heapBuffer, i, i);
}
}
@TearDown
public void tearDown() {
unsafe.invokeCleaner(directBuffer);
heapBuffer = null;
arr = null;
unsafe.freeMemory(addr);
}
@Benchmark
public int native_buffer() {
int sum = 0;
for (int k = 0; k < ELEM_SIZE; k++) {
VH_int.set(directBuffer, k, k + 1);
int v = (int)VH_int.get(directBuffer, k);
sum += v;
}
return sum;
}
@Benchmark
public int heap_buffer() {
int sum = 0;
for (int k = 0; k < ELEM_SIZE; k++) {
VH_int.set(heapBuffer, k, k + 1);
int v = (int)VH_int.get(heapBuffer, k);
sum += v;
}
return sum;
}
@Benchmark
public int heap_buffer_floats() {
int sum = 0;
for (int k = 0; k < ELEM_SIZE; k++) {
VH_float.set(heapBuffer, k, k + 1);
float v = (float)VH_float.get(heapBuffer, k);
sum += (int)v;
}
return sum;
}
@Benchmark
public int heap_unsafe() {
int sum = 0;
for (int k = 0; k < ALLOC_SIZE; k += 4) {
unsafe.putInt(arr, k + Unsafe.ARRAY_BYTE_BASE_OFFSET, k + 1);
int v = unsafe.getInt(arr, k + Unsafe.ARRAY_BYTE_BASE_OFFSET);
sum += v;
}
return sum;
}
@Benchmark
public int native_unsafe() {
int sum = 0;
for (int k = 0; k < ALLOC_SIZE; k += 4) {
unsafe.putInt(addr + k, k + 1);
int v = unsafe.getInt(addr + k);
sum += v;
}
return sum;
}
}
Shows that ByteBiffer VH views are significantly slower than equivalent unsafe access. This is likely caused to lack of precise type information when passing the byte buffer base to the Unsafe call in the VH impl.