Bug ID: JDK-8257850 ByteBuffer VarHandle views are slower than direct BB access

Type: Bug
Component: core-libs
Affected Version: 15,16

Priority: P4
Status: Open
Resolution: Unresolved

Submitted: 2020-12-07
Updated: 2020-12-10

Other
tbdUnresolved

This benchmark:

import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.TearDown;
import org.openjdk.jmh.annotations.Warmup;
import sun.misc.Unsafe;

import java.lang.invoke.MethodHandles;
import java.lang.invoke.VarHandle;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.concurrent.TimeUnit;

import static jdk.incubator.foreign.MemoryLayouts.JAVA_INT;

@BenchmarkMode(Mode.AverageTime)
@Warmup(iterations = 5, time = 500, timeUnit = TimeUnit.MILLISECONDS)
@Measurement(iterations = 10, time = 500, timeUnit = TimeUnit.MILLISECONDS)
@State(org.openjdk.jmh.annotations.Scope.Thread)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@Fork(value = 3, jvmArgsAppend = { "--add-modules=jdk.incubator.foreign" })
public class LoopOverPolluted {

    static final int ELEM_SIZE = 1_000_000;
    static final int CARRIER_SIZE = (int) JAVA_INT.byteSize();
    static final int ALLOC_SIZE = ELEM_SIZE * CARRIER_SIZE;

    static final Unsafe unsafe = Utils.unsafe;

    ByteBuffer heapBuffer = ByteBuffer.allocate(ALLOC_SIZE).order(ByteOrder.nativeOrder());
    ByteBuffer directBuffer = ByteBuffer.allocateDirect(ALLOC_SIZE).order(ByteOrder.nativeOrder());

    static final VarHandle VH_int = MethodHandles.byteBufferViewVarHandle(int[].class, ByteOrder.nativeOrder());
    static final VarHandle VH_float = MethodHandles.byteBufferViewVarHandle(float[].class, ByteOrder.nativeOrder());
    byte[] arr;
    long addr;


    @Setup
    public void setup() {
        addr = unsafe.allocateMemory(ALLOC_SIZE);
        for (int i = 0; i < ELEM_SIZE; i++) {
            unsafe.putInt(addr + (i * 4), i);
        }
        arr = new byte[ALLOC_SIZE];
        for (int i = 0; i < ELEM_SIZE; i++) {
            unsafe.putInt(arr, Unsafe.ARRAY_BYTE_BASE_OFFSET + (i * 4), i);
        }
        for (int i = 0; i < ELEM_SIZE; i++) {
            VH_int.set(directBuffer, i, i);
            VH_float.set(directBuffer, i, i);
            VH_int.set(heapBuffer, i, i);
            VH_float.set(heapBuffer, i, i);
        }
    }

    @TearDown
    public void tearDown() {
        unsafe.invokeCleaner(directBuffer);
        heapBuffer = null;
        arr = null;
        unsafe.freeMemory(addr);
    }

    @Benchmark
    public int native_buffer() {
        int sum = 0;
        for (int k = 0; k < ELEM_SIZE; k++) {
            VH_int.set(directBuffer, k, k + 1);
            int v = (int)VH_int.get(directBuffer, k);
            sum += v;
        }
        return sum;
    }

    @Benchmark
    public int heap_buffer() {
        int sum = 0;
        for (int k = 0; k < ELEM_SIZE; k++) {
            VH_int.set(heapBuffer, k, k + 1);
            int v = (int)VH_int.get(heapBuffer, k);
            sum += v;
        }
        return sum;
    }

    @Benchmark
    public int heap_buffer_floats() {
        int sum = 0;
        for (int k = 0; k < ELEM_SIZE; k++) {
            VH_float.set(heapBuffer, k, k + 1);
            float v = (float)VH_float.get(heapBuffer, k);
            sum += (int)v;
        }
        return sum;
    }

    @Benchmark
    public int heap_unsafe() {
        int sum = 0;
        for (int k = 0; k < ALLOC_SIZE; k += 4) {
            unsafe.putInt(arr, k + Unsafe.ARRAY_BYTE_BASE_OFFSET, k + 1);
            int v = unsafe.getInt(arr, k + Unsafe.ARRAY_BYTE_BASE_OFFSET);
            sum += v;
        }
        return sum;
    }

    @Benchmark
    public int native_unsafe() {
        int sum = 0;
        for (int k = 0; k < ALLOC_SIZE; k += 4) {
            unsafe.putInt(addr + k, k + 1);
            int v = unsafe.getInt(addr + k);
            sum += v;
        }
        return sum;
    }
}


Shows that ByteBiffer VH views are significantly slower than equivalent unsafe access. This is likely caused to lack of precise type information when passing the byte buffer base to the Unsafe call in the VH impl.

The call to VH_float.set needs to cast the value to a float, otherwise there will be a signature mismatch and the slow conversion path will be taken. This is better: @Benchmark public float heap_buffer_floats() { float sum = 0; for (int k = 0; k < ELEM_SIZE; k++) { VH_float.set(heapBuffer, k, (float) k + 1); float v = (float)VH_float.get(heapBuffer, k); sum += v; } return sum; } (including accumulation as float to avoid odd effects.) That will reduce the difference significantly, but there is still some difference. The hot loop (non-unrolled) is: 0.47% �� 0x0000000113414ea0: mov %rbp,%rdx �� 0x0000000113414ea3: movslq %r11d,%rsi 0.22% �� 0x0000000113414ea6: vaddss (%rdi,%rsi,1),%xmm0,%xmm0 78.99% �� 0x0000000113414eab: add %rax,%rsi 8.15% �� 0x0000000113414eae: mov %r8d,(%rdx,%rsi,1) 0.30% �� 0x0000000113414eb2: inc %r11d 4.53% �� 0x0000000113414eb5: vcvtsi2ss %r11d,%xmm1,%xmm1 0.65% �� 0x0000000113414eba: vaddss -0x162(%rip),%xmm1,%xmm2 0.30% �� 0x0000000113414ec2: vmovd %xmm2,%r8d 4.67% �� 0x0000000113414ec7: cmp %ecx,%r11d �� 0x0000000113414eca: jl 0x0000000113414ea0 with some additional instructions dealing with the conversion.

07-12-2020

Assigning to myself, as a placeholder for now.

07-12-2020