Other |
---|
tbdUnresolved |
Relates :
|
|
Relates :
|
|
Relates :
|
|
Relates :
|
|
Relates :
|
Running `compiler.VectorReduction2.WithSuperword.intAddSimple` I've realised that the loop is not vectorized: ``` @Benchmark public void intAddSimple(Blackhole bh) { int acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { int val = in1I[i]; acc += val; } bh.consume(acc); } ``` Here's the assembly on an x64 AVX2 machine: ``` 0x00007f4090020d5e: nop ;*aload_0 {reexecute=0 rethrow=0 return_oop=0} ; - org.openjdk.bench.vm.compiler.VectorReduction2::intAddSimple@12 (line 811) ; - org.openjdk.bench.vm.compiler.jmh_generated.VectorReduction2_WithSuperword_intAddSimple_jmhTest::intAddSimple_avgt_jmhStub@17 (line 190) ↗ 0x00007f4090020d60: addl 0x10(%r14, %rcx, 4), %edx ; add the value of the 1st element 5.55% │ 0x00007f4090020d65: addl 0x14(%r14, %rcx, 4), %edx 4.76% │ 0x00007f4090020d6a: addl 0x18(%r14, %rcx, 4), %edx 7.55% │ 0x00007f4090020d6f: addl 0x1c(%r14, %rcx, 4), %edx 6.70% │ 0x00007f4090020d74: addl 0x20(%r14, %rcx, 4), %edx 5.55% │ 0x00007f4090020d79: addl 0x24(%r14, %rcx, 4), %edx 5.21% │ 0x00007f4090020d7e: addl 0x28(%r14, %rcx, 4), %edx 6.51% │ 0x00007f4090020d83: addl 0x2c(%r14, %rcx, 4), %edx 5.51% │ 0x00007f4090020d88: addl 0x30(%r14, %rcx, 4), %edx 5.66% │ 0x00007f4090020d8d: addl 0x34(%r14, %rcx, 4), %edx 4.69% │ 0x00007f4090020d92: addl 0x38(%r14, %rcx, 4), %edx 6.51% │ 0x00007f4090020d97: addl 0x3c(%r14, %rcx, 4), %edx 5.51% │ 0x00007f4090020d9c: addl 0x40(%r14, %rcx, 4), %edx 7.22% │ 0x00007f4090020da1: addl 0x44(%r14, %rcx, 4), %edx 5.62% │ 0x00007f4090020da6: addl 0x48(%r14, %rcx, 4), %edx 5.14% │ 0x00007f4090020dab: addl 0x4c(%r14, %rcx, 4), %edx;*iadd {reexecute=0 rethrow=0 return_oop=0} │ ; - org.openjdk.bench.vm.compiler.VectorReduction2::intAddSimple@23 (line 812) │ ; - org.openjdk.bench.vm.compiler.jmh_generated.VectorReduction2_WithSuperword_intAddSimple_jmhTest::intAddSimple_avgt_jmhStub@17 (line 190) 6.07% │ 0x00007f4090020db0: addl $0x10, %ecx ;*iinc {reexecute=0 rethrow=0 return_oop=0} │ ; - org.openjdk.bench.vm.compiler.VectorReduction2::intAddSimple@25 (line 810) │ ; - org.openjdk.bench.vm.compiler.jmh_generated.VectorReduction2_WithSuperword_intAddSimple_jmhTest::intAddSimple_avgt_jmhStub@17 (line 190) │ 0x00007f4090020db3: cmpl %eax, %ecx 0.04% ╰ 0x00007f4090020db5: jl 0x7f4090020d60 ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0} ``` However, if you multiply the value before summing it, then vectorization kicks in. So, something like this: ``` @Benchmark public void intAddSimpleWithMultiply(Blackhole bh) { int acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { int val = 11 * in1I[i]; acc += val; } bh.consume(acc); } ``` Here's the assembly ``` 0.16% 0x00007f1190021e93: addl %r11d, %edi 0x00007f1190021e96: nopw (%rax, %rax) ;*bipush {reexecute=0 rethrow=0 return_oop=0} ; - org.openjdk.bench.vm.compiler.VectorReduction2::intAddSimpleWithMultiply@12 (line 821) ; - org.openjdk.bench.vm.compiler.jmh_generated.VectorReduction2_WithSuperword_intAddSimpleWithMultiply_jmhTest::intAddSimpleWithMultiply_avgt_jmhStub@17 (line 190) ↗ 0x00007f1190021ea0: vpmulld 0xf0(%r8, %r11, 4), %ymm5, %ymm7 │ 0x00007f1190021eaa: vpmulld 0xd0(%r8, %r11, 4), %ymm5, %ymm8 │ 0x00007f1190021eb4: vpmulld 0x10(%r8, %r11, 4), %ymm5, %ymm3 6.17% │ 0x00007f1190021ebb: vpmulld 0x30(%r8, %r11, 4), %ymm5, %ymm6 11.30% │ 0x00007f1190021ec2: vpmulld 0xb0(%r8, %r11, 4), %ymm5, %ymm9 │ ; {no_reloc} 11.63% │ 0x00007f1190021ecc: vpmulld 0x50(%r8, %r11, 4), %ymm5, %ymm12 10.64% │ 0x00007f1190021ed3: vpmulld 0x70(%r8, %r11, 4), %ymm5, %ymm11 11.69% │ 0x00007f1190021eda: vpmulld 0x90(%r8, %r11, 4), %ymm5, %ymm10 10.80% │ 0x00007f1190021ee4: vpaddd %ymm3, %ymm13, %ymm3 │ 0x00007f1190021ee8: vpaddd %ymm6, %ymm3, %ymm3 │ 0x00007f1190021eec: vpaddd %ymm12, %ymm3, %ymm3 │ 0x00007f1190021ef1: vpaddd %ymm11, %ymm3, %ymm3 │ 0x00007f1190021ef6: vpaddd %ymm10, %ymm3, %ymm3 10.71% │ 0x00007f1190021efb: vpaddd %ymm9, %ymm3, %ymm3 4.83% │ 0x00007f1190021f00: vpaddd %ymm8, %ymm3, %ymm3 7.42% │ 0x00007f1190021f05: vpaddd %ymm7, %ymm3, %ymm13;*iadd {reexecute=0 rethrow=0 return_oop=0} │ ; - org.openjdk.bench.vm.compiler.VectorReduction2::intAddSimpleWithMultiply@26 (line 822) │ ; - org.openjdk.bench.vm.compiler.jmh_generated.VectorReduction2_WithSuperword_intAddSimpleWithMultiply_jmhTest::intAddSimpleWithMultiply_avgt_jmhStub@17 (line 190) 5.52% │ 0x00007f1190021f09: addl $0x40, %r11d ;*iinc {reexecute=0 rethrow=0 return_oop=0} │ ; - org.openjdk.bench.vm.compiler.VectorReduction2::intAddSimpleWithMultiply@28 (line 820) │ ; - org.openjdk.bench.vm.compiler.jmh_generated.VectorReduction2_WithSuperword_intAddSimpleWithMultiply_jmhTest::intAddSimpleWithMultiply_avgt_jmhStub@17 (line 190) │ 0x00007f1190021f0d: cmpl %edi, %r11d ╰ 0x00007f1190021f10: jl 0x7f1190021ea0 ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0} ; - org.openjdk.bench.vm.compiler.VectorReduction2::intAddSimpleWithMultiply@9 (line 820) ; - org.openjdk.bench.vm.compiler.jmh_generated.VectorReduction2_WithSuperword_intAddSimpleWithMultiply_jmhTest::intAddSimpleWithMultiply_avgt_jmhStub@17 (line 190) ``` Here are the performance of both benchmarks compared: ``` Benchmark (SIZE) (seed) Mode Cnt Score Error Units VectorReduction2.WithSuperword.intAddSimple 2048 0 avgt 3 552.308 ± 1.333 ns/op VectorReduction2.WithSuperword.intAddSimpleWithMultiply 2048 0 avgt 3 141.707 ± 1.827 ns/op ``` This should be working as per JDK-7192383 and JDK-8074981, but couldn't see any bugs related to this. I've replicated this in master branch.
|