JDK-8146071 : Math.max in unrolled loops may produce less optimal code than explicit comparison
  • Type: Bug
  • Component: hotspot
  • Sub-Component: compiler
  • Affected Version: 8,9
  • Priority: P4
  • Status: Closed
  • Resolution: Duplicate
  • OS: windows_7
  • CPU: x86_64
  • Submitted: 2015-11-29
  • Updated: 2023-07-21
  • Resolved: 2023-07-21
Related Reports
Duplicate :  
Description
FULL PRODUCT VERSION :
c:\work\java\jaxenter>java -version
java version "1.8.0_65"
Java(TM) SE Runtime Environment (build 1.8.0_65-b17)
Java HotSpot(TM) 64-Bit Server VM (build 25.65-b01, mixed mode)

c:\work\java\jaxenter>javac -version
javac 1.8.0_65


ADDITIONAL OS VERSION INFORMATION :
Microsoft Windows [Version 6.1.7601]  System 7 x64

EXTRA RELEVANT SYSTEM CONFIGURATION :
c:\work\java\jaxenter>java -version
java version "1.8.0_65"
Java(TM) SE Runtime Environment (build 1.8.0_65-b17)
Java HotSpot(TM) 64-Bit Server VM (build 25.65-b01, mixed mode)

c:\work\java\jaxenter>javac -version
javac 1.8.0_65


A DESCRIPTION OF THE PROBLEM :
streamTest() runs 9X slower than streamTestA(), forTest(), and forTestA(). It appears that the optimizer is not considering that Math::max is a final method. Note that streamTestB() and streamTestC() are only 2X slower than streamTestA() even given that streamTestC() passed the Math::max method reference to the compare method whose containing lambda is passed to reduce().

This bug report is derived from an article posted at https://jaxenter.com/follow-up-how-fast-are-the-java-8-streams-122522.html.

STEPS TO FOLLOW TO REPRODUCE THE PROBLEM :
Execute the provide test case: java ForVsStream


EXPECTED VERSUS ACTUAL BEHAVIOR :
EXPECTED -
streamTest() should run at the same speed as streamTestA(), forTest() and forTestA()
ACTUAL -
streamTest() is 9X slower than streamTest(), forTest(), and forTest() while streamTestB() and streamTestC() are 4X faster than streamTest().

REPRODUCIBILITY :
This bug can be reproduced always.

---------- BEGIN SOURCE ----------
import java.lang.Math;
import java.util.Arrays;
import java.util.Random;
import static java.lang.System.currentTimeMillis;
import static java.lang.System.nanoTime;

class ForVsStream {

  private static final void print (String string) {
    System.out.print(string);
  }

  private static final void println (String string) {
    System.out.println(string);
  }

  private static volatile long time = 0;

  private static int nanoTimed = 0;

  private static final long nanoTimerOverhead () {
    final int bound = 10000;
    int total = 0;
    for (int count = 0; count < bound; ++count) {
      long start = nanoTime();
      time = nanoTime();
      long finish = nanoTime();
      total += (finish - start) / 2;
    }
    nanoTimed = total / bound; 
    return nanoTimed;
  }

  private static final long milliTimerOverhead () {
    final int bound = 10000;
    int total = 0;
    for (int count = 0; count < bound; ++count) {
      long start = nanoTime();
      time = currentTimeMillis();
      long finish = nanoTime();
      total += finish - start - nanoTimed;
    }
    return total / bound; 
  }

  static final int[] ints = new int[1000000];

  private static final long randomInts () {
    long start = currentTimeMillis();
    Random random = new Random();
    for (int i = 0; i < ints.length; i++)
      ints[i] = random.nextInt();
    long finish = currentTimeMillis();
    return finish - start;
  }

  private static final long forTest () {
    int[] a = ints;
    int e = ints.length;
    int m = Integer.MIN_VALUE;
    long start = currentTimeMillis();
    for (int i = 0; i < e; i++)
      if (a[i] > m)
        m = a[i];
    long finish = currentTimeMillis();
    println("m: " + m);
    return finish - start;
  }

  private static final long forTestA () {
    int[] a = ints;
    int e = ints.length;
    int m = Integer.MIN_VALUE;
    long start = currentTimeMillis();
    for (int i = 0; i < e; i++)
      m = Math.max(m, a[i]);
    long finish = currentTimeMillis();
    println("m: " + m);
    return finish - start;
  }

  private static final long streamTest () {
    long start = currentTimeMillis();
    int m = Arrays.stream(ints).reduce(Integer.MIN_VALUE, Math::max); 
    long finish = currentTimeMillis();
    println("m: " + m);
    return finish - start;
  }

  private static final long streamTestA () {
    long start = currentTimeMillis();
    int m = Arrays.stream(ints).reduce(Integer.MIN_VALUE, (a, b) -> a > b ? a : b); 
    long finish = currentTimeMillis();
    println("m: " + m);
    return finish - start;
  }

  private static final long streamTestB () {
    long start = currentTimeMillis();
    int m = Arrays.stream(ints).reduce(Integer.MIN_VALUE, (a, b) -> Math.max(a, b)); 
    long finish = currentTimeMillis();
    println("m: " + m);
    return finish - start;
  }

  public interface BinaryCompareFunction<T> {
    public T compare (T a1, T a2);
  }

  private static final <T> T compare (BinaryCompareFunction<T> f, T a1, T a2) {
    return f.compare(a1, a2);
  }

  private static final long streamTestC () {
    long start = currentTimeMillis();
    int m = Arrays.stream(ints).reduce(Integer.MIN_VALUE, (a, b) -> compare(Math::max, a, b)); 
    long finish = currentTimeMillis();
    println("m: " + m);
    return finish - start;
  }

  public static void main (String[] argumets) {
    println("nanoTimerOverhead: " + nanoTimerOverhead() + " nanoseconds");
    println("milliTimerOverhead: " + milliTimerOverhead() + " nanoseconds");
    println("randomInts(): " + randomInts() + " milliseconds");
    println("forTest(): " + forTest() + " milliseconds");
    println("forTestA(): " + forTestA() + " milliseconds");
    println("streamTest(): " + streamTest() + " milliseconds");
    println("streamTestA(): " + streamTestA() + " milliseconds");
    println("streamTestB(): " + streamTestB() + " milliseconds");
    println("streamTestC(): " + streamTestC() + " milliseconds");
  }

}

---------- END SOURCE ----------

CUSTOMER SUBMITTED WORKAROUND :
See streamTestA(), streamTestB(), and streamTestC(). Note that streamTestA() is 2X faster than streamTestB() and streamTestC(), but all are 4X to 9X faster than streamTest().


Comments
The attachment mathMax.perfasm.txt presents the perfasm results for forTest_MathMax and forTest_if
04-01-2016

The code measuring the timing of various functionality will produce unreliable results because of the JIT. For example, change the order in which test methods are invoked. Here is a more reliable benchmark written using JMH: import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; import org.openjdk.jmh.annotations.Fork; import org.openjdk.jmh.annotations.Measurement; import org.openjdk.jmh.annotations.Mode; import org.openjdk.jmh.annotations.OutputTimeUnit; import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.annotations.Warmup; import java.util.Arrays; import java.util.Random; import java.util.concurrent.TimeUnit; @State(Scope.Benchmark) @Fork(value = 1, warmups = 0) @Warmup(iterations = 10, time = 100, timeUnit = TimeUnit.MILLISECONDS) @Measurement(iterations = 10, time = 100, timeUnit = TimeUnit.MILLISECONDS) @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.MICROSECONDS) public class A { static final int[] ints = new int[1000000]; @Setup public void setUp() { randomInts(); } static void randomInts() { Random random = new Random(); for (int i = 0; i < ints.length; i++) ints[i] = random.nextInt(); } @Benchmark public int forTest_if() { int[] a = ints; int e = ints.length; int m = Integer.MIN_VALUE; for (int i = 0; i < e; i++) if (a[i] >= m) m = a[i]; return m; } @Benchmark public int forTest_MathMax() { int[] a = ints; int e = ints.length; int m = Integer.MIN_VALUE; for (int i = 0; i < e; i++) m = Math.max(m, a[i]); return m; } @Benchmark public int streamTest_lambda() { return Arrays.stream(ints).reduce(Integer.MIN_VALUE, (a, b) -> a >= b ? a : b); } @Benchmark public int streamTest_MathMax() { return Arrays.stream(ints).reduce(Integer.MIN_VALUE, Math::max); } @Benchmark public int streamTest_lambda_MathMax() { return Arrays.stream(ints).reduce(Integer.MIN_VALUE, (a, b) -> Math.max(a, b)); } } Here are the results on a MacBook Pro using a recent version of a JDK 9 build with -XX:-TieredCompilation: Benchmark Mode Cnt Score Error Units A.forTest_MathMax avgt 10 843.823 ± 15.246 us/op A.forTest_if avgt 10 354.751 ± 12.060 us/op A.streamTest_MathMax avgt 10 859.195 ± 26.893 us/op A.streamTest_lambda avgt 10 360.833 ± 9.904 us/op A.streamTest_lambda_MathMax avgt 10 844.766 ± 28.794 us/op Points of note: 1) The "forTest_if" and "streamTest_lambda" results are similar. 2) The "forTest_MathMax" and "streamTest_MathMax" are also similar but over 2x slower than 1) There is something going on with the Math.max intrinsic. Running the tests again with -XX:LoopUnrollLimit=0 -XX:-TieredCompilation i.e. switching off loop unrolling: Benchmark Mode Cnt Score Error Units A.forTest_MathMax avgt 10 912.750 ± 57.903 us/op A.forTest_if avgt 10 921.643 ± 49.769 us/op A.streamTest_MathMax avgt 10 927.324 ± 40.223 us/op A.streamTest_lambda avgt 10 939.930 ± 73.390 us/op A.streamTest_lambda_MathMax avgt 10 936.942 ± 48.921 us/op All tests now report similar times. This implies there is something about the Math.max intrinsic and the code generation, especially in unrolled loops. The generated x86 code for the hot loop of "forTest_MathMax" is: 0x00000001093dcac0: mov 0x10(%rbx,%r13,4),%r11d ;*iaload {reexecute=0 rethrow=0 return_oop=0} ; - oracle.jmh.A::forTest_MathMax@25 (line 80) 0x00000001093dcac5: cmp %r11d,%r14d 0x00000001093dcac8: cmovl %r11d,%r14d ;*invokestatic max {reexecute=0 rethrow=0 return_oop=0} ; - oracle.jmh.A::forTest_MathMax@26 (line 80) 0x00000001093dcacc: movslq %r13d,%r10 0x00000001093dcacf: mov 0x14(%rbx,%r10,4),%r11d 0x00000001093dcad4: mov 0x1c(%rbx,%r10,4),%r8d 0x00000001093dcad9: mov 0x18(%rbx,%r10,4),%edi ;*iaload {reexecute=0 rethrow=0 return_oop=0} ; - oracle.jmh.A::forTest_MathMax@25 (line 80) 0x00000001093dcade: cmp %r11d,%r14d 0x00000001093dcae1: cmovl %r11d,%r14d 0x00000001093dcae5: cmp %edi,%r14d 0x00000001093dcae8: cmovl %edi,%r14d 0x00000001093dcaec: cmp %r8d,%r14d 0x00000001093dcaef: cmovl %r8d,%r14d ;*invokestatic max {reexecute=0 rethrow=0 return_oop=0} ; - oracle.jmh.A::forTest_MathMax@26 (line 80) 0x00000001093dcaf3: add $0x4,%r13d ;*iinc {reexecute=0 rethrow=0 return_oop=0} ; - oracle.jmh.A::forTest_MathMax@30 (line 79) 0x00000001093dcaf7: cmp %r9d,%r13d 0x00000001093dcafa: jl 0x00000001093dcac0 ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0} ; - oracle.jmh.A::forTest_MathMax@18 (line 79) The loop is unrolled in a stride of 4. Probably limited as to keep all loaded values in registers for comparison and conditional moves. Where as the generated x86 code of "forTest_if" is different, essentially a load, compare, jump if >=, assign, jump back to process next element. The loop is unrolled in a stride of 8, although the difference in stride appears to make little difference. Results are similar if unrolling is limited to a stride of 4. The kind of data (random, ramp-up, ramp-down, identical) has no impact. Thus it appears to be the loads with conditional moves that makes the difference here. More analysis using perfasm is required.
04-01-2016

Test Result: **************** OS: Windows 7 - 64bit JDK: 8u0 b132 : Fail 8u65 b17 : Fail 8u72ea b05 : Fail 9ea b94 : Fail ########################## 8 fcs ******* c:\Abhijit\tools>java -version java version "1.8.0" Java(TM) SE Runtime Environment (build 1.8.0-b132) Java HotSpot(TM) 64-Bit Server VM (build 25.0-b70, mixed mode) c:\Abhijit\tools>javac ForVsStream.java c:\Abhijit\tools>java ForVsStream nanoTimerOverhead: 83 nanoseconds milliTimerOverhead: 215 nanoseconds randomInts(): 34 milliseconds m: 2147483300 forTest(): 14 milliseconds m: 2147483300 forTestA(): 22 milliseconds m: 2147483300 streamTest(): 133 milliseconds m: 2147483300 streamTestA(): 31 milliseconds m: 2147483300 streamTestB(): 25 milliseconds m: 2147483300 streamTestC(): 79 milliseconds ################################ 8u65 ****** c:\Abhijit\tools>java -version java version "1.8.0_65" Java(TM) SE Runtime Environment (build 1.8.0_65-b17) Java HotSpot(TM) 64-Bit Server VM (build 25.65-b01, mixed mode) c:\Abhijit\tools>javac ForVsStream.java c:\Abhijit\tools>java ForVsStream nanoTimerOverhead: 39 nanoseconds milliTimerOverhead: 162 nanoseconds randomInts(): 21 milliseconds m: 2147477292 forTest(): 6 milliseconds m: 2147477292 forTestA(): 8 milliseconds m: 2147477292 streamTest(): 105 milliseconds m: 2147477292 streamTestA(): 10 milliseconds m: 2147477292 streamTestB(): 14 milliseconds m: 2147477292 streamTestC(): 19 milliseconds ################################ 8u72ea b05: **************** c:\Abhijit\tools>java -version java version "1.8.0_72-ea" Java(TM) SE Runtime Environment (build 1.8.0_72-ea-b05) Java HotSpot(TM) Client VM (build 25.72-b05, mixed mode) c:\Abhijit\tools>javac ForVsStream.java c:\Abhijit\tools>java ForVsStream nanoTimerOverhead: 435 nanoseconds milliTimerOverhead: 423 nanoseconds randomInts(): 40 milliseconds m: 2147478668 forTest(): 4 milliseconds m: 2147478668 forTestA(): 5 milliseconds m: 2147478668 streamTest(): 403 milliseconds m: 2147478668 streamTestA(): 20 milliseconds m: 2147478668 streamTestB(): 34 milliseconds m: 2147478668 streamTestC(): 82 milliseconds c:\Abhijit\tools> ############################### 9ea: ****** c:\Abhijit\tools>java -version java version "1.9.0-ea" Java(TM) SE Runtime Environment (build 1.9.0-ea-b94) Java HotSpot(TM) 64-Bit Server VM (build 1.9.0-ea-b94, mixed mode) c:\Abhijit\tools>javac ForVsStream.java c:\Abhijit\tools>java ForVsStream nanoTimerOverhead: 251 nanoseconds milliTimerOverhead: 273 nanoseconds randomInts(): 45 milliseconds m: 2147479059 forTest(): 17 milliseconds m: 2147479059 forTestA(): 23 milliseconds m: 2147479059 streamTest(): 151 milliseconds m: 2147479059 streamTestA(): 22 milliseconds m: 2147479059 streamTestB(): 28 milliseconds m: 2147479059 streamTestC(): 49 milliseconds c:\Abhijit\tools>
22-12-2015