Bug ID: JDK-8034833 Strange performance behaviour of cmov vs branch on x86

Other
tbdUnresolved

On 2/13/14 8:12 PM, Martin Grajcar wrote: > My manually written assembly runs in 430 (it looks like we're using the > same units and my computer is slightly slower) and it looks like this: > > "movl %edi, %r15d\n" // i+0 > "andl %esi, %r15d\n" // (i+0) & mask > "addl $-1, %r15d\n" // carry = ((i+0) & mask) ? 1 : 0 > "adcl $0, %eax\n" // result += carry > > "leal 1(%edi), %r15d\n" // (i+1) > "andl %esi, %r15d\n" // (i+1) & mask > "addl $-1, %r15d\n" // carry = ((i+1) & mask) ? 1 : 0 > "adcl $0, %eax\n" // result += carry > > Unfortunately, the AND before TEST removing and the ADC optimizations > are mutually exclusive.
14-02-2014
Here is code for second optimization: diff -r 194e8b7fe9ca src/cpu/x86/vm/x86_64.ad --- a/src/cpu/x86/vm/x86_64.ad Thu Jan 30 14:30:01 2014 +0100 +++ b/src/cpu/x86/vm/x86_64.ad Thu Feb 13 17:04:59 2014 -0800 @@ -9610,6 +9385,73 @@ %} +instruct cmpEQMask(rRegI dst, rRegI p, rRegI q, rFlagsReg cr) +%{ + match(Set dst (CmpEQMask p q)); + effect(KILL cr); + + ins_cost(200); + format %{ "cmpl $p, $q\t# cmpEQMask\n\t" + "seteq $dst\n\t" + "movzbl $dst, $dst\n\t" + "negl $dst" %} + ins_encode %{ + Register Rp = $p$$Register; + Register Rq = $q$$Register; + Register Rdst = $dst$$Register; + __ cmpl(Rp, Rq); + __ setb(Assembler::equal, Rdst); + __ movzbl(Rdst, Rdst); + __ negl(Rdst); + %} + ins_pipe(pipe_slow); +%} + +instruct cmpEQMask0(rRegI dst, immI0 zero, rFlagsReg cr) +%{ + match(Set dst (CmpEQMask dst zero)); + effect(KILL cr); + + ins_cost(200); + format %{ "testl $dst, $dst\t# cmpEQMask\n\t" + "seteq $dst\n\t" + "movzbl $dst, $dst\n\t" + "negl $dst" %} + ins_encode %{ + Register Rdst = $dst$$Register; + Label done; + __ testl(Rdst, Rdst); + __ setb(Assembler::equal, Rdst); + __ movzbl(Rdst, Rdst); + __ negl(Rdst); + %} + ins_pipe(ialu_reg); +%} + +instruct cadd_cmpEQMask(rRegI dst, rdx_RegI p, rRegI q, immI1 one, rFlagsReg cr) +%{ + match(Set dst (AddI (AndI (CmpEQMask p q) one) dst)); + effect(KILL p, KILL cr); + + ins_cost(200); + + format %{ "cmpl $p, $q\t# cadd_cmpEQMask\n\t" + "seteq $p\n\t" + "movzbl $p, $p\n\t" + "addl $dst, $p" %} + ins_encode %{ + Register Rp = $p$$Register; + Register Rq = $q$$Register; + Register Rdst = $dst$$Register; + __ cmpl(Rp, Rq); + __ setb(Assembler::equal, Rp); + __ movzbl(Rp, Rp); + __ addl(Rdst, Rp); + %} + ins_pipe(pipe_cmplt); +%} + + //---------- FP Instructions------------------------------------------------ instruct cmpF_cc_reg(rFlagsRegU cr, regF src1, regF src2) diff -r 194e8b7fe9ca src/share/vm/opto/cfgnode.cpp --- a/src/share/vm/opto/cfgnode.cpp Thu Jan 30 14:30:01 2014 +0100 +++ b/src/share/vm/opto/cfgnode.cpp Thu Feb 13 17:04:59 2014 -0800 @@ -1289,14 +1289,20 @@ if (region->in(2)->outcnt() != 1) return NULL; // Check for "(P < Q)" of type signed int - if (b->_test._test != BoolTest::lt) return NULL; + if (b->_test._test != BoolTest::lt && + b->_test._test != BoolTest::ne) return NULL; if (cmp->Opcode() != Op_CmpI) return NULL; Node p = cmp->in(1); Node q = cmp->in(2); Node n1 = phi->in( true_path); Node n2 = phi->in(3-true_path); - + if (b->_test._test == BoolTest::ne) { + // swap + Node* n = n2; + n2 = n1; + n1 = n; + } int op = n1->Opcode(); if( op != Op_AddI // Need zero as additive identity /&&op != Op_SubI && @@ -1317,7 +1323,12 @@ if( q->is_Con() && phase->type(q) != TypeInt::ZERO && y->is_Con() ) return NULL; - Node cmplt = phase->transform( new (phase->C) CmpLTMaskNode(p,q) ); + Node cmplt = NULL; + if (b->_test._test == BoolTest::lt) { + cmplt = phase->transform( new (phase->C) CmpLTMaskNode(p,q) ); + } else { + cmplt = phase->transform( new (phase->C) CmpEQMaskNode(p,q) ); + } Node j_and = phase->transform( new (phase->C) AndINode(cmplt,y) ); return new (phase->C) AddINode(j_and,x); } diff -r 194e8b7fe9ca src/share/vm/opto/classes.hpp --- a/src/share/vm/opto/classes.hpp Thu Jan 30 14:30:01 2014 +0100 +++ b/src/share/vm/opto/classes.hpp Thu Feb 13 17:04:59 2014 -0800 @@ -79,6 +79,7 @@ macro(CmpL) macro(CmpL3) macro(CmpLTMask) +macro(CmpEQMask) macro(CmpP) macro(CmpU) macro(CompareAndSwapI) diff -r 194e8b7fe9ca src/share/vm/opto/subnode.hpp --- a/src/share/vm/opto/subnode.hpp Thu Jan 30 14:30:01 2014 +0100 +++ b/src/share/vm/opto/subnode.hpp Thu Feb 13 17:04:59 2014 -0800 @@ -362,6 +362,16 @@ virtual uint ideal_reg() const { return Op_RegI; } }; +//------------------------------CmpEQMaskNode---------------------------------- +// If p == q, return -1 else return 0. Nice for flow-free idioms. +class CmpEQMaskNode : public Node { +public: + CmpEQMaskNode( Node p, Node q ) : Node(0, p, q) {} + virtual int Opcode() const; + const Type *bottom_type() const { return TypeInt::INT; } + virtual uint ideal_reg() const { return Op_RegI; } +}; + //------------------------------NegNode---------------------------------------- class NegNode : public Node { diff -r 194e8b7fe9ca src/share/vm/runtime/vmStructs.cpp --- a/src/share/vm/runtime/vmStructs.cpp Thu Jan 30 14:30:01 2014 +0100 +++ b/src/share/vm/runtime/vmStructs.cpp Thu Feb 13 17:04:59 2014 -0800 @@ -1961,6 +1961,7 @@ declare_c2_type(AbsFNode, AbsNode) \ declare_c2_type(AbsDNode, AbsNode) \ declare_c2_type(CmpLTMaskNode, Node) \ + declare_c2_type(CmpEQMaskNode, Node) \ declare_c2_type(NegNode, Node) \ declare_c2_type(NegFNode, NegNode) \ declare_c2_type(NegDNode, NegNode) \
14-02-2014
First optimization, which replaced (CmpI (AndI src mask) zero) with (TestI src mask), gave slight improvement in my test. Second optimization which converts if (P == Q) { X+Y } to data flow only: cmp RDX, R9 # cadd_cmpEQMask seteq RDX movzb RDX, RDX add RAX, RDX gave improvement for JmhBranchingBenchmark test even above cmov code (cmov is still generated after 19% - it is separate problem): PERCENTAGE: MEAN MIN MAX UNIT branchless: 8.511 8.475 8.547 ops/ms 5: 9.756 9.709 9.804 ops/ms 10: 9.709 9.709 9.709 ops/ms 15: 9.756 9.709 9.804 ops/ms 16: 9.709 9.709 9.709 ops/ms 17: 9.756 9.709 9.804 ops/ms 18: 9.756 9.709 9.804 ops/ms 19: 9.133 9.091 9.174 ops/ms 20: 9.133 9.091 9.174 ops/ms 30: 9.133 9.091 9.174 ops/ms 40: 9.133 9.091 9.174 ops/ms 50: 9.133 9.091 9.174 ops/ms vs branches: PERCENTAGE: MEAN MIN MAX UNIT branchless: 8.511 8.475 8.547 ops/ms 5: 8.889 8.850 8.929 ops/ms 10: 5.716 5.618 5.814 ops/ms 15: 4.320 4.310 4.329 ops/ms 16: 4.175 4.167 4.184 ops/ms 17: 3.929 3.922 3.937 ops/ms 18: 9.133 9.091 9.174 ops/ms 19: 9.133 9.091 9.174 ops/ms 20: 9.133 9.091 9.174 ops/ms 30: 9.133 9.091 9.174 ops/ms 40: 9.133 9.091 9.174 ops/ms 50: 9.133 9.091 9.174 ops/ms Unfortunately for my test it gave regression but smaller then when using cmov: testi time: 687 vs base testi time: 402 vs cmov testi time: 785
14-02-2014
On 2/13/14 12:32 AM, Martin Grajcar wrote: > I think you could save one register and two instructions. The generated > code for the conditional increment seems to use BlockLayoutByFrequency > and looks like > > mov %ebp,%r8d > and %ebx,%r8d > test %r8d,%r8d # not used anymore > je 0x00007fafdf77d907 > > while simply > > test %ebp,%ebx > je 0x00007fafdf77d907 We have such matching but only if constant or memory is used instead of register (%ebx) in such case: match(Set cr (CmpI (AndI src con) zero)); It is oversight and very easy to fix: +instruct testI_reg_reg(rFlagsReg cr, rRegI src, rRegI mask, immI0 zero) +%{ + match(Set cr (CmpI (AndI src mask) zero)); + + format %{ "testl $src, $mask" %} + ins_encode %{ + __ testl($src$$Register,$mask$$Register); + %} + ins_pipe(ialu_cr_reg_reg); +%} +
13-02-2014
> After switching BlockLayoutByFrequency off, the loop body is > essentially this chunk of code repeated 16 times (with old_result and > new_result switching between iterations): > > mov %i, %tmp > add $increment, %tmp > and %mask, %tmp > mov %old_result, %new_result > inc %new_result > test %tmp,%tmp > cmovne %old_result,%new_result > > Even when looking at the xxx_result only, each chunk seems to need two > cycles, because of data dependencies (assuming mov+inc get fused into a > single 3-operand microinstruction). > > This dependency could be eliminated, but there are still 7 instructions > which is a lot. Now I can see how branching out helps. > > By using LEA the instruction count can be reduced to 5, but without any > visible speedup in my test. > > Can add with carry be used? > > mov %i, %tmp > add $increment, %tmp > and %mask, %tmp > add $-1, %tmp > adc $0,%result > > This works only for counting non-zeros (or zeros after a simple > transform or with SBB), but counting is pretty common. In my stupid > hand-made assembly there's a nice speedup (from 0.7 ns/iteration to 0.4). Very interesting suggestion. We already doing something similar to that in is_cond_add() in opto/cfgnode.cpp: // Check for simple conditional add pattern: "(P < Q) ? X+Y : X;" // To be profitable the control flow has to disappear; there can be no other // values merging here. We replace the test-and-branch with: // "(sgn(P-Q))&Y) + X". Basically, convert "(P < Q)" into 0 or -1 by // moving the carry bit from (P-Q) into a register with 'sbb EAX,EAX'. // Then convert Y to 0-or-Y and finally add. We can add (P != 0) case.
13-02-2014
Yes, I tried. The problem is the test itself sets it: .jvmArgs("-Dperc=" + perc) This overwrites all --jvmargs setting on command line for forked VMs. You have to modify the test itself to include flags into .jvmArgs(��).
13-02-2014
VladimirK, regarding passing additional flags to forked VM in JMH, have you tried --jvmargs?
13-02-2014
I modified the test (BranchingBenchmarkV.java) to not use JMH because I can't pass flags to forked VM. Methods are cloned (copy/paste) to have separate profile info for different probabilities. The result is the same - the test performance is high when cmov is used: PERCENTAGE: MEAN MIN MAX UNIT branchless: 8.811 8.772 8.850 ops/ms 5: 7.220 7.194 7.246 ops/ms 10: 5.183 5.102 5.263 ops/ms 15: 4.320 4.310 4.329 ops/ms 16: 4.158 4.149 4.167 ops/ms 17: 4.040 4.032 4.049 ops/ms 18: 3.876 3.861 3.891 ops/ms 19: 9.434 9.434 9.434 ops/ms 20: 9.479 9.434 9.524 ops/ms 30: 9.390 9.346 9.434 ops/ms 40: 9.390 9.346 9.434 ops/ms 50: 9.434 9.434 9.434 ops/ms
13-02-2014
Some changes to do experiments. First to keep simple blocks (one instruction) inside loops. Second to switch on generation of cmov instructions for probabilities > 0.1% instead of >20% (BlockLayoutMinDiamondPercentage): --- a/src/share/vm/opto/block.hpp Thu Jan 30 14:30:01 2014 +0100 +++ b/src/share/vm/opto/block.hpp Wed Feb 12 17:01:38 2014 -0800 @@ -745,7 +745,7 @@ CFGEdge(Block from, Block to, float freq, int from_pct, int to_pct) : _from(from), _to(to), _freq(freq), _from_pct(from_pct), _to_pct(to_pct), _state(open) { - _infrequent = from_infrequent() \|\| to_infrequent(); + _infrequent = (from_infrequent() \|\| to_infrequent()) && (!UseNewCode2 \|\| to->number_of_nodes() > 4); } float freq() const { return _freq; } diff -r 194e8b7fe9ca src/share/vm/opto/loopopts.cpp --- a/src/share/vm/opto/loopopts.cpp Thu Jan 30 14:30:01 2014 +0100 +++ b/src/share/vm/opto/loopopts.cpp Wed Feb 12 17:01:38 2014 -0800 @@ -588,7 +588,7 @@ // BlockLayoutByFrequency optimization moves infrequent branch // from hot path. No point in CMOV'ing in such case (110 is used // instead of 100 to take into account not exactness of float value). - if (BlockLayoutByFrequency) { + if (!UseNewCode && BlockLayoutByFrequency) { infrequent_prob = MAX2(infrequent_prob, (float)BlockLayoutMinDiamondPercentage/110.0f); } }
13-02-2014
The issue is more complicated than I thought. The code I pointed before was added by me about 3 years ago for: 7097546: Optimize use of CMOVE instructions https://bugs.openjdk.java.net/browse/JDK-7097546 Changes were done to avoid 2x performance hit with cmov for code like next (from Test_cmove.java attached to 7097546): public static int test(int result, int limit, int mask) { // mask = 15 for (int i = 0; i < limit; i++) { if ((i&mask) == 0) result++; // Non frequent } return result; } Cmov instruction has big flow - it requires an additional register. If loop's body is complex, using cmov will result in a register spilling - additional instructions. The performance hit could be high than branch misprediction. I am not sure how to proceed from here. I may do some benchmark testing to see affects if cmov is used in more cases. Regards, Vladimir
13-02-2014
The code which affect this test was added for 7097546. See attached test in that bug report.
13-02-2014

Relates :	JDK-7097546 - Optimize use of CMOVE instructions
Relates :	JDK-8039104 - Don't use Math.min/max intrinsic on x86