G1 performs much slower than parallel GC in various simple System.arraycopy microbenchmarks with Object[] targets[1]. Although some of this might be unavoidable due to having heavier write barriers, profiling indicate that code in G1SATBCardTableModRefBS::write_ref_array_pre_work may be insufficiently optimized/inlined.
A quick experiment to simply hoist the calls to SATBMarkQueue::enqueue directly into write_ref_array_pre_work improves performance on these in G1 by 3x, which may be a straightforward way to narrow the gap in these corner cases:
diff -r b01c519b715e src/share/vm/gc/g1/g1SATBCardTableModRefBS.cpp
--- a/src/share/vm/gc/g1/g1SATBCardTableModRefBS.cpp Thu Mar 16 12:09:14 2017 -0700
+++ b/src/share/vm/gc/g1/g1SATBCardTableModRefBS.cpp Sun Mar 19 20:09:31 2017 +0100
@@ -60,10 +60,22 @@
G1SATBCardTableModRefBS::write_ref_array_pre_work(T* dst, int count) {
if (!JavaThread::satb_mark_queue_set().is_active()) return;
T* elem_ptr = dst;
- for (int i = 0; i < count; i++, elem_ptr++) {
- T heap_oop = oopDesc::load_heap_oop(elem_ptr);
- if (!oopDesc::is_null(heap_oop)) {
- enqueue(oopDesc::decode_heap_oop_not_null(heap_oop));
+ Thread* thr = Thread::current();
+ if (thr->is_Java_thread()) {
+ JavaThread* jt = (JavaThread*)thr;
+ SATBMarkQueue& smq = jt->satb_mark_queue();
+ for (int i = 0; i < count; i++, elem_ptr++) {
+ T heap_oop = oopDesc::load_heap_oop(elem_ptr);
+ if (!oopDesc::is_null(heap_oop)) {
+ smq.enqueue(oopDesc::decode_heap_oop_not_null(heap_oop));
+ }
+ }
+ } else {
+ for (int i = 0; i < count; i++, elem_ptr++) {
+ T heap_oop = oopDesc::load_heap_oop(elem_ptr);
+ if (!oopDesc::is_null(heap_oop)) {
+ enqueue(oopDesc::decode_heap_oop_not_null(heap_oop));
+ }
}
}
}
[1]
private static final Object[] TEST_OBJECTS = new Object[200];
public Object[] dummyObjectArray = new Object[TEST_OBJECTS.length];
@Benchmark
public void arrayCopyObject() {
System.arraycopy(TEST_OBJECTS, 0, dummyObjectArray, 0, dummyObjectArray.length);
}