United StatesChange Country, Oracle Worldwide Web Sites Communities I am a... I want to...
Bug ID: JDK-6810474 par compact - crash in summary_phase with very full heap
JDK-6810474 : par compact - crash in summary_phase with very full heap

Details
Type:
Bug
Submit Date:
2009-02-26
Status:
Closed
Updated Date:
2011-03-08
Project Name:
JDK
Resolved Date:
2011-03-08
Component:
hotspot
OS:
generic
Sub-Component:
gc
CPU:
generic
Priority:
P3
Resolution:
Fixed
Affected Versions:
hs14
Fixed Versions:
hs15 (b05)

Related Reports
Backport:
Backport:
Relates:

Sub Tasks

Description
The following tests caused jvm crash with solaris-amd64 binaries during b47 promotion testing in "-XX:+UseParallelOldGC" configuration:

   gc/gctests/FinalizeTest01
   gc/gctests/FinalizeTest02

#
# A fatal error has been detected by the Java Runtime Environment:
#
#  SIGSEGV (0xb) at pc=0xfffffd7ffedb736f, pid=13170, tid=5
#
# JRE version: 7.0-b47
# Java VM: Java HotSpot(TM) 64-Bit Server VM (15.0-b01 mixed mode solaris-amd64 )
# Problematic frame:
# V  [libjvm.so+0xbb736f]
#
# If you would like to submit a bug report, please visit:
#   http://java.sun.com/webapps/bugreport/crash.jsp
#

Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code)
V  [libjvm.so+0xbb736f];;  void PSParallelCompact::summary_phase(ParCompactionManager*,bool)+0x41f
V  [libjvm.so+0xbb914d];;  void PSParallelCompact::invoke_no_policy(bool)+0x70d
V  [libjvm.so+0xbc7cc0];;  void PSScavenge::invoke()+0x130
V  [libjvm.so+0xb93aba];;  HeapWord*ParallelScavengeHeap::failed_mem_allocate(unsigned long,bool)+0x9a
V  [libjvm.so+0x422d86];;  void VM_ParallelGCFailedAllocation::doit()+0x96
V  [libjvm.so+0x422817];;  void VM_Operation::evaluate()+0x77
V  [libjvm.so+0x5b4ef1];;  void VMThread::loop()+0x4c1
V  [libjvm.so+0x5b3e7a];;  void VMThread::run()+0x7a
V  [libjvm.so+0xb74883];;  java_start+0x4c3
C  [libc.so.1+0xd504b]  _thr_slot_offset+0x31b;;  _thr_setup+0x5b
C  [libc.so.1+0xd5280]  _thr_slot_offset+0x550;;  _lwp_start+0x0

VM_Operation (0xfffffd7ffdfce4e0): ParallelGCFailedAllocation, mode: safepoint, requested by thread 0x000000000041c000

Running the test with fastdebug binaries triggers assertion failure:

#
# A fatal error has been detected by the Java Runtime Environment:
#
#  Internal Error (/BUILD_AREA/jdk7/hotspot/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.hpp:581), pid=25862, tid=5
#  Error: assert(addr<= _region_end,"bad addr")
#
# JRE version: 7.0-b47
# Java VM: Java HotSpot(TM) 64-Bit Server VM (15.0-b01-fastdebug mixed mode solaris-amd64 )
# If you would like to submit a bug report, please visit:
#   http://java.sun.com/webapps/bugreport/crash.jsp
#

---------------  T H R E A D  ---------------

Current thread (0x0000000000660800):  VMThread [stack: 0xfffffd7fd18ff000,0xfffffd7fd19ff000] [id=5]

Stack: [0xfffffd7fd18ff000,0xfffffd7fd19ff000],  sp=0xfffffd7fd19fd860,  free space=1018k
Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code)
V  [libjvm.so+0x1d647df];;  void VMError::report(outputStream*)+0x68f
V  [libjvm.so+0x1d6569d];;  void VMError::report_and_die()+0x4fd
V  [libjvm.so+0xa5879b];;  void report_assertion_failure(const char*,int,const char*)+0x5ab
V  [libjvm.so+0x1937d45];;  bool ParallelCompactData::summarize(SplitInfo&,HeapWord*,HeapWord*,HeapWord**,HeapWord*,HeapWord*,HeapWord**)+0x1
2b5
V  [libjvm.so+0x194a9c7];;  void PSParallelCompact::summary_phase(ParCompactionManager*,bool)+0x597
V  [libjvm.so+0x194bbbb];;  void PSParallelCompact::invoke_no_policy(bool)+0x68b
V  [libjvm.so+0x1977e2e];;  void PSScavenge::invoke()+0x21e
V  [libjvm.so+0x187cdd1];;  HeapWord*ParallelScavengeHeap::failed_mem_allocate(unsigned long,bool)+0x161
V  [libjvm.so+0x1d67539];;  void VM_ParallelGCFailedAllocation::doit()+0xd9
V  [libjvm.so+0x1d92f1b];;  void VM_Operation::evaluate()+0xfb
V  [libjvm.so+0x1d91033];;  void VMThread::evaluate_operation(VM_Operation*)+0x113
V  [libjvm.so+0x1d9190d];;  void VMThread::loop()+0x72d
V  [libjvm.so+0x1d90c7f];;  void VMThread::run()+0x9f
V  [libjvm.so+0x182762a];;  java_start+0x66a
C  [libc.so.1+0xd504b]  _thr_slot_offset+0x31b;;  _thr_setup+0x5b
C  [libc.so.1+0xd5280]  _thr_slot_offset+0x550;;  _lwp_start+0x0

VM_Operation (0xfffffd7ffc44e340): ParallelGCFailedAllocation, mode: safepoint, requested by thread 0x0000000000440000

To reproduce, do the following:

ssh vmsqe-v20z-01.russia
cd /net/vmsqe.russia/export/execution/results/JDK7/PROMOTION/VM/b47/ParallelOldGC/vm/solaris-amd64/server/mixed/solaris-amd64_server_mixed_vm.gc.testlist/ResultDir/FinalizeTest01
sh rerun.sh # may nedd to run this a couple of times

You can also do:
 bash /net/vmsqe.russia/export/bin/reproduce_bug.sh rerun.sh

The failure is reproducible with jdk7 b46 (hs14 b10), but not with jdk7 b42 (hs14 b09). Perhaps this failure mode became exposed by a number of parallel gc fixes that went into hs14 b10 (6786188, 6784849).

                                    

Comments
EVALUATION

With a core file and assembly listing, identified the failure point in summarize_split_space(), which was added as part of 6765745.  Failures are in the loop which clears the source_region field for regions that contain part an object which does not fit completely into the destination space:

    const RegionData* const sr = region(split_region); 
    const size_t beg_idx = 
      addr_to_region_idx(region_align_up(sr->destination() + 
                                         sr->partial_obj_size())); 
    const size_t end_idx = 
1---> addr_to_region_idx(region_align_up(destination + partial_obj_size)); 
 
    if (TraceParallelOldGCSummaryPhase) { 
        gclog_or_tty->print_cr("split:  clearing source_region field in [" 
                               SIZE_FORMAT ", " SIZE_FORMAT ")", 
                               beg_idx, end_idx); 
    } 
    for (size_t idx = beg_idx; idx < end_idx; ++idx) { 
2---> _region_data[idx].set_source_region(0); 
    } 

1 is the fastdebug failure (assert in region_align_up()), 2 is the product failure (SEGV).  The address 'destination + partial_object_size' used to compute the upper bound on the loop (end_idx) is outside the heap.
                                     
2009-03-03
EVALUATION

The code snippet in the above entry is inside this if block:

  if (destination + partial_obj_size > target_end) { 
    ...
    const size_t end_idx = 
      addr_to_region_idx(region_align_up(destination + partial_obj_size)); 
    ...
  }

Here target_end is the end of the target (i.e., destination) space and destination + partial_obj_size extend beyond it.  It's never safe to clear a region beyond the end of the target space; the failure occurs when we try to clear beyond the end of the very last space in the heap (one of the survivor spaces).

Need to use target_end to compute the last region to clear (end_idx) instead of destination + partial_obj_size.
                                     
2009-03-04
EVALUATION

Generally very difficult to reproduce.  However, running the test

   gc.gctests.Steal.steal002.steal002

with a fastdebug build (32- or 64-bit) on a single-cpu sparc (a rare thing) or using a single cpu processor set causes the test to fail reliably.  No failures are seen with a build containing the suggested fix.
                                     
2009-04-03
SUGGESTED FIX

# HG changeset patch
# User jcoomes
# Date 1236118998 28800
# Node ID 9efca00da61ed9d5a01fa1bf2cd4dea1747742eb
# Parent  cea947c8a9881f8236e22957caf4a3339881a6fe
6810474: par compact - crash in summary_phase with very full heap
Reviewed-by: tonyp

diff --git a/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp b/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp
--- a/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp
+++ b/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp
@@ -508,6 +508,7 @@
   assert(destination <= target_end, "sanity");
   assert(destination + _region_data[src_region].data_size() > target_end,
     "region should not fit into target space");
+  assert(is_region_aligned(target_end), "sanity");
 
   size_t split_region = src_region;
   HeapWord* split_destination = destination;
@@ -538,14 +539,12 @@
     //         max(top, max(new_top, clear_top))
     //
     // where clear_top is a new field in SpaceInfo.  Would have to set clear_top
-    // to destination + partial_obj_size, where both have the values passed to
-    // this routine.
+    // to target_end.
     const RegionData* const sr = region(split_region);
     const size_t beg_idx =
       addr_to_region_idx(region_align_up(sr->destination() +
                                          sr->partial_obj_size()));
-    const size_t end_idx =
-      addr_to_region_idx(region_align_up(destination + partial_obj_size));
+    const size_t end_idx = addr_to_region_idx(target_end);
 
     if (TraceParallelOldGCSummaryPhase) {
         gclog_or_tty->print_cr("split:  clearing source_region field in ["
                                     
2009-04-03
EVALUATION

http://hg.openjdk.java.net/jdk7/hotspot-gc/hotspot/rev/f18338cf04b0
                                     
2009-04-03
EVALUATION

http://hg.openjdk.java.net/jdk7/hotspot/hotspot/rev/f18338cf04b0
                                     
2009-04-04



Hardware and Software, Engineered to Work Together