JDK-7081350 : Simple Regex matcher throws StackOverflowError
  • Type: Bug
  • Component: core-libs
  • Sub-Component: java.util.regex
  • Affected Version: 7
  • Priority: P4
  • Status: Closed
  • Resolution: Duplicate
  • OS: windows_xp
  • CPU: x86
  • Submitted: 2011-08-20
  • Updated: 2012-03-20
  • Resolved: 2011-08-22
Related Reports
Duplicate :  
Description
FULL PRODUCT VERSION :
java version "1.7.0"
Java(TM) SE Runtime Environment (build 1.7.0-b147)
Java HotSpot(TM) Client VM (build 21.0-b17, mixed mode, sharing)

java version "1.6.0_25"
Java(TM) SE Runtime Environment (build 1.6.0_25-b06)
Java HotSpot(TM) Client VM (build 20.0-b11, mixed mode, sharing)



ADDITIONAL OS VERSION INFORMATION :
Microsoft Windows XP Professional Version 2002 Service Pack 3

Also occur under Linux, likely not a OS dependent bug

A DESCRIPTION OF THE PROBLEM :
The pattern (<[^>]+> *)+$ searching for white-space separated HTML tags at the end of a document throws a StackOverFlow error if the input document has too many matches (eg, 500 tags in a row).

Same bug can be triggered with a pattern anchored at the beginning ^(<[^>]+> *)+

STEPS TO FOLLOW TO REPRODUCE THE PROBLEM :
Compile and run the following code:

import java.util.regex.Pattern;
public class RegexBug3 {
	public static final Pattern trailingTags = Pattern.compile("(<[^>]+> *)+$");
	public static final Pattern leadingTags = Pattern.compile("^( *<[^>]+>)+");
	public static void main(String[] args) {
		StringBuilder longDocument = new StringBuilder("<b>");
		for (int i = 0; i < 16; i++) {
			if (trailingTags.matcher(longDocument).find()) {
				// if (leadingTags.matcher(longDocument).find()) {
				System.err.println("OK for document with "
						+ (long) (Math.pow(2, i) + 0.5) + " tags");
			}
			longDocument.append(longDocument);
		}
	}
}

EXPECTED VERSUS ACTUAL BEHAVIOR :
EXPECTED -
OK for document with 1 tags: ""
OK for document with 2 tags: ""
OK for document with 4 tags: ""
OK for document with 8 tags: ""
OK for document with 16 tags: ""
OK for document with 32 tags: ""
OK for document with 64 tags: ""
OK for document with 128 tags: ""
OK for document with 256 tags: ""
OK for document with 512 tags: ""
OK for document with 1024 tags: ""
OK for document with 2048 tags: ""
OK for document with 4096 tags: ""
OK for document with 8192 tags: ""
OK for document with 16384 tags: ""
OK for document with 32768 tags: ""

ACTUAL -
OK for document with 1 tags: ""
OK for document with 2 tags: ""
OK for document with 4 tags: ""
OK for document with 8 tags: ""
OK for document with 16 tags: ""
OK for document with 32 tags: ""
OK for document with 64 tags: ""
OK for document with 128 tags: ""
OK for document with 256 tags: ""
Exception in thread "main" java.lang.StackOverflowError
        at java.util.regex.Pattern$CharProperty.match(Pattern.java:3693)
        at java.util.regex.Pattern$Curly.match(Pattern.java:4125)
    ...

ERROR MESSAGES/STACK TRACES THAT OCCUR :
Exception in thread "main" java.lang.StackOverflowError
        at java.util.regex.Pattern$GroupHead.match(Pattern.java:4554)
        at java.util.regex.Pattern$Loop.match(Pattern.java:4683)
        at java.util.regex.Pattern$GroupTail.match(Pattern.java:4615)
        at java.util.regex.Pattern$Curly.match0(Pattern.java:4177)
        at java.util.regex.Pattern$Curly.match(Pattern.java:4132)
        at java.util.regex.Pattern$BmpCharProperty.match(Pattern.java:3715)
        at java.util.regex.Pattern$Curly.match0(Pattern.java:4177)
        at java.util.regex.Pattern$Curly.match(Pattern.java:4132)
        at java.util.regex.Pattern$BmpCharProperty.match(Pattern.java:3715)
        at java.util.regex.Pattern$GroupHead.match(Pattern.java:4556)
        at java.util.regex.Pattern$Loop.match(Pattern.java:4683)
        at java.util.regex.Pattern$GroupTail.match(Pattern.java:4615)
        at java.util.regex.Pattern$Curly.match0(Pattern.java:4177)
        at java.util.regex.Pattern$Curly.match(Pattern.java:4132)
        at java.util.regex.Pattern$BmpCharProperty.match(Pattern.java:3715)

etc, a total of about 1000 lines, ending on:

       at java.util.regex.Pattern$BmpCharProperty.match(Pattern.java:3715)
        at java.util.regex.Pattern$GroupHead.match(Pattern.java:4556)
        at java.util.regex.Pattern$Loop.match(Pattern.java:4683)
        at java.util.regex.Pattern$GroupTail.match(Pattern.java:4615)
        at java.util.regex.Pattern$Curly.match0(Pattern.java:4177)
        at java.util.regex.Pattern$Curly.match(Pattern.java:4132)
        at java.util.regex.Pattern$BmpCharProperty.match(Pattern.java:3715)
        at java.util.regex.Pattern$Curly.match0(Pattern.java:4177)


REPRODUCIBILITY :
This bug can be reproduced always.

---------- BEGIN SOURCE ----------
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class RegexBug3 {
	public static final Pattern trailingTags = Pattern.compile("(<[^>]+> *)+$");

	public static void main(String[] args) {
		StringBuilder longDocument = new StringBuilder("<b>");
		Matcher m;
		for (int i = 0; i < 16; i++) {
			// next line triggers the bug
			if ((m = trailingTags.matcher(longDocument)).find()) {
				String prunedString = longDocument.toString().substring(0,
						m.start());
				System.err.println("OK for document with "
						+ (long) (Math.pow(2, i) + 0.5) + " tags: \""
						+ prunedString + "\"");
			}
			longDocument.append(longDocument);
		}
	}
}
---------- END SOURCE ----------

CUSTOMER SUBMITTED WORKAROUND :
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class RegexBug4 {

	// approach 1: apply pattern in a loop, limiting the number of matches
	// very slow for long inputs, but does not crash
	public static final Pattern trailingTags = Pattern
			.compile("(<[^>]+> *){1,100}$");

	public static void slow() {
		StringBuilder longDocument = new StringBuilder("<b>");
		Matcher m;
		for (int i = 0; i < 16; i++) {
			String prunedString = longDocument.toString();
			while ((m = trailingTags.matcher(prunedString)).find()) {
				prunedString = prunedString.substring(0, m.start());
			}
			if (prunedString.length() < longDocument.length()) {
				System.err.println("OK for document with "
						+ (long) (Math.pow(2, i) + 0.5) + " tags: \""
						+ prunedString + "\"");
			}
			longDocument.append(longDocument);
		}
	}

	// approach 2: reverse pattern and match from the beginning instead of the
	// end and apply pattern in a loop, limiting the number of matches:
	// does not crash, and much faster than matching from the end
	public static final Pattern trailingTagsReversed = Pattern
			.compile("^( *>[^<]+<){1,100}");

	public static void fast() {
		StringBuilder longDocument = new StringBuilder("<b>");
		Matcher m;
		for (int i = 0; i < 16; i++) {
			String prunedStringReversed = (new StringBuilder(longDocument)).reverse().toString();
			while ((m = trailingTagsReversed.matcher(prunedStringReversed))
					.find()) {
				prunedStringReversed = prunedStringReversed.substring(m.end());
			}
			if (prunedStringReversed.length() < longDocument.length()) {
				System.err.println("OK for document with "
						+ (long) (Math.pow(2, i) + 0.5) + " tags: \""
						+ (new StringBuilder(prunedStringReversed)).reverse()
						+ "\"");
			}
			longDocument.append(longDocument);
		}
	}

	// run with optional argument "fast", everything else runs the slow version
	public static void main(String[] args) {
		if (args.length > 0 && "fast".equals(args[0])) {
			fast();
		} else {
			slow();
		}
	}

}