From e525552d73396aac9c842a76d5f495f88653207a Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Mon, 28 May 2012 12:00:24 +0000 Subject: [PATCH] LUCENE-4078: PatternReplaceCharFilter assertion error caused by malformed utf-16. This is most likely a bug in the JDK because a Pattern("").replaceAll("x") is inserted in between surrogate pair characters and corrupts strings. A temporary fix is to check for this at random pattern generator and pick again if detected. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1343214 13f79535-47bb-0310-9956-ffa450edef68 --- .../pattern/TestPatternReplaceCharFilter.java | 13 +---------- .../apache/lucene/analysis/MockTokenizer.java | 1 - .../org/apache/lucene/util/_TestUtil.java | 22 +++++++++++++++++++ 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java index 9341fafb052..c316e1ff702 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java @@ -23,7 +23,6 @@ import java.io.StringReader; import java.util.Random; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.regex.PatternSyntaxException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; @@ -306,7 +305,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase { long maxTime = 1000 * 2; Random random = new Random(random().nextLong()); for (int i = 0; i < numPatterns && start + maxTime > System.currentTimeMillis(); i++) { - final Pattern p = randomPattern(); + final Pattern p = _TestUtil.randomPattern(random()); final String replacement = _TestUtil.randomSimpleString(random); Analyzer a = new Analyzer() { @Override @@ -325,14 +324,4 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase { * time for certain patterns. */ 40, true); // only ascii } } - - public Pattern randomPattern() { - while (true) { - try { - return Pattern.compile(_TestUtil.randomRegexpishString(random())); - } catch (PatternSyntaxException ignored) { - // if at first you don't succeed... - } - } - } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java index 849479851a9..262c1f9a5d1 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java @@ -22,7 +22,6 @@ import java.io.Reader; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.util.AttributeSource.AttributeFactory; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.RegExp; diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java index a3ea5692e10..f724b278ae8 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java @@ -30,6 +30,8 @@ import java.nio.CharBuffer; import java.util.*; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; @@ -900,4 +902,24 @@ public class _TestUtil { } } } + + /** + * Returns a valid (compiling) Pattern instance with random stuff inside. Be careful + * when applying random patterns to longer strings as certain types of patterns + * may explode into exponential times in backtracking implementations (such as Java's). + */ + public static Pattern randomPattern(Random random) { + final String nonBmpString = "AB\uD840\uDC00C"; + while (true) { + try { + Pattern p = Pattern.compile(_TestUtil.randomRegexpishString(random)); + // Make sure the result of applying the pattern to a string with extended + // unicode characters is a valid utf16 string. See LUCENE-4078 for discussion. + if (UnicodeUtil.validUTF16String(p.matcher(nonBmpString).replaceAll("_"))) + return p; + } catch (PatternSyntaxException ignored) { + // Loop trying until we hit something that compiles. + } + } + } }