LUCENE-4078: PatternReplaceCharFilter assertion error caused by malformed

utf-16. This is most likely a bug in the JDK because a
Pattern("").replaceAll("x") is inserted in between surrogate pair
characters and corrupts strings.

A temporary fix is to check for this at random pattern generator and pick
again if detected.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1343214 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dawid Weiss 2012-05-28 12:00:24 +00:00
parent 6a4a717220
commit e525552d73
3 changed files with 23 additions and 13 deletions

View File

@ -23,7 +23,6 @@ import java.io.StringReader;
import java.util.Random; import java.util.Random;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@ -306,7 +305,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
long maxTime = 1000 * 2; long maxTime = 1000 * 2;
Random random = new Random(random().nextLong()); Random random = new Random(random().nextLong());
for (int i = 0; i < numPatterns && start + maxTime > System.currentTimeMillis(); i++) { for (int i = 0; i < numPatterns && start + maxTime > System.currentTimeMillis(); i++) {
final Pattern p = randomPattern(); final Pattern p = _TestUtil.randomPattern(random());
final String replacement = _TestUtil.randomSimpleString(random); final String replacement = _TestUtil.randomSimpleString(random);
Analyzer a = new Analyzer() { Analyzer a = new Analyzer() {
@Override @Override
@ -325,14 +324,4 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
* time for certain patterns. */ 40, true); // only ascii * time for certain patterns. */ 40, true); // only ascii
} }
} }
public Pattern randomPattern() {
while (true) {
try {
return Pattern.compile(_TestUtil.randomRegexpishString(random()));
} catch (PatternSyntaxException ignored) {
// if at first you don't succeed...
}
}
}
} }

View File

@ -22,7 +22,6 @@ import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.AttributeSource.AttributeFactory;
import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp; import org.apache.lucene.util.automaton.RegExp;

View File

@ -30,6 +30,8 @@ import java.nio.CharBuffer;
import java.util.*; import java.util.*;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import java.util.zip.ZipEntry; import java.util.zip.ZipEntry;
import java.util.zip.ZipFile; import java.util.zip.ZipFile;
@ -900,4 +902,24 @@ public class _TestUtil {
} }
} }
} }
/**
* Returns a valid (compiling) Pattern instance with random stuff inside. Be careful
* when applying random patterns to longer strings as certain types of patterns
* may explode into exponential times in backtracking implementations (such as Java's).
*/
public static Pattern randomPattern(Random random) {
final String nonBmpString = "AB\uD840\uDC00C";
while (true) {
try {
Pattern p = Pattern.compile(_TestUtil.randomRegexpishString(random));
// Make sure the result of applying the pattern to a string with extended
// unicode characters is a valid utf16 string. See LUCENE-4078 for discussion.
if (UnicodeUtil.validUTF16String(p.matcher(nonBmpString).replaceAll("_")))
return p;
} catch (PatternSyntaxException ignored) {
// Loop trying until we hit something that compiles.
}
}
}
} }