mirror of https://github.com/apache/lucene.git
LUCENE-4078: PatternReplaceCharFilter assertion error caused by malformed
utf-16. This is most likely a bug in the JDK because a Pattern("").replaceAll("x") is inserted in between surrogate pair characters and corrupts strings. A temporary fix is to check for this at random pattern generator and pick again if detected. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1343214 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6a4a717220
commit
e525552d73
|
@ -23,7 +23,6 @@ import java.io.StringReader;
|
|||
import java.util.Random;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
@ -306,7 +305,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
|||
long maxTime = 1000 * 2;
|
||||
Random random = new Random(random().nextLong());
|
||||
for (int i = 0; i < numPatterns && start + maxTime > System.currentTimeMillis(); i++) {
|
||||
final Pattern p = randomPattern();
|
||||
final Pattern p = _TestUtil.randomPattern(random());
|
||||
final String replacement = _TestUtil.randomSimpleString(random);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
|
@ -325,14 +324,4 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
|||
* time for certain patterns. */ 40, true); // only ascii
|
||||
}
|
||||
}
|
||||
|
||||
public Pattern randomPattern() {
|
||||
while (true) {
|
||||
try {
|
||||
return Pattern.compile(_TestUtil.randomRegexpishString(random()));
|
||||
} catch (PatternSyntaxException ignored) {
|
||||
// if at first you don't succeed...
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
||||
|
|
|
@ -30,6 +30,8 @@ import java.nio.CharBuffer;
|
|||
import java.util.*;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipFile;
|
||||
|
||||
|
@ -900,4 +902,24 @@ public class _TestUtil {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a valid (compiling) Pattern instance with random stuff inside. Be careful
|
||||
* when applying random patterns to longer strings as certain types of patterns
|
||||
* may explode into exponential times in backtracking implementations (such as Java's).
|
||||
*/
|
||||
public static Pattern randomPattern(Random random) {
|
||||
final String nonBmpString = "AB\uD840\uDC00C";
|
||||
while (true) {
|
||||
try {
|
||||
Pattern p = Pattern.compile(_TestUtil.randomRegexpishString(random));
|
||||
// Make sure the result of applying the pattern to a string with extended
|
||||
// unicode characters is a valid utf16 string. See LUCENE-4078 for discussion.
|
||||
if (UnicodeUtil.validUTF16String(p.matcher(nonBmpString).replaceAll("_")))
|
||||
return p;
|
||||
} catch (PatternSyntaxException ignored) {
|
||||
// Loop trying until we hit something that compiles.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue