mirror of https://github.com/apache/lucene.git
LUCENE-4078: PatternReplaceCharFilter assertion error caused by malformed
utf-16. This is most likely a bug in the JDK because a Pattern("").replaceAll("x") is inserted in between surrogate pair characters and corrupts strings. A temporary fix is to check for this at random pattern generator and pick again if detected. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1343214 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6a4a717220
commit
e525552d73
|
@ -23,7 +23,6 @@ import java.io.StringReader;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.regex.PatternSyntaxException;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
@ -306,7 +305,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
||||||
long maxTime = 1000 * 2;
|
long maxTime = 1000 * 2;
|
||||||
Random random = new Random(random().nextLong());
|
Random random = new Random(random().nextLong());
|
||||||
for (int i = 0; i < numPatterns && start + maxTime > System.currentTimeMillis(); i++) {
|
for (int i = 0; i < numPatterns && start + maxTime > System.currentTimeMillis(); i++) {
|
||||||
final Pattern p = randomPattern();
|
final Pattern p = _TestUtil.randomPattern(random());
|
||||||
final String replacement = _TestUtil.randomSimpleString(random);
|
final String replacement = _TestUtil.randomSimpleString(random);
|
||||||
Analyzer a = new Analyzer() {
|
Analyzer a = new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
|
@ -325,14 +324,4 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
||||||
* time for certain patterns. */ 40, true); // only ascii
|
* time for certain patterns. */ 40, true); // only ascii
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Pattern randomPattern() {
|
|
||||||
while (true) {
|
|
||||||
try {
|
|
||||||
return Pattern.compile(_TestUtil.randomRegexpishString(random()));
|
|
||||||
} catch (PatternSyntaxException ignored) {
|
|
||||||
// if at first you don't succeed...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,7 +22,6 @@ import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
|
||||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||||
import org.apache.lucene.util.automaton.RegExp;
|
import org.apache.lucene.util.automaton.RegExp;
|
||||||
|
|
||||||
|
|
|
@ -30,6 +30,8 @@ import java.nio.CharBuffer;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
import java.util.regex.PatternSyntaxException;
|
||||||
import java.util.zip.ZipEntry;
|
import java.util.zip.ZipEntry;
|
||||||
import java.util.zip.ZipFile;
|
import java.util.zip.ZipFile;
|
||||||
|
|
||||||
|
@ -900,4 +902,24 @@ public class _TestUtil {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a valid (compiling) Pattern instance with random stuff inside. Be careful
|
||||||
|
* when applying random patterns to longer strings as certain types of patterns
|
||||||
|
* may explode into exponential times in backtracking implementations (such as Java's).
|
||||||
|
*/
|
||||||
|
public static Pattern randomPattern(Random random) {
|
||||||
|
final String nonBmpString = "AB\uD840\uDC00C";
|
||||||
|
while (true) {
|
||||||
|
try {
|
||||||
|
Pattern p = Pattern.compile(_TestUtil.randomRegexpishString(random));
|
||||||
|
// Make sure the result of applying the pattern to a string with extended
|
||||||
|
// unicode characters is a valid utf16 string. See LUCENE-4078 for discussion.
|
||||||
|
if (UnicodeUtil.validUTF16String(p.matcher(nonBmpString).replaceAll("_")))
|
||||||
|
return p;
|
||||||
|
} catch (PatternSyntaxException ignored) {
|
||||||
|
// Loop trying until we hit something that compiles.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue