mirror of https://github.com/apache/lucene.git
LUCENE-3820: Wrong trailing index calculation in PatternReplaceCharFilter.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1294141 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
092e348d81
commit
7be5533989
|
@ -246,15 +246,22 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
assertAnalyzesToReuse(a, input, new String[]{expected});
|
||||
}
|
||||
|
||||
// simple utility method for blasting tokenstreams with data to make sure they don't do anything crazy
|
||||
// TODO: add a MockCharStream, and use it here too, to ensure that correctOffset etc is being done by tokenizers.
|
||||
/** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
|
||||
public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException {
|
||||
checkRandomData(random, a, iterations, 20);
|
||||
checkRandomData(random, a, iterations, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* utility method for blasting tokenstreams with data to make sure they don't do anything crazy
|
||||
* @param simple true if only ascii strings will be used (try to avoid)
|
||||
*/
|
||||
public static void checkRandomData(Random random, Analyzer a, int iterations, boolean simple) throws IOException {
|
||||
checkRandomData(random, a, iterations, 20, simple);
|
||||
// now test with multiple threads
|
||||
int numThreads = _TestUtil.nextInt(random, 4, 8);
|
||||
Thread threads[] = new Thread[numThreads];
|
||||
for (int i = 0; i < threads.length; i++) {
|
||||
threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations);
|
||||
threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations, simple);
|
||||
}
|
||||
for (int i = 0; i < threads.length; i++) {
|
||||
threads[i].start();
|
||||
|
@ -272,11 +279,13 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
final int iterations;
|
||||
final Random random;
|
||||
final Analyzer a;
|
||||
final boolean simple;
|
||||
|
||||
AnalysisThread(Random random, Analyzer a, int iterations) {
|
||||
AnalysisThread(Random random, Analyzer a, int iterations, boolean simple) {
|
||||
this.random = random;
|
||||
this.a = a;
|
||||
this.iterations = iterations;
|
||||
this.simple = simple;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -284,32 +293,36 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
try {
|
||||
// see the part in checkRandomData where it replays the same text again
|
||||
// to verify reproducability/reuse: hopefully this would catch thread hazards.
|
||||
checkRandomData(random, a, iterations, 20);
|
||||
checkRandomData(random, a, iterations, 20, simple);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
|
||||
checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean());
|
||||
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) throws IOException {
|
||||
checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple);
|
||||
}
|
||||
|
||||
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter) throws IOException {
|
||||
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple) throws IOException {
|
||||
for (int i = 0; i < iterations; i++) {
|
||||
String text;
|
||||
switch(_TestUtil.nextInt(random, 0, 4)) {
|
||||
case 0:
|
||||
text = _TestUtil.randomSimpleString(random);
|
||||
break;
|
||||
case 1:
|
||||
text = _TestUtil.randomRealisticUnicodeString(random, maxWordLength);
|
||||
break;
|
||||
case 2:
|
||||
text = _TestUtil.randomHtmlishString(random, maxWordLength);
|
||||
break;
|
||||
default:
|
||||
text = _TestUtil.randomUnicodeString(random, maxWordLength);
|
||||
if (simple) {
|
||||
text = random.nextBoolean() ? _TestUtil.randomSimpleString(random) : _TestUtil.randomHtmlishString(random, maxWordLength);
|
||||
} else {
|
||||
switch(_TestUtil.nextInt(random, 0, 4)) {
|
||||
case 0:
|
||||
text = _TestUtil.randomSimpleString(random);
|
||||
break;
|
||||
case 1:
|
||||
text = _TestUtil.randomRealisticUnicodeString(random, maxWordLength);
|
||||
break;
|
||||
case 2:
|
||||
text = _TestUtil.randomHtmlishString(random, maxWordLength);
|
||||
break;
|
||||
default:
|
||||
text = _TestUtil.randomUnicodeString(random, maxWordLength);
|
||||
}
|
||||
}
|
||||
|
||||
if (VERBOSE) {
|
||||
|
|
|
@ -250,6 +250,36 @@ public class _TestUtil {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a String thats "regexpish" (contains lots of operators typically found in regular expressions)
|
||||
* If you call this enough times, you might get a valid regex!
|
||||
*/
|
||||
public static String randomRegexpishString(Random r) {
|
||||
final int end = r.nextInt(20);
|
||||
if (end == 0) {
|
||||
// allow 0 length
|
||||
return "";
|
||||
}
|
||||
final char[] buffer = new char[end];
|
||||
for (int i = 0; i < end; i++) {
|
||||
int t = r.nextInt(11);
|
||||
if (t == 0) {
|
||||
buffer[i] = (char) _TestUtil.nextInt(r, 97, 102);
|
||||
}
|
||||
else if (1 == t) buffer[i] = '.';
|
||||
else if (2 == t) buffer[i] = '?';
|
||||
else if (3 == t) buffer[i] = '*';
|
||||
else if (4 == t) buffer[i] = '+';
|
||||
else if (5 == t) buffer[i] = '(';
|
||||
else if (6 == t) buffer[i] = ')';
|
||||
else if (7 == t) buffer[i] = '-';
|
||||
else if (8 == t) buffer[i] = '[';
|
||||
else if (9 == t) buffer[i] = ']';
|
||||
else if (10 == t) buffer[i] = '|';
|
||||
}
|
||||
return new String(buffer, 0, end);
|
||||
}
|
||||
|
||||
private static final String[] HTML_CHAR_ENTITIES = {
|
||||
"AElig", "Aacute", "Acirc", "Agrave", "Alpha", "AMP", "Aring", "Atilde",
|
||||
"Auml", "Beta", "COPY", "Ccedil", "Chi", "Dagger", "Delta", "ETH",
|
||||
|
|
|
@ -7,6 +7,9 @@ http://s.apache.org/luceneversions
|
|||
|
||||
API Changes
|
||||
|
||||
* LUCENE-3820: Deprecated constructors accepting pattern matching bounds. The input
|
||||
is buffered and matched in one pass. (Dawid Weiss)
|
||||
|
||||
* LUCENE-2413: Deprecated PatternAnalyzer in common/miscellaneous, in favor
|
||||
of the pattern package (CharFilter, Tokenizer, TokenFilter). (Robert Muir)
|
||||
|
||||
|
@ -34,6 +37,11 @@ API Changes
|
|||
and sometimes different depending on the type of set, and ultimately a CharArraySet
|
||||
or CharArrayMap was always used anyway. (Robert Muir)
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-3820: PatternReplaceCharFilter could return invalid token positions.
|
||||
(Dawid Weiss)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer
|
||||
|
|
|
@ -18,12 +18,13 @@
|
|||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.charfilter.BaseCharFilter;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.charfilter.BaseCharFilter;
|
||||
|
||||
/**
|
||||
* CharFilter that uses a regular expression for the target of replace string.
|
||||
|
@ -48,147 +49,88 @@ import org.apache.lucene.analysis.CharStream;
|
|||
* @since Solr 1.5
|
||||
*/
|
||||
public class PatternReplaceCharFilter extends BaseCharFilter {
|
||||
@Deprecated
|
||||
public static final int DEFAULT_MAX_BLOCK_CHARS = 10000;
|
||||
|
||||
private final Pattern pattern;
|
||||
private final String replacement;
|
||||
private final int maxBlockChars;
|
||||
private final String blockDelimiters;
|
||||
public static final int DEFAULT_MAX_BLOCK_CHARS = 10000;
|
||||
private Reader transformedInput;
|
||||
|
||||
private LinkedList<Character> buffer;
|
||||
private int nextCharCounter;
|
||||
private char[] blockBuffer;
|
||||
private int blockBufferLength;
|
||||
private String replaceBlockBuffer;
|
||||
private int replaceBlockBufferOffset;
|
||||
|
||||
public PatternReplaceCharFilter( Pattern pattern, String replacement, CharStream in ){
|
||||
this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, null, in );
|
||||
}
|
||||
|
||||
public PatternReplaceCharFilter( Pattern pattern, String replacement,
|
||||
int maxBlockChars, CharStream in ){
|
||||
this( pattern, replacement, maxBlockChars, null, in );
|
||||
}
|
||||
|
||||
public PatternReplaceCharFilter( Pattern pattern, String replacement,
|
||||
String blockDelimiters, CharStream in ){
|
||||
this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, blockDelimiters, in );
|
||||
}
|
||||
|
||||
public PatternReplaceCharFilter( Pattern pattern, String replacement,
|
||||
int maxBlockChars, String blockDelimiters, CharStream in ){
|
||||
super( in );
|
||||
public PatternReplaceCharFilter(Pattern pattern, String replacement, CharStream in) {
|
||||
super(in);
|
||||
this.pattern = pattern;
|
||||
this.replacement = replacement;
|
||||
if( maxBlockChars < 1 )
|
||||
throw new IllegalArgumentException( "maxBlockChars should be greater than 0, but it is " + maxBlockChars );
|
||||
this.maxBlockChars = maxBlockChars;
|
||||
this.blockDelimiters = blockDelimiters;
|
||||
blockBuffer = new char[maxBlockChars];
|
||||
}
|
||||
|
||||
private boolean prepareReplaceBlock() throws IOException {
|
||||
while( true ){
|
||||
if( replaceBlockBuffer != null && replaceBlockBuffer.length() > replaceBlockBufferOffset )
|
||||
return true;
|
||||
// prepare block buffer
|
||||
blockBufferLength = 0;
|
||||
while( true ){
|
||||
int c = nextChar();
|
||||
if( c == -1 ) break;
|
||||
blockBuffer[blockBufferLength++] = (char)c;
|
||||
// end of block?
|
||||
boolean foundDelimiter =
|
||||
( blockDelimiters != null ) &&
|
||||
( blockDelimiters.length() > 0 ) &&
|
||||
blockDelimiters.indexOf( c ) >= 0;
|
||||
if( foundDelimiter ||
|
||||
blockBufferLength >= maxBlockChars ) break;
|
||||
}
|
||||
// block buffer available?
|
||||
if( blockBufferLength == 0 ) return false;
|
||||
replaceBlockBuffer = getReplaceBlock( blockBuffer, 0, blockBufferLength );
|
||||
replaceBlockBufferOffset = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
while( prepareReplaceBlock() ){
|
||||
return replaceBlockBuffer.charAt( replaceBlockBufferOffset++ );
|
||||
}
|
||||
return -1;
|
||||
@Deprecated
|
||||
public PatternReplaceCharFilter(Pattern pattern, String replacement,
|
||||
int maxBlockChars, String blockDelimiter, CharStream in) {
|
||||
this(pattern, replacement, in);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
char[] tmp = new char[len];
|
||||
int l = input.read(tmp, 0, len);
|
||||
if (l != -1) {
|
||||
for(int i = 0; i < l; i++)
|
||||
pushLastChar(tmp[i]);
|
||||
// Buffer all input on the first call.
|
||||
if (transformedInput == null) {
|
||||
StringBuilder buffered = new StringBuilder();
|
||||
char [] temp = new char [1024];
|
||||
for (int cnt = input.read(temp); cnt > 0; cnt = input.read(temp)) {
|
||||
buffered.append(temp, 0, cnt);
|
||||
}
|
||||
transformedInput = new StringReader(processPattern(buffered).toString());
|
||||
}
|
||||
l = 0;
|
||||
for(int i = off; i < off + len; i++) {
|
||||
int c = read();
|
||||
if (c == -1) break;
|
||||
cbuf[i] = (char) c;
|
||||
l++;
|
||||
}
|
||||
return l == 0 ? -1 : l;
|
||||
|
||||
return transformedInput.read(cbuf, off, len);
|
||||
}
|
||||
|
||||
private int nextChar() throws IOException {
|
||||
if (buffer != null && !buffer.isEmpty()) {
|
||||
nextCharCounter++;
|
||||
return buffer.removeFirst().charValue();
|
||||
}
|
||||
int c = input.read();
|
||||
if( c != -1 )
|
||||
nextCharCounter++;
|
||||
return c;
|
||||
@Override
|
||||
protected int correct(int currentOff) {
|
||||
return Math.max(0, super.correct(currentOff));
|
||||
}
|
||||
|
||||
private void pushLastChar(int c) {
|
||||
if (buffer == null) {
|
||||
buffer = new LinkedList<Character>();
|
||||
}
|
||||
buffer.addLast(new Character((char) c));
|
||||
}
|
||||
|
||||
String getReplaceBlock( String block ){
|
||||
char[] blockChars = block.toCharArray();
|
||||
return getReplaceBlock( blockChars, 0, blockChars.length );
|
||||
}
|
||||
|
||||
String getReplaceBlock( char block[], int offset, int length ){
|
||||
StringBuffer replaceBlock = new StringBuffer();
|
||||
String sourceBlock = new String( block, offset, length );
|
||||
Matcher m = pattern.matcher( sourceBlock );
|
||||
int lastMatchOffset = 0, lastDiff = 0;
|
||||
while( m.find() ){
|
||||
m.appendReplacement( replaceBlock, replacement );
|
||||
// record cumulative diff for the offset correction
|
||||
int diff = replaceBlock.length() - lastMatchOffset - lastDiff - ( m.end( 0 ) - lastMatchOffset );
|
||||
if (diff != 0) {
|
||||
int prevCumulativeDiff = getLastCumulativeDiff();
|
||||
if (diff > 0) {
|
||||
for(int i = 0; i < diff; i++){
|
||||
addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + i - prevCumulativeDiff,
|
||||
prevCumulativeDiff - 1 - i);
|
||||
}
|
||||
/**
|
||||
* Replace pattern in input and mark correction offsets.
|
||||
*/
|
||||
CharSequence processPattern(CharSequence input) {
|
||||
final Matcher m = pattern.matcher(input);
|
||||
|
||||
final StringBuffer cumulativeOutput = new StringBuffer();
|
||||
int cumulative = 0;
|
||||
int lastMatchEnd = 0;
|
||||
while (m.find()) {
|
||||
final int groupSize = m.end() - m.start();
|
||||
final int skippedSize = m.start() - lastMatchEnd;
|
||||
lastMatchEnd = m.end();
|
||||
|
||||
final int lengthBeforeReplacement = cumulativeOutput.length() + skippedSize;
|
||||
m.appendReplacement(cumulativeOutput, replacement);
|
||||
// Matcher doesn't tell us how many characters have been appended before the replacement.
|
||||
// So we need to calculate it. Skipped characters have been added as part of appendReplacement.
|
||||
final int replacementSize = cumulativeOutput.length() - lengthBeforeReplacement;
|
||||
|
||||
if (groupSize != replacementSize) {
|
||||
if (replacementSize < groupSize) {
|
||||
// The replacement is smaller.
|
||||
// Add the 'backskip' to the next index after the replacement (this is possibly
|
||||
// after the end of string, but it's fine -- it just means the last character
|
||||
// of the replaced block doesn't reach the end of the original string.
|
||||
cumulative += groupSize - replacementSize;
|
||||
int atIndex = lengthBeforeReplacement + replacementSize;
|
||||
// System.err.println(atIndex + "!" + cumulative);
|
||||
addOffCorrectMap(atIndex, cumulative);
|
||||
} else {
|
||||
addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + diff - prevCumulativeDiff,
|
||||
prevCumulativeDiff - diff);
|
||||
// The replacement is larger. Every new index needs to point to the last
|
||||
// element of the original group (if any).
|
||||
for (int i = groupSize; i < replacementSize; i++) {
|
||||
addOffCorrectMap(lengthBeforeReplacement + i, --cumulative);
|
||||
// System.err.println((lengthBeforeReplacement + i) + " " + cumulative);
|
||||
}
|
||||
}
|
||||
}
|
||||
// save last offsets
|
||||
lastMatchOffset = m.end( 0 );
|
||||
lastDiff = diff;
|
||||
}
|
||||
// copy remaining of the part of source block
|
||||
m.appendTail( replaceBlock );
|
||||
return replaceBlock.toString();
|
||||
|
||||
// Append the remaining output, no further changes to indices.
|
||||
m.appendTail(cumulativeOutput);
|
||||
return cumulativeOutput;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
|||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
@ -29,12 +30,107 @@ import org.apache.lucene.analysis.CharStream;
|
|||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
/**
|
||||
* Tests {@link PatternReplaceCharFilter}
|
||||
*/
|
||||
public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testFailingDot() throws IOException {
|
||||
checkOutput(
|
||||
"A. .B.", "\\.[\\s]*", ".",
|
||||
"A..B.",
|
||||
"A..B.");
|
||||
}
|
||||
|
||||
public void testLongerReplacement() throws IOException {
|
||||
checkOutput(
|
||||
"XXabcZZabcYY", "abc", "abcde",
|
||||
"XXabcdeZZabcdeYY",
|
||||
"XXabcccZZabcccYY");
|
||||
checkOutput(
|
||||
"XXabcabcYY", "abc", "abcde",
|
||||
"XXabcdeabcdeYY",
|
||||
"XXabcccabcccYY");
|
||||
checkOutput(
|
||||
"abcabcYY", "abc", "abcde",
|
||||
"abcdeabcdeYY",
|
||||
"abcccabcccYY");
|
||||
checkOutput(
|
||||
"YY", "^", "abcde",
|
||||
"abcdeYY",
|
||||
// Should be: "-----YY" but we're enforcing non-negative offsets.
|
||||
"YYYYYYY");
|
||||
checkOutput(
|
||||
"YY", "$", "abcde",
|
||||
"YYabcde",
|
||||
"YYYYYYY");
|
||||
checkOutput(
|
||||
"XYZ", ".", "abc",
|
||||
"abcabcabc",
|
||||
"XXXYYYZZZ");
|
||||
checkOutput(
|
||||
"XYZ", ".", "$0abc",
|
||||
"XabcYabcZabc",
|
||||
"XXXXYYYYZZZZ");
|
||||
}
|
||||
|
||||
public void testShorterReplacement() throws IOException {
|
||||
checkOutput(
|
||||
"XXabcZZabcYY", "abc", "xy",
|
||||
"XXxyZZxyYY",
|
||||
"XXabZZabYY");
|
||||
checkOutput(
|
||||
"XXabcabcYY", "abc", "xy",
|
||||
"XXxyxyYY",
|
||||
"XXababYY");
|
||||
checkOutput(
|
||||
"abcabcYY", "abc", "xy",
|
||||
"xyxyYY",
|
||||
"ababYY");
|
||||
checkOutput(
|
||||
"abcabcYY", "abc", "",
|
||||
"YY",
|
||||
"YY");
|
||||
checkOutput(
|
||||
"YYabcabc", "abc", "",
|
||||
"YY",
|
||||
"YY");
|
||||
}
|
||||
|
||||
private void checkOutput(String input, String pattern, String replacement,
|
||||
String expectedOutput, String expectedIndexMatchedOutput) throws IOException {
|
||||
CharStream cs = new PatternReplaceCharFilter(pattern(pattern), replacement,
|
||||
CharReader.get(new StringReader(input)));
|
||||
|
||||
StringBuilder output = new StringBuilder();
|
||||
for (int chr = cs.read(); chr > 0; chr = cs.read()) {
|
||||
output.append((char) chr);
|
||||
}
|
||||
|
||||
StringBuilder indexMatched = new StringBuilder();
|
||||
for (int i = 0; i < output.length(); i++) {
|
||||
indexMatched.append((cs.correctOffset(i) < 0 ? "-" : input.charAt(cs.correctOffset(i))));
|
||||
}
|
||||
|
||||
boolean outputGood = expectedOutput.equals(output.toString());
|
||||
boolean indexMatchedGood = expectedIndexMatchedOutput.equals(indexMatched.toString());
|
||||
|
||||
if (!outputGood || !indexMatchedGood || false) {
|
||||
System.out.println("Pattern : " + pattern);
|
||||
System.out.println("Replac. : " + replacement);
|
||||
System.out.println("Input : " + input);
|
||||
System.out.println("Output : " + output);
|
||||
System.out.println("Expected: " + expectedOutput);
|
||||
System.out.println("Output/i: " + indexMatched);
|
||||
System.out.println("Expected: " + expectedIndexMatchedOutput);
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
assertTrue("Output doesn't match.", outputGood);
|
||||
assertTrue("Index-matched output doesn't match.", indexMatchedGood);
|
||||
}
|
||||
|
||||
// 1111
|
||||
// 01234567890123
|
||||
// this is test.
|
||||
|
@ -142,9 +238,13 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
|||
// 012345678901234567890123456789012345678
|
||||
// aa bb cc --- aa bb aa. bb aa bb cc
|
||||
// aa##bb cc --- aa##bb aa. bb aa##bb cc
|
||||
|
||||
// aa bb cc --- aa bbbaa. bb aa b cc
|
||||
|
||||
public void test2blocksMultiMatches() throws IOException {
|
||||
final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc";
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", ".",
|
||||
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
|
@ -160,10 +260,10 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
|||
// aa b - c . --- b aa . c c b
|
||||
public void testChain() throws IOException {
|
||||
final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb";
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa", ".",
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
cs = new PatternReplaceCharFilter( pattern("bb"), "b", ".", cs );
|
||||
cs = new PatternReplaceCharFilter( pattern("ccc"), "c", ".", cs );
|
||||
cs = new PatternReplaceCharFilter( pattern("bb"), "b", cs );
|
||||
cs = new PatternReplaceCharFilter( pattern("ccc"), "c", cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" },
|
||||
|
@ -178,18 +278,33 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
int numPatterns = atLeast(100);
|
||||
for (int i = 0; i < numPatterns; i++) {
|
||||
final Pattern p = randomPattern();
|
||||
final String replacement = _TestUtil.randomSimpleString(random);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return new PatternReplaceCharFilter(Pattern.compile("a"), "b", CharReader.get(reader));
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return new PatternReplaceCharFilter(p, replacement, CharReader.get(reader));
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 1000*RANDOM_MULTIPLIER, true); // only ascii
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static Pattern randomPattern() {
|
||||
while (true) {
|
||||
try {
|
||||
return Pattern.compile(_TestUtil.randomRegexpishString(random));
|
||||
} catch (PatternSyntaxException ignored) {
|
||||
// if at first you don't succeed...
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue