LUCENE-3820: Wrong trailing index calculation in PatternReplaceCharFilter.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1294141 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dawid Weiss 2012-02-27 13:13:10 +00:00
parent 092e348d81
commit 7be5533989
5 changed files with 270 additions and 162 deletions

View File

@ -246,15 +246,22 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
assertAnalyzesToReuse(a, input, new String[]{expected});
}
// simple utility method for blasting tokenstreams with data to make sure they don't do anything crazy
// TODO: add a MockCharStream, and use it here too, to ensure that correctOffset etc is being done by tokenizers.
/** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException {
checkRandomData(random, a, iterations, 20);
checkRandomData(random, a, iterations, false);
}
/**
* utility method for blasting tokenstreams with data to make sure they don't do anything crazy
* @param simple true if only ascii strings will be used (try to avoid)
*/
public static void checkRandomData(Random random, Analyzer a, int iterations, boolean simple) throws IOException {
checkRandomData(random, a, iterations, 20, simple);
// now test with multiple threads
int numThreads = _TestUtil.nextInt(random, 4, 8);
Thread threads[] = new Thread[numThreads];
for (int i = 0; i < threads.length; i++) {
threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations);
threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations, simple);
}
for (int i = 0; i < threads.length; i++) {
threads[i].start();
@ -272,11 +279,13 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
final int iterations;
final Random random;
final Analyzer a;
final boolean simple;
AnalysisThread(Random random, Analyzer a, int iterations) {
AnalysisThread(Random random, Analyzer a, int iterations, boolean simple) {
this.random = random;
this.a = a;
this.iterations = iterations;
this.simple = simple;
}
@Override
@ -284,32 +293,36 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
try {
// see the part in checkRandomData where it replays the same text again
// to verify reproducability/reuse: hopefully this would catch thread hazards.
checkRandomData(random, a, iterations, 20);
checkRandomData(random, a, iterations, 20, simple);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
};
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean());
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) throws IOException {
checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple);
}
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter) throws IOException {
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple) throws IOException {
for (int i = 0; i < iterations; i++) {
String text;
switch(_TestUtil.nextInt(random, 0, 4)) {
case 0:
text = _TestUtil.randomSimpleString(random);
break;
case 1:
text = _TestUtil.randomRealisticUnicodeString(random, maxWordLength);
break;
case 2:
text = _TestUtil.randomHtmlishString(random, maxWordLength);
break;
default:
text = _TestUtil.randomUnicodeString(random, maxWordLength);
if (simple) {
text = random.nextBoolean() ? _TestUtil.randomSimpleString(random) : _TestUtil.randomHtmlishString(random, maxWordLength);
} else {
switch(_TestUtil.nextInt(random, 0, 4)) {
case 0:
text = _TestUtil.randomSimpleString(random);
break;
case 1:
text = _TestUtil.randomRealisticUnicodeString(random, maxWordLength);
break;
case 2:
text = _TestUtil.randomHtmlishString(random, maxWordLength);
break;
default:
text = _TestUtil.randomUnicodeString(random, maxWordLength);
}
}
if (VERBOSE) {

View File

@ -250,6 +250,36 @@ public class _TestUtil {
}
}
/**
* Returns a String thats "regexpish" (contains lots of operators typically found in regular expressions)
* If you call this enough times, you might get a valid regex!
*/
public static String randomRegexpishString(Random r) {
final int end = r.nextInt(20);
if (end == 0) {
// allow 0 length
return "";
}
final char[] buffer = new char[end];
for (int i = 0; i < end; i++) {
int t = r.nextInt(11);
if (t == 0) {
buffer[i] = (char) _TestUtil.nextInt(r, 97, 102);
}
else if (1 == t) buffer[i] = '.';
else if (2 == t) buffer[i] = '?';
else if (3 == t) buffer[i] = '*';
else if (4 == t) buffer[i] = '+';
else if (5 == t) buffer[i] = '(';
else if (6 == t) buffer[i] = ')';
else if (7 == t) buffer[i] = '-';
else if (8 == t) buffer[i] = '[';
else if (9 == t) buffer[i] = ']';
else if (10 == t) buffer[i] = '|';
}
return new String(buffer, 0, end);
}
private static final String[] HTML_CHAR_ENTITIES = {
"AElig", "Aacute", "Acirc", "Agrave", "Alpha", "AMP", "Aring", "Atilde",
"Auml", "Beta", "COPY", "Ccedil", "Chi", "Dagger", "Delta", "ETH",

View File

@ -7,6 +7,9 @@ http://s.apache.org/luceneversions
API Changes
* LUCENE-3820: Deprecated constructors accepting pattern matching bounds. The input
is buffered and matched in one pass. (Dawid Weiss)
* LUCENE-2413: Deprecated PatternAnalyzer in common/miscellaneous, in favor
of the pattern package (CharFilter, Tokenizer, TokenFilter). (Robert Muir)
@ -34,6 +37,11 @@ API Changes
and sometimes different depending on the type of set, and ultimately a CharArraySet
or CharArrayMap was always used anyway. (Robert Muir)
Bug fixes
* LUCENE-3820: PatternReplaceCharFilter could return invalid token positions.
(Dawid Weiss)
New Features
* LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer

View File

@ -18,12 +18,13 @@
package org.apache.lucene.analysis.pattern;
import java.io.IOException;
import java.util.LinkedList;
import java.io.Reader;
import java.io.StringReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.charfilter.BaseCharFilter;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.charfilter.BaseCharFilter;
/**
* CharFilter that uses a regular expression for the target of replace string.
@ -48,147 +49,88 @@ import org.apache.lucene.analysis.CharStream;
* @since Solr 1.5
*/
public class PatternReplaceCharFilter extends BaseCharFilter {
@Deprecated
public static final int DEFAULT_MAX_BLOCK_CHARS = 10000;
private final Pattern pattern;
private final String replacement;
private final int maxBlockChars;
private final String blockDelimiters;
public static final int DEFAULT_MAX_BLOCK_CHARS = 10000;
private Reader transformedInput;
private LinkedList<Character> buffer;
private int nextCharCounter;
private char[] blockBuffer;
private int blockBufferLength;
private String replaceBlockBuffer;
private int replaceBlockBufferOffset;
public PatternReplaceCharFilter( Pattern pattern, String replacement, CharStream in ){
this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, null, in );
}
public PatternReplaceCharFilter( Pattern pattern, String replacement,
int maxBlockChars, CharStream in ){
this( pattern, replacement, maxBlockChars, null, in );
}
public PatternReplaceCharFilter( Pattern pattern, String replacement,
String blockDelimiters, CharStream in ){
this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, blockDelimiters, in );
}
public PatternReplaceCharFilter( Pattern pattern, String replacement,
int maxBlockChars, String blockDelimiters, CharStream in ){
super( in );
public PatternReplaceCharFilter(Pattern pattern, String replacement, CharStream in) {
super(in);
this.pattern = pattern;
this.replacement = replacement;
if( maxBlockChars < 1 )
throw new IllegalArgumentException( "maxBlockChars should be greater than 0, but it is " + maxBlockChars );
this.maxBlockChars = maxBlockChars;
this.blockDelimiters = blockDelimiters;
blockBuffer = new char[maxBlockChars];
}
private boolean prepareReplaceBlock() throws IOException {
while( true ){
if( replaceBlockBuffer != null && replaceBlockBuffer.length() > replaceBlockBufferOffset )
return true;
// prepare block buffer
blockBufferLength = 0;
while( true ){
int c = nextChar();
if( c == -1 ) break;
blockBuffer[blockBufferLength++] = (char)c;
// end of block?
boolean foundDelimiter =
( blockDelimiters != null ) &&
( blockDelimiters.length() > 0 ) &&
blockDelimiters.indexOf( c ) >= 0;
if( foundDelimiter ||
blockBufferLength >= maxBlockChars ) break;
}
// block buffer available?
if( blockBufferLength == 0 ) return false;
replaceBlockBuffer = getReplaceBlock( blockBuffer, 0, blockBufferLength );
replaceBlockBufferOffset = 0;
}
}
@Override
public int read() throws IOException {
while( prepareReplaceBlock() ){
return replaceBlockBuffer.charAt( replaceBlockBufferOffset++ );
}
return -1;
@Deprecated
public PatternReplaceCharFilter(Pattern pattern, String replacement,
int maxBlockChars, String blockDelimiter, CharStream in) {
this(pattern, replacement, in);
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
char[] tmp = new char[len];
int l = input.read(tmp, 0, len);
if (l != -1) {
for(int i = 0; i < l; i++)
pushLastChar(tmp[i]);
// Buffer all input on the first call.
if (transformedInput == null) {
StringBuilder buffered = new StringBuilder();
char [] temp = new char [1024];
for (int cnt = input.read(temp); cnt > 0; cnt = input.read(temp)) {
buffered.append(temp, 0, cnt);
}
transformedInput = new StringReader(processPattern(buffered).toString());
}
l = 0;
for(int i = off; i < off + len; i++) {
int c = read();
if (c == -1) break;
cbuf[i] = (char) c;
l++;
}
return l == 0 ? -1 : l;
return transformedInput.read(cbuf, off, len);
}
private int nextChar() throws IOException {
if (buffer != null && !buffer.isEmpty()) {
nextCharCounter++;
return buffer.removeFirst().charValue();
}
int c = input.read();
if( c != -1 )
nextCharCounter++;
return c;
@Override
protected int correct(int currentOff) {
return Math.max(0, super.correct(currentOff));
}
private void pushLastChar(int c) {
if (buffer == null) {
buffer = new LinkedList<Character>();
}
buffer.addLast(new Character((char) c));
}
String getReplaceBlock( String block ){
char[] blockChars = block.toCharArray();
return getReplaceBlock( blockChars, 0, blockChars.length );
}
String getReplaceBlock( char block[], int offset, int length ){
StringBuffer replaceBlock = new StringBuffer();
String sourceBlock = new String( block, offset, length );
Matcher m = pattern.matcher( sourceBlock );
int lastMatchOffset = 0, lastDiff = 0;
while( m.find() ){
m.appendReplacement( replaceBlock, replacement );
// record cumulative diff for the offset correction
int diff = replaceBlock.length() - lastMatchOffset - lastDiff - ( m.end( 0 ) - lastMatchOffset );
if (diff != 0) {
int prevCumulativeDiff = getLastCumulativeDiff();
if (diff > 0) {
for(int i = 0; i < diff; i++){
addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + i - prevCumulativeDiff,
prevCumulativeDiff - 1 - i);
}
/**
* Replace pattern in input and mark correction offsets.
*/
CharSequence processPattern(CharSequence input) {
final Matcher m = pattern.matcher(input);
final StringBuffer cumulativeOutput = new StringBuffer();
int cumulative = 0;
int lastMatchEnd = 0;
while (m.find()) {
final int groupSize = m.end() - m.start();
final int skippedSize = m.start() - lastMatchEnd;
lastMatchEnd = m.end();
final int lengthBeforeReplacement = cumulativeOutput.length() + skippedSize;
m.appendReplacement(cumulativeOutput, replacement);
// Matcher doesn't tell us how many characters have been appended before the replacement.
// So we need to calculate it. Skipped characters have been added as part of appendReplacement.
final int replacementSize = cumulativeOutput.length() - lengthBeforeReplacement;
if (groupSize != replacementSize) {
if (replacementSize < groupSize) {
// The replacement is smaller.
// Add the 'backskip' to the next index after the replacement (this is possibly
// after the end of string, but it's fine -- it just means the last character
// of the replaced block doesn't reach the end of the original string.
cumulative += groupSize - replacementSize;
int atIndex = lengthBeforeReplacement + replacementSize;
// System.err.println(atIndex + "!" + cumulative);
addOffCorrectMap(atIndex, cumulative);
} else {
addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + diff - prevCumulativeDiff,
prevCumulativeDiff - diff);
// The replacement is larger. Every new index needs to point to the last
// element of the original group (if any).
for (int i = groupSize; i < replacementSize; i++) {
addOffCorrectMap(lengthBeforeReplacement + i, --cumulative);
// System.err.println((lengthBeforeReplacement + i) + " " + cumulative);
}
}
}
// save last offsets
lastMatchOffset = m.end( 0 );
lastDiff = diff;
}
// copy remaining of the part of source block
m.appendTail( replaceBlock );
return replaceBlock.toString();
// Append the remaining output, no further changes to indices.
m.appendTail(cumulativeOutput);
return cumulativeOutput;
}
}

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@ -29,12 +30,107 @@ import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util._TestUtil;
/**
* Tests {@link PatternReplaceCharFilter}
*/
public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
public void testFailingDot() throws IOException {
checkOutput(
"A. .B.", "\\.[\\s]*", ".",
"A..B.",
"A..B.");
}
public void testLongerReplacement() throws IOException {
checkOutput(
"XXabcZZabcYY", "abc", "abcde",
"XXabcdeZZabcdeYY",
"XXabcccZZabcccYY");
checkOutput(
"XXabcabcYY", "abc", "abcde",
"XXabcdeabcdeYY",
"XXabcccabcccYY");
checkOutput(
"abcabcYY", "abc", "abcde",
"abcdeabcdeYY",
"abcccabcccYY");
checkOutput(
"YY", "^", "abcde",
"abcdeYY",
// Should be: "-----YY" but we're enforcing non-negative offsets.
"YYYYYYY");
checkOutput(
"YY", "$", "abcde",
"YYabcde",
"YYYYYYY");
checkOutput(
"XYZ", ".", "abc",
"abcabcabc",
"XXXYYYZZZ");
checkOutput(
"XYZ", ".", "$0abc",
"XabcYabcZabc",
"XXXXYYYYZZZZ");
}
public void testShorterReplacement() throws IOException {
checkOutput(
"XXabcZZabcYY", "abc", "xy",
"XXxyZZxyYY",
"XXabZZabYY");
checkOutput(
"XXabcabcYY", "abc", "xy",
"XXxyxyYY",
"XXababYY");
checkOutput(
"abcabcYY", "abc", "xy",
"xyxyYY",
"ababYY");
checkOutput(
"abcabcYY", "abc", "",
"YY",
"YY");
checkOutput(
"YYabcabc", "abc", "",
"YY",
"YY");
}
private void checkOutput(String input, String pattern, String replacement,
String expectedOutput, String expectedIndexMatchedOutput) throws IOException {
CharStream cs = new PatternReplaceCharFilter(pattern(pattern), replacement,
CharReader.get(new StringReader(input)));
StringBuilder output = new StringBuilder();
for (int chr = cs.read(); chr > 0; chr = cs.read()) {
output.append((char) chr);
}
StringBuilder indexMatched = new StringBuilder();
for (int i = 0; i < output.length(); i++) {
indexMatched.append((cs.correctOffset(i) < 0 ? "-" : input.charAt(cs.correctOffset(i))));
}
boolean outputGood = expectedOutput.equals(output.toString());
boolean indexMatchedGood = expectedIndexMatchedOutput.equals(indexMatched.toString());
if (!outputGood || !indexMatchedGood || false) {
System.out.println("Pattern : " + pattern);
System.out.println("Replac. : " + replacement);
System.out.println("Input : " + input);
System.out.println("Output : " + output);
System.out.println("Expected: " + expectedOutput);
System.out.println("Output/i: " + indexMatched);
System.out.println("Expected: " + expectedIndexMatchedOutput);
System.out.println();
}
assertTrue("Output doesn't match.", outputGood);
assertTrue("Index-matched output doesn't match.", indexMatchedGood);
}
// 1111
// 01234567890123
// this is test.
@ -142,9 +238,13 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
// 012345678901234567890123456789012345678
// aa bb cc --- aa bb aa. bb aa bb cc
// aa##bb cc --- aa##bb aa. bb aa##bb cc
// aa bb cc --- aa bbbaa. bb aa b cc
public void test2blocksMultiMatches() throws IOException {
final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc";
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", ".",
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
@ -160,10 +260,10 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
// aa b - c . --- b aa . c c b
public void testChain() throws IOException {
final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb";
CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa", ".",
CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa",
CharReader.get( new StringReader( BLOCK ) ) );
cs = new PatternReplaceCharFilter( pattern("bb"), "b", ".", cs );
cs = new PatternReplaceCharFilter( pattern("ccc"), "c", ".", cs );
cs = new PatternReplaceCharFilter( pattern("bb"), "b", cs );
cs = new PatternReplaceCharFilter( pattern("ccc"), "c", cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" },
@ -178,18 +278,33 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
int numPatterns = atLeast(100);
for (int i = 0; i < numPatterns; i++) {
final Pattern p = randomPattern();
final String replacement = _TestUtil.randomSimpleString(random);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
@Override
protected Reader initReader(Reader reader) {
return new PatternReplaceCharFilter(Pattern.compile("a"), "b", CharReader.get(reader));
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
@Override
protected Reader initReader(Reader reader) {
return new PatternReplaceCharFilter(p, replacement, CharReader.get(reader));
}
};
checkRandomData(random, a, 1000*RANDOM_MULTIPLIER, true); // only ascii
}
}
}
public static Pattern randomPattern() {
while (true) {
try {
return Pattern.compile(_TestUtil.randomRegexpishString(random));
} catch (PatternSyntaxException ignored) {
// if at first you don't succeed...
}
}
}
}