LUCENE-8273: TestRandomChains found some more end() handling problems

This commit is contained in:
Alan Woodward 2018-05-21 15:12:32 +01:00
parent 63e213916c
commit 0c0fce3e98
2 changed files with 74 additions and 19 deletions

View File

@ -80,10 +80,10 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
public void end() throws IOException {
// imitate Tokenizer.end() call - endAttributes, set final offset
if (exhausted) {
if (endCalled == false) {
if (endState == null) {
input.end();
endState = captureState();
}
endCalled = true;
endOffset = offsetAtt.endOffset();
}
endAttributes();
@ -96,7 +96,7 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
private boolean lastTokenFiltered;
private State bufferedState = null;
private boolean exhausted;
private boolean endCalled;
private State endState = null;
private int endOffset;
private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
@ -125,18 +125,22 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
this.bufferedState = null;
this.exhausted = false;
this.endOffset = -1;
this.endCalled = false;
this.endState = null;
}
@Override
public void end() throws IOException {
if (endCalled == false) {
if (endState == null) {
super.end();
endCalled = true;
endState = captureState();
}
else {
restoreState(endState);
}
endOffset = getAttribute(OffsetAttribute.class).endOffset();
if (lastTokenFiltered) {
this.delegate.end();
endState = captureState();
}
}

View File

@ -37,7 +37,10 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ValidatingTokenFilter;
import org.apache.lucene.analysis.core.TypeTokenFilter;
import org.apache.lucene.analysis.de.GermanStemFilter;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.shingle.FixedShingleFilter;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
@ -308,19 +311,7 @@ public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new NGramTokenizer();
TokenStream sink = new KeywordRepeatFilter(source);
sink = new ConditionalTokenFilter(sink, in -> new TypeTokenFilter(in, Collections.singleton("word"))) {
Random random = new Random(seed);
@Override
protected boolean shouldFilter() throws IOException {
return random.nextBoolean();
}
@Override
public void reset() throws IOException {
super.reset();
random = new Random(seed);
}
};
sink = new RandomSkippingFilter(sink, seed, in -> new TypeTokenFilter(in, Collections.singleton("word")));
sink = new ValidatingTokenFilter(sink, "last stage");
return new TokenStreamComponents(source, sink);
}
@ -330,4 +321,64 @@ public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {
}
public void testEndWithShingles() throws IOException {
TokenStream ts = whitespaceMockTokenizer("cyk jvboq \u092e\u0962\u093f");
ts = new GermanStemFilter(ts);
ts = new NonRandomSkippingFilter(ts, in -> new FixedShingleFilter(in, 2), true, false, true);
ts = new NonRandomSkippingFilter(ts, IndicNormalizationFilter::new, true);
assertTokenStreamContents(ts, new String[]{"jvboq"});
}
private static class RandomSkippingFilter extends ConditionalTokenFilter {
Random random;
final long seed;
protected RandomSkippingFilter(TokenStream input, long seed, Function<TokenStream, TokenStream> inputFactory) {
super(input, inputFactory);
this.seed = seed;
this.random = new Random(seed);
}
@Override
protected boolean shouldFilter() throws IOException {
return random.nextBoolean();
}
@Override
public void reset() throws IOException {
super.reset();
random = new Random(seed);
}
}
private static class NonRandomSkippingFilter extends ConditionalTokenFilter {
final boolean[] shouldFilters;
int pos;
/**
* Create a new BypassingTokenFilter
*
* @param input the input TokenStream
* @param inputFactory a factory function to create a new instance of the TokenFilter to wrap
*/
protected NonRandomSkippingFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory, boolean... shouldFilters) {
super(input, inputFactory);
this.shouldFilters = shouldFilters;
}
@Override
protected boolean shouldFilter() throws IOException {
return shouldFilters[pos++ % shouldFilters.length];
}
@Override
public void reset() throws IOException {
super.reset();
pos = 0;
}
}
}