mirror of https://github.com/apache/lucene.git
LUCENE-8273: TestRandomChains found some more end() handling problems
This commit is contained in:
parent
63e213916c
commit
0c0fce3e98
|
@ -80,10 +80,10 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
|
||||||
public void end() throws IOException {
|
public void end() throws IOException {
|
||||||
// imitate Tokenizer.end() call - endAttributes, set final offset
|
// imitate Tokenizer.end() call - endAttributes, set final offset
|
||||||
if (exhausted) {
|
if (exhausted) {
|
||||||
if (endCalled == false) {
|
if (endState == null) {
|
||||||
input.end();
|
input.end();
|
||||||
|
endState = captureState();
|
||||||
}
|
}
|
||||||
endCalled = true;
|
|
||||||
endOffset = offsetAtt.endOffset();
|
endOffset = offsetAtt.endOffset();
|
||||||
}
|
}
|
||||||
endAttributes();
|
endAttributes();
|
||||||
|
@ -96,7 +96,7 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
|
||||||
private boolean lastTokenFiltered;
|
private boolean lastTokenFiltered;
|
||||||
private State bufferedState = null;
|
private State bufferedState = null;
|
||||||
private boolean exhausted;
|
private boolean exhausted;
|
||||||
private boolean endCalled;
|
private State endState = null;
|
||||||
private int endOffset;
|
private int endOffset;
|
||||||
|
|
||||||
private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
@ -125,18 +125,22 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
|
||||||
this.bufferedState = null;
|
this.bufferedState = null;
|
||||||
this.exhausted = false;
|
this.exhausted = false;
|
||||||
this.endOffset = -1;
|
this.endOffset = -1;
|
||||||
this.endCalled = false;
|
this.endState = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void end() throws IOException {
|
public void end() throws IOException {
|
||||||
if (endCalled == false) {
|
if (endState == null) {
|
||||||
super.end();
|
super.end();
|
||||||
endCalled = true;
|
endState = captureState();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
restoreState(endState);
|
||||||
}
|
}
|
||||||
endOffset = getAttribute(OffsetAttribute.class).endOffset();
|
endOffset = getAttribute(OffsetAttribute.class).endOffset();
|
||||||
if (lastTokenFiltered) {
|
if (lastTokenFiltered) {
|
||||||
this.delegate.end();
|
this.delegate.end();
|
||||||
|
endState = captureState();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -37,7 +37,10 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.ValidatingTokenFilter;
|
import org.apache.lucene.analysis.ValidatingTokenFilter;
|
||||||
import org.apache.lucene.analysis.core.TypeTokenFilter;
|
import org.apache.lucene.analysis.core.TypeTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.de.GermanStemFilter;
|
||||||
|
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
|
||||||
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
||||||
|
import org.apache.lucene.analysis.shingle.FixedShingleFilter;
|
||||||
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||||
import org.apache.lucene.analysis.standard.ClassicTokenizer;
|
import org.apache.lucene.analysis.standard.ClassicTokenizer;
|
||||||
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
|
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
|
||||||
|
@ -308,8 +311,36 @@ public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Tokenizer source = new NGramTokenizer();
|
Tokenizer source = new NGramTokenizer();
|
||||||
TokenStream sink = new KeywordRepeatFilter(source);
|
TokenStream sink = new KeywordRepeatFilter(source);
|
||||||
sink = new ConditionalTokenFilter(sink, in -> new TypeTokenFilter(in, Collections.singleton("word"))) {
|
sink = new RandomSkippingFilter(sink, seed, in -> new TypeTokenFilter(in, Collections.singleton("word")));
|
||||||
Random random = new Random(seed);
|
sink = new ValidatingTokenFilter(sink, "last stage");
|
||||||
|
return new TokenStreamComponents(source, sink);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
checkRandomData(random(), analyzer, 1);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEndWithShingles() throws IOException {
|
||||||
|
TokenStream ts = whitespaceMockTokenizer("cyk jvboq \u092e\u0962\u093f");
|
||||||
|
ts = new GermanStemFilter(ts);
|
||||||
|
ts = new NonRandomSkippingFilter(ts, in -> new FixedShingleFilter(in, 2), true, false, true);
|
||||||
|
ts = new NonRandomSkippingFilter(ts, IndicNormalizationFilter::new, true);
|
||||||
|
|
||||||
|
assertTokenStreamContents(ts, new String[]{"jvboq"});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class RandomSkippingFilter extends ConditionalTokenFilter {
|
||||||
|
|
||||||
|
Random random;
|
||||||
|
final long seed;
|
||||||
|
|
||||||
|
protected RandomSkippingFilter(TokenStream input, long seed, Function<TokenStream, TokenStream> inputFactory) {
|
||||||
|
super(input, inputFactory);
|
||||||
|
this.seed = seed;
|
||||||
|
this.random = new Random(seed);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected boolean shouldFilter() throws IOException {
|
protected boolean shouldFilter() throws IOException {
|
||||||
return random.nextBoolean();
|
return random.nextBoolean();
|
||||||
|
@ -320,14 +351,34 @@ public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {
|
||||||
super.reset();
|
super.reset();
|
||||||
random = new Random(seed);
|
random = new Random(seed);
|
||||||
}
|
}
|
||||||
};
|
|
||||||
sink = new ValidatingTokenFilter(sink, "last stage");
|
|
||||||
return new TokenStreamComponents(source, sink);
|
|
||||||
}
|
}
|
||||||
};
|
|
||||||
|
|
||||||
checkRandomData(random(), analyzer, 1);
|
private static class NonRandomSkippingFilter extends ConditionalTokenFilter {
|
||||||
|
|
||||||
|
final boolean[] shouldFilters;
|
||||||
|
int pos;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new BypassingTokenFilter
|
||||||
|
*
|
||||||
|
* @param input the input TokenStream
|
||||||
|
* @param inputFactory a factory function to create a new instance of the TokenFilter to wrap
|
||||||
|
*/
|
||||||
|
protected NonRandomSkippingFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory, boolean... shouldFilters) {
|
||||||
|
super(input, inputFactory);
|
||||||
|
this.shouldFilters = shouldFilters;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected boolean shouldFilter() throws IOException {
|
||||||
|
return shouldFilters[pos++ % shouldFilters.length];
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset() throws IOException {
|
||||||
|
super.reset();
|
||||||
|
pos = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue