LUCENE-8237: Correct handling of position increments in sub-tokenstreams

This commit is contained in:
Alan Woodward 2018-06-18 09:29:58 +01:00
parent d1dc874d90
commit ab2fec1642
5 changed files with 103 additions and 47 deletions

View File

@ -43,24 +43,29 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
private final class OneTimeWrapper extends TokenStream { private final class OneTimeWrapper extends TokenStream {
private final OffsetAttribute offsetAtt; private final OffsetAttribute offsetAtt;
private final PositionIncrementAttribute posIncAtt;
public OneTimeWrapper(AttributeSource attributeSource) { public OneTimeWrapper(AttributeSource attributeSource) {
super(attributeSource); super(attributeSource);
this.offsetAtt = attributeSource.addAttribute(OffsetAttribute.class); this.offsetAtt = attributeSource.addAttribute(OffsetAttribute.class);
this.posIncAtt = attributeSource.addAttribute(PositionIncrementAttribute.class);
} }
@Override @Override
public boolean incrementToken() throws IOException { public boolean incrementToken() throws IOException {
if (state == TokenState.PREBUFFERING) { if (state == TokenState.PREBUFFERING) {
if (posIncAtt.getPositionIncrement() == 0) {
adjustPosition = true;
posIncAtt.setPositionIncrement(1);
}
state = TokenState.DELEGATING; state = TokenState.DELEGATING;
return true; return true;
} }
assert state == TokenState.DELEGATING; assert state == TokenState.DELEGATING;
boolean more = input.incrementToken(); if (input.incrementToken()) {
if (more && shouldFilter()) { if (shouldFilter()) {
return true; return true;
} }
if (more) {
endOffset = offsetAtt.endOffset(); endOffset = offsetAtt.endOffset();
bufferedState = captureState(); bufferedState = captureState();
} }
@ -96,10 +101,11 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
private boolean lastTokenFiltered; private boolean lastTokenFiltered;
private State bufferedState = null; private State bufferedState = null;
private boolean exhausted; private boolean exhausted;
private boolean adjustPosition;
private State endState = null; private State endState = null;
private int endOffset; private int endOffset;
private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
/** /**
* Create a new ConditionalTokenFilter * Create a new ConditionalTokenFilter
@ -124,6 +130,7 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
this.lastTokenFiltered = false; this.lastTokenFiltered = false;
this.bufferedState = null; this.bufferedState = null;
this.exhausted = false; this.exhausted = false;
this.adjustPosition = false;
this.endOffset = -1; this.endOffset = -1;
this.endState = null; this.endState = null;
} }
@ -168,16 +175,6 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
return false; return false;
} }
if (shouldFilter()) { if (shouldFilter()) {
// we're chopping the underlying Tokenstream up into fragments, and presenting
// only those parts of it that pass the filter to the delegate, so the delegate is
// in effect seeing multiple tokenstream snippets. Tokenstreams can't have an initial
// position increment of 0, so if the snippet starts on a stacked token we need to
// offset it here and then correct the increment back again after delegation
boolean adjustPosition = false;
if (posIncAtt.getPositionIncrement() == 0) {
posIncAtt.setPositionIncrement(1);
adjustPosition = true;
}
lastTokenFiltered = true; lastTokenFiltered = true;
state = TokenState.PREBUFFERING; state = TokenState.PREBUFFERING;
// we determine that the delegate has emitted all the tokens it can at the current // we determine that the delegate has emitted all the tokens it can at the current
@ -192,20 +189,14 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
int posInc = posIncAtt.getPositionIncrement(); int posInc = posIncAtt.getPositionIncrement();
posIncAtt.setPositionIncrement(posInc - 1); posIncAtt.setPositionIncrement(posInc - 1);
} }
adjustPosition = false;
} }
else { else {
lastTokenFiltered = false; lastTokenFiltered = false;
state = TokenState.READING; state = TokenState.READING;
if (bufferedState != null) { return endDelegating();
delegate.end();
int posInc = posIncAtt.getPositionIncrement();
restoreState(bufferedState);
posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posInc);
bufferedState = null;
return true;
}
} }
return more || bufferedState != null; return true;
} }
lastTokenFiltered = false; lastTokenFiltered = false;
return true; return true;
@ -216,8 +207,27 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
} }
// no more cached tokens // no more cached tokens
state = TokenState.READING; state = TokenState.READING;
return endDelegating();
} }
} }
} }
private boolean endDelegating() throws IOException {
if (bufferedState == null) {
assert exhausted == true;
return false;
}
delegate.end();
int posInc = posIncAtt.getPositionIncrement();
restoreState(bufferedState);
// System.out.println("Buffered posInc: " + posIncAtt.getPositionIncrement() + " Delegated posInc: " + posInc);
posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posInc);
if (adjustPosition) {
posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() - 1);
adjustPosition = false;
}
bufferedState = null;
return true;
}
} }

View File

@ -97,7 +97,6 @@ import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.Rethrow; import org.apache.lucene.util.Rethrow;
import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
@ -130,6 +129,9 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
// expose inconsistent offsets // expose inconsistent offsets
// https://issues.apache.org/jira/browse/LUCENE-4170 // https://issues.apache.org/jira/browse/LUCENE-4170
avoidConditionals.add(ShingleFilter.class); avoidConditionals.add(ShingleFilter.class);
// FlattenGraphFilter changes the output graph entirely, so wrapping it in a condition
// can break position lengths
avoidConditionals.add(FlattenGraphFilter.class);
} }
private static final Map<Constructor<?>,Predicate<Object[]>> brokenConstructors = new HashMap<>(); private static final Map<Constructor<?>,Predicate<Object[]>> brokenConstructors = new HashMap<>();
@ -626,7 +628,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
return sb.toString(); return sb.toString();
} }
private <T> T createComponent(Constructor<T> ctor, Object[] args, StringBuilder descr) { private <T> T createComponent(Constructor<T> ctor, Object[] args, StringBuilder descr, boolean isConditional) {
try { try {
final T instance = ctor.newInstance(args); final T instance = ctor.newInstance(args);
/* /*
@ -635,6 +637,9 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
} }
*/ */
descr.append("\n "); descr.append("\n ");
if (isConditional) {
descr.append("Conditional:");
}
descr.append(ctor.getDeclaringClass().getName()); descr.append(ctor.getDeclaringClass().getName());
String params = Arrays.deepToString(args); String params = Arrays.deepToString(args);
params = params.substring(1, params.length()-1); params = params.substring(1, params.length()-1);
@ -673,7 +678,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
if (broken(ctor, args)) { if (broken(ctor, args)) {
continue; continue;
} }
spec.tokenizer = createComponent(ctor, args, descr); spec.tokenizer = createComponent(ctor, args, descr, false);
if (spec.tokenizer != null) { if (spec.tokenizer != null) {
spec.toString = descr.toString(); spec.toString = descr.toString();
} }
@ -693,7 +698,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
if (broken(ctor, args)) { if (broken(ctor, args)) {
continue; continue;
} }
reader = createComponent(ctor, args, descr); reader = createComponent(ctor, args, descr, false);
if (reader != null) { if (reader != null) {
spec.reader = reader; spec.reader = reader;
break; break;
@ -725,8 +730,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
if (broken(ctor, args)) { if (broken(ctor, args)) {
return in; return in;
} }
descr.append("ConditionalTokenFilter: "); TokenStream ts = createComponent(ctor, args, descr, true);
TokenStream ts = createComponent(ctor, args, descr);
if (ts == null) { if (ts == null) {
return in; return in;
} }
@ -752,7 +756,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
if (broken(ctor, args)) { if (broken(ctor, args)) {
continue; continue;
} }
final TokenFilter flt = createComponent(ctor, args, descr); final TokenFilter flt = createComponent(ctor, args, descr, false);
if (flt != null) { if (flt != null) {
spec.stream = flt; spec.stream = flt;
break; break;
@ -849,7 +853,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
String toString; String toString;
} }
@LuceneTestCase.BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 12-Jun-2018
public void testRandomChains() throws Throwable { public void testRandomChains() throws Throwable {
int numIterations = TEST_NIGHTLY ? atLeast(20) : 3; int numIterations = TEST_NIGHTLY ? atLeast(20) : 3;
Random random = random(); Random random = random();
@ -878,7 +881,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
} }
// we might regret this decision... // we might regret this decision...
@LuceneTestCase.BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 12-Jun-2018
public void testRandomChainsWithLargeStrings() throws Throwable { public void testRandomChainsWithLargeStrings() throws Throwable {
int numIterations = TEST_NIGHTLY ? atLeast(20) : 3; int numIterations = TEST_NIGHTLY ? atLeast(20) : 3;
Random random = random(); Random random = random();

View File

@ -43,6 +43,7 @@ import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.shingle.FixedShingleFilter; import org.apache.lucene.analysis.shingle.FixedShingleFilter;
import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.standard.ClassicTokenizer; import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.synonym.SolrSynonymParser; import org.apache.lucene.analysis.synonym.SolrSynonymParser;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter; import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.synonym.SynonymMap;
@ -260,6 +261,7 @@ public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new ClassicTokenizer(); Tokenizer source = new ClassicTokenizer();
TokenStream sink = new ProtectedTermFilter(protectedTerms, source, in -> new ShingleFilter(in, 2)); TokenStream sink = new ProtectedTermFilter(protectedTerms, source, in -> new ShingleFilter(in, 2));
sink = new ValidatingTokenFilter(sink, "1");
return new TokenStreamComponents(source, sink); return new TokenStreamComponents(source, sink);
} }
}; };
@ -268,14 +270,53 @@ public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {
try (TokenStream ts = analyzer.tokenStream("", input)) { try (TokenStream ts = analyzer.tokenStream("", input)) {
assertTokenStreamContents(ts, new String[]{ assertTokenStreamContents(ts, new String[]{
"one", "one two", "one", "one two", "two", "three", "four"
"two", }, new int[]{
"three", 0, 0, 4, 8, 14
"four" }, new int[]{
}); 3, 7, 7, 13, 18
}, new int[]{
1, 0, 1, 1, 1
}, new int[]{
1, 2, 1, 1, 1
}, 18);
} }
} }
public void testFilteringWithReadahead() throws IOException {
CharArraySet protectedTerms = new CharArraySet(2, true);
protectedTerms.add("two");
protectedTerms.add("two three");
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new StandardTokenizer();
TokenStream sink = new ShingleFilter(source, 3);
sink = new ProtectedTermFilter(protectedTerms, sink, in -> new TypeTokenFilter(in, Collections.singleton("ALL"), true));
return new TokenStreamComponents(source, sink);
}
};
String input = "one two three four";
try (TokenStream ts = analyzer.tokenStream("", input)) {
assertTokenStreamContents(ts, new String[]{
"two", "two three"
}, new int[]{
4, 4
}, new int[]{
7, 13
}, new int[]{
2, 0
}, new int[]{
1, 2
}, 18);
}
}
public void testMultipleConditionalFilters() throws IOException { public void testMultipleConditionalFilters() throws IOException {
TokenStream stream = whitespaceMockTokenizer("Alice Bob Clara David"); TokenStream stream = whitespaceMockTokenizer("Alice Bob Clara David");
TokenStream t = new SkipMatchingFilter(stream, in -> { TokenStream t = new SkipMatchingFilter(stream, in -> {
@ -311,7 +352,8 @@ public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new NGramTokenizer(); Tokenizer source = new NGramTokenizer();
TokenStream sink = new KeywordRepeatFilter(source); TokenStream sink = new ValidatingTokenFilter(new KeywordRepeatFilter(source), "stage 0");
sink = new ValidatingTokenFilter(sink, "stage 1");
sink = new RandomSkippingFilter(sink, seed, in -> new TypeTokenFilter(in, Collections.singleton("word"))); sink = new RandomSkippingFilter(sink, seed, in -> new TypeTokenFilter(in, Collections.singleton("word")));
sink = new ValidatingTokenFilter(sink, "last stage"); sink = new ValidatingTokenFilter(sink, "last stage");
return new TokenStreamComponents(source, sink); return new TokenStreamComponents(source, sink);
@ -361,11 +403,12 @@ public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {
@Override @Override
public boolean incrementToken() throws IOException { public boolean incrementToken() throws IOException {
if (reset) { boolean more = input.incrementToken();
if (more && reset) {
assertEquals(1, posIncAtt.getPositionIncrement()); assertEquals(1, posIncAtt.getPositionIncrement());
} }
reset = false; reset = false;
return input.incrementToken(); return more;
} }
} }

View File

@ -19,8 +19,6 @@ package org.apache.lucene.analysis;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/** /**

View File

@ -62,6 +62,9 @@ public final class ValidatingTokenFilter extends TokenFilter {
@Override @Override
public boolean incrementToken() throws IOException { public boolean incrementToken() throws IOException {
// System.out.println(name + ": incrementToken()");
if (!input.incrementToken()) { if (!input.incrementToken()) {
return false; return false;
} }
@ -69,15 +72,15 @@ public final class ValidatingTokenFilter extends TokenFilter {
int startOffset = 0; int startOffset = 0;
int endOffset = 0; int endOffset = 0;
int posLen = 0; int posLen = 0;
// System.out.println(name + ": " + this);
if (posIncAtt != null) { if (posIncAtt != null) {
pos += posIncAtt.getPositionIncrement(); pos += posIncAtt.getPositionIncrement();
if (pos == -1) { if (pos == -1) {
throw new IllegalStateException("first posInc must be > 0"); throw new IllegalStateException(name + ": first posInc must be > 0");
} }
} }
// System.out.println(" got token=" + termAtt + " pos=" + pos);
if (offsetAtt != null) { if (offsetAtt != null) {
startOffset = offsetAtt.startOffset(); startOffset = offsetAtt.startOffset();