LUCENE-8273: Adjust position increments when filtering stacked tokens

This commit is contained in:
Alan Woodward 2018-05-29 15:57:03 +01:00
parent 34741a863a
commit 4ea9d2ea8c
2 changed files with 53 additions and 0 deletions

View File

@ -168,6 +168,16 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
return false;
}
if (shouldFilter()) {
// we're chopping the underlying Tokenstream up into fragments, and presenting
// only those parts of it that pass the filter to the delegate, so the delegate is
// in effect seeing multiple tokenstream snippets. Tokenstreams can't have an initial
// position increment of 0, so if the snippet starts on a stacked token we need to
// offset it here and then correct the increment back again after delegation
boolean adjustPosition = false;
if (posIncAtt.getPositionIncrement() == 0) {
posIncAtt.setPositionIncrement(1);
adjustPosition = true;
}
lastTokenFiltered = true;
state = TokenState.PREBUFFERING;
// we determine that the delegate has emitted all the tokens it can at the current
@ -178,6 +188,10 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
boolean more = delegate.incrementToken();
if (more) {
state = TokenState.DELEGATING;
if (adjustPosition) {
int posInc = posIncAtt.getPositionIncrement();
posIncAtt.setPositionIncrement(posInc - 1);
}
}
else {
lastTokenFiltered = false;

View File

@ -48,6 +48,7 @@ import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {
@ -330,6 +331,44 @@ public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {
assertTokenStreamContents(ts, new String[]{"jvboq"});
}
public void testInternalPositionAdjustment() throws IOException {
// check that the partial TokenStream sent to the condition filter begins with a posInc of 1,
// even if the input stream has a posInc of 0 at that position, and that the filtered stream
// has the correct posInc afterwards
TokenStream ts = whitespaceMockTokenizer("one two three");
ts = new KeywordRepeatFilter(ts);
ts = new NonRandomSkippingFilter(ts, PositionAssertingTokenFilter::new, false, true, true, true, true, false);
assertTokenStreamContents(ts,
new String[]{ "one", "one", "two", "two", "three", "three" },
new int[]{ 1, 0, 1, 0, 1, 0});
}
private static final class PositionAssertingTokenFilter extends TokenFilter {
boolean reset = false;
final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
protected PositionAssertingTokenFilter(TokenStream input) {
super(input);
}
@Override
public void reset() throws IOException {
super.reset();
this.reset = true;
}
@Override
public boolean incrementToken() throws IOException {
if (reset) {
assertEquals(1, posIncAtt.getPositionIncrement());
}
reset = false;
return input.incrementToken();
}
}
private static class RandomSkippingFilter extends ConditionalTokenFilter {
Random random;