diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.java index 7de4fbde645..e41ce8268aa 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.java @@ -168,6 +168,16 @@ public abstract class ConditionalTokenFilter extends TokenFilter { return false; } if (shouldFilter()) { + // we're chopping the underlying Tokenstream up into fragments, and presenting + // only those parts of it that pass the filter to the delegate, so the delegate is + // in effect seeing multiple tokenstream snippets. Tokenstreams can't have an initial + // position increment of 0, so if the snippet starts on a stacked token we need to + // offset it here and then correct the increment back again after delegation + boolean adjustPosition = false; + if (posIncAtt.getPositionIncrement() == 0) { + posIncAtt.setPositionIncrement(1); + adjustPosition = true; + } lastTokenFiltered = true; state = TokenState.PREBUFFERING; // we determine that the delegate has emitted all the tokens it can at the current @@ -178,6 +188,10 @@ public abstract class ConditionalTokenFilter extends TokenFilter { boolean more = delegate.incrementToken(); if (more) { state = TokenState.DELEGATING; + if (adjustPosition) { + int posInc = posIncAtt.getPositionIncrement(); + posIncAtt.setPositionIncrement(posInc - 1); + } } else { lastTokenFiltered = false; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java index 511c725e6be..e0bbac44794 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java @@ -48,6 +48,7 @@ import org.apache.lucene.analysis.synonym.SynonymGraphFilter; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; public class TestConditionalTokenFilter extends BaseTokenStreamTestCase { @@ -330,6 +331,44 @@ public class TestConditionalTokenFilter extends BaseTokenStreamTestCase { assertTokenStreamContents(ts, new String[]{"jvboq"}); } + public void testInternalPositionAdjustment() throws IOException { + // check that the partial TokenStream sent to the condition filter begins with a posInc of 1, + // even if the input stream has a posInc of 0 at that position, and that the filtered stream + // has the correct posInc afterwards + TokenStream ts = whitespaceMockTokenizer("one two three"); + ts = new KeywordRepeatFilter(ts); + ts = new NonRandomSkippingFilter(ts, PositionAssertingTokenFilter::new, false, true, true, true, true, false); + + assertTokenStreamContents(ts, + new String[]{ "one", "one", "two", "two", "three", "three" }, + new int[]{ 1, 0, 1, 0, 1, 0}); + } + + private static final class PositionAssertingTokenFilter extends TokenFilter { + + boolean reset = false; + final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + + protected PositionAssertingTokenFilter(TokenStream input) { + super(input); + } + + @Override + public void reset() throws IOException { + super.reset(); + this.reset = true; + } + + @Override + public boolean incrementToken() throws IOException { + if (reset) { + assertEquals(1, posIncAtt.getPositionIncrement()); + } + reset = false; + return input.incrementToken(); + } + } + private static class RandomSkippingFilter extends ConditionalTokenFilter { Random random;