LUCENE-8273: Adjust position increments when filtering stacked tokens

2018-05-29 15:57:03 +01:00 · 2018-05-29 15:57:03 +01:00 · 4ea9d2ea8c
parent 34741a863a
commit 4ea9d2ea8c
2 changed files with 53 additions and 0 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.java
@ -168,6 +168,16 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
          return false;
        }
        if (shouldFilter()) {
+          // we're chopping the underlying Tokenstream up into fragments, and presenting
+          // only those parts of it that pass the filter to the delegate, so the delegate is
+          // in effect seeing multiple tokenstream snippets.  Tokenstreams can't have an initial
+          // position increment of 0, so if the snippet starts on a stacked token we need to
+          // offset it here and then correct the increment back again after delegation
+          boolean adjustPosition = false;
+          if (posIncAtt.getPositionIncrement() == 0) {
+            posIncAtt.setPositionIncrement(1);
+            adjustPosition = true;
+          }
          lastTokenFiltered = true;
          state = TokenState.PREBUFFERING;
          // we determine that the delegate has emitted all the tokens it can at the current
@ -178,6 +188,10 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
          boolean more = delegate.incrementToken();
          if (more) {
            state = TokenState.DELEGATING;
+            if (adjustPosition) {
+              int posInc = posIncAtt.getPositionIncrement();
+              posIncAtt.setPositionIncrement(posInc - 1);
+            }
          }
          else {
            lastTokenFiltered = false;
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java
@ -48,6 +48,7 @@ import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
 import org.apache.lucene.analysis.synonym.SynonymMap;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

 public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {

@ -330,6 +331,44 @@ public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {
    assertTokenStreamContents(ts, new String[]{"jvboq"});
  }

+  public void testInternalPositionAdjustment() throws IOException {
+    // check that the partial TokenStream sent to the condition filter begins with a posInc of 1,
+    // even if the input stream has a posInc of 0 at that position, and that the filtered stream
+    // has the correct posInc afterwards
+    TokenStream ts = whitespaceMockTokenizer("one two three");
+    ts = new KeywordRepeatFilter(ts);
+    ts = new NonRandomSkippingFilter(ts, PositionAssertingTokenFilter::new, false, true, true, true, true, false);
+
+    assertTokenStreamContents(ts,
+        new String[]{ "one", "one", "two", "two", "three", "three" },
+        new int[]{    1,      0,    1,      0,    1,        0});
+  }
+
+  private static final class PositionAssertingTokenFilter extends TokenFilter {
+
+    boolean reset = false;
+    final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+
+    protected PositionAssertingTokenFilter(TokenStream input) {
+      super(input);
+    }
+
+    @Override
+    public void reset() throws IOException {
+      super.reset();
+      this.reset = true;
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+      if (reset) {
+        assertEquals(1, posIncAtt.getPositionIncrement());
+      }
+      reset = false;
+      return input.incrementToken();
+    }
+  }
+
  private static class RandomSkippingFilter extends ConditionalTokenFilter {

    Random random;