LUCENE-8202: Add FixedShingleFilter

2018-03-21 10:35:28 +00:00 · 2018-03-21 10:35:28 +00:00 · fac84c01c8
parent d4e69c5cd8
commit fac84c01c8
7 changed files with 561 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -106,6 +106,9 @@ New Features
 * LUCENE-8197: A new FeatureField makes it easy and efficient to integrate
  static relevance signals into the final score. (Adrien Grand, Robert Muir)

+* LUCENE-8202: Add a FixedShingleFilter (Alan Woodward, Adrien Grand, Jim
+  Ferenczi)
+
 Other

 * LUCENE-8214: Improve selection of testPoint for GeoComplexPolygon.
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
@ -0,0 +1,294 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.shingle;
+
+import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.Deque;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * A FixedShingleFilter constructs shingles (token n-grams) from a token stream.
+ * In other words, it creates combinations of tokens as a single token.
+ *
+ * Unlike the {@link ShingleFilter}, FixedShingleFilter only emits shingles of a
+ * fixed size, and never emits unigrams, even at the end of a TokenStream. In
+ * addition, if the filter encounters stacked tokens (eg synonyms), then it will
+ * output stacked shingles
+ *
+ * For example, the sentence "please divide this sentence into shingles"
+ * might be tokenized into shingles "please divide", "divide this",
+ * "this sentence", "sentence into", and "into shingles".
+ *
+ * This filter handles position increments &gt; 1 by inserting filler tokens
+ * (tokens with termtext "_").
+ *
+ * @lucene.experimental
+ */
+public final class FixedShingleFilter extends TokenFilter {
+
+  private final Deque<Token> tokenPool = new ArrayDeque<>();
+
+  private final int shingleSize;
+  private final String tokenSeparator;
+  private final Token gapToken = new Token(new AttributeSource());
+  private final Token endToken = new Token(new AttributeSource());
+
+  private final PositionIncrementAttribute incAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+  private Token[] currentShingleTokens;
+  private boolean inputStreamExhausted = false;
+
+  public FixedShingleFilter(TokenStream input, int shingleSize) {
+    this(input, shingleSize, " ", "_");
+  }
+
+  public FixedShingleFilter(TokenStream input, int shingleSize, String tokenSeparator, String fillerToken) {
+    super(input);
+    this.shingleSize = shingleSize;
+    this.tokenSeparator = tokenSeparator;
+
+    this.gapToken.termAtt.setEmpty().append(fillerToken);
+
+    this.currentShingleTokens = new Token[shingleSize];
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    int posInc = 0;
+    if (nextShingle() == false) {
+      Token nextRoot = nextTokenInStream(currentShingleTokens[0]);
+      if (nextRoot == endToken)
+        return false;
+      recycleToken(currentShingleTokens[0]);
+      if (resetShingleRoot(nextRoot) == false) {
+        return false;
+      }
+      posInc = currentShingleTokens[0].posInc();
+    }
+    clearAttributes();
+    incAtt.setPositionIncrement(posInc);
+    offsetAtt.setOffset(currentShingleTokens[0].startOffset(), lastTokenInShingle().endOffset());
+    termAtt.setEmpty();
+    termAtt.append(currentShingleTokens[0].term());
+    typeAtt.setType("shingle");
+    posLenAtt.setPositionLength(shingleSize);
+    for (int i = 1; i < shingleSize; i++) {
+      termAtt.append(tokenSeparator).append(currentShingleTokens[i].term());
+    }
+    return true;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    this.tokenPool.clear();
+    this.currentShingleTokens[0] = null;
+    this.inputStreamExhausted = false;
+  }
+
+  @Override
+  public void end() throws IOException {
+    if (inputStreamExhausted == false) {
+      finishInnerStream();
+    }
+    clearAttributes();
+    this.offsetAtt.setOffset(0, endToken.endOffset());
+  }
+
+  private void finishInnerStream() throws IOException {
+    input.end();
+    inputStreamExhausted = true;
+    // check for gaps at the end of the tokenstream
+    endToken.posIncAtt.setPositionIncrement(this.incAtt.getPositionIncrement());
+    OffsetAttribute inputOffsets = input.getAttribute(OffsetAttribute.class);
+    endToken.offsetAtt.setOffset(inputOffsets.startOffset(), inputOffsets.endOffset());
+  }
+
+  private Token lastTokenInShingle() {
+    int lastTokenIndex = shingleSize - 1;
+    while (currentShingleTokens[lastTokenIndex] == gapToken) {
+      lastTokenIndex--;
+    }
+    return currentShingleTokens[lastTokenIndex];
+  }
+
+  private boolean resetShingleRoot(Token token) throws IOException {
+    this.currentShingleTokens[0] = token;
+    for (int i = 1; i < shingleSize; i++) {
+      Token current = nextTokenInGraph(this.currentShingleTokens[i - 1]);
+      if (current == endToken) {
+        if (endToken.posInc() + i >= shingleSize) {
+          // end tokens are a special case, because their posIncs are always
+          // due to stopwords.  Therefore, we can happily append gap tokens
+          // to the end of the current shingle
+          for (int j = i; j < shingleSize; j++) {
+            this.currentShingleTokens[i] = gapToken;
+            i++;
+          }
+          return true;
+        }
+        return false;
+      }
+      if (current.posInc() > 1) {
+        // insert gaps into the shingle list
+        for (int j = 1; j < current.posInc(); j++) {
+          this.currentShingleTokens[i] = gapToken;
+          i++;
+          if (i >= shingleSize)
+            return true;
+        }
+      }
+      this.currentShingleTokens[i] = current;
+    }
+    return true;
+  }
+
+  private boolean nextShingle() throws IOException {
+    return currentShingleTokens[0] != null && advanceStack();
+  }
+
+  // check if the next token in the tokenstream is at the same position as this one
+  private boolean lastInStack(Token token) throws IOException {
+    Token next = nextTokenInStream(token);
+    return next == endToken || next.posInc() != 0;
+  }
+  
+  private boolean advanceStack() throws IOException {
+    for (int i = shingleSize - 1; i >= 1; i--) {
+      if (currentShingleTokens[i] != gapToken && lastInStack(currentShingleTokens[i]) == false) {
+        currentShingleTokens[i] = nextTokenInStream(currentShingleTokens[i]);
+        for (int j = i + 1; j < shingleSize; j++) {
+          currentShingleTokens[j] = nextTokenInGraph(currentShingleTokens[j - 1]);
+        }
+        return true;
+      }
+    }
+    return false;
+  }
+
+  private Token newToken() {
+    Token token = tokenPool.size() == 0 ? new Token(this.cloneAttributes()) : tokenPool.removeFirst();
+    token.reset(this);
+    return token;
+  }
+
+  private void recycleToken(Token token) {
+    if (token == null)
+      return;
+    token.nextToken = null;
+    tokenPool.add(token);
+  }
+
+  // for testing
+  int instantiatedTokenCount() {
+    int tokenCount = tokenPool.size() + 1;
+    if (currentShingleTokens[0] == endToken || currentShingleTokens[0] == null)
+      return tokenCount;
+    for (Token t = currentShingleTokens[0]; t != endToken && t != null; t = t.nextToken) {
+      tokenCount++;
+    }
+    return tokenCount;
+  }
+
+  private Token nextTokenInGraph(Token token) throws IOException {
+    do {
+      token = nextTokenInStream(token);
+      if (token == endToken) {
+        return endToken;
+      }
+    } while (token.posInc() == 0);
+    return token;
+  }
+
+  private Token nextTokenInStream(Token token) throws IOException {
+    if (token != null && token.nextToken != null) {
+      return token.nextToken;
+    }
+    if (input.incrementToken() == false) {
+      finishInnerStream();
+      if (token == null) {
+        return endToken;
+      }
+      else {
+        token.nextToken = endToken;
+        return endToken;
+      }
+    }
+    if (token == null) {
+      return newToken();
+    }
+    token.nextToken = newToken();
+    return token.nextToken;
+  }
+
+  private static class Token {
+    final AttributeSource attSource;
+    final PositionIncrementAttribute posIncAtt;
+    final CharTermAttribute termAtt;
+    final OffsetAttribute offsetAtt;
+
+    Token nextToken;
+
+    Token(AttributeSource attSource) {
+      this.attSource = attSource;
+      this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class);
+      this.termAtt = attSource.addAttribute(CharTermAttribute.class);
+      this.offsetAtt = attSource.addAttribute(OffsetAttribute.class);
+    }
+
+    int posInc() {
+      return this.posIncAtt.getPositionIncrement();
+    }
+
+    CharSequence term() {
+      return this.termAtt;
+    }
+
+    int startOffset() {
+      return this.offsetAtt.startOffset();
+    }
+
+    int endOffset() {
+      return this.offsetAtt.endOffset();
+    }
+
+    void reset(AttributeSource attSource) {
+      attSource.copyTo(this.attSource);
+      this.nextToken = null;
+    }
+
+    @Override
+    public String toString() {
+      return term() + "(" + startOffset() + "," + endOffset() + ") " + posInc();
+    }
+  }
+
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilterFactory.java
@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.shingle;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link FixedShingleFilter}
+ *
+ * Parameters are:
+ * <ul>
+ *   <li>shingleSize - how many tokens should be combined into each shingle (default: 2)
+ *   <li>tokenSeparator - how tokens should be joined together in the shingle (default: space)
+ *   <li>fillerToken - what should be added in place of stop words (default: _ )
+ * </ul>
+ */
+public class FixedShingleFilterFactory extends TokenFilterFactory {
+
+  private final int shingleSize;
+  private final String tokenSeparator;
+  private final String fillerToken;
+
+  public FixedShingleFilterFactory(Map<String, String> args) {
+    super(args);
+    this.shingleSize = getInt(args, "shingleSize", 2);
+    this.tokenSeparator = get(args, "tokenSeparator", " ");
+    this.fillerToken = get(args, "fillerToken", "_");
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new FixedShingleFilter(input, shingleSize, tokenSeparator, fillerToken);
+  }
+}
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@ -101,6 +101,7 @@ org.apache.lucene.analysis.pt.PortugueseStemFilterFactory
 org.apache.lucene.analysis.reverse.ReverseStringFilterFactory
 org.apache.lucene.analysis.ru.RussianLightStemFilterFactory
 org.apache.lucene.analysis.shingle.ShingleFilterFactory
+org.apache.lucene.analysis.shingle.FixedShingleFilterFactory
 org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
 org.apache.lucene.analysis.sr.SerbianNormalizationFilterFactory
 org.apache.lucene.analysis.standard.ClassicFilterFactory
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.shingle;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
+
+  public void testBiGramFilter() throws IOException {
+
+    TokenStream ts = new CannedTokenStream(
+        new Token("please", 0, 6),
+        new Token("divide", 7, 13),
+        new Token("this", 14, 18),
+        new Token("sentence", 19, 27),
+        new Token("into", 28, 32),
+        new Token("shingles", 33, 41)
+    );
+
+    assertTokenStreamContents(new FixedShingleFilter(ts, 2),
+        new String[]{"please divide", "divide this", "this sentence", "sentence into", "into shingles"},
+        new int[]{0, 7, 14, 19, 28,},
+        new int[]{13, 18, 27, 32, 41,},
+        new String[]{"shingle", "shingle", "shingle", "shingle", "shingle",},
+        new int[]{1, 1, 1, 1, 1,},
+        new int[]{2, 2, 2, 2, 2});
+
+  }
+
+  public void testBiGramFilterWithAltSeparator() throws IOException {
+
+    TokenStream ts = new CannedTokenStream(
+        new Token("please", 0, 6),
+        new Token("divide", 7, 13),
+        new Token("this", 14, 18),
+        new Token("sentence", 19, 27),
+        new Token("into", 28, 32),
+        new Token("shingles", 33, 41)
+    );
+
+    assertTokenStreamContents(new FixedShingleFilter(ts, 2, "<SEP>", "_"),
+        new String[]{"please<SEP>divide", "divide<SEP>this", "this<SEP>sentence", "sentence<SEP>into", "into<SEP>shingles"},
+        new int[]{0, 7, 14, 19, 28},
+        new int[]{13, 18, 27, 32, 41},
+        new String[]{"shingle", "shingle", "shingle", "shingle", "shingle"},
+        new int[]{1, 1, 1, 1, 1});
+
+  }
+
+  public void testTriGramFilter() throws IOException {
+
+    TokenStream ts = new CannedTokenStream(
+        new Token("please", 0, 6),
+        new Token("divide", 7, 13),
+        new Token("this", 14, 18),
+        new Token("sentence", 19, 27),
+        new Token("into", 28, 32),
+        new Token("shingles", 33, 41)
+    );
+
+    assertTokenStreamContents(new FixedShingleFilter(ts, 3),
+        new String[]{"please divide this", "divide this sentence", "this sentence into", "sentence into shingles"});
+  }
+
+  public void testShingleSizeGreaterThanTokenstreamLength() throws IOException {
+
+    TokenStream ts = new FixedShingleFilter(new CannedTokenStream(
+        new Token("please", 0, 6),
+        new Token("divide", 7, 13)
+    ), 3);
+
+    ts.reset();
+    assertFalse(ts.incrementToken());
+
+  }
+
+  public void testWithStopwords() throws IOException {
+
+    TokenStream ts = new CannedTokenStream(
+        new Token("please", 0, 6),
+        new Token("divide", 7, 13),
+        new Token("sentence", 2, 19, 27),
+        new Token("shingles", 2, 33, 41)
+    );
+
+    assertTokenStreamContents(new FixedShingleFilter(ts, 3),
+        new String[]{"please divide _", "divide _ sentence", "sentence _ shingles"},
+        new int[]{0, 7, 19,},
+        new int[]{13, 27, 41,},
+        new String[]{"shingle", "shingle", "shingle",},
+        new int[]{1, 1, 2,});
+
+  }
+
+  public void testConsecutiveStopwords() throws IOException {
+
+    TokenStream ts = new CannedTokenStream(
+        new Token("b", 2, 2, 3),
+        new Token("c", 4, 5),
+        new Token("d", 6, 7),
+        new Token("b", 3, 12, 13),
+        new Token("c", 14, 15)
+    );
+
+    assertTokenStreamContents(new FixedShingleFilter(ts, 4),
+        new String[]{"b c d _", "c d _ _", "d _ _ b"},
+        new int[]{2, 4, 6,},
+        new int[]{7, 7, 13,},
+        new int[]{2, 1, 1,});
+  }
+
+  public void testTrailingStopwords() throws IOException {
+
+    TokenStream ts = new CannedTokenStream(1, 7,
+        new Token("b", 0, 1),
+        new Token("c", 2, 3),
+        new Token("d", 4, 5)
+    );
+
+    assertTokenStreamContents(new FixedShingleFilter(ts, 3),
+          new String[] { "b c d", "c d _" },
+          new int[] {    0,         2,    },
+          new int[] {    5,         5,    },
+          new int[] {    1,         1,    });
+
+
+  }
+
+  public void testMultipleTrailingStopwords() throws IOException {
+
+    TokenStream ts = new CannedTokenStream(2, 9,
+        new Token("b", 0, 1),
+        new Token("c", 2, 3),
+        new Token("d", 4, 5)
+    );
+
+    assertTokenStreamContents(new FixedShingleFilter(ts, 3),
+          new String[] { "b c d", "c d _", "d _ _" },
+          new int[] {    0,         2,      4 },
+          new int[] {    5,         5,      5 },
+          new int[] {    1,         1,      1 });
+  }
+
+  public void testIncomingGraphs() throws IOException {
+
+    TokenStream ts = new CannedTokenStream(
+        new Token("b", 0, 1),
+        new Token("a", 0, 0, 1),
+        new Token("c", 2, 3),
+        new Token("b", 4, 5),
+        new Token("a", 0, 4, 5),
+        new Token("d", 6, 7)
+    );
+
+    assertTokenStreamContents(new FixedShingleFilter(ts, 2),
+          new String[] { "b c", "a c", "c b", "c a", "b d", "a d" },
+          new int[] {    0,     0,     2,     2,     4,     4 },
+          new int[] {    3,     3,     5,     5,     7,     7 },
+          new int[] {    1,     0,     1,     0,     1,     0 });
+  }
+
+  public void testShinglesSpanningGraphs() throws IOException {
+
+    TokenStream ts = new CannedTokenStream(
+        new Token("b", 0, 1),
+        new Token("a", 0, 0, 1),
+        new Token("c", 2, 3),
+        new Token("b", 4, 5),
+        new Token("a", 0, 4, 5),
+        new Token("d", 6, 7)
+    );
+
+    assertTokenStreamContents(new FixedShingleFilter(ts, 3),
+          new String[] { "b c b", "b c a", "a c b", "a c a", "c b d", "c a d" },
+          new int[] {    0,        0,      0,       0,       2,        2,     },
+          new int[] {    5,        5,      5,       5,       7,        7,     },
+          new int[] {    1,        0,      0,       0,       1,        0,     });
+  }
+
+}
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@ -341,6 +341,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null);
  }

+  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int[] posLengths) throws IOException {
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, null);
+  }
+
  public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
    assertTokenStreamContents(ts, output, null, null, null, null, null, null);
  }
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
@ -75,6 +75,13 @@ public class Token extends PackedTokenAttributeImpl implements FlagsAttribute, P
    setOffset(start, end);
  }

+  /** Constructs a Token with the given term text, position increment, start and end offsets */
+  public Token(CharSequence text, int posInc, int start, int end) {
+    append(text);
+    setOffset(start, end);
+    setPositionIncrement(posInc);
+  }
+
  /**
   * {@inheritDoc}
   * @see FlagsAttribute