LUCENE-8202: Add FixedShingleFilter

2025-02-09 11:35:14 +00:00 · 2018-03-21 10:35:28 +00:00 · 2018-03-21 10:35:28 +00:00 · fac84c01c8
commit fac84c01c8
parent d4e69c5cd8
7 changed files with 561 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -106,6 +106,9 @@ New Features
 * LUCENE-8197: A new FeatureField makes it easy and efficient to integrate
  static relevance signals into the final score. (Adrien Grand, Robert Muir)
 * LUCENE-8202: Add a FixedShingleFilter (Alan Woodward, Adrien Grand, Jim
  Ferenczi)
 Other
 * LUCENE-8214: Improve selection of testPoint for GeoComplexPolygon.
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
@ -0,0 +1,294 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.shingle;
 import java.io.IOException;
 import java.util.ArrayDeque;
 import java.util.Deque;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.AttributeSource;
 /**
 * A FixedShingleFilter constructs shingles (token n-grams) from a token stream.
 * In other words, it creates combinations of tokens as a single token.
 *
 * Unlike the {@link ShingleFilter}, FixedShingleFilter only emits shingles of a
 * fixed size, and never emits unigrams, even at the end of a TokenStream. In
 * addition, if the filter encounters stacked tokens (eg synonyms), then it will
 * output stacked shingles
 *
 * For example, the sentence "please divide this sentence into shingles"
 * might be tokenized into shingles "please divide", "divide this",
 * "this sentence", "sentence into", and "into shingles".
 *
 * This filter handles position increments &gt; 1 by inserting filler tokens
 * (tokens with termtext "_").
 *
 * @lucene.experimental
 */
 public final class FixedShingleFilter extends TokenFilter {
  private final Deque<Token> tokenPool = new ArrayDeque<>();
  private final int shingleSize;
  private final String tokenSeparator;
  private final Token gapToken = new Token(new AttributeSource());
  private final Token endToken = new Token(new AttributeSource());
  private final PositionIncrementAttribute incAtt = addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
  private Token[] currentShingleTokens;
  private boolean inputStreamExhausted = false;
  public FixedShingleFilter(TokenStream input, int shingleSize) {
    this(input, shingleSize, " ", "_");
  }
  public FixedShingleFilter(TokenStream input, int shingleSize, String tokenSeparator, String fillerToken) {
    super(input);
    this.shingleSize = shingleSize;
    this.tokenSeparator = tokenSeparator;
    this.gapToken.termAtt.setEmpty().append(fillerToken);
    this.currentShingleTokens = new Token[shingleSize];
  }
  @Override
  public boolean incrementToken() throws IOException {
    int posInc = 0;
    if (nextShingle() == false) {
      Token nextRoot = nextTokenInStream(currentShingleTokens[0]);
      if (nextRoot == endToken)
        return false;
      recycleToken(currentShingleTokens[0]);
      if (resetShingleRoot(nextRoot) == false) {
        return false;
      }
      posInc = currentShingleTokens[0].posInc();
    }
    clearAttributes();
    incAtt.setPositionIncrement(posInc);
    offsetAtt.setOffset(currentShingleTokens[0].startOffset(), lastTokenInShingle().endOffset());
    termAtt.setEmpty();
    termAtt.append(currentShingleTokens[0].term());
    typeAtt.setType("shingle");
    posLenAtt.setPositionLength(shingleSize);
    for (int i = 1; i < shingleSize; i++) {
      termAtt.append(tokenSeparator).append(currentShingleTokens[i].term());
    }
    return true;
  }
  @Override
  public void reset() throws IOException {
    super.reset();
    this.tokenPool.clear();
    this.currentShingleTokens[0] = null;
    this.inputStreamExhausted = false;
  }
  @Override
  public void end() throws IOException {
    if (inputStreamExhausted == false) {
      finishInnerStream();
    }
    clearAttributes();
    this.offsetAtt.setOffset(0, endToken.endOffset());
  }
  private void finishInnerStream() throws IOException {
    input.end();
    inputStreamExhausted = true;
    // check for gaps at the end of the tokenstream
    endToken.posIncAtt.setPositionIncrement(this.incAtt.getPositionIncrement());
    OffsetAttribute inputOffsets = input.getAttribute(OffsetAttribute.class);
    endToken.offsetAtt.setOffset(inputOffsets.startOffset(), inputOffsets.endOffset());
  }
  private Token lastTokenInShingle() {
    int lastTokenIndex = shingleSize - 1;
    while (currentShingleTokens[lastTokenIndex] == gapToken) {
      lastTokenIndex--;
    }
    return currentShingleTokens[lastTokenIndex];
  }
  private boolean resetShingleRoot(Token token) throws IOException {
    this.currentShingleTokens[0] = token;
    for (int i = 1; i < shingleSize; i++) {
      Token current = nextTokenInGraph(this.currentShingleTokens[i - 1]);
      if (current == endToken) {
        if (endToken.posInc() + i >= shingleSize) {
          // end tokens are a special case, because their posIncs are always
          // due to stopwords.  Therefore, we can happily append gap tokens
          // to the end of the current shingle
          for (int j = i; j < shingleSize; j++) {
            this.currentShingleTokens[i] = gapToken;
            i++;
          }
          return true;
        }
        return false;
      }
      if (current.posInc() > 1) {
        // insert gaps into the shingle list
        for (int j = 1; j < current.posInc(); j++) {
          this.currentShingleTokens[i] = gapToken;
          i++;
          if (i >= shingleSize)
            return true;
        }
      }
      this.currentShingleTokens[i] = current;
    }
    return true;
  }
  private boolean nextShingle() throws IOException {
    return currentShingleTokens[0] != null && advanceStack();
  }
  // check if the next token in the tokenstream is at the same position as this one
  private boolean lastInStack(Token token) throws IOException {
    Token next = nextTokenInStream(token);
    return next == endToken || next.posInc() != 0;
  }
  private boolean advanceStack() throws IOException {
    for (int i = shingleSize - 1; i >= 1; i--) {
      if (currentShingleTokens[i] != gapToken && lastInStack(currentShingleTokens[i]) == false) {
        currentShingleTokens[i] = nextTokenInStream(currentShingleTokens[i]);
        for (int j = i + 1; j < shingleSize; j++) {
          currentShingleTokens[j] = nextTokenInGraph(currentShingleTokens[j - 1]);
        }
        return true;
      }
    }
    return false;
  }
  private Token newToken() {
    Token token = tokenPool.size() == 0 ? new Token(this.cloneAttributes()) : tokenPool.removeFirst();
    token.reset(this);
    return token;
  }
  private void recycleToken(Token token) {
    if (token == null)
      return;
    token.nextToken = null;
    tokenPool.add(token);
  }
  // for testing
  int instantiatedTokenCount() {
    int tokenCount = tokenPool.size() + 1;
    if (currentShingleTokens[0] == endToken || currentShingleTokens[0] == null)
      return tokenCount;
    for (Token t = currentShingleTokens[0]; t != endToken && t != null; t = t.nextToken) {
      tokenCount++;
    }
    return tokenCount;
  }
  private Token nextTokenInGraph(Token token) throws IOException {
    do {
      token = nextTokenInStream(token);
      if (token == endToken) {
        return endToken;
      }
    } while (token.posInc() == 0);
    return token;
  }
  private Token nextTokenInStream(Token token) throws IOException {
    if (token != null && token.nextToken != null) {
      return token.nextToken;
    }
    if (input.incrementToken() == false) {
      finishInnerStream();
      if (token == null) {
        return endToken;
      }
      else {
        token.nextToken = endToken;
        return endToken;
      }
    }
    if (token == null) {
      return newToken();
    }
    token.nextToken = newToken();
    return token.nextToken;
  }
  private static class Token {
    final AttributeSource attSource;
    final PositionIncrementAttribute posIncAtt;
    final CharTermAttribute termAtt;
    final OffsetAttribute offsetAtt;
    Token nextToken;
    Token(AttributeSource attSource) {
      this.attSource = attSource;
      this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class);
      this.termAtt = attSource.addAttribute(CharTermAttribute.class);
      this.offsetAtt = attSource.addAttribute(OffsetAttribute.class);
    }
    int posInc() {
      return this.posIncAtt.getPositionIncrement();
    }
    CharSequence term() {
      return this.termAtt;
    }
    int startOffset() {
      return this.offsetAtt.startOffset();
    }
    int endOffset() {
      return this.offsetAtt.endOffset();
    }
    void reset(AttributeSource attSource) {
      attSource.copyTo(this.attSource);
      this.nextToken = null;
    }
    @Override
    public String toString() {
      return term() + "(" + startOffset() + "," + endOffset() + ") " + posInc();
    }
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilterFactory.java
@ -0,0 +1,52 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.shingle;
 import java.util.Map;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 /**
 * Factory for {@link FixedShingleFilter}
 *
 * Parameters are:
 * <ul>
 *   <li>shingleSize - how many tokens should be combined into each shingle (default: 2)
 *   <li>tokenSeparator - how tokens should be joined together in the shingle (default: space)
 *   <li>fillerToken - what should be added in place of stop words (default: _ )
 * </ul>
 */
 public class FixedShingleFilterFactory extends TokenFilterFactory {
  private final int shingleSize;
  private final String tokenSeparator;
  private final String fillerToken;
  public FixedShingleFilterFactory(Map<String, String> args) {
    super(args);
    this.shingleSize = getInt(args, "shingleSize", 2);
    this.tokenSeparator = get(args, "tokenSeparator", " ");
    this.fillerToken = get(args, "fillerToken", "_");
  }
  @Override
  public TokenStream create(TokenStream input) {
    return new FixedShingleFilter(input, shingleSize, tokenSeparator, fillerToken);
  }
 }
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@ -101,6 +101,7 @@ org.apache.lucene.analysis.pt.PortugueseStemFilterFactory
 org.apache.lucene.analysis.reverse.ReverseStringFilterFactory
 org.apache.lucene.analysis.ru.RussianLightStemFilterFactory
 org.apache.lucene.analysis.shingle.ShingleFilterFactory
 org.apache.lucene.analysis.shingle.FixedShingleFilterFactory
 org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
 org.apache.lucene.analysis.sr.SerbianNormalizationFilterFactory
 org.apache.lucene.analysis.standard.ClassicFilterFactory
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
@ -0,0 +1,200 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.shingle;
 import java.io.IOException;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CannedTokenStream;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
  public void testBiGramFilter() throws IOException {
    TokenStream ts = new CannedTokenStream(
        new Token("please", 0, 6),
        new Token("divide", 7, 13),
        new Token("this", 14, 18),
        new Token("sentence", 19, 27),
        new Token("into", 28, 32),
        new Token("shingles", 33, 41)
    );
    assertTokenStreamContents(new FixedShingleFilter(ts, 2),
        new String[]{"please divide", "divide this", "this sentence", "sentence into", "into shingles"},
        new int[]{0, 7, 14, 19, 28,},
        new int[]{13, 18, 27, 32, 41,},
        new String[]{"shingle", "shingle", "shingle", "shingle", "shingle",},
        new int[]{1, 1, 1, 1, 1,},
        new int[]{2, 2, 2, 2, 2});
  }
  public void testBiGramFilterWithAltSeparator() throws IOException {
    TokenStream ts = new CannedTokenStream(
        new Token("please", 0, 6),
        new Token("divide", 7, 13),
        new Token("this", 14, 18),
        new Token("sentence", 19, 27),
        new Token("into", 28, 32),
        new Token("shingles", 33, 41)
    );
    assertTokenStreamContents(new FixedShingleFilter(ts, 2, "<SEP>", "_"),
        new String[]{"please<SEP>divide", "divide<SEP>this", "this<SEP>sentence", "sentence<SEP>into", "into<SEP>shingles"},
        new int[]{0, 7, 14, 19, 28},
        new int[]{13, 18, 27, 32, 41},
        new String[]{"shingle", "shingle", "shingle", "shingle", "shingle"},
        new int[]{1, 1, 1, 1, 1});
  }
  public void testTriGramFilter() throws IOException {
    TokenStream ts = new CannedTokenStream(
        new Token("please", 0, 6),
        new Token("divide", 7, 13),
        new Token("this", 14, 18),
        new Token("sentence", 19, 27),
        new Token("into", 28, 32),
        new Token("shingles", 33, 41)
    );
    assertTokenStreamContents(new FixedShingleFilter(ts, 3),
        new String[]{"please divide this", "divide this sentence", "this sentence into", "sentence into shingles"});
  }
  public void testShingleSizeGreaterThanTokenstreamLength() throws IOException {
    TokenStream ts = new FixedShingleFilter(new CannedTokenStream(
        new Token("please", 0, 6),
        new Token("divide", 7, 13)
    ), 3);
    ts.reset();
    assertFalse(ts.incrementToken());
  }
  public void testWithStopwords() throws IOException {
    TokenStream ts = new CannedTokenStream(
        new Token("please", 0, 6),
        new Token("divide", 7, 13),
        new Token("sentence", 2, 19, 27),
        new Token("shingles", 2, 33, 41)
    );
    assertTokenStreamContents(new FixedShingleFilter(ts, 3),
        new String[]{"please divide _", "divide _ sentence", "sentence _ shingles"},
        new int[]{0, 7, 19,},
        new int[]{13, 27, 41,},
        new String[]{"shingle", "shingle", "shingle",},
        new int[]{1, 1, 2,});
  }
  public void testConsecutiveStopwords() throws IOException {
    TokenStream ts = new CannedTokenStream(
        new Token("b", 2, 2, 3),
        new Token("c", 4, 5),
        new Token("d", 6, 7),
        new Token("b", 3, 12, 13),
        new Token("c", 14, 15)
    );
    assertTokenStreamContents(new FixedShingleFilter(ts, 4),
        new String[]{"b c d _", "c d _ _", "d _ _ b"},
        new int[]{2, 4, 6,},
        new int[]{7, 7, 13,},
        new int[]{2, 1, 1,});
  }
  public void testTrailingStopwords() throws IOException {
    TokenStream ts = new CannedTokenStream(1, 7,
        new Token("b", 0, 1),
        new Token("c", 2, 3),
        new Token("d", 4, 5)
    );
    assertTokenStreamContents(new FixedShingleFilter(ts, 3),
          new String[] { "b c d", "c d _" },
          new int[] {    0,         2,    },
          new int[] {    5,         5,    },
          new int[] {    1,         1,    });
  }
  public void testMultipleTrailingStopwords() throws IOException {
    TokenStream ts = new CannedTokenStream(2, 9,
        new Token("b", 0, 1),
        new Token("c", 2, 3),
        new Token("d", 4, 5)
    );
    assertTokenStreamContents(new FixedShingleFilter(ts, 3),
          new String[] { "b c d", "c d _", "d _ _" },
          new int[] {    0,         2,      4 },
          new int[] {    5,         5,      5 },
          new int[] {    1,         1,      1 });
  }
  public void testIncomingGraphs() throws IOException {
    TokenStream ts = new CannedTokenStream(
        new Token("b", 0, 1),
        new Token("a", 0, 0, 1),
        new Token("c", 2, 3),
        new Token("b", 4, 5),
        new Token("a", 0, 4, 5),
        new Token("d", 6, 7)
    );
    assertTokenStreamContents(new FixedShingleFilter(ts, 2),
          new String[] { "b c", "a c", "c b", "c a", "b d", "a d" },
          new int[] {    0,     0,     2,     2,     4,     4 },
          new int[] {    3,     3,     5,     5,     7,     7 },
          new int[] {    1,     0,     1,     0,     1,     0 });
  }
  public void testShinglesSpanningGraphs() throws IOException {
    TokenStream ts = new CannedTokenStream(
        new Token("b", 0, 1),
        new Token("a", 0, 0, 1),
        new Token("c", 2, 3),
        new Token("b", 4, 5),
        new Token("a", 0, 4, 5),
        new Token("d", 6, 7)
    );
    assertTokenStreamContents(new FixedShingleFilter(ts, 3),
          new String[] { "b c b", "b c a", "a c b", "a c a", "c b d", "c a d" },
          new int[] {    0,        0,      0,       0,       2,        2,     },
          new int[] {    5,        5,      5,       5,       7,        7,     },
          new int[] {    1,        0,      0,       0,       1,        0,     });
  }
 }
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@ -341,6 +341,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null);
  }
  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int[] posLengths) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, null);
  }
  public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
    assertTokenStreamContents(ts, output, null, null, null, null, null, null);
  }
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
@ -75,6 +75,13 @@ public class Token extends PackedTokenAttributeImpl implements FlagsAttribute, P
    setOffset(start, end);
  }
  /** Constructs a Token with the given term text, position increment, start and end offsets */
  public Token(CharSequence text, int posInc, int start, int end) {
    append(text);
    setOffset(start, end);
    setPositionIncrement(posInc);
  }
  /**
   * {@inheritDoc}
   * @see FlagsAttribute