Add Option to Set Subtoken Position Increment for Dictonary Decompounder

This pull request adds a new feature to Lucene's DictionaryDecompounder. Now, you can set the position increment of subtokens to one. This feature is required when you're doing AND searches that involve subtokens. Right now, the position increment is set to zero. That's how DictionaryDecompounder currently operates. But with this update, users can set the subtokenPositionIncrement to one. This changes the position increment of the subtokens to one. This means, if you're using the AND operator in Elasticsearch match clauses to search for 'orangenschokolade', and 'orangen' and 'schokolade' are in your dictionary, it will correctly search for 'orangen AND schokolade'. By default, the DictionaryDecompounder emits the original compounded token. This behavior remains unchanged when the flag is set to zero. However, when set to one, it changes the DictionaryDecompounder's output to individual subtokens, and the original compounded token will not be emitted.
2023-07-31 14:47:26 +02:00 · 2023-07-31 14:47:26 +02:00 · 3af0c6872a
parent 5e725964a0
commit 3af0c6872a
6 changed files with 106 additions and 33 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
@ -42,6 +42,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
  protected final int minSubwordSize;
  protected final int maxSubwordSize;
  protected final boolean onlyLongestMatch;
+  protected final int subtokenPositionIncrement;

  protected final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@ -58,7 +59,8 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
        DEFAULT_MIN_WORD_SIZE,
        DEFAULT_MIN_SUBWORD_SIZE,
        DEFAULT_MAX_SUBWORD_SIZE,
-        onlyLongestMatch);
+        onlyLongestMatch,
+        0);
  }

  protected CompoundWordTokenFilterBase(TokenStream input, CharArraySet dictionary) {
@ -68,7 +70,20 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
        DEFAULT_MIN_WORD_SIZE,
        DEFAULT_MIN_SUBWORD_SIZE,
        DEFAULT_MAX_SUBWORD_SIZE,
-        false);
+        false,
+        0);
+  }
+
+  protected CompoundWordTokenFilterBase(
+      TokenStream input, CharArraySet dictionary, int subtokenPositionIncrement) {
+    this(
+        input,
+        dictionary,
+        DEFAULT_MIN_WORD_SIZE,
+        DEFAULT_MIN_SUBWORD_SIZE,
+        DEFAULT_MAX_SUBWORD_SIZE,
+        false,
+        subtokenPositionIncrement);
  }

  protected CompoundWordTokenFilterBase(
@ -77,7 +92,8 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
      int minWordSize,
      int minSubwordSize,
      int maxSubwordSize,
-      boolean onlyLongestMatch) {
+      boolean onlyLongestMatch,
+      int subtokenPositionIncrement) {
    super(input);
    this.tokens = new LinkedList<>();
    if (minWordSize < 0) {
@ -91,6 +107,10 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
    if (maxSubwordSize < 0) {
      throw new IllegalArgumentException("maxSubwordSize cannot be negative");
    }
+    if (subtokenPositionIncrement != 0 && subtokenPositionIncrement != 1) {
+      throw new IllegalArgumentException("subtokenPositionIncrement must either be 0 or 1");
+    }
+    this.subtokenPositionIncrement = subtokenPositionIncrement;
    this.maxSubwordSize = maxSubwordSize;
    this.onlyLongestMatch = onlyLongestMatch;
    this.dictionary = dictionary;
@ -98,31 +118,40 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {

  @Override
  public final boolean incrementToken() throws IOException {
+    if (!tokens.isEmpty()) {
+      return processSubtokens();
+    }
+    current = null; // For safety
+    if (!input.incrementToken()) {
+      return false;
+    }
+    if (termAtt.length() >= minWordSize) {
+      decompose();
+      if (!tokens.isEmpty()) {
+        current = captureState();
+        if (subtokenPositionIncrement == 1) {
+          // provided that we have sub-tokens with increment one,
+          // we don't want to write the original token into the output
+          return processSubtokens();
+        }
+      } else if (subtokenPositionIncrement == 1) {
+        current = captureState();
+      }
+    }
+    return true; // Return original token
+  }
+
+  private boolean processSubtokens() {
    if (!tokens.isEmpty()) {
      assert current != null;
      CompoundToken token = tokens.removeFirst();
      restoreState(current); // keep all other attributes untouched
      termAtt.setEmpty().append(token.txt);
      offsetAtt.setOffset(token.startOffset, token.endOffset);
-      posIncAtt.setPositionIncrement(0);
+      posIncAtt.setPositionIncrement(this.subtokenPositionIncrement);
      return true;
    }
-
-    current = null; // not really needed, but for safety
-    if (input.incrementToken()) {
-      // Only words longer than minWordSize get processed
-      if (termAtt.length() >= this.minWordSize) {
-        decompose();
-        // only capture the state if we really need it for producing new tokens
-        if (!tokens.isEmpty()) {
-          current = captureState();
-        }
-      }
-      // return original token:
-      return true;
-    } else {
-      return false;
-    }
+    return false;
  }

  /**
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
@ -50,6 +50,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
   * @param minSubwordSize only subwords longer than this get to the output stream
   * @param maxSubwordSize only subwords shorter than this get to the output stream
   * @param onlyLongestMatch Add only the longest matching subword to the stream
+   * @param subtokenPositionIncrement set a positional increment for subtokens to 0 or 1.
   */
  public DictionaryCompoundWordTokenFilter(
      TokenStream input,
@ -57,11 +58,16 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
      int minWordSize,
      int minSubwordSize,
      int maxSubwordSize,
-      boolean onlyLongestMatch) {
-    super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
-    if (dictionary == null) {
-      throw new IllegalArgumentException("dictionary must not be null");
-    }
+      boolean onlyLongestMatch,
+      int subtokenPositionIncrement) {
+    super(
+        input,
+        dictionary,
+        minWordSize,
+        minSubwordSize,
+        maxSubwordSize,
+        onlyLongestMatch,
+        subtokenPositionIncrement);
  }

  @Override
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java
@ -51,7 +51,7 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory
  private final int minSubwordSize;
  private final int maxSubwordSize;
  private final boolean onlyLongestMatch;
-
+  private final int subtokenPositionIncrement;
  /** Creates a new DictionaryCompoundWordTokenFilterFactory */
  public DictionaryCompoundWordTokenFilterFactory(Map<String, String> args) {
    super(args);
@ -62,6 +62,7 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory
    maxSubwordSize =
        getInt(args, "maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
    onlyLongestMatch = getBoolean(args, "onlyLongestMatch", true);
+    subtokenPositionIncrement = getInt(args, "subtokenPositionIncrement", 0);
    if (!args.isEmpty()) {
      throw new IllegalArgumentException("Unknown parameters: " + args);
    }
@ -84,6 +85,12 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory
      return input;
    }
    return new DictionaryCompoundWordTokenFilter(
-        input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
+        input,
+        dictionary,
+        minWordSize,
+        minSubwordSize,
+        maxSubwordSize,
+        onlyLongestMatch,
+        subtokenPositionIncrement);
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
@ -113,7 +113,7 @@ public class HyphenationCompoundWordTokenFilter extends CompoundWordTokenFilterB
      boolean onlyLongestMatch,
      boolean noSubMatches,
      boolean noOverlappingMatches) {
-    super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
+    super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch, 0);

    this.hyphenator = Objects.requireNonNull(hyphenator, "hyphenator");
    this.noSubMatches = noSubMatches;
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
@ -251,7 +251,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
            CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
            CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
            CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
-            true);
+            true,
+            0);

    assertTokenStreamContents(
        tf,
@ -275,7 +276,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
            CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
            CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
            CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
-            false);
+            false,
+            0);

    assertTokenStreamContents(
        tf,
@ -297,7 +299,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
            CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
            CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
            CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
-            false);
+            false,
+            0);

    // since "d" is shorter than the minimum subword size, it should not be added to the token
    // stream
@ -323,7 +326,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
            CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
            CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
            CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
-            false);
+            false,
+            0);

    CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
    tf.reset();
@ -351,7 +355,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
            CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
            CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
            CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
-            false);
+            false,
+            0);
    MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java
@ -18,6 +18,8 @@ package org.apache.lucene.analysis.compound;

 import java.io.Reader;
 import java.io.StringReader;
+import java.util.Arrays;
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase;
@ -25,6 +27,11 @@ import org.apache.lucene.tests.analysis.MockTokenizer;

 /** Simple tests to ensure the Dictionary compound filter factory is working. */
 public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStreamFactoryTestCase {
+
+  private static CharArraySet makeDictionary(String... dictionary) {
+    return new CharArraySet(Arrays.asList(dictionary), true);
+  }
+
  /** Ensure the filter actually decompounds text. */
  public void testDecompounding() throws Exception {
    Reader reader = new StringReader("I like to play softball");
@ -37,6 +44,25 @@ public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStrea
        stream, new String[] {"I", "like", "to", "play", "softball", "soft", "ball"});
  }

+  /** Ensure subtoken can be set with a positional increment of 1 * */
+  public void testDecompounderWithSubtokenIncrement() throws Exception {
+    CharArraySet dict = makeDictionary("læse", "hest");
+
+    DictionaryCompoundWordTokenFilter tf =
+        new DictionaryCompoundWordTokenFilter(
+            whitespaceMockTokenizer("min veninde som er lidt af en læsehest"),
+            dict,
+            CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+            CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+            CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
+            false,
+            1);
+    assertTokenStreamContents(
+        tf,
+        new String[] {"min", "veninde", "som", "er", "lidt", "af", "en", "læse", "hest"},
+        new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1});
+  }
+
  /** Test that bogus arguments result in exception */
  public void testBogusArguments() throws Exception {
    IllegalArgumentException expected =