Fix onlyLongestMatch in DictionaryCompoundWordTokenFilter

The commit addresses an issue with the onlyLongestMatch flag in the DictionaryCompoundWordTokenFilter. Prior to this fix, when onlyLongestMatch was set to true, the filter would return a match for "orangen" but not for "oran" when both were present in the dictionary. With this fix, the filter now also correctly handles cases where the submatch is not at the start of the match, such as "orangen" and "angen".
2023-08-02 16:40:02 +02:00 · 2023-08-02 16:40:02 +02:00 · b7cad98c6a
parent 5e725964a0
commit b7cad98c6a
2 changed files with 71 additions and 1 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
@ -88,8 +88,17 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
        }
      }
      if (this.onlyLongestMatch && longestMatchToken != null) {
+        boolean contained = false;
+        for (CompoundToken addedToken : tokens) {
+          if (addedToken.txt.toString().contains(longestMatchToken.txt)) {
+            contained = true;
+            break;
+          }
+        }
+        if (!contained) {
          tokens.add(longestMatchToken);
        }
      }
    }
  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java
@ -18,6 +18,8 @@ package org.apache.lucene.analysis.compound;

 import java.io.Reader;
 import java.io.StringReader;
+import java.util.Arrays;
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase;
@ -25,6 +27,9 @@ import org.apache.lucene.tests.analysis.MockTokenizer;

 /** Simple tests to ensure the Dictionary compound filter factory is working. */
 public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStreamFactoryTestCase {
+  private static CharArraySet makeDictionary(String... dictionary) {
+    return new CharArraySet(Arrays.asList(dictionary), true);
+  }
  /** Ensure the filter actually decompounds text. */
  public void testDecompounding() throws Exception {
    Reader reader = new StringReader("I like to play softball");
@ -37,6 +42,62 @@ public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStrea
        stream, new String[] {"I", "like", "to", "play", "softball", "soft", "ball"});
  }

+  /** Ensure subtoken are found in token and indexed zero * */
+  public void testDecompounderSubmatches() throws Exception {
+    CharArraySet dict = makeDictionary("ora", "orangen", "schoko", "schokolade");
+
+    DictionaryCompoundWordTokenFilter tf =
+        new DictionaryCompoundWordTokenFilter(
+            whitespaceMockTokenizer("ich will orangenschokolade haben"),
+            dict,
+            CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+            CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+            CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
+            false);
+    assertTokenStreamContents(
+        tf,
+        new String[] {
+          "ich", "will", "orangenschokolade", "ora", "orangen", "schoko", "schokolade", "haben"
+        },
+        new int[] {1, 1, 1, 0, 0, 0, 0, 1});
+  }
+
+  /** Ensure subtoken are found in token and only longest match is returned with same start * */
+  public void testDecompounderSubmatchesOnlyLongestMatch() throws Exception {
+    CharArraySet dict = makeDictionary("ora", "orangen", "schoko", "schokolade");
+
+    DictionaryCompoundWordTokenFilter tf =
+        new DictionaryCompoundWordTokenFilter(
+            whitespaceMockTokenizer("ich will orangenschokolade haben"),
+            dict,
+            CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+            CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+            CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
+            true);
+    assertTokenStreamContents(
+        tf,
+        new String[] {"ich", "will", "orangenschokolade", "orangen", "schokolade", "haben"},
+        new int[] {1, 1, 1, 0, 0, 1});
+  }
+
+  /** Ensure subtoken are found in token and only longest match is returned without same start * */
+  public void testDecompounderPostSubmatchesOnlyLongestMatch() throws Exception {
+    CharArraySet dict = makeDictionary("ngen", "orangen", "schoko", "schokolade");
+
+    DictionaryCompoundWordTokenFilter tf =
+        new DictionaryCompoundWordTokenFilter(
+            whitespaceMockTokenizer("ich will orangenschokolade haben"),
+            dict,
+            CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+            CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+            CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
+            true);
+    assertTokenStreamContents(
+        tf,
+        new String[] {"ich", "will", "orangenschokolade", "orangen", "schokolade", "haben"},
+        new int[] {1, 1, 1, 0, 0, 1});
+  }
+
  /** Test that bogus arguments result in exception */
  public void testBogusArguments() throws Exception {
    IllegalArgumentException expected =