From b7cad98c6afdb87ccd31d141b06582bfc0c7a098 Mon Sep 17 00:00:00 2001 From: Matthias Osswald Date: Wed, 2 Aug 2023 16:40:02 +0200 Subject: [PATCH] Fix onlyLongestMatch in DictionaryCompoundWordTokenFilter The commit addresses an issue with the onlyLongestMatch flag in the DictionaryCompoundWordTokenFilter. Prior to this fix, when onlyLongestMatch was set to true, the filter would return a match for "orangen" but not for "oran" when both were present in the dictionary. With this fix, the filter now also correctly handles cases where the submatch is not at the start of the match, such as "orangen" and "angen". --- .../DictionaryCompoundWordTokenFilter.java | 11 +++- ...tionaryCompoundWordTokenFilterFactory.java | 61 +++++++++++++++++++ 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java index c6278a80a1f..1923cd933b6 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java @@ -88,7 +88,16 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa } } if (this.onlyLongestMatch && longestMatchToken != null) { - tokens.add(longestMatchToken); + boolean contained = false; + for (CompoundToken addedToken : tokens) { + if (addedToken.txt.toString().contains(longestMatchToken.txt)) { + contained = true; + break; + } + } + if (!contained) { + tokens.add(longestMatchToken); + } } } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java index cfe2f432b65..95340a3d892 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java @@ -18,6 +18,8 @@ package org.apache.lucene.analysis.compound; import java.io.Reader; import java.io.StringReader; +import java.util.Arrays; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase; @@ -25,6 +27,9 @@ import org.apache.lucene.tests.analysis.MockTokenizer; /** Simple tests to ensure the Dictionary compound filter factory is working. */ public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStreamFactoryTestCase { + private static CharArraySet makeDictionary(String... dictionary) { + return new CharArraySet(Arrays.asList(dictionary), true); + } /** Ensure the filter actually decompounds text. */ public void testDecompounding() throws Exception { Reader reader = new StringReader("I like to play softball"); @@ -37,6 +42,62 @@ public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStrea stream, new String[] {"I", "like", "to", "play", "softball", "soft", "ball"}); } + /** Ensure subtoken are found in token and indexed zero * */ + public void testDecompounderSubmatches() throws Exception { + CharArraySet dict = makeDictionary("ora", "orangen", "schoko", "schokolade"); + + DictionaryCompoundWordTokenFilter tf = + new DictionaryCompoundWordTokenFilter( + whitespaceMockTokenizer("ich will orangenschokolade haben"), + dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, + false); + assertTokenStreamContents( + tf, + new String[] { + "ich", "will", "orangenschokolade", "ora", "orangen", "schoko", "schokolade", "haben" + }, + new int[] {1, 1, 1, 0, 0, 0, 0, 1}); + } + + /** Ensure subtoken are found in token and only longest match is returned with same start * */ + public void testDecompounderSubmatchesOnlyLongestMatch() throws Exception { + CharArraySet dict = makeDictionary("ora", "orangen", "schoko", "schokolade"); + + DictionaryCompoundWordTokenFilter tf = + new DictionaryCompoundWordTokenFilter( + whitespaceMockTokenizer("ich will orangenschokolade haben"), + dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, + true); + assertTokenStreamContents( + tf, + new String[] {"ich", "will", "orangenschokolade", "orangen", "schokolade", "haben"}, + new int[] {1, 1, 1, 0, 0, 1}); + } + + /** Ensure subtoken are found in token and only longest match is returned without same start * */ + public void testDecompounderPostSubmatchesOnlyLongestMatch() throws Exception { + CharArraySet dict = makeDictionary("ngen", "orangen", "schoko", "schokolade"); + + DictionaryCompoundWordTokenFilter tf = + new DictionaryCompoundWordTokenFilter( + whitespaceMockTokenizer("ich will orangenschokolade haben"), + dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, + true); + assertTokenStreamContents( + tf, + new String[] {"ich", "will", "orangenschokolade", "orangen", "schokolade", "haben"}, + new int[] {1, 1, 1, 0, 0, 1}); + } + /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception { IllegalArgumentException expected =