Fix onlyLongestMatch in DictionaryCompoundWordTokenFilter

The commit addresses an issue with the onlyLongestMatch flag in the DictionaryCompoundWordTokenFilter. Prior to this fix, when onlyLongestMatch was set to true, the filter would return a match for "orangen" but not for "oran" when both were present in the dictionary.

With this fix, the filter now also correctly handles cases where the submatch is not at the start of the match, such as "orangen" and "angen".
This commit is contained in:
Matthias Osswald 2023-08-02 16:40:02 +02:00
parent 5e725964a0
commit b7cad98c6a
2 changed files with 71 additions and 1 deletions

View File

@ -88,7 +88,16 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
} }
} }
if (this.onlyLongestMatch && longestMatchToken != null) { if (this.onlyLongestMatch && longestMatchToken != null) {
tokens.add(longestMatchToken); boolean contained = false;
for (CompoundToken addedToken : tokens) {
if (addedToken.txt.toString().contains(longestMatchToken.txt)) {
contained = true;
break;
}
}
if (!contained) {
tokens.add(longestMatchToken);
}
} }
} }
} }

View File

@ -18,6 +18,8 @@ package org.apache.lucene.analysis.compound;
import java.io.Reader; import java.io.Reader;
import java.io.StringReader; import java.io.StringReader;
import java.util.Arrays;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase; import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase;
@ -25,6 +27,9 @@ import org.apache.lucene.tests.analysis.MockTokenizer;
/** Simple tests to ensure the Dictionary compound filter factory is working. */ /** Simple tests to ensure the Dictionary compound filter factory is working. */
public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStreamFactoryTestCase { public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStreamFactoryTestCase {
private static CharArraySet makeDictionary(String... dictionary) {
return new CharArraySet(Arrays.asList(dictionary), true);
}
/** Ensure the filter actually decompounds text. */ /** Ensure the filter actually decompounds text. */
public void testDecompounding() throws Exception { public void testDecompounding() throws Exception {
Reader reader = new StringReader("I like to play softball"); Reader reader = new StringReader("I like to play softball");
@ -37,6 +42,62 @@ public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStrea
stream, new String[] {"I", "like", "to", "play", "softball", "soft", "ball"}); stream, new String[] {"I", "like", "to", "play", "softball", "soft", "ball"});
} }
/** Ensure subtoken are found in token and indexed zero * */
public void testDecompounderSubmatches() throws Exception {
CharArraySet dict = makeDictionary("ora", "orangen", "schoko", "schokolade");
DictionaryCompoundWordTokenFilter tf =
new DictionaryCompoundWordTokenFilter(
whitespaceMockTokenizer("ich will orangenschokolade haben"),
dict,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
false);
assertTokenStreamContents(
tf,
new String[] {
"ich", "will", "orangenschokolade", "ora", "orangen", "schoko", "schokolade", "haben"
},
new int[] {1, 1, 1, 0, 0, 0, 0, 1});
}
/** Ensure subtoken are found in token and only longest match is returned with same start * */
public void testDecompounderSubmatchesOnlyLongestMatch() throws Exception {
CharArraySet dict = makeDictionary("ora", "orangen", "schoko", "schokolade");
DictionaryCompoundWordTokenFilter tf =
new DictionaryCompoundWordTokenFilter(
whitespaceMockTokenizer("ich will orangenschokolade haben"),
dict,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
true);
assertTokenStreamContents(
tf,
new String[] {"ich", "will", "orangenschokolade", "orangen", "schokolade", "haben"},
new int[] {1, 1, 1, 0, 0, 1});
}
/** Ensure subtoken are found in token and only longest match is returned without same start * */
public void testDecompounderPostSubmatchesOnlyLongestMatch() throws Exception {
CharArraySet dict = makeDictionary("ngen", "orangen", "schoko", "schokolade");
DictionaryCompoundWordTokenFilter tf =
new DictionaryCompoundWordTokenFilter(
whitespaceMockTokenizer("ich will orangenschokolade haben"),
dict,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
true);
assertTokenStreamContents(
tf,
new String[] {"ich", "will", "orangenschokolade", "orangen", "schokolade", "haben"},
new int[] {1, 1, 1, 0, 0, 1});
}
/** Test that bogus arguments result in exception */ /** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception { public void testBogusArguments() throws Exception {
IllegalArgumentException expected = IllegalArgumentException expected =