mirror of https://github.com/apache/lucene.git
Fix onlyLongestMatch in DictionaryCompoundWordTokenFilter
The commit addresses an issue with the onlyLongestMatch flag in the DictionaryCompoundWordTokenFilter. Prior to this fix, when onlyLongestMatch was set to true, the filter would return a match for "orangen" but not for "oran" when both were present in the dictionary. With this fix, the filter now also correctly handles cases where the submatch is not at the start of the match, such as "orangen" and "angen".
This commit is contained in:
parent
5e725964a0
commit
b7cad98c6a
|
@ -88,8 +88,17 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
|
|||
}
|
||||
}
|
||||
if (this.onlyLongestMatch && longestMatchToken != null) {
|
||||
boolean contained = false;
|
||||
for (CompoundToken addedToken : tokens) {
|
||||
if (addedToken.txt.toString().contains(longestMatchToken.txt)) {
|
||||
contained = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!contained) {
|
||||
tokens.add(longestMatchToken);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,6 +18,8 @@ package org.apache.lucene.analysis.compound;
|
|||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase;
|
||||
|
@ -25,6 +27,9 @@ import org.apache.lucene.tests.analysis.MockTokenizer;
|
|||
|
||||
/** Simple tests to ensure the Dictionary compound filter factory is working. */
|
||||
public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
private static CharArraySet makeDictionary(String... dictionary) {
|
||||
return new CharArraySet(Arrays.asList(dictionary), true);
|
||||
}
|
||||
/** Ensure the filter actually decompounds text. */
|
||||
public void testDecompounding() throws Exception {
|
||||
Reader reader = new StringReader("I like to play softball");
|
||||
|
@ -37,6 +42,62 @@ public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStrea
|
|||
stream, new String[] {"I", "like", "to", "play", "softball", "soft", "ball"});
|
||||
}
|
||||
|
||||
/** Ensure subtoken are found in token and indexed zero * */
|
||||
public void testDecompounderSubmatches() throws Exception {
|
||||
CharArraySet dict = makeDictionary("ora", "orangen", "schoko", "schokolade");
|
||||
|
||||
DictionaryCompoundWordTokenFilter tf =
|
||||
new DictionaryCompoundWordTokenFilter(
|
||||
whitespaceMockTokenizer("ich will orangenschokolade haben"),
|
||||
dict,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
|
||||
false);
|
||||
assertTokenStreamContents(
|
||||
tf,
|
||||
new String[] {
|
||||
"ich", "will", "orangenschokolade", "ora", "orangen", "schoko", "schokolade", "haben"
|
||||
},
|
||||
new int[] {1, 1, 1, 0, 0, 0, 0, 1});
|
||||
}
|
||||
|
||||
/** Ensure subtoken are found in token and only longest match is returned with same start * */
|
||||
public void testDecompounderSubmatchesOnlyLongestMatch() throws Exception {
|
||||
CharArraySet dict = makeDictionary("ora", "orangen", "schoko", "schokolade");
|
||||
|
||||
DictionaryCompoundWordTokenFilter tf =
|
||||
new DictionaryCompoundWordTokenFilter(
|
||||
whitespaceMockTokenizer("ich will orangenschokolade haben"),
|
||||
dict,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
|
||||
true);
|
||||
assertTokenStreamContents(
|
||||
tf,
|
||||
new String[] {"ich", "will", "orangenschokolade", "orangen", "schokolade", "haben"},
|
||||
new int[] {1, 1, 1, 0, 0, 1});
|
||||
}
|
||||
|
||||
/** Ensure subtoken are found in token and only longest match is returned without same start * */
|
||||
public void testDecompounderPostSubmatchesOnlyLongestMatch() throws Exception {
|
||||
CharArraySet dict = makeDictionary("ngen", "orangen", "schoko", "schokolade");
|
||||
|
||||
DictionaryCompoundWordTokenFilter tf =
|
||||
new DictionaryCompoundWordTokenFilter(
|
||||
whitespaceMockTokenizer("ich will orangenschokolade haben"),
|
||||
dict,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
|
||||
true);
|
||||
assertTokenStreamContents(
|
||||
tf,
|
||||
new String[] {"ich", "will", "orangenschokolade", "orangen", "schokolade", "haben"},
|
||||
new int[] {1, 1, 1, 0, 0, 1});
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
IllegalArgumentException expected =
|
||||
|
|
Loading…
Reference in New Issue