diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java index c6278a80a1f..1923cd933b6 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java @@ -88,7 +88,16 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa } } if (this.onlyLongestMatch && longestMatchToken != null) { - tokens.add(longestMatchToken); + boolean contained = false; + for (CompoundToken addedToken : tokens) { + if (addedToken.txt.toString().contains(longestMatchToken.txt)) { + contained = true; + break; + } + } + if (!contained) { + tokens.add(longestMatchToken); + } } } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java index cfe2f432b65..95340a3d892 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java @@ -18,6 +18,8 @@ package org.apache.lucene.analysis.compound; import java.io.Reader; import java.io.StringReader; +import java.util.Arrays; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase; @@ -25,6 +27,9 @@ import org.apache.lucene.tests.analysis.MockTokenizer; /** Simple tests to ensure the Dictionary compound filter factory is working. */ public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStreamFactoryTestCase { + private static CharArraySet makeDictionary(String... dictionary) { + return new CharArraySet(Arrays.asList(dictionary), true); + } /** Ensure the filter actually decompounds text. */ public void testDecompounding() throws Exception { Reader reader = new StringReader("I like to play softball"); @@ -37,6 +42,62 @@ public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStrea stream, new String[] {"I", "like", "to", "play", "softball", "soft", "ball"}); } + /** Ensure subtoken are found in token and indexed zero * */ + public void testDecompounderSubmatches() throws Exception { + CharArraySet dict = makeDictionary("ora", "orangen", "schoko", "schokolade"); + + DictionaryCompoundWordTokenFilter tf = + new DictionaryCompoundWordTokenFilter( + whitespaceMockTokenizer("ich will orangenschokolade haben"), + dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, + false); + assertTokenStreamContents( + tf, + new String[] { + "ich", "will", "orangenschokolade", "ora", "orangen", "schoko", "schokolade", "haben" + }, + new int[] {1, 1, 1, 0, 0, 0, 0, 1}); + } + + /** Ensure subtoken are found in token and only longest match is returned with same start * */ + public void testDecompounderSubmatchesOnlyLongestMatch() throws Exception { + CharArraySet dict = makeDictionary("ora", "orangen", "schoko", "schokolade"); + + DictionaryCompoundWordTokenFilter tf = + new DictionaryCompoundWordTokenFilter( + whitespaceMockTokenizer("ich will orangenschokolade haben"), + dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, + true); + assertTokenStreamContents( + tf, + new String[] {"ich", "will", "orangenschokolade", "orangen", "schokolade", "haben"}, + new int[] {1, 1, 1, 0, 0, 1}); + } + + /** Ensure subtoken are found in token and only longest match is returned without same start * */ + public void testDecompounderPostSubmatchesOnlyLongestMatch() throws Exception { + CharArraySet dict = makeDictionary("ngen", "orangen", "schoko", "schokolade"); + + DictionaryCompoundWordTokenFilter tf = + new DictionaryCompoundWordTokenFilter( + whitespaceMockTokenizer("ich will orangenschokolade haben"), + dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, + true); + assertTokenStreamContents( + tf, + new String[] {"ich", "will", "orangenschokolade", "orangen", "schokolade", "haben"}, + new int[] {1, 1, 1, 0, 0, 1}); + } + /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception { IllegalArgumentException expected =