From b265d499f243b67b5447514f2daf7062a12c8002 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 8 Sep 2011 14:59:15 +0000 Subject: [PATCH] LUCENE-3417: DictionaryCompoundWordFilter did not properly add tokens from the end compound word git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1166728 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 5 +++ .../DictionaryCompoundWordTokenFilter.java | 4 +- .../compound/TestCompoundWordTokenFilter.java | 43 +++++++++++++++++++ 3 files changed, 50 insertions(+), 2 deletions(-) diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index fb83365c062..31080ef6c57 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -83,6 +83,11 @@ New Features SimpleBoundaryScanner and BreakIteratorBoundaryScanner, so that FVH's FragmentsBuilder can find "natural" boundary to make snippets. (Robert Muir, Koji Sekiguchi) +Bug Fixes + + * LUCENE-3417: DictionaryCompoundWordFilter did not properly add tokens from the + end compound word. (Njal Karevoll via Robert Muir) + ======================= Lucene 3.4.0 ================ New Features diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java index 405c1aff665..4805918f406 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java @@ -136,9 +136,9 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer()); - for (int i=0;itoken.length()) { break; } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java index 20c9435ca9b..03e50b2ea75 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java @@ -161,6 +161,49 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { 14, 20 }, new int[] { 26, 3, 14, 14, 20, 26 }, new int[] { 1, 0, 0, 0, 0, 0 }); } + + public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception { + String[] dict = {"ab", "cd", "ef"}; + + DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, + new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader( + "abcdef") + ), + dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); + + assertTokenStreamContents(tf, + new String[] { "abcdef", "ab", "cd", "ef" }, + new int[] { 0, 0, 2, 4}, + new int[] { 6, 2, 4, 6}, + new int[] { 1, 0, 0, 0} + ); + } + + public void testWordComponentWithLessThanMinimumLength() throws Exception { + String[] dict = {"abc", "d", "efg"}; + + DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, + new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader( + "abcdefg") + ), + dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); + + // since "d" is shorter than the minimum subword size, it should not be added to the token stream + assertTokenStreamContents(tf, + new String[] { "abcdefg", "abc", "efg" }, + new int[] { 0, 0, 4}, + new int[] { 7, 3, 7}, + new int[] { 1, 0, 0} + ); + } public void testReset() throws Exception { String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",