From b265d499f243b67b5447514f2daf7062a12c8002 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Thu, 8 Sep 2011 14:59:15 +0000
Subject: [PATCH] LUCENE-3417: DictionaryCompoundWordFilter did not properly
 add tokens from the end compound word

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1166728 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/contrib/CHANGES.txt                    |  5 +++
 .../DictionaryCompoundWordTokenFilter.java    |  4 +-
 .../compound/TestCompoundWordTokenFilter.java | 43 +++++++++++++++++++
 3 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt
index fb83365c062..31080ef6c57 100644
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@@ -83,6 +83,11 @@ New Features
    SimpleBoundaryScanner and BreakIteratorBoundaryScanner, so that FVH's FragmentsBuilder
    can find "natural" boundary to make snippets. (Robert Muir, Koji Sekiguchi)
 
+Bug Fixes
+
+ * LUCENE-3417: DictionaryCompoundWordFilter did not properly add tokens from the
+   end compound word. (Njal Karevoll via Robert Muir)
+
 ======================= Lucene 3.4.0 ================
 
 New Features
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
index 405c1aff665..4805918f406 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
@@ -136,9 +136,9 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
     
     char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer());
     
-    for (int i=0;i<token.length()-this.minSubwordSize;++i) {
+    for (int i=0;i<=token.length()-this.minSubwordSize;++i) {
         Token longestMatchToken=null;
-        for (int j=this.minSubwordSize-1;j<this.maxSubwordSize;++j) {
+        for (int j=this.minSubwordSize;j<=this.maxSubwordSize;++j) {
             if(i+j>token.length()) {
                 break;
             }
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
index 20c9435ca9b..03e50b2ea75 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
@@ -161,6 +161,49 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
         14, 20 }, new int[] { 26, 3, 14, 14, 20, 26 }, new int[] { 1, 0, 0, 0,
         0, 0 });
   }
+
+  public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
+    String[] dict = {"ab", "cd", "ef"};
+
+    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
+			new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+				new StringReader(
+					"abcdef")
+				),
+			dict,
+			CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+			CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+			CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+
+    assertTokenStreamContents(tf,
+			new String[] { "abcdef", "ab", "cd", "ef" },
+			new int[] { 0, 0, 2, 4},
+			new int[] { 6, 2, 4, 6},
+			new int[] { 1, 0, 0, 0}
+			);
+  }
+
+  public void testWordComponentWithLessThanMinimumLength() throws Exception {
+    String[] dict = {"abc", "d", "efg"};
+
+    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
+			new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+				new StringReader(
+					"abcdefg")
+				),
+			dict,
+			CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+			CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+			CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+
+	// since "d" is shorter than the minimum subword size, it should not be added to the token stream
+    assertTokenStreamContents(tf,
+			new String[] { "abcdefg", "abc", "efg" },
+			new int[] { 0, 0, 4},
+			new int[] { 7, 3, 7},
+			new int[] { 1, 0, 0}
+			);
+  }
   
   public void testReset() throws Exception {
     String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",