LUCENE-3417: DictionaryCompoundWordFilter did not properly add tokens from the end compound word

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1166728 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-09-08 14:59:15 +00:00
parent 2227f3bfaa
commit b265d499f2
3 changed files with 50 additions and 2 deletions

View File

@ -83,6 +83,11 @@ New Features
SimpleBoundaryScanner and BreakIteratorBoundaryScanner, so that FVH's FragmentsBuilder SimpleBoundaryScanner and BreakIteratorBoundaryScanner, so that FVH's FragmentsBuilder
can find "natural" boundary to make snippets. (Robert Muir, Koji Sekiguchi) can find "natural" boundary to make snippets. (Robert Muir, Koji Sekiguchi)
Bug Fixes
* LUCENE-3417: DictionaryCompoundWordFilter did not properly add tokens from the
end compound word. (Njal Karevoll via Robert Muir)
======================= Lucene 3.4.0 ================ ======================= Lucene 3.4.0 ================
New Features New Features

View File

@ -136,9 +136,9 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer()); char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer());
for (int i=0;i<token.length()-this.minSubwordSize;++i) { for (int i=0;i<=token.length()-this.minSubwordSize;++i) {
Token longestMatchToken=null; Token longestMatchToken=null;
for (int j=this.minSubwordSize-1;j<this.maxSubwordSize;++j) { for (int j=this.minSubwordSize;j<=this.maxSubwordSize;++j) {
if(i+j>token.length()) { if(i+j>token.length()) {
break; break;
} }

View File

@ -162,6 +162,49 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
0, 0 }); 0, 0 });
} }
public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
String[] dict = {"ab", "cd", "ef"};
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
new WhitespaceTokenizer(TEST_VERSION_CURRENT,
new StringReader(
"abcdef")
),
dict,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
assertTokenStreamContents(tf,
new String[] { "abcdef", "ab", "cd", "ef" },
new int[] { 0, 0, 2, 4},
new int[] { 6, 2, 4, 6},
new int[] { 1, 0, 0, 0}
);
}
public void testWordComponentWithLessThanMinimumLength() throws Exception {
String[] dict = {"abc", "d", "efg"};
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
new WhitespaceTokenizer(TEST_VERSION_CURRENT,
new StringReader(
"abcdefg")
),
dict,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
// since "d" is shorter than the minimum subword size, it should not be added to the token stream
assertTokenStreamContents(tf,
new String[] { "abcdefg", "abc", "efg" },
new int[] { 0, 0, 4},
new int[] { 7, 3, 7},
new int[] { 1, 0, 0}
);
}
public void testReset() throws Exception { public void testReset() throws Exception {
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz", String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
"Aufgabe", "Überwachung" }; "Aufgabe", "Überwachung" };