mirror of https://github.com/apache/lucene.git
LUCENE-3417: DictionaryCompoundWordFilter did not properly add tokens from the end compound word
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1166728 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2227f3bfaa
commit
b265d499f2
|
@ -83,6 +83,11 @@ New Features
|
||||||
SimpleBoundaryScanner and BreakIteratorBoundaryScanner, so that FVH's FragmentsBuilder
|
SimpleBoundaryScanner and BreakIteratorBoundaryScanner, so that FVH's FragmentsBuilder
|
||||||
can find "natural" boundary to make snippets. (Robert Muir, Koji Sekiguchi)
|
can find "natural" boundary to make snippets. (Robert Muir, Koji Sekiguchi)
|
||||||
|
|
||||||
|
Bug Fixes
|
||||||
|
|
||||||
|
* LUCENE-3417: DictionaryCompoundWordFilter did not properly add tokens from the
|
||||||
|
end compound word. (Njal Karevoll via Robert Muir)
|
||||||
|
|
||||||
======================= Lucene 3.4.0 ================
|
======================= Lucene 3.4.0 ================
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
|
|
|
@ -136,9 +136,9 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
|
||||||
|
|
||||||
char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer());
|
char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer());
|
||||||
|
|
||||||
for (int i=0;i<token.length()-this.minSubwordSize;++i) {
|
for (int i=0;i<=token.length()-this.minSubwordSize;++i) {
|
||||||
Token longestMatchToken=null;
|
Token longestMatchToken=null;
|
||||||
for (int j=this.minSubwordSize-1;j<this.maxSubwordSize;++j) {
|
for (int j=this.minSubwordSize;j<=this.maxSubwordSize;++j) {
|
||||||
if(i+j>token.length()) {
|
if(i+j>token.length()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -162,6 +162,49 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
||||||
0, 0 });
|
0, 0 });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
|
||||||
|
String[] dict = {"ab", "cd", "ef"};
|
||||||
|
|
||||||
|
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
|
||||||
|
new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||||
|
new StringReader(
|
||||||
|
"abcdef")
|
||||||
|
),
|
||||||
|
dict,
|
||||||
|
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||||
|
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||||
|
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||||
|
|
||||||
|
assertTokenStreamContents(tf,
|
||||||
|
new String[] { "abcdef", "ab", "cd", "ef" },
|
||||||
|
new int[] { 0, 0, 2, 4},
|
||||||
|
new int[] { 6, 2, 4, 6},
|
||||||
|
new int[] { 1, 0, 0, 0}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testWordComponentWithLessThanMinimumLength() throws Exception {
|
||||||
|
String[] dict = {"abc", "d", "efg"};
|
||||||
|
|
||||||
|
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
|
||||||
|
new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||||
|
new StringReader(
|
||||||
|
"abcdefg")
|
||||||
|
),
|
||||||
|
dict,
|
||||||
|
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||||
|
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||||
|
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||||
|
|
||||||
|
// since "d" is shorter than the minimum subword size, it should not be added to the token stream
|
||||||
|
assertTokenStreamContents(tf,
|
||||||
|
new String[] { "abcdefg", "abc", "efg" },
|
||||||
|
new int[] { 0, 0, 4},
|
||||||
|
new int[] { 7, 3, 7},
|
||||||
|
new int[] { 1, 0, 0}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
public void testReset() throws Exception {
|
public void testReset() throws Exception {
|
||||||
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
|
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
|
||||||
"Aufgabe", "Überwachung" };
|
"Aufgabe", "Überwachung" };
|
||||||
|
|
Loading…
Reference in New Issue