LUCENE-1491 - EdgeNGramTokenFilter no longer stops on tokens shorter than minimum n-gram size.

- line, and those below, will be ignored--

M    CHANGES.txt
M    analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
M    analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
M    analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
M    analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java


git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@794034 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Otis Gospodnetic 2009-07-14 19:44:52 +00:00
parent 65494af827
commit b393e4d0af
5 changed files with 59 additions and 20 deletions

View File

@ -36,6 +36,9 @@ Bug fixes
StandardTokenizer so that stop words with mixed case are filtered
out. (Rafael Cunha de Almeida, Douglas Campos via Mike McCandless)
8. LUCENE-1491: EdgeNGramTokenFilter no longer stops on tokens shorter than minimum n-gram size.
(Todd Teak via Otis Gospodnetic)
New features
1. LUCENE-1531: Added support for BoostingTermQuery to XML query parser. (Karl Wettin)

View File

@ -117,19 +117,25 @@ public class EdgeNGramTokenFilter extends TokenFilter {
/** Returns the next token in the stream, or null at EOS. */
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (ngrams.size() > 0) {
return (Token) ngrams.removeFirst();
if (!ngrams.isEmpty()) {
return (Token)ngrams.removeFirst();
}
Token nextToken = input.next(reusableToken);
if (nextToken == null)
return null;
Token token = null;
ngram(nextToken);
if (ngrams.size() > 0)
return (Token) ngrams.removeFirst();
else
return null;
while (ngrams.isEmpty() && (token = input.next()) != null) {
ngram(token);
}
if (token == null) {
return null;
}
if (!ngrams.isEmpty()) {
return (Token)ngrams.removeFirst();
} else {
return null;
}
}
private void ngram(final Token token) {

View File

@ -64,19 +64,25 @@ public class NGramTokenFilter extends TokenFilter {
/** Returns the next token in the stream, or null at EOS. */
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (ngrams.size() > 0) {
return (Token) ngrams.removeFirst();
if (!ngrams.isEmpty()) {
return (Token)ngrams.removeFirst();
}
Token nextToken = input.next(reusableToken);
if (nextToken == null)
return null;
Token token = null;
ngram(nextToken);
if (ngrams.size() > 0)
return (Token) ngrams.removeFirst();
else
return null;
while (ngrams.isEmpty() && (token = input.next()) != null) {
ngram(token);
}
if (token == null) {
return null;
}
if (!ngrams.isEmpty()) {
return (Token)ngrams.removeFirst();
} else {
return null;
}
}
private void ngram(Token token) {

View File

@ -109,4 +109,16 @@ public class EdgeNGramTokenFilterTest extends TestCase {
assertEquals("(cde,2,5)", nextToken.toString());
assertNull(tokenizer.next(reusableToken));
}
public void testSmallTokenInStream() throws Exception {
input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
final Token reusableToken = new Token();
Token nextToken = tokenizer.next(reusableToken);
assertEquals("(abc,0,3)", nextToken.toString());
nextToken = tokenizer.next(reusableToken);
assertNotNull(nextToken);
assertEquals("(fgh,0,3)", nextToken.toString());
assertNull(tokenizer.next(reusableToken));
}
}

View File

@ -120,4 +120,16 @@ public class NGramTokenFilterTest extends TestCase {
assertTrue(tokens.isEmpty());
}
public void testSmallTokenInStream() throws Exception {
input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
final Token reusableToken = new Token();
Token nextToken = filter.next(reusableToken);
assertEquals("(abc,0,3)", nextToken.toString());
nextToken = filter.next(reusableToken);
assertNotNull(nextToken);
assertEquals("(fgh,0,3)", nextToken.toString());
assertNull(filter.next(reusableToken));
}
}