mirror of https://github.com/apache/lucene.git
LUCENE-1491 - EdgeNGramTokenFilter no longer stops on tokens shorter than minimum n-gram size.
- line, and those below, will be ignored-- M CHANGES.txt M analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java M analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java M analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java M analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@794034 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
65494af827
commit
b393e4d0af
|
@ -36,6 +36,9 @@ Bug fixes
|
|||
StandardTokenizer so that stop words with mixed case are filtered
|
||||
out. (Rafael Cunha de Almeida, Douglas Campos via Mike McCandless)
|
||||
|
||||
8. LUCENE-1491: EdgeNGramTokenFilter no longer stops on tokens shorter than minimum n-gram size.
|
||||
(Todd Teak via Otis Gospodnetic)
|
||||
|
||||
New features
|
||||
|
||||
1. LUCENE-1531: Added support for BoostingTermQuery to XML query parser. (Karl Wettin)
|
||||
|
|
|
@ -117,19 +117,25 @@ public class EdgeNGramTokenFilter extends TokenFilter {
|
|||
/** Returns the next token in the stream, or null at EOS. */
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (ngrams.size() > 0) {
|
||||
return (Token) ngrams.removeFirst();
|
||||
if (!ngrams.isEmpty()) {
|
||||
return (Token)ngrams.removeFirst();
|
||||
}
|
||||
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken == null)
|
||||
return null;
|
||||
Token token = null;
|
||||
|
||||
ngram(nextToken);
|
||||
if (ngrams.size() > 0)
|
||||
return (Token) ngrams.removeFirst();
|
||||
else
|
||||
return null;
|
||||
while (ngrams.isEmpty() && (token = input.next()) != null) {
|
||||
ngram(token);
|
||||
}
|
||||
|
||||
if (token == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!ngrams.isEmpty()) {
|
||||
return (Token)ngrams.removeFirst();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private void ngram(final Token token) {
|
||||
|
|
|
@ -64,19 +64,25 @@ public class NGramTokenFilter extends TokenFilter {
|
|||
/** Returns the next token in the stream, or null at EOS. */
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (ngrams.size() > 0) {
|
||||
return (Token) ngrams.removeFirst();
|
||||
if (!ngrams.isEmpty()) {
|
||||
return (Token)ngrams.removeFirst();
|
||||
}
|
||||
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken == null)
|
||||
return null;
|
||||
Token token = null;
|
||||
|
||||
ngram(nextToken);
|
||||
if (ngrams.size() > 0)
|
||||
return (Token) ngrams.removeFirst();
|
||||
else
|
||||
return null;
|
||||
while (ngrams.isEmpty() && (token = input.next()) != null) {
|
||||
ngram(token);
|
||||
}
|
||||
|
||||
if (token == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!ngrams.isEmpty()) {
|
||||
return (Token)ngrams.removeFirst();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private void ngram(Token token) {
|
||||
|
|
|
@ -109,4 +109,16 @@ public class EdgeNGramTokenFilterTest extends TestCase {
|
|||
assertEquals("(cde,2,5)", nextToken.toString());
|
||||
assertNull(tokenizer.next(reusableToken));
|
||||
}
|
||||
|
||||
public void testSmallTokenInStream() throws Exception {
|
||||
input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = tokenizer.next(reusableToken);
|
||||
assertEquals("(abc,0,3)", nextToken.toString());
|
||||
nextToken = tokenizer.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals("(fgh,0,3)", nextToken.toString());
|
||||
assertNull(tokenizer.next(reusableToken));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -120,4 +120,16 @@ public class NGramTokenFilterTest extends TestCase {
|
|||
|
||||
assertTrue(tokens.isEmpty());
|
||||
}
|
||||
|
||||
public void testSmallTokenInStream() throws Exception {
|
||||
input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = filter.next(reusableToken);
|
||||
assertEquals("(abc,0,3)", nextToken.toString());
|
||||
nextToken = filter.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals("(fgh,0,3)", nextToken.toString());
|
||||
assertNull(filter.next(reusableToken));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue