mirror of https://github.com/apache/lucene.git
LUCENE-1101: TokenStream.next(Token) reuse 'policy': calling Token.clear() should be responsibility of token producer.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@607521 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
efbd1260a9
commit
b367e863e6
|
@ -85,6 +85,10 @@ API Changes
|
|||
12. LUCENE-1089: Added PriorityQueue.insertWithOverflow, which returns
|
||||
the Object (if any) that was bumped from the queue to allow
|
||||
re-use. (Shai Erera via Mike McCandless)
|
||||
|
||||
13. LUCENE-1101: Token reuse 'contract' (defined LUCENE-969)
|
||||
modified so it is token producer's responsibility
|
||||
to call Token.clear(). (Doron Cohen)
|
||||
|
||||
|
||||
Bug fixes
|
||||
|
|
|
@ -45,6 +45,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
public final Token next(Token token) throws IOException {
|
||||
token.clear();
|
||||
int length = 0;
|
||||
int start = bufferIndex;
|
||||
char[] buffer = token.termBuffer();
|
||||
|
|
|
@ -42,6 +42,7 @@ public class KeywordTokenizer extends Tokenizer {
|
|||
if (!done) {
|
||||
done = true;
|
||||
int upto = 0;
|
||||
result.clear();
|
||||
char[] buffer = result.termBuffer();
|
||||
while (true) {
|
||||
final int length = input.read(buffer, upto, buffer.length-upto);
|
||||
|
|
|
@ -58,14 +58,23 @@ public abstract class TokenStream {
|
|||
* When possible, the input Token should be used as the
|
||||
* returned Token (this gives fastest tokenization
|
||||
* performance), but this is not required and a new Token
|
||||
* may be returned. Callers may re-use a single Token
|
||||
* instance for successive calls to this method and must
|
||||
* therefore fully consume the previously returned Token
|
||||
* before calling this method again.
|
||||
* @param result a Token that may or may not be used to
|
||||
* return
|
||||
* @return next token in the stream or null if
|
||||
* end-of-stream was hit*/
|
||||
* may be returned. Callers may re-use a single Token
|
||||
* instance for successive calls to this method.
|
||||
* <p>
|
||||
* This implicitly defines a "contract" between
|
||||
* consumers (callers of this method) and
|
||||
* producers (implementations of this method
|
||||
* that are the source for tokens):
|
||||
* <ul>
|
||||
* <li>A consumer must fully consume the previously
|
||||
* returned Token before calling this method again.</li>
|
||||
* <li>A producer must call {@link Token#clear()}
|
||||
* before setting the fields in it & returning it</li>
|
||||
* </ul>
|
||||
* Note that a {@link TokenFilter} is considered a consumer.
|
||||
* @param result a Token that may or may not be used to return
|
||||
* @return next token in the stream or null if end-of-stream was hit
|
||||
*/
|
||||
public Token next(Token result) throws IOException {
|
||||
return next();
|
||||
}
|
||||
|
|
|
@ -23,8 +23,12 @@ import java.io.IOException;
|
|||
/** A Tokenizer is a TokenStream whose input is a Reader.
|
||||
<p>
|
||||
This is an abstract class.
|
||||
<p>
|
||||
NOTE: subclasses must override at least one of {@link
|
||||
#next()} or {@link #next(Token)}.
|
||||
<p>
|
||||
NOTE: subclasses overriding {@link #next(Token)} must
|
||||
call {@link Token#clear()}.
|
||||
*/
|
||||
|
||||
public abstract class Tokenizer extends TokenStream {
|
||||
|
|
|
@ -92,6 +92,7 @@ public class StandardTokenizer extends Tokenizer {
|
|||
return null;
|
||||
}
|
||||
|
||||
result.clear();
|
||||
scanner.getText(result);
|
||||
final int start = scanner.yychar();
|
||||
result.setStartOffset(start);
|
||||
|
|
|
@ -1373,7 +1373,6 @@ final class DocumentsWriter {
|
|||
offsetEnd = offset-1;
|
||||
Token token;
|
||||
for(;;) {
|
||||
localToken.clear();
|
||||
token = stream.next(localToken);
|
||||
if (token == null) break;
|
||||
position += (token.getPositionIncrement() - 1);
|
||||
|
|
Loading…
Reference in New Issue