LUCENE-1101: TokenStream.next(Token) reuse 'policy': calling Token.clear() should be responsibility of token producer.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@607521 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Doron Cohen 2007-12-30 07:34:30 +00:00
parent efbd1260a9
commit b367e863e6
7 changed files with 28 additions and 9 deletions

View File

@ -85,6 +85,10 @@ API Changes
12. LUCENE-1089: Added PriorityQueue.insertWithOverflow, which returns
the Object (if any) that was bumped from the queue to allow
re-use. (Shai Erera via Mike McCandless)
13. LUCENE-1101: Token reuse 'contract' (defined LUCENE-969)
modified so it is token producer's responsibility
to call Token.clear(). (Doron Cohen)
Bug fixes

View File

@ -45,6 +45,7 @@ public abstract class CharTokenizer extends Tokenizer {
}
public final Token next(Token token) throws IOException {
token.clear();
int length = 0;
int start = bufferIndex;
char[] buffer = token.termBuffer();

View File

@ -42,6 +42,7 @@ public class KeywordTokenizer extends Tokenizer {
if (!done) {
done = true;
int upto = 0;
result.clear();
char[] buffer = result.termBuffer();
while (true) {
final int length = input.read(buffer, upto, buffer.length-upto);

View File

@ -58,14 +58,23 @@ public abstract class TokenStream {
* When possible, the input Token should be used as the
* returned Token (this gives fastest tokenization
* performance), but this is not required and a new Token
* may be returned. Callers may re-use a single Token
* instance for successive calls to this method and must
* therefore fully consume the previously returned Token
* before calling this method again.
* @param result a Token that may or may not be used to
* return
* @return next token in the stream or null if
* end-of-stream was hit*/
* may be returned. Callers may re-use a single Token
* instance for successive calls to this method.
* <p>
* This implicitly defines a "contract" between
* consumers (callers of this method) and
* producers (implementations of this method
* that are the source for tokens):
* <ul>
* <li>A consumer must fully consume the previously
* returned Token before calling this method again.</li>
* <li>A producer must call {@link Token#clear()}
* before setting the fields in it & returning it</li>
* </ul>
* Note that a {@link TokenFilter} is considered a consumer.
* @param result a Token that may or may not be used to return
* @return next token in the stream or null if end-of-stream was hit
*/
public Token next(Token result) throws IOException {
return next();
}

View File

@ -23,8 +23,12 @@ import java.io.IOException;
/** A Tokenizer is a TokenStream whose input is a Reader.
<p>
This is an abstract class.
<p>
NOTE: subclasses must override at least one of {@link
#next()} or {@link #next(Token)}.
<p>
NOTE: subclasses overriding {@link #next(Token)} must
call {@link Token#clear()}.
*/
public abstract class Tokenizer extends TokenStream {

View File

@ -92,6 +92,7 @@ public class StandardTokenizer extends Tokenizer {
return null;
}
result.clear();
scanner.getText(result);
final int start = scanner.yychar();
result.setStartOffset(start);

View File

@ -1373,7 +1373,6 @@ final class DocumentsWriter {
offsetEnd = offset-1;
Token token;
for(;;) {
localToken.clear();
token = stream.next(localToken);
if (token == null) break;
position += (token.getPositionIncrement() - 1);