mirror of https://github.com/apache/lucene.git
LUCENE-1101: TokenStream.next(Token) reuse 'policy': calling Token.clear() should be responsibility of token producer.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@607521 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
efbd1260a9
commit
b367e863e6
|
@ -85,6 +85,10 @@ API Changes
|
||||||
12. LUCENE-1089: Added PriorityQueue.insertWithOverflow, which returns
|
12. LUCENE-1089: Added PriorityQueue.insertWithOverflow, which returns
|
||||||
the Object (if any) that was bumped from the queue to allow
|
the Object (if any) that was bumped from the queue to allow
|
||||||
re-use. (Shai Erera via Mike McCandless)
|
re-use. (Shai Erera via Mike McCandless)
|
||||||
|
|
||||||
|
13. LUCENE-1101: Token reuse 'contract' (defined LUCENE-969)
|
||||||
|
modified so it is token producer's responsibility
|
||||||
|
to call Token.clear(). (Doron Cohen)
|
||||||
|
|
||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
|
@ -45,6 +45,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Token next(Token token) throws IOException {
|
public final Token next(Token token) throws IOException {
|
||||||
|
token.clear();
|
||||||
int length = 0;
|
int length = 0;
|
||||||
int start = bufferIndex;
|
int start = bufferIndex;
|
||||||
char[] buffer = token.termBuffer();
|
char[] buffer = token.termBuffer();
|
||||||
|
|
|
@ -42,6 +42,7 @@ public class KeywordTokenizer extends Tokenizer {
|
||||||
if (!done) {
|
if (!done) {
|
||||||
done = true;
|
done = true;
|
||||||
int upto = 0;
|
int upto = 0;
|
||||||
|
result.clear();
|
||||||
char[] buffer = result.termBuffer();
|
char[] buffer = result.termBuffer();
|
||||||
while (true) {
|
while (true) {
|
||||||
final int length = input.read(buffer, upto, buffer.length-upto);
|
final int length = input.read(buffer, upto, buffer.length-upto);
|
||||||
|
|
|
@ -58,14 +58,23 @@ public abstract class TokenStream {
|
||||||
* When possible, the input Token should be used as the
|
* When possible, the input Token should be used as the
|
||||||
* returned Token (this gives fastest tokenization
|
* returned Token (this gives fastest tokenization
|
||||||
* performance), but this is not required and a new Token
|
* performance), but this is not required and a new Token
|
||||||
* may be returned. Callers may re-use a single Token
|
* may be returned. Callers may re-use a single Token
|
||||||
* instance for successive calls to this method and must
|
* instance for successive calls to this method.
|
||||||
* therefore fully consume the previously returned Token
|
* <p>
|
||||||
* before calling this method again.
|
* This implicitly defines a "contract" between
|
||||||
* @param result a Token that may or may not be used to
|
* consumers (callers of this method) and
|
||||||
* return
|
* producers (implementations of this method
|
||||||
* @return next token in the stream or null if
|
* that are the source for tokens):
|
||||||
* end-of-stream was hit*/
|
* <ul>
|
||||||
|
* <li>A consumer must fully consume the previously
|
||||||
|
* returned Token before calling this method again.</li>
|
||||||
|
* <li>A producer must call {@link Token#clear()}
|
||||||
|
* before setting the fields in it & returning it</li>
|
||||||
|
* </ul>
|
||||||
|
* Note that a {@link TokenFilter} is considered a consumer.
|
||||||
|
* @param result a Token that may or may not be used to return
|
||||||
|
* @return next token in the stream or null if end-of-stream was hit
|
||||||
|
*/
|
||||||
public Token next(Token result) throws IOException {
|
public Token next(Token result) throws IOException {
|
||||||
return next();
|
return next();
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,8 +23,12 @@ import java.io.IOException;
|
||||||
/** A Tokenizer is a TokenStream whose input is a Reader.
|
/** A Tokenizer is a TokenStream whose input is a Reader.
|
||||||
<p>
|
<p>
|
||||||
This is an abstract class.
|
This is an abstract class.
|
||||||
|
<p>
|
||||||
NOTE: subclasses must override at least one of {@link
|
NOTE: subclasses must override at least one of {@link
|
||||||
#next()} or {@link #next(Token)}.
|
#next()} or {@link #next(Token)}.
|
||||||
|
<p>
|
||||||
|
NOTE: subclasses overriding {@link #next(Token)} must
|
||||||
|
call {@link Token#clear()}.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public abstract class Tokenizer extends TokenStream {
|
public abstract class Tokenizer extends TokenStream {
|
||||||
|
|
|
@ -92,6 +92,7 @@ public class StandardTokenizer extends Tokenizer {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
result.clear();
|
||||||
scanner.getText(result);
|
scanner.getText(result);
|
||||||
final int start = scanner.yychar();
|
final int start = scanner.yychar();
|
||||||
result.setStartOffset(start);
|
result.setStartOffset(start);
|
||||||
|
|
|
@ -1373,7 +1373,6 @@ final class DocumentsWriter {
|
||||||
offsetEnd = offset-1;
|
offsetEnd = offset-1;
|
||||||
Token token;
|
Token token;
|
||||||
for(;;) {
|
for(;;) {
|
||||||
localToken.clear();
|
|
||||||
token = stream.next(localToken);
|
token = stream.next(localToken);
|
||||||
if (token == null) break;
|
if (token == null) break;
|
||||||
position += (token.getPositionIncrement() - 1);
|
position += (token.getPositionIncrement() - 1);
|
||||||
|
|
Loading…
Reference in New Issue