mirror of https://github.com/apache/lucene.git
LUCENE-1057: call clear when reusing token, change clear to only resent essential fields, re-add Token.clone()
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@596398 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
31f50ad41d
commit
e9a5866777
|
@ -361,14 +361,29 @@ public class Token implements Cloneable {
|
|||
return sb.toString();
|
||||
}
|
||||
|
||||
/** Reset all state for this token back to defaults. */
|
||||
/** Resets the term text, payload, and positionIncrement to default.
|
||||
* Other fields such as startOffset, endOffset and the token type are
|
||||
* not reset since they are normally overwritten by the tokenizer. */
|
||||
public void clear() {
|
||||
payload = null;
|
||||
// Leave termBuffer to allow re-use
|
||||
termLength = 0;
|
||||
termText = null;
|
||||
positionIncrement = 1;
|
||||
startOffset = endOffset = 0;
|
||||
type = DEFAULT_TYPE;
|
||||
// startOffset = endOffset = 0;
|
||||
// type = DEFAULT_TYPE;
|
||||
}
|
||||
|
||||
public Object clone() {
|
||||
try {
|
||||
Token t = (Token)super.clone();
|
||||
if (termBuffer != null) {
|
||||
t.termBuffer = null;
|
||||
t.setTermBuffer(termBuffer, 0, termLength);
|
||||
}
|
||||
return t;
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new RuntimeException(e); // shouldn't happen
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1281,6 +1281,7 @@ final class DocumentsWriter {
|
|||
if (!field.isTokenized()) { // un-tokenized field
|
||||
String stringValue = field.stringValue();
|
||||
Token token = localToken;
|
||||
token.clear();
|
||||
token.setTermText(stringValue);
|
||||
token.setStartOffset(offset);
|
||||
token.setEndOffset(offset + stringValue.length());
|
||||
|
@ -1319,7 +1320,10 @@ final class DocumentsWriter {
|
|||
try {
|
||||
offsetEnd = offset-1;
|
||||
Token token;
|
||||
while((token = stream.next(localToken)) != null) {
|
||||
for(;;) {
|
||||
localToken.clear();
|
||||
token = stream.next(localToken);
|
||||
if (token == null) break;
|
||||
position += (token.getPositionIncrement() - 1);
|
||||
addPosition(token);
|
||||
if (++length >= maxFieldLength) {
|
||||
|
|
|
@ -17,22 +17,17 @@ package org.apache.lucene.index;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.document.*;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Field.TermVector;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.io.Reader;
|
||||
|
||||
public class TestDocumentWriter extends LuceneTestCase {
|
||||
private RAMDirectory dir;
|
||||
|
@ -131,6 +126,70 @@ public class TestDocumentWriter extends LuceneTestCase {
|
|||
assertEquals(502, termPositions.nextPosition());
|
||||
}
|
||||
|
||||
public void testTokenReuse() throws IOException {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new TokenFilter(new WhitespaceTokenizer(reader)) {
|
||||
boolean first=true;
|
||||
Token buffered;
|
||||
|
||||
public Token next() throws IOException {
|
||||
return input.next();
|
||||
}
|
||||
|
||||
public Token next(Token result) throws IOException {
|
||||
if (buffered != null) {
|
||||
Token t = buffered;
|
||||
buffered=null;
|
||||
return t;
|
||||
}
|
||||
Token t = input.next(result);
|
||||
if (t==null) return null;
|
||||
if (Character.isDigit(t.termBuffer()[0])) {
|
||||
t.setPositionIncrement(t.termBuffer()[0] - '0');
|
||||
}
|
||||
if (first) {
|
||||
// set payload on first position only
|
||||
t.setPayload(new Payload(new byte[]{100}));
|
||||
first = false;
|
||||
}
|
||||
|
||||
// index a "synonym" for every token
|
||||
buffered = (Token)t.clone();
|
||||
buffered.setPayload(null);
|
||||
buffered.setPositionIncrement(0);
|
||||
buffered.setTermBuffer(new char[]{'b'}, 0, 1);
|
||||
|
||||
return t;
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
IndexWriter writer = new IndexWriter(dir, analyzer, true);
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("f1", "a 5 a a", Field.Store.YES, Field.Index.TOKENIZED));
|
||||
|
||||
writer.addDocument(doc);
|
||||
writer.flush();
|
||||
SegmentInfo info = writer.newestSegment();
|
||||
writer.close();
|
||||
SegmentReader reader = SegmentReader.get(info);
|
||||
|
||||
TermPositions termPositions = reader.termPositions(new Term("f1", "a"));
|
||||
assertTrue(termPositions.next());
|
||||
int freq = termPositions.freq();
|
||||
assertEquals(3, freq);
|
||||
assertEquals(0, termPositions.nextPosition());
|
||||
assertEquals(true, termPositions.isPayloadAvailable());
|
||||
assertEquals(6, termPositions.nextPosition());
|
||||
assertEquals(false, termPositions.isPayloadAvailable());
|
||||
assertEquals(7, termPositions.nextPosition());
|
||||
assertEquals(false, termPositions.isPayloadAvailable());
|
||||
}
|
||||
|
||||
|
||||
public void testPreAnalyzedField() throws IOException {
|
||||
Similarity similarity = Similarity.getDefault();
|
||||
IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true);
|
||||
|
|
Loading…
Reference in New Issue