mirror of https://github.com/apache/lucene.git
LUCENE-1057: call clear when reusing token, change clear to only resent essential fields, re-add Token.clone()
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@596398 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
31f50ad41d
commit
e9a5866777
|
@ -361,14 +361,29 @@ public class Token implements Cloneable {
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Reset all state for this token back to defaults. */
|
/** Resets the term text, payload, and positionIncrement to default.
|
||||||
|
* Other fields such as startOffset, endOffset and the token type are
|
||||||
|
* not reset since they are normally overwritten by the tokenizer. */
|
||||||
public void clear() {
|
public void clear() {
|
||||||
payload = null;
|
payload = null;
|
||||||
// Leave termBuffer to allow re-use
|
// Leave termBuffer to allow re-use
|
||||||
termLength = 0;
|
termLength = 0;
|
||||||
termText = null;
|
termText = null;
|
||||||
positionIncrement = 1;
|
positionIncrement = 1;
|
||||||
startOffset = endOffset = 0;
|
// startOffset = endOffset = 0;
|
||||||
type = DEFAULT_TYPE;
|
// type = DEFAULT_TYPE;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object clone() {
|
||||||
|
try {
|
||||||
|
Token t = (Token)super.clone();
|
||||||
|
if (termBuffer != null) {
|
||||||
|
t.termBuffer = null;
|
||||||
|
t.setTermBuffer(termBuffer, 0, termLength);
|
||||||
|
}
|
||||||
|
return t;
|
||||||
|
} catch (CloneNotSupportedException e) {
|
||||||
|
throw new RuntimeException(e); // shouldn't happen
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1281,6 +1281,7 @@ final class DocumentsWriter {
|
||||||
if (!field.isTokenized()) { // un-tokenized field
|
if (!field.isTokenized()) { // un-tokenized field
|
||||||
String stringValue = field.stringValue();
|
String stringValue = field.stringValue();
|
||||||
Token token = localToken;
|
Token token = localToken;
|
||||||
|
token.clear();
|
||||||
token.setTermText(stringValue);
|
token.setTermText(stringValue);
|
||||||
token.setStartOffset(offset);
|
token.setStartOffset(offset);
|
||||||
token.setEndOffset(offset + stringValue.length());
|
token.setEndOffset(offset + stringValue.length());
|
||||||
|
@ -1319,7 +1320,10 @@ final class DocumentsWriter {
|
||||||
try {
|
try {
|
||||||
offsetEnd = offset-1;
|
offsetEnd = offset-1;
|
||||||
Token token;
|
Token token;
|
||||||
while((token = stream.next(localToken)) != null) {
|
for(;;) {
|
||||||
|
localToken.clear();
|
||||||
|
token = stream.next(localToken);
|
||||||
|
if (token == null) break;
|
||||||
position += (token.getPositionIncrement() - 1);
|
position += (token.getPositionIncrement() - 1);
|
||||||
addPosition(token);
|
addPosition(token);
|
||||||
if (++length >= maxFieldLength) {
|
if (++length >= maxFieldLength) {
|
||||||
|
|
|
@ -17,22 +17,17 @@ package org.apache.lucene.index;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.analysis.*;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
|
||||||
import org.apache.lucene.document.*;
|
|
||||||
import org.apache.lucene.document.Field.TermVector;
|
import org.apache.lucene.document.Field.TermVector;
|
||||||
|
import org.apache.lucene.document.Fieldable;
|
||||||
import org.apache.lucene.search.Similarity;
|
import org.apache.lucene.search.Similarity;
|
||||||
import org.apache.lucene.store.RAMDirectory;
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
import java.io.Reader;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
import java.util.Arrays;
|
|
||||||
|
|
||||||
public class TestDocumentWriter extends LuceneTestCase {
|
public class TestDocumentWriter extends LuceneTestCase {
|
||||||
private RAMDirectory dir;
|
private RAMDirectory dir;
|
||||||
|
@ -131,6 +126,70 @@ public class TestDocumentWriter extends LuceneTestCase {
|
||||||
assertEquals(502, termPositions.nextPosition());
|
assertEquals(502, termPositions.nextPosition());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testTokenReuse() throws IOException {
|
||||||
|
Analyzer analyzer = new Analyzer() {
|
||||||
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
return new TokenFilter(new WhitespaceTokenizer(reader)) {
|
||||||
|
boolean first=true;
|
||||||
|
Token buffered;
|
||||||
|
|
||||||
|
public Token next() throws IOException {
|
||||||
|
return input.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Token next(Token result) throws IOException {
|
||||||
|
if (buffered != null) {
|
||||||
|
Token t = buffered;
|
||||||
|
buffered=null;
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
Token t = input.next(result);
|
||||||
|
if (t==null) return null;
|
||||||
|
if (Character.isDigit(t.termBuffer()[0])) {
|
||||||
|
t.setPositionIncrement(t.termBuffer()[0] - '0');
|
||||||
|
}
|
||||||
|
if (first) {
|
||||||
|
// set payload on first position only
|
||||||
|
t.setPayload(new Payload(new byte[]{100}));
|
||||||
|
first = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// index a "synonym" for every token
|
||||||
|
buffered = (Token)t.clone();
|
||||||
|
buffered.setPayload(null);
|
||||||
|
buffered.setPositionIncrement(0);
|
||||||
|
buffered.setTermBuffer(new char[]{'b'}, 0, 1);
|
||||||
|
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
IndexWriter writer = new IndexWriter(dir, analyzer, true);
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new Field("f1", "a 5 a a", Field.Store.YES, Field.Index.TOKENIZED));
|
||||||
|
|
||||||
|
writer.addDocument(doc);
|
||||||
|
writer.flush();
|
||||||
|
SegmentInfo info = writer.newestSegment();
|
||||||
|
writer.close();
|
||||||
|
SegmentReader reader = SegmentReader.get(info);
|
||||||
|
|
||||||
|
TermPositions termPositions = reader.termPositions(new Term("f1", "a"));
|
||||||
|
assertTrue(termPositions.next());
|
||||||
|
int freq = termPositions.freq();
|
||||||
|
assertEquals(3, freq);
|
||||||
|
assertEquals(0, termPositions.nextPosition());
|
||||||
|
assertEquals(true, termPositions.isPayloadAvailable());
|
||||||
|
assertEquals(6, termPositions.nextPosition());
|
||||||
|
assertEquals(false, termPositions.isPayloadAvailable());
|
||||||
|
assertEquals(7, termPositions.nextPosition());
|
||||||
|
assertEquals(false, termPositions.isPayloadAvailable());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public void testPreAnalyzedField() throws IOException {
|
public void testPreAnalyzedField() throws IOException {
|
||||||
Similarity similarity = Similarity.getDefault();
|
Similarity similarity = Similarity.getDefault();
|
||||||
IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true);
|
IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true);
|
||||||
|
|
Loading…
Reference in New Issue