LUCENE-1448: Add TokenStream.end() to perform end-of-stream operations. Fixes offset problems when multiple fields with the same name are added to a document.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@797715 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Busch 2009-07-25 04:11:33 +00:00
parent e4b34462ee
commit c311d5730c
11 changed files with 347 additions and 12 deletions

View File

@ -609,6 +609,11 @@ New features
CONSTANT_SCORE_AUTO_REWRITE tries to pick the most performant
constant-score rewrite method. (Mike McCandless)
34. LUCENE-1448: Added TokenStream.end(), to perform end-of-stream
operations. This is currently used to fix offset problems when
multiple fields with the same name are added to a document.
(Mike McCandless, Mark Miller, Michael Busch)
Optimizations
1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing

View File

@ -24,6 +24,8 @@ import java.lang.reflect.Method;
import org.apache.lucene.util.CloseableThreadLocal;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.document.Fieldable;
/** An Analyzer builds TokenStreams, which analyze text. It thus represents a
* policy for extracting index terms from text.
* <p>
@ -123,6 +125,24 @@ public abstract class Analyzer {
return 0;
}
/**
* Just like {@link #getPositionIncrementGap}, except for
* Token offsets instead. By default this returns 1 for
* tokenized fields and, as if the fields were joined
* with an extra space character, and 0 for un-tokenized
* fields. This method is only called if the field
* produced at least one token for indexing.
*
* @param Fieldable the field just indexed
* @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
*/
public int getOffsetGap(Fieldable field) {
if (field.isTokenized())
return 1;
else
return 0;
}
/** Frees persistent resources used by this Analyzer */
public void close() {
tokenStreams.close();

View File

@ -36,6 +36,7 @@ import org.apache.lucene.util.AttributeSource;
public class CachingTokenFilter extends TokenFilter {
private List cache = null;
private Iterator iterator = null;
private AttributeSource.State finalState;
public CachingTokenFilter(TokenStream input) {
super(input);
@ -69,6 +70,12 @@ public class CachingTokenFilter extends TokenFilter {
restoreState((AttributeSource.State) iterator.next());
return true;
}
public final void end() throws IOException {
if (finalState != null) {
restoreState(finalState);
}
}
public void reset() throws IOException {
if(cache != null) {
@ -80,6 +87,9 @@ public class CachingTokenFilter extends TokenFilter {
while(input.incrementToken()) {
cache.add(captureState());
}
// capture final state
input.end();
finalState = captureState();
}
}

View File

@ -63,6 +63,7 @@ public abstract class CharTokenizer extends Tokenizer {
offset += dataLen;
dataLen = input.read(ioBuffer);
if (dataLen == -1) {
dataLen = 0; // so next offset += dataLen won't decrement offset
if (length > 0)
break;
else
@ -93,6 +94,12 @@ public abstract class CharTokenizer extends Tokenizer {
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
return true;
}
public final void end() {
// set final offset
int finalOffset = input.correctOffset(offset);
offsetAtt.setOffset(finalOffset, finalOffset);
}
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
* not be overridden. Delegates to the backwards compatibility layer. */

View File

@ -31,6 +31,7 @@ public class KeywordTokenizer extends Tokenizer {
private static final int DEFAULT_BUFFER_SIZE = 256;
private boolean done;
private int finalOffset;
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
@ -59,11 +60,17 @@ public class KeywordTokenizer extends Tokenizer {
buffer = termAtt.resizeTermBuffer(1+buffer.length);
}
termAtt.setTermLength(upto);
offsetAtt.setOffset(input.correctOffset(0), input.correctOffset(upto));
finalOffset = input.correctOffset(upto);
offsetAtt.setOffset(input.correctOffset(0), finalOffset);
return true;
}
return false;
}
public final void end() {
// set final offset
offsetAtt.setOffset(finalOffset, finalOffset);
}
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
* not be overridden. Delegates to the backwards compatibility layer. */

View File

@ -149,6 +149,17 @@ public final class TeeSinkTokenFilter extends TokenFilter {
return false;
}
public final void end() throws IOException {
super.end();
AttributeSource.State finalState = captureState();
for (Iterator it = sinks.iterator(); it.hasNext(); ) {
final SinkTokenStream sink = (SinkTokenStream) ((WeakReference) it.next()).get();
if (sink != null) {
sink.setFinalState(finalState);
}
}
}
/**
* A filter that decides which {@link AttributeSource} states to store in the sink.
*/
@ -162,6 +173,7 @@ public final class TeeSinkTokenFilter extends TokenFilter {
public static final class SinkTokenStream extends TokenStream {
private final List cachedStates = new LinkedList();
private AttributeSource.State finalState;
private Iterator it = null;
private SinkFilter filter;
@ -181,6 +193,10 @@ public final class TeeSinkTokenFilter extends TokenFilter {
cachedStates.add(state);
}
private void setFinalState(AttributeSource.State finalState) {
this.finalState = finalState;
}
public final boolean incrementToken() throws IOException {
// lazy init the iterator
if (it == null) {
@ -195,12 +211,18 @@ public final class TeeSinkTokenFilter extends TokenFilter {
restoreState(state);
return true;
}
public final void end() throws IOException {
if (finalState != null) {
restoreState(finalState);
}
}
public final void reset() {
it = cachedStates.iterator();
}
}
private static final SinkFilter ACCEPT_ALL_FILTER = new SinkFilter() {
public boolean accept(AttributeSource source) {
return true;

View File

@ -39,6 +39,13 @@ public abstract class TokenFilter extends TokenStream {
this.input = input;
}
/** Performs end-of-stream operations, if any, and calls then <code>end()</code> on the
* input TokenStream.<p/>
* <b>NOTE:</b> Be sure to call <code>super.end()</code> first when overriding this method.*/
public void end() throws IOException {
input.end();
}
/** Close the input TokenStream. */
public void close() throws IOException {
input.close();

View File

@ -268,6 +268,22 @@ public abstract class TokenStream extends AttributeSource {
tokenWrapper.delegate = token;
return true;
}
/**
* This method is called by the consumer after the last token has been consumed,
* i.e. after {@link #incrementToken()} returned <code>false</code> (using the new TokenStream API)
* or after {@link #next(Token)} or {@link #next()} returned <code>null</code> (old TokenStream API).
* <p/>
* This method can be used to perform any end-of-stream operations, such as setting the final
* offset of a stream. The final offset of a stream might differ from the offset of the last token
* e.g. in case one or more whitespaces followed after the last token, but a {@link WhitespaceTokenizer}
* was used.
*
* @throws IOException
*/
public void end() throws IOException {
// do nothing by default
}
/** Returns the next token in the stream, or null at EOS.
* When possible, the input Token should be used as the

View File

@ -183,6 +183,12 @@ public class StandardTokenizer extends Tokenizer {
posIncr++;
}
}
public final void end() {
// set final offset
int finalOffset = input.correctOffset(scanner.yychar() + scanner.yylength());
offsetAtt.setOffset(finalOffset, finalOffset);
}
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
* not be overridden. Delegates to the backwards compatibility layer. */

View File

@ -74,6 +74,8 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
// tokenized.
if (field.isIndexed() && doInvert) {
final boolean anyToken;
if (fieldState.length > 0)
fieldState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name);
@ -95,6 +97,7 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
fieldState.offset += valueLength;
fieldState.length++;
fieldState.position++;
anyToken = valueLength > 0;
} else { // tokenized field
final TokenStream stream;
final TokenStream streamValue = field.tokenStreamValue();
@ -124,6 +127,8 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
// reset the TokenStream to the first token
stream.reset();
final int startLength = fieldState.length;
// deprecated
final boolean allowMinus1Position = docState.allowMinus1Position;
@ -183,12 +188,18 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
hasMoreTokens = stream.incrementToken();
}
fieldState.offset = offsetEnd+1;
// trigger streams to perform end-of-stream operations
stream.end();
fieldState.offset += offsetAttribute.endOffset();
anyToken = fieldState.length > startLength;
} finally {
stream.close();
}
}
if (anyToken)
fieldState.offset += docState.analyzer.getOffsetGap(field);
fieldState.boost *= field.getBoost();
}
}

View File

@ -17,17 +17,26 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import java.io.*;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import java.util.Map;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.SinkTokenizer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.TeeSinkTokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@ -35,8 +44,6 @@ import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.IndexSearcher;
@ -55,7 +62,6 @@ import org.apache.lucene.store.LockFactory;
import org.apache.lucene.store.MockRAMDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.SingleInstanceLockFactory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
@ -4150,6 +4156,41 @@ public class TestIndexWriter extends LuceneTestCase
Field f = new Field("field", "abcd", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(f);
doc.add(f);
Field f2 = new Field("field", "", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(f2);
doc.add(f);
w.addDocument(doc);
w.close();
IndexReader r = IndexReader.open(dir);
TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
// Token "" occurred once
assertEquals(1, termOffsets.length);
assertEquals(8, termOffsets[0].getStartOffset());
assertEquals(8, termOffsets[0].getEndOffset());
// Token "abcd" occurred three times
termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(1);
assertEquals(3, termOffsets.length);
assertEquals(0, termOffsets[0].getStartOffset());
assertEquals(4, termOffsets[0].getEndOffset());
assertEquals(4, termOffsets[1].getStartOffset());
assertEquals(8, termOffsets[1].getEndOffset());
assertEquals(8, termOffsets[2].getStartOffset());
assertEquals(12, termOffsets[2].getEndOffset());
r.close();
dir.close();
}
// LUCENE-1442
public void testDoubleOffsetCounting2() throws Exception {
MockRAMDirectory dir = new MockRAMDirectory();
IndexWriter w = new IndexWriter(dir, new SimpleAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
Document doc = new Document();
Field f = new Field("field", "abcd", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(f);
doc.add(f);
w.addDocument(doc);
w.close();
@ -4158,12 +4199,195 @@ public class TestIndexWriter extends LuceneTestCase
assertEquals(2, termOffsets.length);
assertEquals(0, termOffsets[0].getStartOffset());
assertEquals(4, termOffsets[0].getEndOffset());
assertEquals(4, termOffsets[1].getStartOffset());
assertEquals(8, termOffsets[1].getEndOffset());
assertEquals(5, termOffsets[1].getStartOffset());
assertEquals(9, termOffsets[1].getEndOffset());
r.close();
dir.close();
}
// LUCENE-1448
public void testEndOffsetPositionCharAnalyzer() throws Exception {
MockRAMDirectory dir = new MockRAMDirectory();
IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
Document doc = new Document();
Field f = new Field("field", "abcd ", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(f);
doc.add(f);
w.addDocument(doc);
w.close();
IndexReader r = IndexReader.open(dir);
TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
assertEquals(2, termOffsets.length);
assertEquals(0, termOffsets[0].getStartOffset());
assertEquals(4, termOffsets[0].getEndOffset());
assertEquals(8, termOffsets[1].getStartOffset());
assertEquals(12, termOffsets[1].getEndOffset());
r.close();
dir.close();
}
// LUCENE-1448
public void testEndOffsetPositionWithCachingTokenFilter() throws Exception {
MockRAMDirectory dir = new MockRAMDirectory();
Analyzer analyzer = new WhitespaceAnalyzer();
IndexWriter w = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.LIMITED);
Document doc = new Document();
TokenStream stream = new CachingTokenFilter(analyzer.tokenStream("field", new StringReader("abcd ")));
Field f = new Field("field", stream, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(f);
doc.add(f);
w.addDocument(doc);
w.close();
IndexReader r = IndexReader.open(dir);
TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
assertEquals(2, termOffsets.length);
assertEquals(0, termOffsets[0].getStartOffset());
assertEquals(4, termOffsets[0].getEndOffset());
assertEquals(8, termOffsets[1].getStartOffset());
assertEquals(12, termOffsets[1].getEndOffset());
r.close();
dir.close();
}
// LUCENE-1448
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
MockRAMDirectory dir = new MockRAMDirectory();
Analyzer analyzer = new WhitespaceAnalyzer();
IndexWriter w = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.LIMITED);
Document doc = new Document();
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(analyzer.tokenStream("field", new StringReader("abcd ")));
TokenStream sink = tee.newSinkTokenStream();
Field f1 = new Field("field", tee, Field.TermVector.WITH_POSITIONS_OFFSETS);
Field f2 = new Field("field", sink, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(f1);
doc.add(f2);
w.addDocument(doc);
w.close();
IndexReader r = IndexReader.open(dir);
TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
assertEquals(2, termOffsets.length);
assertEquals(0, termOffsets[0].getStartOffset());
assertEquals(4, termOffsets[0].getEndOffset());
assertEquals(8, termOffsets[1].getStartOffset());
assertEquals(12, termOffsets[1].getEndOffset());
r.close();
dir.close();
}
// LUCENE-1448
public void testEndOffsetPositionStopFilter() throws Exception {
MockRAMDirectory dir = new MockRAMDirectory();
IndexWriter w = new IndexWriter(dir, new StopAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
Document doc = new Document();
Field f = new Field("field", "abcd the", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(f);
doc.add(f);
w.addDocument(doc);
w.close();
IndexReader r = IndexReader.open(dir);
TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
assertEquals(2, termOffsets.length);
assertEquals(0, termOffsets[0].getStartOffset());
assertEquals(4, termOffsets[0].getEndOffset());
assertEquals(9, termOffsets[1].getStartOffset());
assertEquals(13, termOffsets[1].getEndOffset());
r.close();
dir.close();
}
// LUCENE-1448
public void testEndOffsetPositionStandard() throws Exception {
MockRAMDirectory dir = new MockRAMDirectory();
IndexWriter w = new IndexWriter(dir, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
Document doc = new Document();
Field f = new Field("field", "abcd the ", Field.Store.NO,
Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
Field f2 = new Field("field", "crunch man", Field.Store.NO,
Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(f);
doc.add(f2);
w.addDocument(doc);
w.close();
IndexReader r = IndexReader.open(dir);
TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
assertEquals(1, termOffsets.length);
assertEquals(0, termOffsets[0].getStartOffset());
assertEquals(4, termOffsets[0].getEndOffset());
termOffsets = tpv.getOffsets(1);
assertEquals(11, termOffsets[0].getStartOffset());
assertEquals(17, termOffsets[0].getEndOffset());
termOffsets = tpv.getOffsets(2);
assertEquals(18, termOffsets[0].getStartOffset());
assertEquals(21, termOffsets[0].getEndOffset());
r.close();
dir.close();
}
// LUCENE-1448
public void testEndOffsetPositionStandardEmptyField() throws Exception {
MockRAMDirectory dir = new MockRAMDirectory();
IndexWriter w = new IndexWriter(dir, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
Document doc = new Document();
Field f = new Field("field", "", Field.Store.NO,
Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
Field f2 = new Field("field", "crunch man", Field.Store.NO,
Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(f);
doc.add(f2);
w.addDocument(doc);
w.close();
IndexReader r = IndexReader.open(dir);
TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
assertEquals(1, termOffsets.length);
assertEquals(0, termOffsets[0].getStartOffset());
assertEquals(6, termOffsets[0].getEndOffset());
termOffsets = tpv.getOffsets(1);
assertEquals(7, termOffsets[0].getStartOffset());
assertEquals(10, termOffsets[0].getEndOffset());
r.close();
dir.close();
}
// LUCENE-1448
public void testEndOffsetPositionStandardEmptyField2() throws Exception {
MockRAMDirectory dir = new MockRAMDirectory();
IndexWriter w = new IndexWriter(dir, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
Document doc = new Document();
Field f = new Field("field", "abcd", Field.Store.NO,
Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(f);
doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
Field f2 = new Field("field", "crunch", Field.Store.NO,
Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(f2);
w.addDocument(doc);
w.close();
IndexReader r = IndexReader.open(dir);
TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
assertEquals(1, termOffsets.length);
assertEquals(0, termOffsets[0].getStartOffset());
assertEquals(4, termOffsets[0].getEndOffset());
termOffsets = tpv.getOffsets(1);
assertEquals(5, termOffsets[0].getStartOffset());
assertEquals(11, termOffsets[0].getEndOffset());
r.close();
dir.close();
}
// LUCENE-1468 -- make sure opening an IndexWriter with
// create=true does not remove non-index files