mirror of https://github.com/apache/lucene.git
LUCENE-1448: Add TokenStream.end() to perform end-of-stream operations. Fixes offset problems when multiple fields with the same name are added to a document.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@797715 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e4b34462ee
commit
c311d5730c
|
@ -609,6 +609,11 @@ New features
|
|||
CONSTANT_SCORE_AUTO_REWRITE tries to pick the most performant
|
||||
constant-score rewrite method. (Mike McCandless)
|
||||
|
||||
34. LUCENE-1448: Added TokenStream.end(), to perform end-of-stream
|
||||
operations. This is currently used to fix offset problems when
|
||||
multiple fields with the same name are added to a document.
|
||||
(Mike McCandless, Mark Miller, Michael Busch)
|
||||
|
||||
Optimizations
|
||||
|
||||
1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing
|
||||
|
|
|
@ -24,6 +24,8 @@ import java.lang.reflect.Method;
|
|||
import org.apache.lucene.util.CloseableThreadLocal;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
|
||||
/** An Analyzer builds TokenStreams, which analyze text. It thus represents a
|
||||
* policy for extracting index terms from text.
|
||||
* <p>
|
||||
|
@ -123,6 +125,24 @@ public abstract class Analyzer {
|
|||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Just like {@link #getPositionIncrementGap}, except for
|
||||
* Token offsets instead. By default this returns 1 for
|
||||
* tokenized fields and, as if the fields were joined
|
||||
* with an extra space character, and 0 for un-tokenized
|
||||
* fields. This method is only called if the field
|
||||
* produced at least one token for indexing.
|
||||
*
|
||||
* @param Fieldable the field just indexed
|
||||
* @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
|
||||
*/
|
||||
public int getOffsetGap(Fieldable field) {
|
||||
if (field.isTokenized())
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
/** Frees persistent resources used by this Analyzer */
|
||||
public void close() {
|
||||
tokenStreams.close();
|
||||
|
|
|
@ -36,6 +36,7 @@ import org.apache.lucene.util.AttributeSource;
|
|||
public class CachingTokenFilter extends TokenFilter {
|
||||
private List cache = null;
|
||||
private Iterator iterator = null;
|
||||
private AttributeSource.State finalState;
|
||||
|
||||
public CachingTokenFilter(TokenStream input) {
|
||||
super(input);
|
||||
|
@ -69,6 +70,12 @@ public class CachingTokenFilter extends TokenFilter {
|
|||
restoreState((AttributeSource.State) iterator.next());
|
||||
return true;
|
||||
}
|
||||
|
||||
public final void end() throws IOException {
|
||||
if (finalState != null) {
|
||||
restoreState(finalState);
|
||||
}
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
if(cache != null) {
|
||||
|
@ -80,6 +87,9 @@ public class CachingTokenFilter extends TokenFilter {
|
|||
while(input.incrementToken()) {
|
||||
cache.add(captureState());
|
||||
}
|
||||
// capture final state
|
||||
input.end();
|
||||
finalState = captureState();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -63,6 +63,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
offset += dataLen;
|
||||
dataLen = input.read(ioBuffer);
|
||||
if (dataLen == -1) {
|
||||
dataLen = 0; // so next offset += dataLen won't decrement offset
|
||||
if (length > 0)
|
||||
break;
|
||||
else
|
||||
|
@ -93,6 +94,12 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
|
||||
return true;
|
||||
}
|
||||
|
||||
public final void end() {
|
||||
// set final offset
|
||||
int finalOffset = input.correctOffset(offset);
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
|
|
|
@ -31,6 +31,7 @@ public class KeywordTokenizer extends Tokenizer {
|
|||
private static final int DEFAULT_BUFFER_SIZE = 256;
|
||||
|
||||
private boolean done;
|
||||
private int finalOffset;
|
||||
private TermAttribute termAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
|
||||
|
@ -59,11 +60,17 @@ public class KeywordTokenizer extends Tokenizer {
|
|||
buffer = termAtt.resizeTermBuffer(1+buffer.length);
|
||||
}
|
||||
termAtt.setTermLength(upto);
|
||||
offsetAtt.setOffset(input.correctOffset(0), input.correctOffset(upto));
|
||||
finalOffset = input.correctOffset(upto);
|
||||
offsetAtt.setOffset(input.correctOffset(0), finalOffset);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public final void end() {
|
||||
// set final offset
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
|
|
|
@ -149,6 +149,17 @@ public final class TeeSinkTokenFilter extends TokenFilter {
|
|||
return false;
|
||||
}
|
||||
|
||||
public final void end() throws IOException {
|
||||
super.end();
|
||||
AttributeSource.State finalState = captureState();
|
||||
for (Iterator it = sinks.iterator(); it.hasNext(); ) {
|
||||
final SinkTokenStream sink = (SinkTokenStream) ((WeakReference) it.next()).get();
|
||||
if (sink != null) {
|
||||
sink.setFinalState(finalState);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A filter that decides which {@link AttributeSource} states to store in the sink.
|
||||
*/
|
||||
|
@ -162,6 +173,7 @@ public final class TeeSinkTokenFilter extends TokenFilter {
|
|||
|
||||
public static final class SinkTokenStream extends TokenStream {
|
||||
private final List cachedStates = new LinkedList();
|
||||
private AttributeSource.State finalState;
|
||||
private Iterator it = null;
|
||||
private SinkFilter filter;
|
||||
|
||||
|
@ -181,6 +193,10 @@ public final class TeeSinkTokenFilter extends TokenFilter {
|
|||
cachedStates.add(state);
|
||||
}
|
||||
|
||||
private void setFinalState(AttributeSource.State finalState) {
|
||||
this.finalState = finalState;
|
||||
}
|
||||
|
||||
public final boolean incrementToken() throws IOException {
|
||||
// lazy init the iterator
|
||||
if (it == null) {
|
||||
|
@ -195,12 +211,18 @@ public final class TeeSinkTokenFilter extends TokenFilter {
|
|||
restoreState(state);
|
||||
return true;
|
||||
}
|
||||
|
||||
public final void end() throws IOException {
|
||||
if (finalState != null) {
|
||||
restoreState(finalState);
|
||||
}
|
||||
}
|
||||
|
||||
public final void reset() {
|
||||
it = cachedStates.iterator();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static final SinkFilter ACCEPT_ALL_FILTER = new SinkFilter() {
|
||||
public boolean accept(AttributeSource source) {
|
||||
return true;
|
||||
|
|
|
@ -39,6 +39,13 @@ public abstract class TokenFilter extends TokenStream {
|
|||
this.input = input;
|
||||
}
|
||||
|
||||
/** Performs end-of-stream operations, if any, and calls then <code>end()</code> on the
|
||||
* input TokenStream.<p/>
|
||||
* <b>NOTE:</b> Be sure to call <code>super.end()</code> first when overriding this method.*/
|
||||
public void end() throws IOException {
|
||||
input.end();
|
||||
}
|
||||
|
||||
/** Close the input TokenStream. */
|
||||
public void close() throws IOException {
|
||||
input.close();
|
||||
|
|
|
@ -268,6 +268,22 @@ public abstract class TokenStream extends AttributeSource {
|
|||
tokenWrapper.delegate = token;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* This method is called by the consumer after the last token has been consumed,
|
||||
* i.e. after {@link #incrementToken()} returned <code>false</code> (using the new TokenStream API)
|
||||
* or after {@link #next(Token)} or {@link #next()} returned <code>null</code> (old TokenStream API).
|
||||
* <p/>
|
||||
* This method can be used to perform any end-of-stream operations, such as setting the final
|
||||
* offset of a stream. The final offset of a stream might differ from the offset of the last token
|
||||
* e.g. in case one or more whitespaces followed after the last token, but a {@link WhitespaceTokenizer}
|
||||
* was used.
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
public void end() throws IOException {
|
||||
// do nothing by default
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS.
|
||||
* When possible, the input Token should be used as the
|
||||
|
|
|
@ -183,6 +183,12 @@ public class StandardTokenizer extends Tokenizer {
|
|||
posIncr++;
|
||||
}
|
||||
}
|
||||
|
||||
public final void end() {
|
||||
// set final offset
|
||||
int finalOffset = input.correctOffset(scanner.yychar() + scanner.yylength());
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
|
|
|
@ -74,6 +74,8 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
// tokenized.
|
||||
if (field.isIndexed() && doInvert) {
|
||||
|
||||
final boolean anyToken;
|
||||
|
||||
if (fieldState.length > 0)
|
||||
fieldState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name);
|
||||
|
||||
|
@ -95,6 +97,7 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
fieldState.offset += valueLength;
|
||||
fieldState.length++;
|
||||
fieldState.position++;
|
||||
anyToken = valueLength > 0;
|
||||
} else { // tokenized field
|
||||
final TokenStream stream;
|
||||
final TokenStream streamValue = field.tokenStreamValue();
|
||||
|
@ -124,6 +127,8 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
// reset the TokenStream to the first token
|
||||
stream.reset();
|
||||
|
||||
final int startLength = fieldState.length;
|
||||
|
||||
// deprecated
|
||||
final boolean allowMinus1Position = docState.allowMinus1Position;
|
||||
|
||||
|
@ -183,12 +188,18 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
|
||||
hasMoreTokens = stream.incrementToken();
|
||||
}
|
||||
fieldState.offset = offsetEnd+1;
|
||||
// trigger streams to perform end-of-stream operations
|
||||
stream.end();
|
||||
|
||||
fieldState.offset += offsetAttribute.endOffset();
|
||||
anyToken = fieldState.length > startLength;
|
||||
} finally {
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
|
||||
if (anyToken)
|
||||
fieldState.offset += docState.analyzer.getOffsetGap(field);
|
||||
fieldState.boost *= field.getBoost();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,17 +17,26 @@ package org.apache.lucene.index;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.*;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||
import org.apache.lucene.analysis.SinkTokenizer;
|
||||
import org.apache.lucene.analysis.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.TeeSinkTokenFilter;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -35,8 +44,6 @@ import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
|||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
|
@ -55,7 +62,6 @@ import org.apache.lucene.store.LockFactory;
|
|||
import org.apache.lucene.store.MockRAMDirectory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.store.SingleInstanceLockFactory;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
@ -4150,6 +4156,41 @@ public class TestIndexWriter extends LuceneTestCase
|
|||
Field f = new Field("field", "abcd", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
doc.add(f);
|
||||
doc.add(f);
|
||||
Field f2 = new Field("field", "", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
doc.add(f2);
|
||||
doc.add(f);
|
||||
w.addDocument(doc);
|
||||
w.close();
|
||||
|
||||
IndexReader r = IndexReader.open(dir);
|
||||
TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
|
||||
|
||||
// Token "" occurred once
|
||||
assertEquals(1, termOffsets.length);
|
||||
assertEquals(8, termOffsets[0].getStartOffset());
|
||||
assertEquals(8, termOffsets[0].getEndOffset());
|
||||
|
||||
// Token "abcd" occurred three times
|
||||
termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(1);
|
||||
assertEquals(3, termOffsets.length);
|
||||
assertEquals(0, termOffsets[0].getStartOffset());
|
||||
assertEquals(4, termOffsets[0].getEndOffset());
|
||||
assertEquals(4, termOffsets[1].getStartOffset());
|
||||
assertEquals(8, termOffsets[1].getEndOffset());
|
||||
assertEquals(8, termOffsets[2].getStartOffset());
|
||||
assertEquals(12, termOffsets[2].getEndOffset());
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// LUCENE-1442
|
||||
public void testDoubleOffsetCounting2() throws Exception {
|
||||
MockRAMDirectory dir = new MockRAMDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new SimpleAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
|
||||
Document doc = new Document();
|
||||
Field f = new Field("field", "abcd", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
doc.add(f);
|
||||
doc.add(f);
|
||||
w.addDocument(doc);
|
||||
w.close();
|
||||
|
||||
|
@ -4158,12 +4199,195 @@ public class TestIndexWriter extends LuceneTestCase
|
|||
assertEquals(2, termOffsets.length);
|
||||
assertEquals(0, termOffsets[0].getStartOffset());
|
||||
assertEquals(4, termOffsets[0].getEndOffset());
|
||||
assertEquals(4, termOffsets[1].getStartOffset());
|
||||
assertEquals(8, termOffsets[1].getEndOffset());
|
||||
assertEquals(5, termOffsets[1].getStartOffset());
|
||||
assertEquals(9, termOffsets[1].getEndOffset());
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// LUCENE-1448
|
||||
public void testEndOffsetPositionCharAnalyzer() throws Exception {
|
||||
MockRAMDirectory dir = new MockRAMDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
|
||||
Document doc = new Document();
|
||||
Field f = new Field("field", "abcd ", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
doc.add(f);
|
||||
doc.add(f);
|
||||
w.addDocument(doc);
|
||||
w.close();
|
||||
|
||||
IndexReader r = IndexReader.open(dir);
|
||||
TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
|
||||
assertEquals(2, termOffsets.length);
|
||||
assertEquals(0, termOffsets[0].getStartOffset());
|
||||
assertEquals(4, termOffsets[0].getEndOffset());
|
||||
assertEquals(8, termOffsets[1].getStartOffset());
|
||||
assertEquals(12, termOffsets[1].getEndOffset());
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// LUCENE-1448
|
||||
public void testEndOffsetPositionWithCachingTokenFilter() throws Exception {
|
||||
MockRAMDirectory dir = new MockRAMDirectory();
|
||||
Analyzer analyzer = new WhitespaceAnalyzer();
|
||||
IndexWriter w = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.LIMITED);
|
||||
Document doc = new Document();
|
||||
TokenStream stream = new CachingTokenFilter(analyzer.tokenStream("field", new StringReader("abcd ")));
|
||||
Field f = new Field("field", stream, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
doc.add(f);
|
||||
doc.add(f);
|
||||
w.addDocument(doc);
|
||||
w.close();
|
||||
|
||||
IndexReader r = IndexReader.open(dir);
|
||||
TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
|
||||
assertEquals(2, termOffsets.length);
|
||||
assertEquals(0, termOffsets[0].getStartOffset());
|
||||
assertEquals(4, termOffsets[0].getEndOffset());
|
||||
assertEquals(8, termOffsets[1].getStartOffset());
|
||||
assertEquals(12, termOffsets[1].getEndOffset());
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// LUCENE-1448
|
||||
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
|
||||
MockRAMDirectory dir = new MockRAMDirectory();
|
||||
Analyzer analyzer = new WhitespaceAnalyzer();
|
||||
IndexWriter w = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.LIMITED);
|
||||
Document doc = new Document();
|
||||
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(analyzer.tokenStream("field", new StringReader("abcd ")));
|
||||
TokenStream sink = tee.newSinkTokenStream();
|
||||
Field f1 = new Field("field", tee, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
Field f2 = new Field("field", sink, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
doc.add(f1);
|
||||
doc.add(f2);
|
||||
w.addDocument(doc);
|
||||
w.close();
|
||||
|
||||
IndexReader r = IndexReader.open(dir);
|
||||
TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
|
||||
assertEquals(2, termOffsets.length);
|
||||
assertEquals(0, termOffsets[0].getStartOffset());
|
||||
assertEquals(4, termOffsets[0].getEndOffset());
|
||||
assertEquals(8, termOffsets[1].getStartOffset());
|
||||
assertEquals(12, termOffsets[1].getEndOffset());
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// LUCENE-1448
|
||||
public void testEndOffsetPositionStopFilter() throws Exception {
|
||||
MockRAMDirectory dir = new MockRAMDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new StopAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
|
||||
Document doc = new Document();
|
||||
Field f = new Field("field", "abcd the", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
doc.add(f);
|
||||
doc.add(f);
|
||||
w.addDocument(doc);
|
||||
w.close();
|
||||
|
||||
IndexReader r = IndexReader.open(dir);
|
||||
TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
|
||||
assertEquals(2, termOffsets.length);
|
||||
assertEquals(0, termOffsets[0].getStartOffset());
|
||||
assertEquals(4, termOffsets[0].getEndOffset());
|
||||
assertEquals(9, termOffsets[1].getStartOffset());
|
||||
assertEquals(13, termOffsets[1].getEndOffset());
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// LUCENE-1448
|
||||
public void testEndOffsetPositionStandard() throws Exception {
|
||||
MockRAMDirectory dir = new MockRAMDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
|
||||
Document doc = new Document();
|
||||
Field f = new Field("field", "abcd the ", Field.Store.NO,
|
||||
Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
Field f2 = new Field("field", "crunch man", Field.Store.NO,
|
||||
Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
doc.add(f);
|
||||
doc.add(f2);
|
||||
w.addDocument(doc);
|
||||
w.close();
|
||||
|
||||
IndexReader r = IndexReader.open(dir);
|
||||
TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
|
||||
TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
|
||||
assertEquals(1, termOffsets.length);
|
||||
assertEquals(0, termOffsets[0].getStartOffset());
|
||||
assertEquals(4, termOffsets[0].getEndOffset());
|
||||
termOffsets = tpv.getOffsets(1);
|
||||
assertEquals(11, termOffsets[0].getStartOffset());
|
||||
assertEquals(17, termOffsets[0].getEndOffset());
|
||||
termOffsets = tpv.getOffsets(2);
|
||||
assertEquals(18, termOffsets[0].getStartOffset());
|
||||
assertEquals(21, termOffsets[0].getEndOffset());
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// LUCENE-1448
|
||||
public void testEndOffsetPositionStandardEmptyField() throws Exception {
|
||||
MockRAMDirectory dir = new MockRAMDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
|
||||
Document doc = new Document();
|
||||
Field f = new Field("field", "", Field.Store.NO,
|
||||
Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
Field f2 = new Field("field", "crunch man", Field.Store.NO,
|
||||
Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
doc.add(f);
|
||||
doc.add(f2);
|
||||
w.addDocument(doc);
|
||||
w.close();
|
||||
|
||||
IndexReader r = IndexReader.open(dir);
|
||||
TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
|
||||
TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
|
||||
assertEquals(1, termOffsets.length);
|
||||
assertEquals(0, termOffsets[0].getStartOffset());
|
||||
assertEquals(6, termOffsets[0].getEndOffset());
|
||||
termOffsets = tpv.getOffsets(1);
|
||||
assertEquals(7, termOffsets[0].getStartOffset());
|
||||
assertEquals(10, termOffsets[0].getEndOffset());
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// LUCENE-1448
|
||||
public void testEndOffsetPositionStandardEmptyField2() throws Exception {
|
||||
MockRAMDirectory dir = new MockRAMDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
|
||||
Document doc = new Document();
|
||||
|
||||
Field f = new Field("field", "abcd", Field.Store.NO,
|
||||
Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
doc.add(f);
|
||||
doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||
|
||||
Field f2 = new Field("field", "crunch", Field.Store.NO,
|
||||
Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
doc.add(f2);
|
||||
|
||||
w.addDocument(doc);
|
||||
w.close();
|
||||
|
||||
IndexReader r = IndexReader.open(dir);
|
||||
TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
|
||||
TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
|
||||
assertEquals(1, termOffsets.length);
|
||||
assertEquals(0, termOffsets[0].getStartOffset());
|
||||
assertEquals(4, termOffsets[0].getEndOffset());
|
||||
termOffsets = tpv.getOffsets(1);
|
||||
assertEquals(5, termOffsets[0].getStartOffset());
|
||||
assertEquals(11, termOffsets[0].getEndOffset());
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
|
||||
// LUCENE-1468 -- make sure opening an IndexWriter with
|
||||
// create=true does not remove non-index files
|
||||
|
||||
|
|
Loading…
Reference in New Issue