LUCENE-3684: add offsets to postings APIs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1231794 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-01-15 23:17:45 +00:00
parent 1e09d9eff6
commit 11f33ee521
91 changed files with 1610 additions and 703 deletions

View File

@ -226,6 +226,10 @@ Changes in backwards compatibility policy
* LUCENE-3640: Removed IndexSearcher.close(), because IndexSearcher no longer
takes a Directory and no longer "manages" IndexReaders, it is a no-op.
(Robert Muir)
* LUCENE-3684: Add offsets into DocsAndPositionsEnum, and a few
FieldInfo.IndexOption: DOCS_AND_POSITIONS_AND_OFFSETS. (Robert
Muir, Mike McCandless)
Changes in Runtime Behavior

View File

@ -126,7 +126,7 @@ public class TokenSources {
private static boolean hasPositions(Terms vector) throws IOException {
final TermsEnum termsEnum = vector.iterator(null);
if (termsEnum.next() != null) {
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, false);
if (dpEnum != null) {
int pos = dpEnum.nextPosition();
if (pos >= 0) {
@ -219,22 +219,21 @@ public class TokenSources {
DocsAndPositionsEnum dpEnum = null;
while ((text = termsEnum.next()) != null) {
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
if (dpEnum == null || (!dpEnum.attributes().hasAttribute(OffsetAttribute.class))) {
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
if (dpEnum == null) {
throw new IllegalArgumentException(
"Required TermVector Offset information was not found");
}
final String term = text.utf8ToString();
final OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
dpEnum.nextDoc();
final int freq = dpEnum.freq();
for(int posUpto=0;posUpto<freq;posUpto++) {
final int pos = dpEnum.nextPosition();
final Token token = new Token(term,
offsetAtt.startOffset(),
offsetAtt.endOffset());
dpEnum.startOffset(),
dpEnum.endOffset());
if (tokenPositionsGuaranteedContiguous && pos != -1) {
// We have positions stored and a guarantee that the token position
// information is contiguous

View File

@ -60,22 +60,23 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
BytesRef text;
DocsAndPositionsEnum dpEnum = null;
while((text = termsEnum.next()) != null) {
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
final boolean hasOffsets;
if (dpEnum == null) {
hasOffsets = false;
dpEnum = termsEnum.docsAndPositions(null, dpEnum, false);
} else {
hasOffsets = true;
}
dpEnum.nextDoc();
final int freq = dpEnum.freq();
final OffsetAttribute offsetAtt;
if (dpEnum.attributes().hasAttribute(OffsetAttribute.class)) {
offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
} else {
offsetAtt = null;
}
for (int j = 0; j < freq; j++) {
int pos = dpEnum.nextPosition();
Token token;
if (offsetAtt != null) {
if (hasOffsets) {
token = new Token(text.utf8ToString(),
offsetAtt.startOffset(),
offsetAtt.endOffset());
dpEnum.startOffset(),
dpEnum.endOffset());
} else {
token = new Token();
token.setEmpty().append(text.utf8ToString());

View File

@ -21,7 +21,6 @@ import java.util.Collections;
import java.util.LinkedList;
import java.util.Set;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
@ -101,29 +100,19 @@ public class FieldTermStack {
if (!termSet.contains(term)) {
continue;
}
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
if (dpEnum == null) {
// null snippet
return;
}
if (!dpEnum.attributes().hasAttribute(OffsetAttribute.class)) {
// null snippet
return;
}
dpEnum.nextDoc();
final OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
final int freq = dpEnum.freq();
for(int i = 0;i < freq;i++) {
final int pos = dpEnum.nextPosition();
if (pos == -1) {
// null snippet
return;
}
termList.add(new TermInfo(term, offsetAtt.startOffset(), offsetAtt.endOffset(), pos));
int pos = dpEnum.nextPosition();
termList.add(new TermInfo(term, dpEnum.startOffset(), dpEnum.endOffset(), pos));
}
}

View File

@ -953,7 +953,10 @@ public class MemoryIndex {
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) {
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) {
if (needsOffsets) {
return null;
}
if (reuse == null || !(reuse instanceof MemoryDocsAndPositionsEnum)) {
reuse = new MemoryDocsAndPositionsEnum();
}
@ -1065,6 +1068,16 @@ public class MemoryIndex {
return positions.get(posUpto++);
}
@Override
public int startOffset() {
return -1;
}
@Override
public int endOffset() {
return -1;
}
@Override
public boolean hasPayload() {
return false;

View File

@ -206,7 +206,7 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase {
MemoryIndex memory = new MemoryIndex();
memory.addField("foo", "bar", analyzer);
IndexReader reader = memory.createSearcher().getIndexReader();
DocsAndPositionsEnum disi = reader.termPositionsEnum(null, "foo", new BytesRef("bar"));
DocsAndPositionsEnum disi = reader.termPositionsEnum(null, "foo", new BytesRef("bar"), false);
int docid = disi.docID();
assertTrue(docid == -1 || docid == DocIdSetIterator.NO_MORE_DOCS);
assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
@ -214,7 +214,7 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase {
// now reuse and check again
TermsEnum te = reader.terms("foo").iterator(null);
assertTrue(te.seekExact(new BytesRef("bar"), true));
disi = te.docsAndPositions(null, disi);
disi = te.docsAndPositions(null, disi, false);
docid = disi.docID();
assertTrue(docid == -1 || docid == DocIdSetIterator.NO_MORE_DOCS);
assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);

View File

@ -697,16 +697,20 @@ public class BlockTermsReader extends FieldsProducer {
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
//System.out.println("BTR.d&p this=" + this);
decodeMetaData();
if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
if (fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
// Positions were not indexed:
return null;
} else {
DocsAndPositionsEnum dpe = postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse);
//System.out.println(" return d&pe=" + dpe);
return dpe;
}
if (needsOffsets &&
fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0) {
// Offsets were not indexed:
return null;
}
decodeMetaData();
return postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse, needsOffsets);
}
@Override

View File

@ -881,13 +881,20 @@ public class BlockTreeTermsReader extends FieldsProducer {
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
if (fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
// Positions were not indexed:
return null;
} else {
currentFrame.decodeMetaData();
return postingsReader.docsAndPositions(fieldInfo, currentFrame.termState, skipDocs, reuse);
}
if (needsOffsets &&
fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0) {
// Offsets were not indexed:
return null;
}
currentFrame.decodeMetaData();
return postingsReader.docsAndPositions(fieldInfo, currentFrame.termState, skipDocs, reuse, needsOffsets);
}
private int getState() {
@ -2096,17 +2103,21 @@ public class BlockTreeTermsReader extends FieldsProducer {
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
assert !eof;
//System.out.println("BTR.d&p this=" + this);
if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
if (fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
// Positions were not indexed:
return null;
} else {
currentFrame.decodeMetaData();
DocsAndPositionsEnum dpe = postingsReader.docsAndPositions(fieldInfo, currentFrame.state, skipDocs, reuse);
//System.out.println(" return d&pe=" + dpe);
return dpe;
}
if (needsOffsets &&
fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0) {
// Offsets were not indexed:
return null;
}
assert !eof;
currentFrame.decodeMetaData();
return postingsReader.docsAndPositions(fieldInfo, currentFrame.state, skipDocs, reuse, needsOffsets);
}
@Override

View File

@ -102,6 +102,16 @@ public final class MappingMultiDocsAndPositionsEnum extends DocsAndPositionsEnum
public int nextPosition() throws IOException {
return current.nextPosition();
}
@Override
public int startOffset() throws IOException {
return current.startOffset();
}
@Override
public int endOffset() throws IOException {
return current.endOffset();
}
@Override
public BytesRef getPayload() throws IOException {

View File

@ -44,12 +44,12 @@ public abstract class PostingsConsumer {
int docBase;
}
/** Add a new position & payload. A null payload means no
* payload; a non-null payload with zero length also
* means no payload. Caller may reuse the {@link
* BytesRef} for the payload between calls (method must
* fully consume the payload). */
public abstract void addPosition(int position, BytesRef payload) throws IOException;
/** Add a new position & payload, and start/end offset. A
* null payload means no payload; a non-null payload with
* zero length also means no payload. Caller may reuse
* the {@link BytesRef} for the payload between calls
* (method must fully consume the payload). */
public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException;
/** Called when we are done adding positions & payloads
* for each doc. Not called when the field omits term
@ -88,7 +88,7 @@ public abstract class PostingsConsumer {
df++;
totTF += freq;
}
} else {
} else if (mergeState.fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings;
while(true) {
final int doc = postingsEnum.nextDoc();
@ -107,7 +107,32 @@ public abstract class PostingsConsumer {
} else {
payload = null;
}
this.addPosition(position, payload);
this.addPosition(position, payload, -1, -1);
}
this.finishDoc();
df++;
}
} else {
assert mergeState.fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings;
while(true) {
final int doc = postingsEnum.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
visitedDocs.set(doc);
final int freq = postingsEnum.freq();
this.startDoc(doc, freq);
totTF += freq;
for(int i=0;i<freq;i++) {
final int position = postingsEnum.nextPosition();
final BytesRef payload;
if (postingsEnum.hasPayload()) {
payload = postingsEnum.getPayload();
} else {
payload = null;
}
this.addPosition(position, payload, postingsEnum.startOffset(), postingsEnum.endOffset());
}
this.finishDoc();
df++;

View File

@ -55,7 +55,8 @@ public abstract class PostingsReaderBase implements Closeable {
/** Must fully consume state, since after this call that
* TermState may be reused. */
public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState state, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException;
public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState state, Bits skipDocs, DocsAndPositionsEnum reuse,
boolean needsOffsets) throws IOException;
public abstract void close() throws IOException;

View File

@ -20,7 +20,6 @@ package org.apache.lucene.codecs;
import java.io.Closeable;
import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
@ -185,7 +184,6 @@ public abstract class TermVectorsWriter implements Closeable {
String lastFieldName = null;
while((fieldName = fieldsEnum.next()) != null) {
final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldName);
assert lastFieldName == null || fieldName.compareTo(lastFieldName) > 0: "lastFieldName=" + lastFieldName + " fieldName=" + fieldName;
@ -200,79 +198,79 @@ public abstract class TermVectorsWriter implements Closeable {
if (numTerms == -1) {
throw new IllegalStateException("vector.getUniqueTermCount() must be implemented (it returned -1)");
}
final boolean positions;
OffsetAttribute offsetAtt;
final TermsEnum termsEnum = terms.iterator(null);
DocsAndPositionsEnum docsAndPositionsEnum = null;
if (termsEnum.next() != null) {
assert numTerms > 0;
docsAndPositionsEnum = termsEnum.docsAndPositions(null, null);
if (docsAndPositionsEnum != null) {
// has positions
positions = true;
if (docsAndPositionsEnum.attributes().hasAttribute(OffsetAttribute.class)) {
offsetAtt = docsAndPositionsEnum.attributes().getAttribute(OffsetAttribute.class);
} else {
offsetAtt = null;
}
} else {
positions = false;
offsetAtt = null;
}
} else {
// no terms in this field (hmm why is field present
// then...?)
assert numTerms == 0;
positions = false;
offsetAtt = null;
}
startField(fieldInfo, numTerms, positions, offsetAtt != null);
boolean startedField = false;
int termCount = 1;
// NOTE: this is tricky, because TermVectors allow
// indexing offsets but NOT positions. So we must
// lazily init the field by checking whether first
// position we see is -1 or not.
int termCount = 0;
while(termsEnum.next() != null) {
termCount++;
// NOTE: we already .next()'d the TermsEnum above, to
// peek @ first term to see if positions/offsets are
// present
while(true) {
final int freq = (int) termsEnum.totalTermFreq();
startTerm(termsEnum.term(), freq);
if (positions || offsetAtt != null) {
DocsAndPositionsEnum dp = termsEnum.docsAndPositions(null, docsAndPositionsEnum);
// TODO: add startOffset()/endOffset() to d&pEnum... this is insanity
if (dp != docsAndPositionsEnum) {
// producer didnt reuse, must re-pull attributes
if (offsetAtt != null) {
assert dp.attributes().hasAttribute(OffsetAttribute.class);
offsetAtt = dp.attributes().getAttribute(OffsetAttribute.class);
}
}
docsAndPositionsEnum = dp;
if (startedField) {
startTerm(termsEnum.term(), freq);
}
// TODO: we need a "query" API where we can ask (via
// flex API) what this term was indexed with...
// Both positions & offsets:
docsAndPositionsEnum = termsEnum.docsAndPositions(null, null, true);
final boolean hasOffsets;
boolean hasPositions = false;
if (docsAndPositionsEnum == null) {
// Fallback: no offsets
docsAndPositionsEnum = termsEnum.docsAndPositions(null, null, false);
hasOffsets = false;
} else {
hasOffsets = true;
}
if (docsAndPositionsEnum != null) {
final int docID = docsAndPositionsEnum.nextDoc();
assert docID != DocsEnum.NO_MORE_DOCS;
assert docsAndPositionsEnum.freq() == freq;
for(int posUpto=0; posUpto<freq; posUpto++) {
final int pos = docsAndPositionsEnum.nextPosition();
final int startOffset = offsetAtt == null ? -1 : offsetAtt.startOffset();
final int endOffset = offsetAtt == null ? -1 : offsetAtt.endOffset();
if (!startedField) {
assert numTerms > 0;
hasPositions = pos != -1;
startField(fieldInfo, numTerms, hasPositions, hasOffsets);
startTerm(termsEnum.term(), freq);
startedField = true;
}
final int startOffset;
final int endOffset;
if (hasOffsets) {
startOffset = docsAndPositionsEnum.startOffset();
endOffset = docsAndPositionsEnum.endOffset();
assert startOffset != -1;
assert endOffset != -1;
} else {
startOffset = -1;
endOffset = -1;
}
assert !hasPositions || pos >= 0;
addPosition(pos, startOffset, endOffset);
}
} else {
if (!startedField) {
assert numTerms > 0;
startField(fieldInfo, numTerms, hasPositions, hasOffsets);
startTerm(termsEnum.term(), freq);
startedField = true;
}
}
if (termsEnum.next() == null) {
assert termCount == numTerms;
break;
}
termCount++;
}
assert termCount == numTerms;
}
}
}

View File

@ -119,8 +119,7 @@ public abstract class TermsConsumer {
}
}
}
} else {
assert mergeState.fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
} else if (mergeState.fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
if (postingsEnum == null) {
postingsEnum = new MappingMultiDocsAndPositionsEnum();
}
@ -129,7 +128,41 @@ public abstract class TermsConsumer {
while((term = termsEnum.next()) != null) {
// We can pass null for liveDocs, because the
// mapping enum will skip the non-live docs:
postingsEnumIn = (MultiDocsAndPositionsEnum) termsEnum.docsAndPositions(null, postingsEnumIn);
postingsEnumIn = (MultiDocsAndPositionsEnum) termsEnum.docsAndPositions(null, postingsEnumIn, false);
assert postingsEnumIn != null;
postingsEnum.reset(postingsEnumIn);
// set PayloadProcessor
if (mergeState.payloadProcessorProvider != null) {
for (int i = 0; i < mergeState.readers.size(); i++) {
if (mergeState.dirPayloadProcessor[i] != null) {
mergeState.currentPayloadProcessor[i] = mergeState.dirPayloadProcessor[i].getProcessor(mergeState.fieldInfo.name, term);
}
}
}
final PostingsConsumer postingsConsumer = startTerm(term);
final TermStats stats = postingsConsumer.merge(mergeState, postingsEnum, visitedDocs);
if (stats.docFreq > 0) {
finishTerm(term, stats);
sumTotalTermFreq += stats.totalTermFreq;
sumDFsinceLastAbortCheck += stats.docFreq;
sumDocFreq += stats.docFreq;
if (sumDFsinceLastAbortCheck > 60000) {
mergeState.checkAbort.work(sumDFsinceLastAbortCheck/5.0);
sumDFsinceLastAbortCheck = 0;
}
}
}
} else {
assert mergeState.fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
if (postingsEnum == null) {
postingsEnum = new MappingMultiDocsAndPositionsEnum();
}
postingsEnum.setMergeState(mergeState);
MultiDocsAndPositionsEnum postingsEnumIn = null;
while((term = termsEnum.next()) != null) {
// We can pass null for liveDocs, because the
// mapping enum will skip the non-live docs:
postingsEnumIn = (MultiDocsAndPositionsEnum) termsEnum.docsAndPositions(null, postingsEnumIn, true);
assert postingsEnumIn != null;
postingsEnum.reset(postingsEnumIn);
// set PayloadProcessor
@ -154,7 +187,6 @@ public abstract class TermsConsumer {
}
}
}
finish(sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality());
}
}

View File

@ -966,7 +966,12 @@ public class Lucene3xFields extends FieldsProducer {
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
if (needsOffsets) {
// Pre-4.0 indices never have offsets:
return null;
}
PreDocsAndPositionsEnum docsPosEnum;
if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
return null;
@ -1081,6 +1086,16 @@ public class Lucene3xFields extends FieldsProducer {
return pos.nextPosition();
}
@Override
public int startOffset() throws IOException {
return -1;
}
@Override
public int endOffset() throws IOException {
return -1;
}
@Override
public boolean hasPayload() {
assert docID != NO_MORE_DOCS;

View File

@ -215,7 +215,9 @@ public final class TermInfosReader {
TermInfo seekEnum(SegmentTermEnum enumerator, Term term, boolean useCache) throws IOException {
if (useCache) {
return seekEnum(enumerator, term, termsCache.get(new CloneableTerm(term)), useCache);
return seekEnum(enumerator, term,
termsCache.get(new CloneableTerm(term.deepCopyOf())),
useCache);
} else {
return seekEnum(enumerator, term, null, useCache);
}
@ -247,7 +249,8 @@ public final class TermInfosReader {
// of terms in order
if (tiOrd == null) {
if (useCache) {
termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, enumerator.position));
termsCache.put(new CloneableTerm(term.deepCopyOf()),
new TermInfoAndOrd(ti, enumerator.position));
}
} else {
assert sameTermInfo(ti, tiOrd, enumerator);
@ -279,7 +282,8 @@ public final class TermInfosReader {
ti = enumerator.termInfo;
if (tiOrd == null) {
if (useCache) {
termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, enumerator.position));
termsCache.put(new CloneableTerm(term.deepCopyOf()),
new TermInfoAndOrd(ti, enumerator.position));
}
} else {
assert sameTermInfo(ti, tiOrd, enumerator);

View File

@ -80,6 +80,8 @@ public class Lucene40FieldInfosReader extends FieldInfosReader {
} else {
throw new CorruptIndexException("Corrupt fieldinfos, OMIT_POSITIONS set but format=" + format + " (resource: " + input + ")");
}
} else if (format <= Lucene40FieldInfosWriter.FORMAT_FLEX && (bits & Lucene40FieldInfosWriter.STORE_OFFSETS_IN_POSTINGS) != 0) {
indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
} else {
indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
}

View File

@ -47,6 +47,7 @@ public class Lucene40FieldInfosWriter extends FieldInfosWriter {
static final byte IS_INDEXED = 0x1;
static final byte STORE_TERMVECTOR = 0x2;
static final byte STORE_OFFSETS_IN_POSTINGS = 0x4;
static final byte OMIT_NORMS = 0x10;
static final byte STORE_PAYLOADS = 0x20;
static final byte OMIT_TERM_FREQ_AND_POSITIONS = 0x40;
@ -68,6 +69,8 @@ public class Lucene40FieldInfosWriter extends FieldInfosWriter {
if (fi.storePayloads) bits |= STORE_PAYLOADS;
if (fi.indexOptions == IndexOptions.DOCS_ONLY) {
bits |= OMIT_TERM_FREQ_AND_POSITIONS;
} else if (fi.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
bits |= STORE_OFFSETS_IN_POSTINGS;
} else if (fi.indexOptions == IndexOptions.DOCS_AND_FREQS) {
bits |= OMIT_POSITIONS;
}

View File

@ -241,11 +241,15 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
}
@Override
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs,
DocsAndPositionsEnum reuse, boolean needsOffsets)
throws IOException {
if (needsOffsets) {
// TODO: once we index offsets into postings fix this!
return null;
}
// TODO: refactor
if (fieldInfo.storePayloads) {
SegmentDocsAndPositionsAndPayloadsEnum docsEnum;
@ -366,7 +370,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
start = count; // buffer is consumed
return doc = skipTo(target, liveDocs);
return doc = skipTo(target);
}
private final int binarySearch(int hi, int low, int target, int[] docs) {
@ -448,7 +452,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
}
private final int skipTo(int target, Bits liveDocs) throws IOException {
private final int skipTo(int target) throws IOException {
if ((target - skipInterval) >= accum && limit >= skipMinimum) {
// There are enough docs in the posting to have
@ -841,6 +845,16 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
return position;
}
@Override
public int startOffset() throws IOException {
return -1;
}
@Override
public int endOffset() throws IOException {
return -1;
}
/** Returns the payload at this position, or null if no
* payload was indexed. */
@Override
@ -1074,6 +1088,16 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
return position;
}
@Override
public int startOffset() throws IOException {
return -1;
}
@Override
public int endOffset() throws IOException {
return -1;
}
/** Returns the payload at this position, or null if no
* payload was indexed. */
@Override

View File

@ -155,6 +155,10 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
*/
this.fieldInfo = fieldInfo;
indexOptions = fieldInfo.indexOptions;
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
throw new IllegalArgumentException("this codec cannot index offsets");
}
storePayloads = fieldInfo.storePayloads;
//System.out.println(" set init blockFreqStart=" + freqStart);
//System.out.println(" set init blockProxStart=" + proxStart);
@ -197,11 +201,19 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
/** Add a new position & payload */
@Override
public void addPosition(int position, BytesRef payload) throws IOException {
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
//if (DEBUG) System.out.println("SPW: addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.length + " bytes")) + " proxFP=" + proxOut.getFilePointer());
assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS: "invalid indexOptions: " + indexOptions;
assert proxOut != null;
// TODO: when we add offsets... often
// endOffset-startOffset will be constant or near
// constant for all docs (eg if the term wasn't stemmed
// then this will usually be the utf16 length of the
// term); would be nice to write that length once up
// front and then not encode endOffset for each
// position..
final int delta = position - lastPosition;
assert delta >= 0: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it)

View File

@ -24,7 +24,6 @@ import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocsAndPositionsEnum;
@ -518,21 +517,20 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
if (needsOffsets && !storeOffsets) {
return null;
}
if (!storePositions && !storeOffsets) {
return null;
}
TVDocsAndPositionsEnum docsAndPositionsEnum;
if (reuse != null) {
if (reuse != null && reuse instanceof TVDocsAndPositionsEnum) {
docsAndPositionsEnum = (TVDocsAndPositionsEnum) reuse;
if (docsAndPositionsEnum.canReuse(storeOffsets)) {
docsAndPositionsEnum = (TVDocsAndPositionsEnum) reuse;
} else {
docsAndPositionsEnum = new TVDocsAndPositionsEnum(storeOffsets);
}
} else {
docsAndPositionsEnum = new TVDocsAndPositionsEnum(storeOffsets);
docsAndPositionsEnum = new TVDocsAndPositionsEnum();
}
docsAndPositionsEnum.reset(liveDocs, positions, startOffsets, endOffsets);
return docsAndPositionsEnum;
@ -592,7 +590,6 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
}
private static class TVDocsAndPositionsEnum extends DocsAndPositionsEnum {
private final OffsetAttribute offsetAtt;
private boolean didNext;
private int doc = -1;
private int nextPos;
@ -601,18 +598,6 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
private int[] startOffsets;
private int[] endOffsets;
public TVDocsAndPositionsEnum(boolean storeOffsets) {
if (storeOffsets) {
offsetAtt = attributes().addAttribute(OffsetAttribute.class);
} else {
offsetAtt = null;
}
}
public boolean canReuse(boolean storeOffsets) {
return storeOffsets == (offsetAtt != null);
}
@Override
public int freq() {
if (positions != null) {
@ -651,7 +636,6 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
this.liveDocs = liveDocs;
this.positions = positions;
this.startOffsets = startOffsets;
assert (offsetAtt != null) == (startOffsets != null);
this.endOffsets = endOffsets;
this.doc = -1;
didNext = false;
@ -673,10 +657,6 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
assert (positions != null && nextPos < positions.length) ||
startOffsets != null && nextPos < startOffsets.length;
if (startOffsets != null) {
offsetAtt.setOffset(startOffsets[nextPos],
endOffsets[nextPos]);
}
if (positions != null) {
return positions[nextPos++];
} else {
@ -684,6 +664,18 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
return -1;
}
}
@Override
public int startOffset() {
assert startOffsets != null;
return startOffsets[nextPos-1];
}
@Override
public int endOffset() {
assert endOffsets != null;
return endOffsets[nextPos-1];
}
}
@Override

View File

@ -131,7 +131,7 @@ public class MemoryPostingsFormat extends PostingsFormat {
}
@Override
public void addPosition(int pos, BytesRef payload) throws IOException {
public void addPosition(int pos, BytesRef payload, int startOffset, int endOffset) throws IOException {
assert payload == null || field.storePayloads;
if (VERBOSE) System.out.println(" addPos pos=" + pos + " payload=" + payload);
@ -249,6 +249,9 @@ public class MemoryPostingsFormat extends PostingsFormat {
return new FieldsConsumer() {
@Override
public TermsConsumer addField(FieldInfo field) {
if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
throw new IllegalArgumentException("this codec cannot index offsets");
}
if (VERBOSE) System.out.println("\naddField field=" + field.name);
return new TermsWriter(out, field);
}
@ -328,7 +331,7 @@ public class MemoryPostingsFormat extends PostingsFormat {
assert freq > 0;
}
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
// Skip positions
for(int posUpto=0;posUpto<freq;posUpto++) {
if (!storePayloads) {
@ -500,6 +503,16 @@ public class MemoryPostingsFormat extends PostingsFormat {
return pos;
}
@Override
public int startOffset() {
return -1;
}
@Override
public int endOffset() {
return -1;
}
@Override
public BytesRef getPayload() {
payloadRetrieved = true;
@ -618,8 +631,14 @@ public class MemoryPostingsFormat extends PostingsFormat {
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
if (field.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
if (needsOffsets) {
// Not until we can index offsets...
return null;
}
if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
return null;
}
decodeMetaData();

View File

@ -215,10 +215,8 @@ public class PulsingPostingsReader extends PostingsReaderBase {
}
@Override
public DocsAndPositionsEnum docsAndPositions(FieldInfo field, BlockTermState _termState, Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
if (field.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
return null;
}
public DocsAndPositionsEnum docsAndPositions(FieldInfo field, BlockTermState _termState, Bits liveDocs, DocsAndPositionsEnum reuse,
boolean needsOffsets) throws IOException {
//System.out.println("D&P: field=" + field.name);
final PulsingTermState termState = (PulsingTermState) _termState;
@ -245,11 +243,12 @@ public class PulsingPostingsReader extends PostingsReaderBase {
return postings.reset(liveDocs, termState);
} else {
if (reuse instanceof PulsingDocsAndPositionsEnum) {
DocsAndPositionsEnum wrapped = wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, (DocsAndPositionsEnum) getOther(reuse));
DocsAndPositionsEnum wrapped = wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, (DocsAndPositionsEnum) getOther(reuse),
needsOffsets);
setOther(wrapped, reuse); // wrapped.other = reuse
return wrapped;
} else {
return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, reuse);
return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, reuse, needsOffsets);
}
}
}
@ -486,6 +485,16 @@ public class PulsingPostingsReader extends PostingsReaderBase {
return position;
}
@Override
public int startOffset() {
return -1;
}
@Override
public int endOffset() {
return -1;
}
private void skipPositions() throws IOException {
while(posPending != 0) {
nextPosition();

View File

@ -115,6 +115,9 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
@Override
public void setField(FieldInfo fieldInfo) {
this.indexOptions = fieldInfo.indexOptions;
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
throw new IllegalArgumentException("this codec cannot index offsets: " + indexOptions);
}
if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions);
storePayloads = fieldInfo.storePayloads;
wrappedPostingsWriter.setField(fieldInfo);
@ -165,7 +168,7 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
}
@Override
public void addPosition(int position, BytesRef payload) throws IOException {
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
if (DEBUG) System.out.println("PW pos=" + position + " payload=" + (payload == null ? "null" : payload.length + " bytes"));
if (pendingCount == pending.length) {
@ -175,7 +178,7 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
if (pendingCount == -1) {
// We've already seen too many docs for this term --
// just forward to our fallback writer
wrappedPostingsWriter.addPosition(position, payload);
wrappedPostingsWriter.addPosition(position, payload, -1, -1);
} else {
// buffer up
final Position pos = pending[pendingCount++];
@ -360,7 +363,7 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
wrappedPostingsWriter.startTerm();
// Flush all buffered docs
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
Position doc = null;
for(Position pos : pending) {
if (doc == null) {
@ -376,7 +379,7 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
wrappedPostingsWriter.startDoc(doc.docID, doc.termFreq);
}
if (DEBUG) System.out.println("PW: wrapped.addPos pos=" + pos.pos);
wrappedPostingsWriter.addPosition(pos.pos, pos.payload);
wrappedPostingsWriter.addPosition(pos.pos, pos.payload, -1, -1);
}
//wrappedPostingsWriter.finishDoc();
} else {

View File

@ -294,7 +294,18 @@ public class SepPostingsReader extends PostingsReaderBase {
}
@Override
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState _termState, Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState _termState, Bits liveDocs,
DocsAndPositionsEnum reuse, boolean needsOffsets)
throws IOException {
if (fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
return null;
}
if (needsOffsets) {
return null;
}
assert fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
final SepTermState termState = (SepTermState) _termState;
SepDocsAndPositionsEnum postingsEnum;
@ -713,6 +724,16 @@ public class SepPostingsReader extends PostingsReaderBase {
return position;
}
@Override
public int startOffset() {
return -1;
}
@Override
public int endOffset() {
return -1;
}
private BytesRef payload;
@Override

View File

@ -188,6 +188,9 @@ public final class SepPostingsWriter extends PostingsWriterBase {
public void setField(FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
this.indexOptions = fieldInfo.indexOptions;
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
throw new IllegalArgumentException("this codec cannot index offsets");
}
skipListWriter.setIndexOptions(indexOptions);
storePayloads = indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS && fieldInfo.storePayloads;
}
@ -222,7 +225,7 @@ public final class SepPostingsWriter extends PostingsWriterBase {
/** Add a new position & payload */
@Override
public void addPosition(int position, BytesRef payload) throws IOException {
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
final int delta = position - lastPosition;

View File

@ -103,7 +103,7 @@ public class SimpleTextFieldInfosReader extends FieldInfosReader {
IndexOptions indexOptions = IndexOptions.valueOf(readString(INDEXOPTIONS.length, scratch));
hasVectors |= storeTermVector;
hasProx |= isIndexed && indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
hasProx |= isIndexed && indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
hasFreq |= isIndexed && indexOptions != IndexOptions.DOCS_ONLY;
infos[i] = new FieldInfo(name, isIndexed, fieldNumber, storeTermVector,

View File

@ -62,7 +62,7 @@ public class SimpleTextFieldInfosWriter extends FieldInfosWriter {
SimpleTextUtil.writeNewline(out);
for (FieldInfo fi : infos) {
assert fi.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !fi.storePayloads;
assert fi.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !fi.storePayloads;
SimpleTextUtil.write(out, NAME);
SimpleTextUtil.write(out, fi.name, scratch);

View File

@ -50,13 +50,15 @@ class SimpleTextFieldsReader extends FieldsProducer {
private final IndexInput in;
private final FieldInfos fieldInfos;
final static BytesRef END = SimpleTextFieldsWriter.END;
final static BytesRef FIELD = SimpleTextFieldsWriter.FIELD;
final static BytesRef TERM = SimpleTextFieldsWriter.TERM;
final static BytesRef DOC = SimpleTextFieldsWriter.DOC;
final static BytesRef FREQ = SimpleTextFieldsWriter.FREQ;
final static BytesRef POS = SimpleTextFieldsWriter.POS;
final static BytesRef PAYLOAD = SimpleTextFieldsWriter.PAYLOAD;
final static BytesRef END = SimpleTextFieldsWriter.END;
final static BytesRef FIELD = SimpleTextFieldsWriter.FIELD;
final static BytesRef TERM = SimpleTextFieldsWriter.TERM;
final static BytesRef DOC = SimpleTextFieldsWriter.DOC;
final static BytesRef FREQ = SimpleTextFieldsWriter.FREQ;
final static BytesRef POS = SimpleTextFieldsWriter.POS;
final static BytesRef START_OFFSET = SimpleTextFieldsWriter.START_OFFSET;
final static BytesRef END_OFFSET = SimpleTextFieldsWriter.END_OFFSET;
final static BytesRef PAYLOAD = SimpleTextFieldsWriter.PAYLOAD;
public SimpleTextFieldsReader(SegmentReadState state) throws IOException {
in = state.dir.openInput(SimpleTextPostingsFormat.getPostingsFileName(state.segmentInfo.name, state.segmentSuffix), state.context);
@ -204,8 +206,16 @@ class SimpleTextFieldsReader extends FieldsProducer {
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
// Positions were not indexed
return null;
}
if (needsOffsets &&
indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0) {
// Offsets were not indexed
return null;
}
@ -215,7 +225,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
} else {
docsAndPositionsEnum = new SimpleTextDocsAndPositionsEnum();
}
return docsAndPositionsEnum.reset(docsStart, liveDocs);
return docsAndPositionsEnum.reset(docsStart, liveDocs, indexOptions);
}
@Override
@ -289,6 +299,10 @@ class SimpleTextFieldsReader extends FieldsProducer {
termFreq = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
} else if (StringHelper.startsWith(scratch, POS)) {
// skip termFreq++;
} else if (StringHelper.startsWith(scratch, START_OFFSET)) {
// skip
} else if (StringHelper.startsWith(scratch, END_OFFSET)) {
// skip
} else if (StringHelper.startsWith(scratch, PAYLOAD)) {
// skip
} else {
@ -325,6 +339,10 @@ class SimpleTextFieldsReader extends FieldsProducer {
private final CharsRef scratchUTF16_2 = new CharsRef(10);
private BytesRef payload;
private long nextDocStart;
private boolean readOffsets;
private boolean readPositions;
private int startOffset = -1;
private int endOffset = -1;
public SimpleTextDocsAndPositionsEnum() {
this.inStart = SimpleTextFieldsReader.this.in;
@ -335,10 +353,12 @@ class SimpleTextFieldsReader extends FieldsProducer {
return in == inStart;
}
public SimpleTextDocsAndPositionsEnum reset(long fp, Bits liveDocs) {
public SimpleTextDocsAndPositionsEnum reset(long fp, Bits liveDocs, IndexOptions indexOptions) {
this.liveDocs = liveDocs;
nextDocStart = fp;
docID = -1;
readPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
readOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
return this;
}
@ -360,6 +380,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
while(true) {
final long lineStart = in.getFilePointer();
SimpleTextUtil.readLine(in, scratch);
//System.out.println("NEXT DOC: " + scratch.utf8ToString());
if (StringHelper.startsWith(scratch, DOC)) {
if (!first && (liveDocs == null || liveDocs.get(docID))) {
nextDocStart = lineStart;
@ -376,6 +397,10 @@ class SimpleTextFieldsReader extends FieldsProducer {
posStart = in.getFilePointer();
} else if (StringHelper.startsWith(scratch, POS)) {
// skip
} else if (StringHelper.startsWith(scratch, START_OFFSET)) {
// skip
} else if (StringHelper.startsWith(scratch, END_OFFSET)) {
// skip
} else if (StringHelper.startsWith(scratch, PAYLOAD)) {
// skip
} else {
@ -399,10 +424,27 @@ class SimpleTextFieldsReader extends FieldsProducer {
@Override
public int nextPosition() throws IOException {
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch, POS): "got line=" + scratch.utf8ToString();
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+POS.length, scratch.length-POS.length, scratchUTF16_2);
final int pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
final int pos;
if (readPositions) {
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch, POS): "got line=" + scratch.utf8ToString();
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+POS.length, scratch.length-POS.length, scratchUTF16_2);
pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
} else {
pos = -1;
}
if (readOffsets) {
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch, START_OFFSET): "got line=" + scratch.utf8ToString();
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+START_OFFSET.length, scratch.length-START_OFFSET.length, scratchUTF16_2);
startOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch, END_OFFSET): "got line=" + scratch.utf8ToString();
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+END_OFFSET.length, scratch.length-END_OFFSET.length, scratchUTF16_2);
endOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
}
final long fp = in.getFilePointer();
SimpleTextUtil.readLine(in, scratch);
if (StringHelper.startsWith(scratch, PAYLOAD)) {
@ -420,6 +462,16 @@ class SimpleTextFieldsReader extends FieldsProducer {
return pos;
}
@Override
public int startOffset() throws IOException {
return startOffset;
}
@Override
public int endOffset() throws IOException {
return endOffset;
}
@Override
public BytesRef getPayload() {
// Some tests rely on only being able to retrieve the

View File

@ -35,13 +35,15 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
private final IndexOutput out;
private final BytesRef scratch = new BytesRef(10);
final static BytesRef END = new BytesRef("END");
final static BytesRef FIELD = new BytesRef("field ");
final static BytesRef TERM = new BytesRef(" term ");
final static BytesRef DOC = new BytesRef(" doc ");
final static BytesRef FREQ = new BytesRef(" freq ");
final static BytesRef POS = new BytesRef(" pos ");
final static BytesRef PAYLOAD = new BytesRef(" payload ");
final static BytesRef END = new BytesRef("END");
final static BytesRef FIELD = new BytesRef("field ");
final static BytesRef TERM = new BytesRef(" term ");
final static BytesRef DOC = new BytesRef(" doc ");
final static BytesRef FREQ = new BytesRef(" freq ");
final static BytesRef POS = new BytesRef(" pos ");
final static BytesRef START_OFFSET = new BytesRef(" startOffset ");
final static BytesRef END_OFFSET = new BytesRef(" endOffset ");
final static BytesRef PAYLOAD = new BytesRef(" payload ");
public SimpleTextFieldsWriter(SegmentWriteState state) throws IOException {
final String fileName = SimpleTextPostingsFormat.getPostingsFileName(state.segmentName, state.segmentSuffix);
@ -97,10 +99,19 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
private class SimpleTextPostingsWriter extends PostingsConsumer {
private BytesRef term;
private boolean wroteTerm;
private IndexOptions indexOptions;
private final IndexOptions indexOptions;
private final boolean writePositions;
private final boolean writeOffsets;
// for assert:
private int lastEndOffset = -1;
public SimpleTextPostingsWriter(FieldInfo field) {
this.indexOptions = field.indexOptions;
writePositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
writeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
//System.out.println("writeOffsets=" + writeOffsets);
//System.out.println("writePos=" + writePositions);
}
@Override
@ -121,10 +132,10 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
write(Integer.toString(termDocFreq));
newline();
}
lastEndOffset = -1;
}
public PostingsConsumer reset(BytesRef term) {
this.term = term;
wroteTerm = false;
@ -132,10 +143,25 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
}
@Override
public void addPosition(int position, BytesRef payload) throws IOException {
write(POS);
write(Integer.toString(position));
newline();
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
if (writePositions) {
write(POS);
write(Integer.toString(position));
newline();
}
if (writeOffsets) {
assert endOffset >= startOffset;
assert startOffset >= lastEndOffset: "startOffset=" + startOffset + " lastEndOffset=" + lastEndOffset;
lastEndOffset = endOffset;
write(START_OFFSET);
write(Integer.toString(startOffset));
newline();
write(END_OFFSET);
write(Integer.toString(endOffset));
newline();
}
if (payload != null && payload.length > 0) {
assert payload.length != 0;
write(PAYLOAD);

View File

@ -38,7 +38,7 @@ public class SimpleTextTermVectorsFormat extends TermVectorsFormat {
@Override
public TermVectorsReader vectorsReader(Directory directory, SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext context) throws IOException {
return new SimpleTextTermVectorsReader(directory, segmentInfo, fieldInfos, context);
return new SimpleTextTermVectorsReader(directory, segmentInfo, context);
}
@Override

View File

@ -26,11 +26,9 @@ import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames;
@ -63,7 +61,7 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
private BytesRef scratch = new BytesRef();
private CharsRef scratchUTF16 = new CharsRef();
public SimpleTextTermVectorsReader(Directory directory, SegmentInfo si, FieldInfos fieldInfos, IOContext context) throws IOException {
public SimpleTextTermVectorsReader(Directory directory, SegmentInfo si, IOContext context) throws IOException {
boolean success = false;
try {
in = directory.openInput(IndexFileNames.segmentFileName(si.name, "", VECTORS_EXTENSION), context);
@ -114,7 +112,8 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
for (int i = 0; i < numFields; i++) {
readLine();
assert StringHelper.startsWith(scratch, FIELD);
int fieldNumber = parseIntAt(FIELD.length);
// skip fieldNumber:
parseIntAt(FIELD.length);
readLine();
assert StringHelper.startsWith(scratch, FIELDNAME);
@ -373,13 +372,16 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
SimpleTVPostings postings = current.getValue();
if (postings.positions == null && postings.startOffsets == null) {
return null;
}
if (needsOffsets && (postings.startOffsets == null || postings.endOffsets == null)) {
return null;
}
// TODO: reuse
SimpleTVDocsAndPositionsEnum e = new SimpleTVDocsAndPositionsEnum(postings.startOffsets != null);
SimpleTVDocsAndPositionsEnum e = new SimpleTVDocsAndPositionsEnum();
e.reset(liveDocs, postings.positions, postings.startOffsets, postings.endOffsets);
return e;
}
@ -436,7 +438,6 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
}
private static class SimpleTVDocsAndPositionsEnum extends DocsAndPositionsEnum {
private final OffsetAttribute offsetAtt;
private boolean didNext;
private int doc = -1;
private int nextPos;
@ -445,18 +446,6 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
private int[] startOffsets;
private int[] endOffsets;
public SimpleTVDocsAndPositionsEnum(boolean storeOffsets) {
if (storeOffsets) {
offsetAtt = attributes().addAttribute(OffsetAttribute.class);
} else {
offsetAtt = null;
}
}
public boolean canReuse(boolean storeOffsets) {
return storeOffsets == (offsetAtt != null);
}
@Override
public int freq() {
if (positions != null) {
@ -495,7 +484,6 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
this.liveDocs = liveDocs;
this.positions = positions;
this.startOffsets = startOffsets;
assert (offsetAtt != null) == (startOffsets != null);
this.endOffsets = endOffsets;
this.doc = -1;
didNext = false;
@ -516,11 +504,6 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
public int nextPosition() {
assert (positions != null && nextPos < positions.length) ||
startOffsets != null && nextPos < startOffsets.length;
if (startOffsets != null) {
offsetAtt.setOffset(startOffsets[nextPos],
endOffsets[nextPos]);
}
if (positions != null) {
return positions[nextPos++];
} else {
@ -528,5 +511,15 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
return -1;
}
}
@Override
public int startOffset() {
return startOffsets[nextPos-1];
}
@Override
public int endOffset() {
return endOffsets[nextPos-1];
}
}
}

View File

@ -289,6 +289,20 @@ public class CheckIndex {
infoStream = null;
}
private boolean crossCheckTermVectors;
/** If true, term vectors are compared against postings to
* make sure they are the same. This will likely
* drastically increase time it takes to run CheckIndex! */
public void setCrossCheckTermVectors(boolean v) {
crossCheckTermVectors = v;
}
/** See {@link #setCrossCheckTermVectors}. */
public boolean getCrossCheckTermVectors() {
return crossCheckTermVectors;
}
private boolean verbose;
/** Set infoStream where messages should go. If null, no
@ -563,7 +577,7 @@ public class CheckIndex {
segInfoStat.fieldNormStatus = testFieldNorms(fieldInfos, reader);
// Test the Term Index
segInfoStat.termIndexStatus = testTermIndex(reader);
segInfoStat.termIndexStatus = testPostings(reader);
// Test Stored Fields
segInfoStat.storedFieldStatus = testStoredFields(info, reader, nf);
@ -678,7 +692,11 @@ public class CheckIndex {
/**
* Test the term index.
*/
private Status.TermIndexStatus testTermIndex(SegmentReader reader) {
private Status.TermIndexStatus testPostings(SegmentReader reader) {
// TODO: we should go and verify term vectors match, if
// crossCheckTermVectors is on...
final Status.TermIndexStatus status = new Status.TermIndexStatus();
final int maxDoc = reader.maxDoc();
@ -760,7 +778,7 @@ public class CheckIndex {
docs = termsEnum.docs(liveDocs, docs, false);
docsAndFreqs = termsEnum.docs(liveDocs, docsAndFreqs, true);
postings = termsEnum.docsAndPositions(liveDocs, postings);
postings = termsEnum.docsAndPositions(liveDocs, postings, false);
if (hasOrd) {
long ord = -1;
@ -890,7 +908,7 @@ public class CheckIndex {
if (hasPositions) {
for(int idx=0;idx<7;idx++) {
final int skipDocID = (int) (((idx+1)*(long) maxDoc)/8);
postings = termsEnum.docsAndPositions(liveDocs, postings);
postings = termsEnum.docsAndPositions(liveDocs, postings, false);
final int docID = postings.advance(skipDocID);
if (docID == DocsEnum.NO_MORE_DOCS) {
break;
@ -1256,7 +1274,10 @@ public class CheckIndex {
private Status.TermVectorStatus testTermVectors(SegmentInfo info, SegmentReader reader, NumberFormat format) {
final Status.TermVectorStatus status = new Status.TermVectorStatus();
TermsEnum termsEnum = null;
// TODO: in theory we could test that term vectors have
// same terms/pos/offsets as the postings, but it'd be
// very slow...
try {
if (infoStream != null) {
infoStream.print(" test: term vectors........");
@ -1264,9 +1285,25 @@ public class CheckIndex {
// TODO: maybe we can factor out testTermIndex and reuse here?
DocsEnum docs = null;
DocsEnum docsAndFreqs = null;
DocsAndPositionsEnum postings = null;
// Only used if crossCheckTermVectors is true:
DocsEnum postingsDocs = null;
DocsAndPositionsEnum postingsPostings = null;
final Bits liveDocs = reader.getLiveDocs();
final Fields postingsFields;
// TODO: testTermsIndex
if (crossCheckTermVectors) {
postingsFields = reader.fields();
} else {
postingsFields = null;
}
TermsEnum termsEnum = null;
TermsEnum postingsTermsEnum = null;
for (int j = 0; j < info.docCount; ++j) {
if (liveDocs == null || liveDocs.get(j)) {
status.docCount++;
@ -1290,6 +1327,16 @@ public class CheckIndex {
Terms terms = tfv.terms(field);
termsEnum = terms.iterator(termsEnum);
if (crossCheckTermVectors) {
Terms postingsTerms = postingsFields.terms(field);
if (postingsTerms == null) {
throw new RuntimeException("vector field=" + field + " does not exist in postings; doc=" + j);
}
postingsTermsEnum = postingsTerms.iterator(postingsTermsEnum);
} else {
postingsTermsEnum = null;
}
long tfvComputedTermCountForField = 0;
long tfvComputedSumTotalTermFreq = 0;
@ -1308,52 +1355,171 @@ public class CheckIndex {
throw new RuntimeException("totalTermFreq: " + totalTermFreq + " is out of bounds");
}
postings = termsEnum.docsAndPositions(null, postings);
final boolean hasPositions;
final boolean hasOffsets;
final boolean hasFreqs;
// TODO: really we need a reflection/query
// API so we can just ask what was indexed
// instead of "probing"...
// Try offsets:
postings = termsEnum.docsAndPositions(null, postings, true);
if (postings == null) {
docsAndFreqs = termsEnum.docs(null, docsAndFreqs, true);
if (docsAndFreqs == null) {
docs = termsEnum.docs(null, docs, false);
hasOffsets = false;
// Try only positions:
postings = termsEnum.docsAndPositions(null, postings, false);
if (postings == null) {
hasPositions = false;
// Try docIDs & freqs:
docs = termsEnum.docs(null, docs, true);
if (docs == null) {
// OK, only docIDs:
hasFreqs = false;
docs = termsEnum.docs(null, docs, false);
} else {
hasFreqs = true;
}
} else {
docs = docsAndFreqs;
hasPositions = true;
hasFreqs = true;
}
} else {
docs = docsAndFreqs = postings;
hasOffsets = true;
// NOTE: may be a lie... but we accept -1 below
hasPositions = true;
hasFreqs = true;
}
final int doc = docs.nextDoc();
final DocsEnum docs2;
if (hasPositions || hasOffsets) {
assert postings != null;
docs2 = postings;
} else {
assert docs != null;
docs2 = docs;
}
final DocsEnum postingsDocs2;
final boolean postingsHasFreq;
if (crossCheckTermVectors) {
if (!postingsTermsEnum.seekExact(term, true)) {
throw new RuntimeException("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j);
}
postingsPostings = postingsTermsEnum.docsAndPositions(null, postingsPostings, true);
if (postingsPostings == null) {
// Term vectors were indexed w/ offsets but postings were not
postingsPostings = postingsTermsEnum.docsAndPositions(null, postingsPostings, false);
if (postingsPostings == null) {
postingsDocs = postingsTermsEnum.docs(null, postingsDocs, true);
if (postingsDocs == null) {
postingsHasFreq = false;
postingsDocs = postingsTermsEnum.docs(null, postingsDocs, false);
if (postingsDocs == null) {
throw new RuntimeException("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j);
}
} else {
postingsHasFreq = true;
}
} else {
postingsHasFreq = true;
}
} else {
postingsHasFreq = true;
}
if (postingsPostings != null) {
postingsDocs2 = postingsPostings;
} else {
postingsDocs2 = postingsDocs;
}
final int advanceDoc = postingsDocs2.advance(j);
if (advanceDoc != j) {
throw new RuntimeException("vector term=" + term + " field=" + field + ": doc=" + j + " was not found in postings (got: " + advanceDoc + ")");
}
} else {
postingsDocs2 = null;
postingsHasFreq = false;
}
final int doc = docs2.nextDoc();
if (doc != 0) {
throw new RuntimeException("vector for doc " + j + " didn't return docID=0: got docID=" + doc);
}
if (docsAndFreqs != null) {
final int tf = docsAndFreqs.freq();
if (hasFreqs) {
final int tf = docs2.freq();
if (tf <= 0) {
throw new RuntimeException("vector freq " + tf + " is out of bounds");
}
if (totalTermFreq != -1 && totalTermFreq != tf) {
throw new RuntimeException("vector totalTermFreq " + totalTermFreq + " != tf " + tf);
}
if (crossCheckTermVectors && postingsHasFreq) {
if (postingsDocs2.freq() != tf) {
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": freq=" + tf + " differs from postings freq=" + postingsDocs2.freq());
}
}
tfvComputedSumTotalTermFreq += tf;
if (postings != null) {
if (hasPositions || hasOffsets) {
int lastPosition = -1;
//int lastStartOffset = -1;
for (int i = 0; i < tf; i++) {
int pos = postings.nextPosition();
if (pos != -1 && pos < 0) {
throw new RuntimeException("vector position " + pos + " is out of bounds");
}
if (hasPositions) {
if (pos != -1 && pos < 0) {
throw new RuntimeException("vector position " + pos + " is out of bounds");
}
if (pos < lastPosition) {
throw new RuntimeException("vector position " + pos + " < lastPos " + lastPosition);
}
if (pos < lastPosition) {
throw new RuntimeException("vector position " + pos + " < lastPos " + lastPosition);
lastPosition = pos;
}
if (crossCheckTermVectors && postingsPostings != null) {
int postingsPos = postingsPostings.nextPosition();
if (pos != -1 && postingsPos != -1 && pos != postingsPos) {
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": pos=" + pos + " differs from postings pos=" + postingsPos);
}
}
if (hasOffsets) {
// Call the methods to at least make
// sure they don't throw exc:
final int startOffset = postings.startOffset();
final int endOffset = postings.endOffset();
// TODO: these are too anal...?
/*
if (endOffset < startOffset) {
throw new RuntimeException("vector startOffset=" + startOffset + " is > endOffset=" + endOffset);
}
if (startOffset < lastStartOffset) {
throw new RuntimeException("vector startOffset=" + startOffset + " is < prior startOffset=" + lastStartOffset);
}
lastStartOffset = startOffset;
*/
if (crossCheckTermVectors && postingsPostings != null) {
final int postingsStartOffset = postingsPostings.startOffset();
final int postingsEndOffset = postingsPostings.endOffset();
if (startOffset != -1 && postingsStartOffset != -1 && startOffset != postingsStartOffset) {
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": startOffset=" + startOffset + " differs from postings startOffset=" + postingsStartOffset);
}
if (endOffset != -1 && postingsEndOffset != -1 && endOffset != postingsEndOffset) {
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": endOffset=" + endOffset + " differs from postings endOffset=" + postingsEndOffset);
}
}
}
lastPosition = pos;
}
}
}
if (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
if (docs2.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
throw new RuntimeException("vector for doc " + j + " references multiple documents!");
}
}
@ -1474,6 +1640,7 @@ public class CheckIndex {
public static void main(String[] args) throws IOException, InterruptedException {
boolean doFix = false;
boolean doCrossCheckTermVectors = false;
Codec codec = Codec.getDefault(); // only used when fixing
boolean verbose = false;
List<String> onlySegments = new ArrayList<String>();
@ -1484,6 +1651,8 @@ public class CheckIndex {
String arg = args[i];
if ("-fix".equals(arg)) {
doFix = true;
} else if ("-crossCheckTermVectors".equals(arg)) {
doCrossCheckTermVectors = true;
} else if ("-codec".equals(arg)) {
if (i == args.length-1) {
System.out.println("ERROR: missing name for -codec option");
@ -1519,9 +1688,10 @@ public class CheckIndex {
if (indexPath == null) {
System.out.println("\nERROR: index path not specified");
System.out.println("\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-fix] [-segment X] [-segment Y] [-dir-impl X]\n" +
System.out.println("\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-fix] [-crossCheckTermVectors] [-segment X] [-segment Y] [-dir-impl X]\n" +
"\n" +
" -fix: actually write a new segments_N file, removing any problematic segments\n" +
" -crossCheckTermVectors: verifies that term vectors match postings; THIS IS VERY SLOW!\n" +
" -codec X: when fixing, codec to write the new segments_N file with\n" +
" -verbose: print additional details\n" +
" -segment X: only check the specified segments. This can be specified multiple\n" +
@ -1570,6 +1740,7 @@ public class CheckIndex {
}
CheckIndex checker = new CheckIndex(dir);
checker.setCrossCheckTermVectors(doCrossCheckTermVectors);
checker.setInfoStream(System.out, verbose);
Status result = checker.checkIndex(onlySegments);

View File

@ -73,8 +73,9 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
// tokenized.
if (field.fieldType().indexed() && doInvert) {
if (i > 0)
if (i > 0) {
fieldState.position += docState.analyzer == null ? 0 : docState.analyzer.getPositionIncrementGap(fieldInfo.name);
}
final TokenStream stream = field.tokenStream(docState.analyzer);
// reset the TokenStream to the first token

View File

@ -655,8 +655,8 @@ public class DocTermOrds {
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
return termsEnum.docsAndPositions(liveDocs, reuse);
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
return termsEnum.docsAndPositions(liveDocs, reuse, needsOffsets);
}
@Override

View File

@ -26,9 +26,20 @@ public abstract class DocsAndPositionsEnum extends DocsEnum {
/** Returns the next position. You should only call this
* up to {@link DocsEnum#freq()} times else
* the behavior is not defined. */
* the behavior is not defined. If positions were not
* indexed this will return -1; this only happens if
* offsets were indexed and you passed needsOffset=true
* when pulling the enum. */
public abstract int nextPosition() throws IOException;
/** Returns start offset for the current position, or -1
* if offsets were not indexed. */
public abstract int startOffset() throws IOException;
/** Returns end offset for the current position, or -1 if
* offsets were not indexed. */
public abstract int endOffset() throws IOException;
/** Returns the payload at this position, or null if no
* payload was indexed. Only call this once per
* position. */

View File

@ -38,13 +38,18 @@ public final class FieldInfo {
* @lucene.experimental
*/
public static enum IndexOptions {
// NOTE: order is important here; FieldInfo uses this
// order to merge two conflicting IndexOptions (always
// "downgrades" by picking the lowest).
/** only documents are indexed: term frequencies and positions are omitted */
// TODO: maybe rename to just DOCS?
DOCS_ONLY,
/** only documents and term frequencies are indexed: positions are omitted */
DOCS_AND_FREQS,
/** full postings: documents, frequencies, and positions */
DOCS_AND_FREQS_AND_POSITIONS
/** documents, frequencies and positions */
DOCS_AND_FREQS_AND_POSITIONS,
/** documents, frequencies, positions and offsets */
DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS,
};
/**
@ -67,7 +72,7 @@ public final class FieldInfo {
this.omitNorms = false;
this.indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
}
assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !storePayloads;
assert indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !storePayloads;
}
@Override
@ -95,10 +100,13 @@ public final class FieldInfo {
if (this.indexOptions != indexOptions) {
// downgrade
this.indexOptions = this.indexOptions.compareTo(indexOptions) < 0 ? this.indexOptions : indexOptions;
this.storePayloads = false;
if (this.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
// cannot store payloads if we don't store positions:
this.storePayloads = false;
}
}
}
assert this.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !this.storePayloads;
assert this.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !this.storePayloads;
}
void setDocValuesType(DocValues.Type v) {

View File

@ -185,7 +185,7 @@ public final class FieldInfos implements Iterable<FieldInfo> {
}
// mutable FIs must check!
for (FieldInfo fi : this) {
if (fi.isIndexed && fi.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
if (fi.isIndexed && fi.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
return true;
}
}
@ -430,7 +430,7 @@ public final class FieldInfos implements Iterable<FieldInfo> {
FieldInfo clone = (FieldInfo) (fieldInfo).clone();
roFis.putInternal(clone);
roFis.hasVectors |= clone.storeTermVector;
roFis.hasProx |= clone.isIndexed && clone.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
roFis.hasProx |= clone.isIndexed && clone.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
roFis.hasFreq |= clone.isIndexed && clone.indexOptions != IndexOptions.DOCS_ONLY;
}
return roFis;

View File

@ -176,8 +176,8 @@ public class FilterIndexReader extends IndexReader {
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
return in.docsAndPositions(liveDocs, reuse);
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
return in.docsAndPositions(liveDocs, reuse, needsOffsets);
}
@Override
@ -258,6 +258,16 @@ public class FilterIndexReader extends IndexReader {
return in.nextPosition();
}
@Override
public int startOffset() throws IOException {
return in.startOffset();
}
@Override
public int endOffset() throws IOException {
return in.endOffset();
}
@Override
public BytesRef getPayload() throws IOException {
return in.getPayload();

View File

@ -171,8 +171,8 @@ public abstract class FilteredTermsEnum extends TermsEnum {
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits bits, DocsAndPositionsEnum reuse) throws IOException {
return tenum.docsAndPositions(bits, reuse);
public DocsAndPositionsEnum docsAndPositions(Bits bits, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
return tenum.docsAndPositions(bits, reuse, needsOffsets);
}
/** This enum does not support seeking!

View File

@ -83,7 +83,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
// Aggregate the storePayload as seen by the same
// field across multiple threads
if (fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
if (fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
fieldInfo.storePayloads |= fieldWriter.hasPayloads;
}

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import java.util.Comparator;
import java.util.Map;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsConsumer;
@ -43,7 +44,11 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
final DocumentsWriterPerThread.DocState docState;
final FieldInvertState fieldState;
IndexOptions indexOptions;
private boolean writeFreq;
private boolean writeProx;
private boolean writeOffsets;
PayloadAttribute payloadAttribute;
OffsetAttribute offsetAttribute;
public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriter parent, FieldInfo fieldInfo) {
this.termsHashPerField = termsHashPerField;
@ -51,15 +56,16 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
this.fieldInfo = fieldInfo;
docState = termsHashPerField.docState;
fieldState = termsHashPerField.fieldState;
indexOptions = fieldInfo.indexOptions;
setIndexOptions(fieldInfo.indexOptions);
}
@Override
int getStreamCount() {
if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
if (!writeProx) {
return 1;
else
} else {
return 2;
}
}
@Override
@ -74,13 +80,21 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
return fieldInfo.name.compareTo(other.fieldInfo.name);
}
// Called after flush
void reset() {
// Record, up front, whether our in-RAM format will be
// with or without term freqs:
indexOptions = fieldInfo.indexOptions;
setIndexOptions(fieldInfo.indexOptions);
payloadAttribute = null;
}
private void setIndexOptions(IndexOptions indexOptions) {
this.indexOptions = indexOptions;
writeFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
writeProx = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
writeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}
@Override
boolean start(IndexableField[] fields, int count) {
for(int i=0;i<count;i++) {
@ -98,9 +112,16 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
} else {
payloadAttribute = null;
}
if (writeOffsets) {
offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
} else {
offsetAttribute = null;
}
}
void writeProx(final int termID, int proxCode) {
//System.out.println("writeProx termID=" + termID + " proxCode=" + proxCode);
assert writeProx;
final Payload payload;
if (payloadAttribute == null) {
payload = null;
@ -113,12 +134,24 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
termsHashPerField.writeVInt(1, payload.length);
termsHashPerField.writeBytes(1, payload.data, payload.offset, payload.length);
hasPayloads = true;
} else
} else {
termsHashPerField.writeVInt(1, proxCode<<1);
}
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
postings.lastPositions[termID] = fieldState.position;
}
void writeOffsets(final int termID, int prevOffset) {
assert writeOffsets;
final int startOffset = offsetAttribute.startOffset();
final int endOffset = offsetAttribute.endOffset();
//System.out.println("writeOffsets termID=" + termID + " prevOffset=" + prevOffset + " startOff=" + startOffset + " endOff=" + endOffset);
termsHashPerField.writeVInt(1, startOffset - prevOffset);
termsHashPerField.writeVInt(1, endOffset - startOffset);
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
postings.lastOffsets[termID] = startOffset;
}
@Override
@ -129,13 +162,18 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
postings.lastDocIDs[termID] = docState.docID;
if (indexOptions == IndexOptions.DOCS_ONLY) {
if (!writeFreq) {
postings.lastDocCodes[termID] = docState.docID;
} else {
postings.lastDocCodes[termID] = docState.docID << 1;
postings.docFreqs[termID] = 1;
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
if (writeProx) {
writeProx(termID, fieldState.position);
if (writeOffsets) {
writeOffsets(termID, fieldState.offset);
}
} else {
assert !writeOffsets;
}
}
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
@ -149,9 +187,10 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
assert indexOptions == IndexOptions.DOCS_ONLY || postings.docFreqs[termID] > 0;
assert !writeFreq || postings.docFreqs[termID] > 0;
if (indexOptions == IndexOptions.DOCS_ONLY) {
if (!writeFreq) {
assert postings.docFreqs == null;
if (docState.docID != postings.lastDocIDs[termID]) {
assert docState.docID > postings.lastDocIDs[termID];
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
@ -159,59 +198,76 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
postings.lastDocIDs[termID] = docState.docID;
fieldState.uniqueTermCount++;
}
} else {
if (docState.docID != postings.lastDocIDs[termID]) {
assert docState.docID > postings.lastDocIDs[termID]:"id: "+docState.docID + " postings ID: "+ postings.lastDocIDs[termID] + " termID: "+termID;
// Term not yet seen in the current doc but previously
// seen in other doc(s) since the last flush
} else if (docState.docID != postings.lastDocIDs[termID]) {
assert docState.docID > postings.lastDocIDs[termID]:"id: "+docState.docID + " postings ID: "+ postings.lastDocIDs[termID] + " termID: "+termID;
// Term not yet seen in the current doc but previously
// seen in other doc(s) since the last flush
// Now that we know doc freq for previous doc,
// write it & lastDocCode
if (1 == postings.docFreqs[termID])
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]|1);
else {
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
termsHashPerField.writeVInt(0, postings.docFreqs[termID]);
}
postings.docFreqs[termID] = 1;
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
postings.lastDocIDs[termID] = docState.docID;
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
writeProx(termID, fieldState.position);
}
fieldState.uniqueTermCount++;
// Now that we know doc freq for previous doc,
// write it & lastDocCode
if (1 == postings.docFreqs[termID]) {
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]|1);
} else {
fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]);
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
writeProx(termID, fieldState.position-postings.lastPositions[termID]);
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
termsHashPerField.writeVInt(0, postings.docFreqs[termID]);
}
postings.docFreqs[termID] = 1;
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
postings.lastDocIDs[termID] = docState.docID;
if (writeProx) {
writeProx(termID, fieldState.position);
if (writeOffsets) {
writeOffsets(termID, fieldState.offset);
}
} else {
assert !writeOffsets;
}
fieldState.uniqueTermCount++;
} else {
fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]);
if (writeProx) {
writeProx(termID, fieldState.position-postings.lastPositions[termID]);
}
if (writeOffsets) {
writeOffsets(termID, postings.lastOffsets[termID]);
}
}
}
@Override
ParallelPostingsArray createPostingsArray(int size) {
return new FreqProxPostingsArray(size);
return new FreqProxPostingsArray(size, writeFreq, writeProx, writeOffsets);
}
static final class FreqProxPostingsArray extends ParallelPostingsArray {
public FreqProxPostingsArray(int size) {
public FreqProxPostingsArray(int size, boolean writeFreqs, boolean writeProx, boolean writeOffsets) {
super(size);
docFreqs = new int[size];
if (writeFreqs) {
docFreqs = new int[size];
}
lastDocIDs = new int[size];
lastDocCodes = new int[size];
lastPositions = new int[size];
if (writeProx) {
lastPositions = new int[size];
if (writeOffsets) {
lastOffsets = new int[size];
}
} else {
assert !writeOffsets;
}
//System.out.println("PA init freqs=" + writeFreqs + " pos=" + writeProx + " offs=" + writeOffsets);
}
int docFreqs[]; // # times this term occurs in the current doc
int lastDocIDs[]; // Last docID where this term occurred
int lastDocCodes[]; // Code for prior doc
int lastPositions[]; // Last position where this term occurred
int lastOffsets[]; // Last endOffset where this term occurred
@Override
ParallelPostingsArray newInstance(int size) {
return new FreqProxPostingsArray(size);
return new FreqProxPostingsArray(size, docFreqs != null, lastPositions != null, lastOffsets != null);
}
@Override
@ -221,15 +277,36 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
super.copyTo(toArray, numToCopy);
System.arraycopy(docFreqs, 0, to.docFreqs, 0, numToCopy);
System.arraycopy(lastDocIDs, 0, to.lastDocIDs, 0, numToCopy);
System.arraycopy(lastDocCodes, 0, to.lastDocCodes, 0, numToCopy);
System.arraycopy(lastPositions, 0, to.lastPositions, 0, numToCopy);
if (lastPositions != null) {
assert to.lastPositions != null;
System.arraycopy(lastPositions, 0, to.lastPositions, 0, numToCopy);
}
if (lastOffsets != null) {
assert to.lastOffsets != null;
System.arraycopy(lastOffsets, 0, to.lastOffsets, 0, numToCopy);
}
if (docFreqs != null) {
assert to.docFreqs != null;
System.arraycopy(docFreqs, 0, to.docFreqs, 0, numToCopy);
}
}
@Override
int bytesPerPosting() {
return ParallelPostingsArray.BYTES_PER_POSTING + 4 * RamUsageEstimator.NUM_BYTES_INT;
int bytes = ParallelPostingsArray.BYTES_PER_POSTING + 2 * RamUsageEstimator.NUM_BYTES_INT;
if (lastPositions != null) {
bytes += RamUsageEstimator.NUM_BYTES_INT;
}
if (lastOffsets != null) {
bytes += RamUsageEstimator.NUM_BYTES_INT;
}
if (docFreqs != null) {
bytes += RamUsageEstimator.NUM_BYTES_INT;
}
return bytes;
}
}
@ -246,8 +323,33 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
final TermsConsumer termsConsumer = consumer.addField(fieldInfo);
final Comparator<BytesRef> termComp = termsConsumer.getComparator();
// CONFUSING: this.indexOptions holds the index options
// that were current when we first saw this field. But
// it's possible this has changed, eg when other
// documents are indexed that cause a "downgrade" of the
// IndexOptions. So we must decode the in-RAM buffer
// according to this.indexOptions, but then write the
// new segment to the directory according to
// currentFieldIndexOptions:
final IndexOptions currentFieldIndexOptions = fieldInfo.indexOptions;
final boolean writeTermFreq = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
final boolean writePositions = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
final boolean writeOffsets = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
final boolean readTermFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
final boolean readPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
final boolean readOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
//System.out.println("flush readTF=" + readTermFreq + " readPos=" + readPositions + " readOffs=" + readOffsets);
// Make sure FieldInfo.update is working correctly!:
assert !writeTermFreq || readTermFreq;
assert !writePositions || readPositions;
assert !writeOffsets || readOffsets;
assert !writeOffsets || writePositions;
final Map<Term,Integer> segDeletes;
if (state.segDeletes != null && state.segDeletes.terms.size() > 0) {
segDeletes = state.segDeletes.terms;
@ -268,12 +370,13 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
for (int i = 0; i < numTerms; i++) {
final int termID = termIDs[i];
//System.out.println("term=" + termID);
// Get BytesRef
final int textStart = postings.textStarts[termID];
termsHashPerField.bytePool.setBytesRef(text, textStart);
termsHashPerField.initReader(freq, termID, 0);
if (fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
if (readPositions || readOffsets) {
termsHashPerField.initReader(prox, termID, 1);
}
@ -303,15 +406,18 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
int numDocs = 0;
long totTF = 0;
int docID = 0;
int termFreq = 0;
while(true) {
//System.out.println(" cycle");
final int termDocFreq;
if (freq.eof()) {
if (postings.lastDocCodes[termID] != -1) {
// Return last doc
docID = postings.lastDocIDs[termID];
if (indexOptions != IndexOptions.DOCS_ONLY) {
termFreq = postings.docFreqs[termID];
if (readTermFreq) {
termDocFreq = postings.docFreqs[termID];
} else {
termDocFreq = 0;
}
postings.lastDocCodes[termID] = -1;
} else {
@ -320,14 +426,15 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
}
} else {
final int code = freq.readVInt();
if (indexOptions == IndexOptions.DOCS_ONLY) {
if (!readTermFreq) {
docID += code;
termDocFreq = 0;
} else {
docID += code >>> 1;
if ((code & 1) != 0) {
termFreq = 1;
termDocFreq = 1;
} else {
termFreq = freq.readVInt();
termDocFreq = freq.readVInt();
}
}
@ -336,7 +443,6 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
numDocs++;
assert docID < state.numDocs: "doc=" + docID + " maxDoc=" + state.numDocs;
final int termDocFreq = termFreq;
// NOTE: we could check here if the docID was
// deleted, and skip it. However, this is somewhat
@ -362,45 +468,54 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
state.liveDocs.clear(docID);
}
if (currentFieldIndexOptions != IndexOptions.DOCS_ONLY) {
totTF += termDocFreq;
}
totTF += termDocFreq;
// Carefully copy over the prox + payload info,
// changing the format to match Lucene's segment
// format.
if (currentFieldIndexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
// we do write positions & payload
if (readPositions || readOffsets) {
// we did record positions (& maybe payload) and/or offsets
int position = 0;
int offset = 0;
for(int j=0;j<termDocFreq;j++) {
final int code = prox.readVInt();
position += code >> 1;
final int payloadLength;
final BytesRef thisPayload;
if ((code & 1) != 0) {
// This position has a payload
payloadLength = prox.readVInt();
if (readPositions) {
final int code = prox.readVInt();
position += code >> 1;
if (payload == null) {
payload = new BytesRef();
payload.bytes = new byte[payloadLength];
} else if (payload.bytes.length < payloadLength) {
payload.grow(payloadLength);
if ((code & 1) != 0) {
// This position has a payload
final int payloadLength = prox.readVInt();
if (payload == null) {
payload = new BytesRef();
payload.bytes = new byte[payloadLength];
} else if (payload.bytes.length < payloadLength) {
payload.grow(payloadLength);
}
prox.readBytes(payload.bytes, 0, payloadLength);
payload.length = payloadLength;
thisPayload = payload;
} else {
thisPayload = null;
}
prox.readBytes(payload.bytes, 0, payloadLength);
payload.length = payloadLength;
thisPayload = payload;
} else {
payloadLength = 0;
thisPayload = null;
if (readOffsets) {
final int startOffset = offset + prox.readVInt();
final int endOffset = startOffset + prox.readVInt();
offset = startOffset;
if (writePositions) {
postingsConsumer.addPosition(position, thisPayload, startOffset, endOffset);
}
} else if (writePositions) {
postingsConsumer.addPosition(position, thisPayload, -1, -1);
}
}
postingsConsumer.addPosition(position, thisPayload);
}
postingsConsumer.finishDoc();
@ -413,6 +528,4 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
termsConsumer.finish(sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality());
}
}

View File

@ -788,9 +788,9 @@ public abstract class IndexReader implements Closeable {
/** Returns {@link DocsAndPositionsEnum} for the specified
* field & term. This may return null, if either the
* field or term does not exist, or, positions were not
* indexed for this field. */
public final DocsAndPositionsEnum termPositionsEnum(Bits liveDocs, String field, BytesRef term) throws IOException {
* field or term does not exist, or needsOffsets is
* true but offsets were not indexed for this field. */
public final DocsAndPositionsEnum termPositionsEnum(Bits liveDocs, String field, BytesRef term, boolean needsOffsets) throws IOException {
assert field != null;
assert term != null;
final Fields fields = fields();
@ -799,7 +799,7 @@ public abstract class IndexReader implements Closeable {
if (terms != null) {
final TermsEnum termsEnum = terms.iterator(null);
if (termsEnum.seekExact(term, true)) {
return termsEnum.docsAndPositions(liveDocs, null);
return termsEnum.docsAndPositions(liveDocs, null, needsOffsets);
}
}
}
@ -830,8 +830,9 @@ public abstract class IndexReader implements Closeable {
* Returns {@link DocsAndPositionsEnum} for the specified field and
* {@link TermState}. This may return null, if either the field or the term
* does not exists, the {@link TermState} is invalid for the underlying
* implementation, or positions were not indexed for this field. */
public final DocsAndPositionsEnum termPositionsEnum(Bits liveDocs, String field, BytesRef term, TermState state) throws IOException {
* implementation, or needsOffsets is true but offsets
* were not indexed for this field. */
public final DocsAndPositionsEnum termPositionsEnum(Bits liveDocs, String field, BytesRef term, TermState state, boolean needsOffsets) throws IOException {
assert state != null;
assert field != null;
final Fields fields = fields();
@ -840,7 +841,7 @@ public abstract class IndexReader implements Closeable {
if (terms != null) {
final TermsEnum termsEnum = terms.iterator(null);
termsEnum.seekExact(term, state);
return termsEnum.docsAndPositions(liveDocs, null);
return termsEnum.docsAndPositions(liveDocs, null, needsOffsets);
}
}
return null;

View File

@ -125,6 +125,16 @@ public final class MultiDocsAndPositionsEnum extends DocsAndPositionsEnum {
return current.nextPosition();
}
@Override
public int startOffset() throws IOException {
return current.startOffset();
}
@Override
public int endOffset() throws IOException {
return current.endOffset();
}
@Override
public boolean hasPayload() {
return current.hasPayload();

View File

@ -167,14 +167,14 @@ public final class MultiFields extends Fields {
/** Returns {@link DocsAndPositionsEnum} for the specified
* field & term. This may return null if the term does
* not exist or positions were not indexed. */
public static DocsAndPositionsEnum getTermPositionsEnum(IndexReader r, Bits liveDocs, String field, BytesRef term) throws IOException {
public static DocsAndPositionsEnum getTermPositionsEnum(IndexReader r, Bits liveDocs, String field, BytesRef term, boolean needsOffsets) throws IOException {
assert field != null;
assert term != null;
final Terms terms = getTerms(r, field);
if (terms != null) {
final TermsEnum termsEnum = terms.iterator(null);
if (termsEnum.seekExact(term, true)) {
return termsEnum.docsAndPositions(liveDocs, null);
return termsEnum.docsAndPositions(liveDocs, null, needsOffsets);
}
}
return null;

View File

@ -418,7 +418,7 @@ public final class MultiTermsEnum extends TermsEnum {
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
MultiDocsAndPositionsEnum docsAndPositionsEnum;
// Can only reuse if incoming enum is also a MultiDocsAndPositionsEnum
if (reuse != null && reuse instanceof MultiDocsAndPositionsEnum) {
@ -469,7 +469,7 @@ public final class MultiTermsEnum extends TermsEnum {
}
assert entry.index < docsAndPositionsEnum.subDocsAndPositionsEnum.length: entry.index + " vs " + docsAndPositionsEnum.subDocsAndPositionsEnum.length + "; " + subs.length;
final DocsAndPositionsEnum subPostings = entry.terms.docsAndPositions(b, docsAndPositionsEnum.subDocsAndPositionsEnum[entry.index]);
final DocsAndPositionsEnum subPostings = entry.terms.docsAndPositions(b, docsAndPositionsEnum.subDocsAndPositionsEnum[entry.index], needsOffsets);
if (subPostings != null) {
docsAndPositionsEnum.subDocsAndPositionsEnum[entry.index] = subPostings;
@ -479,8 +479,8 @@ public final class MultiTermsEnum extends TermsEnum {
} else {
if (entry.terms.docs(b, null, false) != null) {
// At least one of our subs does not store
// positions -- we can't correctly produce a
// MultiDocsAndPositions enum
// offsets or positions -- we can't correctly
// produce a MultiDocsAndPositions enum
return null;
}
}

View File

@ -44,7 +44,7 @@ public final class Term implements Comparable<Term> {
field = fld;
this.bytes = bytes;
}
/** Constructs a Term with the given field and text.
* <p>Note that a null field or null text value results in undefined
* behavior for most Lucene APIs that accept a Term parameter. */
@ -132,4 +132,8 @@ public final class Term implements Comparable<Term> {
@Override
public final String toString() { return field + ":" + bytes.utf8ToString(); }
public Term deepCopyOf() {
return new Term(field, BytesRef.deepCopyOf(bytes));
}
}

View File

@ -38,7 +38,7 @@ final class TermVectorsConsumerPerField extends TermsHashConsumerPerField {
boolean doVectorOffsets;
int maxNumPostings;
OffsetAttribute offsetAttribute = null;
OffsetAttribute offsetAttribute;
public TermVectorsConsumerPerField(TermsHashPerField termsHashPerField, TermVectorsConsumer termsWriter, FieldInfo fieldInfo) {
this.termsHashPerField = termsHashPerField;

View File

@ -160,12 +160,13 @@ public abstract class TermsEnum {
/** Get {@link DocsAndPositionsEnum} for the current term.
* Do not call this when the enum is unpositioned.
* This method will only return null if positions were
* not indexed into the postings by this codec.
* This method will only return null if needsOffsets is
* true but offsets were not indexed.
* @param liveDocs unset bits are documents that should not
* be returned
* @param reuse pass a prior DocsAndPositionsEnum for possible reuse */
public abstract DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException;
* @param reuse pass a prior DocsAndPositionsEnum for possible reuse
* @param needsOffsets true if offsets are required */
public abstract DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException;
/**
* Expert: Returns the TermsEnums internal state to position the TermsEnum
@ -238,7 +239,7 @@ public abstract class TermsEnum {
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) {
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) {
throw new IllegalStateException("this method should never be called");
}

View File

@ -293,7 +293,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
@Override
public int[] init() {
if(perField.postingsArray == null) {
if (perField.postingsArray == null) {
perField.postingsArray = perField.consumer.createPostingsArray(2);
bytesUsed.addAndGet(perField.postingsArray.size * perField.postingsArray.bytesPerPosting());
}
@ -305,8 +305,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
ParallelPostingsArray postingsArray = perField.postingsArray;
final int oldSize = perField.postingsArray.size;
postingsArray = perField.postingsArray = postingsArray.grow();
bytesUsed
.addAndGet((postingsArray.bytesPerPosting() * (postingsArray.size - oldSize)));
bytesUsed.addAndGet((postingsArray.bytesPerPosting() * (postingsArray.size - oldSize)));
return postingsArray.textStarts;
}

View File

@ -1043,7 +1043,7 @@ class FieldCacheImpl implements FieldCache {
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
throw new UnsupportedOperationException();
}

View File

@ -272,8 +272,8 @@ public final class FuzzyTermsEnum extends TermsEnum {
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs,
DocsAndPositionsEnum reuse) throws IOException {
return actualEnum.docsAndPositions(liveDocs, reuse);
DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
return actualEnum.docsAndPositions(liveDocs, reuse, needsOffsets);
}
@Override

View File

@ -225,7 +225,7 @@ public class MultiPhraseQuery extends Query {
return null;
}
termsEnum.seekExact(term.bytes(), termState);
postingsEnum = termsEnum.docsAndPositions(liveDocs, null);
postingsEnum = termsEnum.docsAndPositions(liveDocs, null, false);
if (postingsEnum == null) {
// term does exist, but has no positions
@ -475,7 +475,7 @@ class UnionDocsAndPositionsEnum extends DocsAndPositionsEnum {
continue;
}
termsEnum.seekExact(term.bytes(), termState);
DocsAndPositionsEnum postings = termsEnum.docsAndPositions(liveDocs, null);
DocsAndPositionsEnum postings = termsEnum.docsAndPositions(liveDocs, null, false);
if (postings == null) {
// term does exist, but has no positions
throw new IllegalStateException("field \"" + term.field() + "\" was indexed without position data; cannot run PhraseQuery (term=" + term.text() + ")");
@ -527,6 +527,16 @@ class UnionDocsAndPositionsEnum extends DocsAndPositionsEnum {
return _posList.next();
}
@Override
public int startOffset() {
return -1;
}
@Override
public int endOffset() {
return -1;
}
@Override
public BytesRef getPayload() {
throw new UnsupportedOperationException();

View File

@ -239,7 +239,7 @@ public class PhraseQuery extends Query {
return null;
}
te.seekExact(t.bytes(), state);
DocsAndPositionsEnum postingsEnum = te.docsAndPositions(liveDocs, null);
DocsAndPositionsEnum postingsEnum = te.docsAndPositions(liveDocs, null, false);
// PhraseQuery on a field that did not index
// positions.

View File

@ -120,7 +120,7 @@ public class SpanTermQuery extends SpanQuery {
final TermsEnum termsEnum = context.reader.terms(term.field()).iterator(null);
termsEnum.seekExact(term.bytes(), state);
final DocsAndPositionsEnum postings = termsEnum.docsAndPositions(acceptDocs, null);
final DocsAndPositionsEnum postings = termsEnum.docsAndPositions(acceptDocs, null, false);
if (postings != null) {
return new TermSpans(postings, term);

View File

@ -0,0 +1,73 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
public class CannedAnalyzer extends Analyzer {
private final Token[] tokens;
public CannedAnalyzer(Token[] tokens) {
this.tokens = tokens;
}
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
return new TokenStreamComponents(new CannedTokenizer(tokens));
}
public static class CannedTokenizer extends Tokenizer {
private final Token[] tokens;
private int upto = 0;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
public CannedTokenizer(Token[] tokens) {
this.tokens = tokens;
}
@Override
public final boolean incrementToken() throws IOException {
if (upto < tokens.length) {
final Token token = tokens[upto++];
// TODO: can we just capture/restoreState so
// we get all attrs...?
clearAttributes();
termAtt.setEmpty();
termAtt.append(token.toString());
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
offsetAtt.setOffset(token.startOffset(), token.endOffset());
return true;
} else {
return false;
}
}
@Override
public void reset() throws IOException {
super.reset();
this.upto = 0;
}
}
}

View File

@ -88,6 +88,9 @@ class PreFlexFieldsWriter extends FieldsConsumer {
@Override
public TermsConsumer addField(FieldInfo field) throws IOException {
assert field.number != -1;
if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
throw new IllegalArgumentException("this codec cannot index offsets");
}
//System.out.println("w field=" + field.name + " storePayload=" + field.storePayloads + " number=" + field.number);
return new PreFlexTermsWriter(field);
}
@ -157,8 +160,10 @@ class PreFlexFieldsWriter extends FieldsConsumer {
}
@Override
public void addPosition(int position, BytesRef payload) throws IOException {
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
assert proxOut != null;
assert startOffset == -1;
assert endOffset == -1;
//System.out.println(" w pos=" + position + " payl=" + payload);
final int delta = position - lastPosition;

View File

@ -37,6 +37,7 @@ import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames;
@ -197,6 +198,9 @@ public class RAMOnlyPostingsFormat extends PostingsFormat {
@Override
public TermsConsumer addField(FieldInfo field) {
if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
throw new IllegalArgumentException("this codec cannot index offsets");
}
RAMField ramField = new RAMField(field.name);
postings.fieldToTerms.put(field.name, ramField);
termsConsumer.reset(ramField);
@ -265,7 +269,9 @@ public class RAMOnlyPostingsFormat extends PostingsFormat {
}
@Override
public void addPosition(int position, BytesRef payload) {
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) {
assert startOffset == -1;
assert endOffset == -1;
current.positions[posUpto] = position;
if (payload != null && payload.length > 0) {
if (current.payloads == null) {
@ -388,7 +394,10 @@ public class RAMOnlyPostingsFormat extends PostingsFormat {
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) {
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) {
if (needsOffsets) {
return null;
}
return new RAMDocsAndPositionsEnum(ramField.termToDocs.get(current), liveDocs);
}
}
@ -493,6 +502,16 @@ public class RAMOnlyPostingsFormat extends PostingsFormat {
return current.positions[posUpto++];
}
@Override
public int startOffset() {
return -1;
}
@Override
public int endOffset() {
return -1;
}
@Override
public boolean hasPayload() {
return current.payloads != null && current.payloads[posUpto-1] != null;

View File

@ -122,6 +122,10 @@ public class RandomIndexWriter implements Closeable {
* @see IndexWriter#addDocument(Iterable)
*/
public <T extends IndexableField> void addDocument(final Iterable<T> doc) throws IOException {
addDocument(doc, w.getAnalyzer());
}
public <T extends IndexableField> void addDocument(final Iterable<T> doc, Analyzer a) throws IOException {
if (doDocValues && doc instanceof Document) {
randomPerDocFieldValues(r, (Document) doc);
}
@ -157,9 +161,9 @@ public class RandomIndexWriter implements Closeable {
}
};
}
});
}, a);
} else {
w.addDocument(doc);
w.addDocument(doc, a);
}
maybeCommit();

View File

@ -1106,6 +1106,10 @@ public abstract class LuceneTestCase extends Assert {
return new Field(name, value, type);
}
// TODO: once all core & test codecs can index
// offsets, sometimes randomly turn on offsets if we are
// already indexing positions...
FieldType newType = new FieldType(type);
if (!newType.stored() && random.nextBoolean()) {
newType.setStored(true); // randomly store it

View File

@ -157,6 +157,7 @@ public class _TestUtil {
public static CheckIndex.Status checkIndex(Directory dir) throws IOException {
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
CheckIndex checker = new CheckIndex(dir);
checker.setCrossCheckTermVectors(true);
checker.setInfoStream(new PrintStream(bos), false);
CheckIndex.Status indexStatus = checker.checkIndex(null);
if (indexStatus == null || indexStatus.clean == false) {
@ -567,7 +568,10 @@ public class _TestUtil {
if (random.nextBoolean()) {
if (random.nextBoolean()) {
// TODO: cast re-use to D&PE if we can...?
final DocsAndPositionsEnum docsAndPositions = termsEnum.docsAndPositions(liveDocs, null);
DocsAndPositionsEnum docsAndPositions = termsEnum.docsAndPositions(liveDocs, null, true);
if (docsAndPositions == null) {
docsAndPositions = termsEnum.docsAndPositions(liveDocs, null, false);
}
if (docsAndPositions != null) {
return docsAndPositions;
}
@ -586,7 +590,10 @@ public class _TestUtil {
if (random.nextBoolean()) {
if (random.nextBoolean()) {
// TODO: cast re-use to D&PE if we can...?
final DocsAndPositionsEnum docsAndPositions = termsEnum.docsAndPositions(liveDocs, null);
DocsAndPositionsEnum docsAndPositions = termsEnum.docsAndPositions(liveDocs, null, true);
if (docsAndPositions == null) {
docsAndPositions = termsEnum.docsAndPositions(liveDocs, null, false);
}
if (docsAndPositions != null) {
return docsAndPositions;
}

View File

@ -74,7 +74,8 @@ public class TestCachingTokenFilter extends BaseTokenStreamTestCase {
DocsAndPositionsEnum termPositions = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
"preanalyzed",
new BytesRef("term1"));
new BytesRef("term1"),
false);
assertTrue(termPositions.nextDoc() != termPositions.NO_MORE_DOCS);
assertEquals(1, termPositions.freq());
assertEquals(0, termPositions.nextPosition());
@ -82,7 +83,8 @@ public class TestCachingTokenFilter extends BaseTokenStreamTestCase {
termPositions = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
"preanalyzed",
new BytesRef("term2"));
new BytesRef("term2"),
false);
assertTrue(termPositions.nextDoc() != termPositions.NO_MORE_DOCS);
assertEquals(2, termPositions.freq());
assertEquals(1, termPositions.nextPosition());
@ -91,7 +93,8 @@ public class TestCachingTokenFilter extends BaseTokenStreamTestCase {
termPositions = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
"preanalyzed",
new BytesRef("term3"));
new BytesRef("term3"),
false);
assertTrue(termPositions.nextDoc() != termPositions.NO_MORE_DOCS);
assertEquals(1, termPositions.freq());
assertEquals(2, termPositions.nextPosition());

View File

@ -23,7 +23,6 @@ import java.util.Map;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.nestedpulsing.NestedPulsingPostingsFormat;
import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
@ -70,7 +69,7 @@ public class TestPulsingReuse extends LuceneTestCase {
DocsAndPositionsEnum posReuse = null;
te = segment.terms("foo").iterator(null);
while (te.next() != null) {
posReuse = te.docsAndPositions(null, posReuse);
posReuse = te.docsAndPositions(null, posReuse, false);
allEnums.put(posReuse, true);
}
@ -112,7 +111,7 @@ public class TestPulsingReuse extends LuceneTestCase {
DocsAndPositionsEnum posReuse = null;
te = segment.terms("foo").iterator(null);
while (te.next() != null) {
posReuse = te.docsAndPositions(null, posReuse);
posReuse = te.docsAndPositions(null, posReuse, false);
allEnums.put(posReuse, true);
}

View File

@ -347,7 +347,7 @@ public class TestDocument extends LuceneTestCase {
assertEquals(2, tvs.getUniqueTermCount());
TermsEnum tvsEnum = tvs.iterator(null);
assertEquals(new BytesRef("abc"), tvsEnum.next());
final DocsAndPositionsEnum dpEnum = tvsEnum.docsAndPositions(null, null);
final DocsAndPositionsEnum dpEnum = tvsEnum.docsAndPositions(null, null, false);
if (field.equals("tv")) {
assertNull(dpEnum);
} else {

View File

@ -166,7 +166,7 @@ public class TestCodecs extends LuceneTestCase {
totTF += positions[i].length;
for(int j=0;j<positions[i].length;j++) {
final PositionData pos = positions[i][j];
postingsConsumer.addPosition(pos.pos, pos.payload);
postingsConsumer.addPosition(pos.pos, pos.payload, -1, -1);
}
postingsConsumer.finishDoc();
}
@ -480,7 +480,7 @@ public class TestCodecs extends LuceneTestCase {
if (field.omitTF) {
this.verifyDocs(term.docs, term.positions, _TestUtil.docs(random, termsEnum, null, null, false), false);
} else {
this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null), true);
this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null, false), true);
}
// Test random seek by ord:
@ -500,7 +500,7 @@ public class TestCodecs extends LuceneTestCase {
if (field.omitTF) {
this.verifyDocs(term.docs, term.positions, _TestUtil.docs(random, termsEnum, null, null, false), false);
} else {
this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null), true);
this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null, false), true);
}
}
@ -552,7 +552,7 @@ public class TestCodecs extends LuceneTestCase {
final DocsEnum docsAndFreqs;
final DocsAndPositionsEnum postings;
if (!field.omitTF) {
postings = termsEnum.docsAndPositions(null, null);
postings = termsEnum.docsAndPositions(null, null, false);
if (postings != null) {
docs = docsAndFreqs = postings;
} else {

View File

@ -234,7 +234,7 @@ public class TestDoc extends LuceneTestCase {
out.print(" term=" + field + ":" + tis.term());
out.println(" DF=" + tis.docFreq());
DocsAndPositionsEnum positions = tis.docsAndPositions(reader.getLiveDocs(), null);
DocsAndPositionsEnum positions = tis.docsAndPositions(reader.getLiveDocs(), null, false);
while (positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
out.print(" doc=" + positions.docID());

View File

@ -96,7 +96,7 @@ public class TestDocsAndPositions extends LuceneTestCase {
public DocsAndPositionsEnum getDocsAndPositions(IndexReader reader,
BytesRef bytes, Bits liveDocs) throws IOException {
return reader.termPositionsEnum(null, fieldName, bytes);
return reader.termPositionsEnum(null, fieldName, bytes, false);
}
/**
@ -358,7 +358,7 @@ public class TestDocsAndPositions extends LuceneTestCase {
writer.addDocument(doc);
IndexReader reader = writer.getReader();
IndexReader r = getOnlySegmentReader(reader);
DocsAndPositionsEnum disi = r.termPositionsEnum(null, "foo", new BytesRef("bar"));
DocsAndPositionsEnum disi = r.termPositionsEnum(null, "foo", new BytesRef("bar"), false);
int docid = disi.docID();
assertTrue(docid == -1 || docid == DocIdSetIterator.NO_MORE_DOCS);
assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
@ -366,7 +366,7 @@ public class TestDocsAndPositions extends LuceneTestCase {
// now reuse and check again
TermsEnum te = r.terms("foo").iterator(null);
assertTrue(te.seekExact(new BytesRef("bar"), true));
disi = te.docsAndPositions(null, disi);
disi = te.docsAndPositions(null, disi, false);
docid = disi.docID();
assertTrue(docid == -1 || docid == DocIdSetIterator.NO_MORE_DOCS);
assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);

View File

@ -128,7 +128,7 @@ public class TestDocumentWriter extends LuceneTestCase {
SegmentReader reader = new SegmentReader(info, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR, newIOContext(random));
DocsAndPositionsEnum termPositions = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader),
"repeated", new BytesRef("repeated"));
"repeated", new BytesRef("repeated"), false);
assertTrue(termPositions.nextDoc() != termPositions.NO_MORE_DOCS);
int freq = termPositions.freq();
assertEquals(2, freq);
@ -199,7 +199,7 @@ public class TestDocumentWriter extends LuceneTestCase {
writer.close();
SegmentReader reader = new SegmentReader(info, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR, newIOContext(random));
DocsAndPositionsEnum termPositions = MultiFields.getTermPositionsEnum(reader, reader.getLiveDocs(), "f1", new BytesRef("a"));
DocsAndPositionsEnum termPositions = MultiFields.getTermPositionsEnum(reader, reader.getLiveDocs(), "f1", new BytesRef("a"), false);
assertTrue(termPositions.nextDoc() != termPositions.NO_MORE_DOCS);
int freq = termPositions.freq();
assertEquals(3, freq);
@ -243,18 +243,18 @@ public class TestDocumentWriter extends LuceneTestCase {
writer.close();
SegmentReader reader = new SegmentReader(info, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR, newIOContext(random));
DocsAndPositionsEnum termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term1"));
DocsAndPositionsEnum termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term1"), false);
assertTrue(termPositions.nextDoc() != termPositions.NO_MORE_DOCS);
assertEquals(1, termPositions.freq());
assertEquals(0, termPositions.nextPosition());
termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term2"));
termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term2"), false);
assertTrue(termPositions.nextDoc() != termPositions.NO_MORE_DOCS);
assertEquals(2, termPositions.freq());
assertEquals(1, termPositions.nextPosition());
assertEquals(3, termPositions.nextPosition());
termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term3"));
termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term3"), false);
assertTrue(termPositions.nextDoc() != termPositions.NO_MORE_DOCS);
assertEquals(1, termPositions.freq());
assertEquals(2, termPositions.nextPosition());

View File

@ -260,17 +260,17 @@ public class TestDuelingCodecs extends LuceneTestCase {
assertEquals(info, term, rightTermsEnum.next());
assertTermStats(leftTermsEnum, rightTermsEnum);
if (deep) {
assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions),
rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions));
assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions),
rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions));
assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions, false),
rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions, false));
assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions, false),
rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions, false));
assertPositionsSkipping(leftTermsEnum.docFreq(),
leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions),
rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions));
leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions, false),
rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions, false));
assertPositionsSkipping(leftTermsEnum.docFreq(),
leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions),
rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions));
leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions, false),
rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions, false));
// with freqs:
assertDocsEnum(leftDocs = leftTermsEnum.docs(null, leftDocs, true),
@ -341,6 +341,8 @@ public class TestDuelingCodecs extends LuceneTestCase {
for (int i = 0; i < freq; i++) {
assertEquals(info, leftDocs.nextPosition(), rightDocs.nextPosition());
assertEquals(info, leftDocs.hasPayload(), rightDocs.hasPayload());
assertEquals(info, leftDocs.startOffset(), rightDocs.startOffset());
assertEquals(info, leftDocs.endOffset(), rightDocs.endOffset());
if (leftDocs.hasPayload()) {
assertEquals(info, leftDocs.getPayload(), rightDocs.getPayload());
}

View File

@ -90,8 +90,8 @@ public class TestFilterIndexReader extends LuceneTestCase {
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
return new TestPositions(super.docsAndPositions(liveDocs, reuse == null ? null : ((FilterDocsAndPositionsEnum) reuse).in));
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
return new TestPositions(super.docsAndPositions(liveDocs, reuse == null ? null : ((FilterDocsAndPositionsEnum) reuse).in, needsOffsets));
}
}
@ -166,7 +166,7 @@ public class TestFilterIndexReader extends LuceneTestCase {
assertEquals(TermsEnum.SeekStatus.FOUND, terms.seekCeil(new BytesRef("one")));
DocsAndPositionsEnum positions = terms.docsAndPositions(MultiFields.getLiveDocs(reader),
null);
null, false);
while (positions.nextDoc() != DocsEnum.NO_MORE_DOCS) {
assertTrue((positions.docID() % 2) == 1);
}

View File

@ -603,8 +603,8 @@ public class TestIndexReader extends LuceneTestCase {
while(enum1.next() != null) {
assertEquals("Different terms", enum1.term(), enum2.next());
DocsAndPositionsEnum tp1 = enum1.docsAndPositions(liveDocs, null);
DocsAndPositionsEnum tp2 = enum2.docsAndPositions(liveDocs, null);
DocsAndPositionsEnum tp1 = enum1.docsAndPositions(liveDocs, null, false);
DocsAndPositionsEnum tp2 = enum2.docsAndPositions(liveDocs, null, false);
while(tp1.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
assertTrue(tp2.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);

View File

@ -39,6 +39,7 @@ import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.IndexSearcher;
@ -899,7 +900,8 @@ public class TestIndexWriter extends LuceneTestCase {
DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(s.getIndexReader(),
MultiFields.getLiveDocs(s.getIndexReader()),
"field",
new BytesRef("a"));
new BytesRef("a"),
false);
assertTrue(tps.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, tps.freq());
@ -964,14 +966,14 @@ public class TestIndexWriter extends LuceneTestCase {
Terms tpv = r.getTermVectors(0).terms("field");
TermsEnum termsEnum = tpv.iterator(null);
assertNotNull(termsEnum.next());
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, false);
assertNotNull(dpEnum);
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
assertEquals(1, dpEnum.freq());
assertEquals(100, dpEnum.nextPosition());
assertNotNull(termsEnum.next());
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
dpEnum = termsEnum.docsAndPositions(null, dpEnum, false);
assertNotNull(dpEnum);
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
assertEquals(1, dpEnum.freq());
@ -1634,7 +1636,7 @@ public class TestIndexWriter extends LuceneTestCase {
// Make sure position is still incremented when
// massive term is skipped:
DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader, null, "content", new BytesRef("another"));
DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader, null, "content", new BytesRef("another"), false);
assertEquals(0, tps.nextDoc());
assertEquals(1, tps.freq());
assertEquals(3, tps.nextPosition());
@ -1761,4 +1763,27 @@ public class TestIndexWriter extends LuceneTestCase {
w1.close();
d.close();
}
public void testChangeIndexOptions() throws Exception {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir,
new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)));
FieldType docsAndFreqs = new FieldType(TextField.TYPE_UNSTORED);
docsAndFreqs.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
FieldType docsOnly = new FieldType(TextField.TYPE_UNSTORED);
docsOnly.setIndexOptions(IndexOptions.DOCS_ONLY);
Document doc = new Document();
doc.add(new Field("field", "a b c", docsAndFreqs));
w.addDocument(doc);
w.addDocument(doc);
doc = new Document();
doc.add(new Field("field", "a b c", docsOnly));
w.addDocument(doc);
w.close();
dir.close();
}
}

View File

@ -263,14 +263,14 @@ public class TestIndexableField extends LuceneTestCase {
TermsEnum termsEnum = tfv.iterator(null);
assertEquals(new BytesRef(""+counter), termsEnum.next());
assertEquals(1, termsEnum.totalTermFreq());
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, false);
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
assertEquals(1, dpEnum.freq());
assertEquals(1, dpEnum.nextPosition());
assertEquals(new BytesRef("text"), termsEnum.next());
assertEquals(1, termsEnum.totalTermFreq());
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
dpEnum = termsEnum.docsAndPositions(null, dpEnum, false);
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
assertEquals(1, dpEnum.freq());
assertEquals(0, dpEnum.nextPosition());

View File

@ -156,7 +156,8 @@ public class TestLazyProxSkipping extends LuceneTestCase {
DocsAndPositionsEnum tp = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
this.field,
new BytesRef("b"));
new BytesRef("b"),
false);
for (int i = 0; i < 10; i++) {
tp.nextDoc();
@ -167,7 +168,8 @@ public class TestLazyProxSkipping extends LuceneTestCase {
tp = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
this.field,
new BytesRef("a"));
new BytesRef("a"),
false);
for (int i = 0; i < 10; i++) {
tp.nextDoc();

View File

@ -173,7 +173,7 @@ public class TestLongPostings extends LuceneTestCase {
System.out.println("\nTEST: iter=" + iter + " doS1=" + doS1);
}
final DocsAndPositionsEnum postings = MultiFields.getTermPositionsEnum(r, null, "field", new BytesRef(term));
final DocsAndPositionsEnum postings = MultiFields.getTermPositionsEnum(r, null, "field", new BytesRef(term), false);
int docID = -1;
while(docID < DocsEnum.NO_MORE_DOCS) {

View File

@ -86,7 +86,8 @@ public class TestMultiLevelSkipList extends LuceneTestCase {
counter = 0;
DocsAndPositionsEnum tp = reader.termPositionsEnum(reader.getLiveDocs(),
term.field(),
new BytesRef(term.text()));
new BytesRef(term.text()),
false);
checkSkipTo(tp, 14, 185); // no skips
checkSkipTo(tp, 17, 190); // one skip on level 0

View File

@ -51,7 +51,7 @@ public class TestOmitPositions extends LuceneTestCase {
IndexReader reader = w.getReader();
w.close();
assertNull(MultiFields.getTermPositionsEnum(reader, null, "foo", new BytesRef("test")));
assertNull(MultiFields.getTermPositionsEnum(reader, null, "foo", new BytesRef("test"), false));
DocsEnum de = _TestUtil.docs(random, reader, "foo", new BytesRef("test"), null, null, true);
while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {

View File

@ -155,7 +155,7 @@ public class TestPayloadProcessorProvider extends LuceneTestCase {
IndexReader reader = IndexReader.open(dir);
try {
int numPayloads = 0;
DocsAndPositionsEnum tpe = MultiFields.getTermPositionsEnum(reader, null, field, text);
DocsAndPositionsEnum tpe = MultiFields.getTermPositionsEnum(reader, null, field, text, false);
while (tpe.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
tpe.nextPosition();
if (tpe.hasPayload()) {

View File

@ -222,7 +222,8 @@ public class TestPayloads extends LuceneTestCase {
tps[i] = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
terms[i].field(),
new BytesRef(terms[i].text()));
new BytesRef(terms[i].text()),
false);
}
while (tps[0].nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
@ -259,7 +260,8 @@ public class TestPayloads extends LuceneTestCase {
DocsAndPositionsEnum tp = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
terms[0].field(),
new BytesRef(terms[0].text()));
new BytesRef(terms[0].text()),
false);
tp.nextDoc();
tp.nextPosition();
// NOTE: prior rev of this test was failing to first
@ -287,7 +289,8 @@ public class TestPayloads extends LuceneTestCase {
tp = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
terms[1].field(),
new BytesRef(terms[1].text()));
new BytesRef(terms[1].text()),
false);
tp.nextDoc();
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayload().length);
@ -330,7 +333,8 @@ public class TestPayloads extends LuceneTestCase {
tp = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
fieldName,
new BytesRef(singleTerm));
new BytesRef(singleTerm),
false);
tp.nextDoc();
tp.nextPosition();
@ -532,7 +536,7 @@ public class TestPayloads extends LuceneTestCase {
DocsAndPositionsEnum tp = null;
while (terms.next() != null) {
String termText = terms.term().utf8ToString();
tp = terms.docsAndPositions(liveDocs, tp);
tp = terms.docsAndPositions(liveDocs, tp, false);
while(tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
int freq = tp.freq();
for (int i = 0; i < freq; i++) {

View File

@ -0,0 +1,240 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.CannedAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Assume;
public class TestPostingsOffsets extends LuceneTestCase {
public void testBasic() throws Exception {
// Currently only SimpleText can index offsets into postings:
Assume.assumeTrue(Codec.getDefault().getName().equals("SimpleText"));
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random, dir);
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Token[] tokens = new Token[] {
makeToken("a", 1, 0, 6),
makeToken("b", 1, 8, 9),
makeToken("a", 1, 9, 17),
makeToken("c", 1, 19, 50),
};
doc.add(new Field("content", new CannedAnalyzer.CannedTokenizer(tokens), ft));
w.addDocument(doc, new CannedAnalyzer(tokens));
IndexReader r = w.getReader();
w.close();
DocsAndPositionsEnum dp = MultiFields.getTermPositionsEnum(r, null, "content", new BytesRef("a"), true);
assertNotNull(dp);
assertEquals(0, dp.nextDoc());
assertEquals(2, dp.freq());
assertEquals(0, dp.nextPosition());
assertEquals(0, dp.startOffset());
assertEquals(6, dp.endOffset());
assertEquals(2, dp.nextPosition());
assertEquals(9, dp.startOffset());
assertEquals(17, dp.endOffset());
assertEquals(DocsEnum.NO_MORE_DOCS, dp.nextDoc());
dp = MultiFields.getTermPositionsEnum(r, null, "content", new BytesRef("b"), true);
assertNotNull(dp);
assertEquals(0, dp.nextDoc());
assertEquals(1, dp.freq());
assertEquals(1, dp.nextPosition());
assertEquals(8, dp.startOffset());
assertEquals(9, dp.endOffset());
assertEquals(DocsEnum.NO_MORE_DOCS, dp.nextDoc());
dp = MultiFields.getTermPositionsEnum(r, null, "content", new BytesRef("c"), true);
assertNotNull(dp);
assertEquals(0, dp.nextDoc());
assertEquals(1, dp.freq());
assertEquals(3, dp.nextPosition());
assertEquals(19, dp.startOffset());
assertEquals(50, dp.endOffset());
assertEquals(DocsEnum.NO_MORE_DOCS, dp.nextDoc());
r.close();
dir.close();
}
public void testRandom() throws Exception {
// Currently only SimpleText can index offsets into postings:
Assume.assumeTrue(Codec.getDefault().getName().equals("SimpleText"));
// token -> docID -> tokens
final Map<String,Map<Integer,List<Token>>> actualTokens = new HashMap<String,Map<Integer,List<Token>>>();
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random, dir);
final int numDocs = atLeast(20);
//final int numDocs = atLeast(5);
FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
// TODO: randomize what IndexOptions we use; also test
// changing this up in one IW buffered segment...:
ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
if (random.nextBoolean()) {
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(random.nextBoolean());
ft.setStoreTermVectorPositions(random.nextBoolean());
}
for(int docCount=0;docCount<numDocs;docCount++) {
Document doc = new Document();
doc.add(new NumericField("id", docCount));
List<Token> tokens = new ArrayList<Token>();
final int numTokens = atLeast(100);
//final int numTokens = atLeast(20);
int pos = -1;
int offset = 0;
//System.out.println("doc id=" + docCount);
for(int tokenCount=0;tokenCount<numTokens;tokenCount++) {
final String text;
if (random.nextBoolean()) {
text = "a";
} else if (random.nextBoolean()) {
text = "b";
} else if (random.nextBoolean()) {
text = "c";
} else {
text = "d";
}
int posIncr = random.nextBoolean() ? 1 : random.nextInt(5);
if (tokenCount == 0 && posIncr == 0) {
posIncr = 1;
}
final int offIncr = random.nextBoolean() ? 0 : random.nextInt(5);
final int tokenOffset = random.nextInt(5);
final Token token = makeToken(text, posIncr, offset+offIncr, offset+offIncr+tokenOffset);
if (!actualTokens.containsKey(text)) {
actualTokens.put(text, new HashMap<Integer,List<Token>>());
}
final Map<Integer,List<Token>> postingsByDoc = actualTokens.get(text);
if (!postingsByDoc.containsKey(docCount)) {
postingsByDoc.put(docCount, new ArrayList<Token>());
}
postingsByDoc.get(docCount).add(token);
tokens.add(token);
pos += posIncr;
// stuff abs position into type:
token.setType(""+pos);
offset += offIncr + tokenOffset;
//System.out.println(" " + token + " posIncr=" + token.getPositionIncrement() + " pos=" + pos + " off=" + token.startOffset() + "/" + token.endOffset() + " (freq=" + postingsByDoc.get(docCount).size() + ")");
}
doc.add(new Field("content", new CannedAnalyzer.CannedTokenizer(tokens.toArray(new Token[tokens.size()])), ft));
w.addDocument(doc);
}
final IndexReader r = w.getReader();
w.close();
final String[] terms = new String[] {"a", "b", "c", "d"};
for(IndexReader sub : r.getSequentialSubReaders()) {
//System.out.println("\nsub=" + sub);
final TermsEnum termsEnum = sub.fields().terms("content").iterator(null);
DocsEnum docs = null;
DocsAndPositionsEnum docsAndPositions = null;
DocsAndPositionsEnum docsAndPositionsAndOffsets = null;
final int docIDToID[] = FieldCache.DEFAULT.getInts(sub, "id", false);
for(String term : terms) {
//System.out.println(" term=" + term);
if (termsEnum.seekExact(new BytesRef(term), random.nextBoolean())) {
docs = termsEnum.docs(null, docs, true);
assertNotNull(docs);
int doc;
//System.out.println(" doc/freq");
while((doc = docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
//System.out.println(" doc=" + docIDToID[doc] + " docID=" + doc + " " + expected.size() + " freq");
assertNotNull(expected);
assertEquals(expected.size(), docs.freq());
}
docsAndPositions = termsEnum.docsAndPositions(null, docsAndPositions, false);
assertNotNull(docsAndPositions);
//System.out.println(" doc/freq/pos");
while((doc = docsAndPositions.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
//System.out.println(" doc=" + docIDToID[doc] + " " + expected.size() + " freq");
assertNotNull(expected);
assertEquals(expected.size(), docsAndPositions.freq());
for(Token token : expected) {
int pos = Integer.parseInt(token.type());
//System.out.println(" pos=" + pos);
assertEquals(pos, docsAndPositions.nextPosition());
}
}
docsAndPositionsAndOffsets = termsEnum.docsAndPositions(null, docsAndPositions, true);
assertNotNull(docsAndPositionsAndOffsets);
//System.out.println(" doc/freq/pos/offs");
while((doc = docsAndPositions.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
//System.out.println(" doc=" + docIDToID[doc] + " " + expected.size() + " freq");
assertNotNull(expected);
assertEquals(expected.size(), docsAndPositions.freq());
for(Token token : expected) {
int pos = Integer.parseInt(token.type());
//System.out.println(" pos=" + pos);
assertEquals(pos, docsAndPositions.nextPosition());
assertEquals(token.startOffset(), docsAndPositions.startOffset());
assertEquals(token.endOffset(), docsAndPositions.endOffset());
}
}
}
}
// TODO: test advance:
}
r.close();
dir.close();
}
private Token makeToken(String text, int posIncr, int startOffset, int endOffset) {
final Token t = new Token();
t.append(text);
t.setPositionIncrement(posIncr);
t.setOffset(startOffset, endOffset);
return t;
}
}

View File

@ -148,7 +148,8 @@ public class TestSegmentReader extends LuceneTestCase {
DocsAndPositionsEnum positions = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
DocHelper.TEXT_FIELD_1_KEY,
new BytesRef("field"));
new BytesRef("field"),
false);
// NOTE: prior rev of this test was failing to first
// call next here:
assertTrue(positions.nextDoc() != DocsEnum.NO_MORE_DOCS);

View File

@ -406,7 +406,7 @@ public class TestStressIndexing2 extends LuceneTestCase {
BytesRef term2;
while((term2 = termsEnum3.next()) != null) {
System.out.println(" " + term2.utf8ToString() + ": freq=" + termsEnum3.totalTermFreq());
dpEnum = termsEnum3.docsAndPositions(null, dpEnum);
dpEnum = termsEnum3.docsAndPositions(null, dpEnum, false);
if (dpEnum != null) {
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
final int freq = dpEnum.freq();
@ -440,7 +440,7 @@ public class TestStressIndexing2 extends LuceneTestCase {
BytesRef term2;
while((term2 = termsEnum3.next()) != null) {
System.out.println(" " + term2.utf8ToString() + ": freq=" + termsEnum3.totalTermFreq());
dpEnum = termsEnum3.docsAndPositions(null, dpEnum);
dpEnum = termsEnum3.docsAndPositions(null, dpEnum, false);
if (dpEnum != null) {
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
final int freq = dpEnum.freq();
@ -630,8 +630,8 @@ public class TestStressIndexing2 extends LuceneTestCase {
assertEquals(termsEnum1.totalTermFreq(),
termsEnum2.totalTermFreq());
dpEnum1 = termsEnum1.docsAndPositions(null, dpEnum1);
dpEnum2 = termsEnum2.docsAndPositions(null, dpEnum2);
dpEnum1 = termsEnum1.docsAndPositions(null, dpEnum1, false);
dpEnum2 = termsEnum2.docsAndPositions(null, dpEnum2, false);
if (dpEnum1 != null) {
assertNotNull(dpEnum2);
int docID1 = dpEnum1.nextDoc();

View File

@ -259,7 +259,7 @@ public class TestTermVectorsReader extends LuceneTestCase {
//System.out.println("Term: " + term);
assertEquals(testTerms[i], term);
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
dpEnum = termsEnum.docsAndPositions(null, dpEnum, false);
assertNotNull(dpEnum);
int doc = dpEnum.docID();
assertTrue(doc == -1 || doc == DocIdSetIterator.NO_MORE_DOCS);
@ -270,18 +270,16 @@ public class TestTermVectorsReader extends LuceneTestCase {
}
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
doc = dpEnum.docID();
assertTrue(doc == -1 || doc == DocIdSetIterator.NO_MORE_DOCS);
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
assertNotNull(dpEnum);
final OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
assertNotNull(offsetAtt);
assertEquals(dpEnum.freq(), positions[i].length);
for (int j = 0; j < positions[i].length; j++) {
assertEquals(positions[i][j], dpEnum.nextPosition());
assertEquals(j*10, offsetAtt.startOffset());
assertEquals(j*10 + testTerms[i].length(), offsetAtt.endOffset());
assertEquals(j*10, dpEnum.startOffset());
assertEquals(j*10 + testTerms[i].length(), dpEnum.endOffset());
}
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
}
@ -315,7 +313,7 @@ public class TestTermVectorsReader extends LuceneTestCase {
String term = text.utf8ToString();
assertEquals(testTerms[i], term);
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
dpEnum = termsEnum.docsAndPositions(null, dpEnum, false);
assertNotNull(dpEnum);
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
assertEquals(dpEnum.freq(), positions[i].length);
@ -324,16 +322,14 @@ public class TestTermVectorsReader extends LuceneTestCase {
}
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
final OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
assertNotNull(offsetAtt);
assertNotNull(dpEnum);
assertEquals(dpEnum.freq(), positions[i].length);
for (int j = 0; j < positions[i].length; j++) {
assertEquals(positions[i][j], dpEnum.nextPosition());
assertEquals(j*10, offsetAtt.startOffset());
assertEquals(j*10 + testTerms[i].length(), offsetAtt.endOffset());
assertEquals(j*10, dpEnum.startOffset());
assertEquals(j*10 + testTerms[i].length(), dpEnum.endOffset());
}
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
}

View File

@ -26,7 +26,6 @@ import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
@ -69,34 +68,30 @@ public class TestTermVectorsWriter extends LuceneTestCase {
// Token "" occurred once
assertEquals(1, termsEnum.totalTermFreq());
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
assertNotNull(offsetAtt);
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true);
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
dpEnum.nextPosition();
assertEquals(8, offsetAtt.startOffset());
assertEquals(8, offsetAtt.endOffset());
assertEquals(8, dpEnum.startOffset());
assertEquals(8, dpEnum.endOffset());
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
// Token "abcd" occurred three times
assertEquals(new BytesRef("abcd"), termsEnum.next());
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
assertNotNull(offsetAtt);
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
assertEquals(3, termsEnum.totalTermFreq());
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
dpEnum.nextPosition();
assertEquals(0, offsetAtt.startOffset());
assertEquals(4, offsetAtt.endOffset());
assertEquals(0, dpEnum.startOffset());
assertEquals(4, dpEnum.endOffset());
dpEnum.nextPosition();
assertEquals(4, offsetAtt.startOffset());
assertEquals(8, offsetAtt.endOffset());
assertEquals(4, dpEnum.startOffset());
assertEquals(8, dpEnum.endOffset());
dpEnum.nextPosition();
assertEquals(8, offsetAtt.startOffset());
assertEquals(12, offsetAtt.endOffset());
assertEquals(8, dpEnum.startOffset());
assertEquals(12, dpEnum.endOffset());
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
assertNull(termsEnum.next());
@ -122,19 +117,17 @@ public class TestTermVectorsWriter extends LuceneTestCase {
IndexReader r = IndexReader.open(dir);
TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null);
assertNotNull(termsEnum.next());
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
assertNotNull(offsetAtt);
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true);
assertEquals(2, termsEnum.totalTermFreq());
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
dpEnum.nextPosition();
assertEquals(0, offsetAtt.startOffset());
assertEquals(4, offsetAtt.endOffset());
assertEquals(0, dpEnum.startOffset());
assertEquals(4, dpEnum.endOffset());
dpEnum.nextPosition();
assertEquals(5, offsetAtt.startOffset());
assertEquals(9, offsetAtt.endOffset());
assertEquals(5, dpEnum.startOffset());
assertEquals(9, dpEnum.endOffset());
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
r.close();
@ -159,19 +152,17 @@ public class TestTermVectorsWriter extends LuceneTestCase {
IndexReader r = IndexReader.open(dir);
TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null);
assertNotNull(termsEnum.next());
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
assertNotNull(offsetAtt);
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true);
assertEquals(2, termsEnum.totalTermFreq());
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
dpEnum.nextPosition();
assertEquals(0, offsetAtt.startOffset());
assertEquals(4, offsetAtt.endOffset());
assertEquals(0, dpEnum.startOffset());
assertEquals(4, dpEnum.endOffset());
dpEnum.nextPosition();
assertEquals(8, offsetAtt.startOffset());
assertEquals(12, offsetAtt.endOffset());
assertEquals(8, dpEnum.startOffset());
assertEquals(12, dpEnum.endOffset());
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
r.close();
@ -200,19 +191,17 @@ public class TestTermVectorsWriter extends LuceneTestCase {
IndexReader r = IndexReader.open(dir);
TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null);
assertNotNull(termsEnum.next());
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
assertNotNull(offsetAtt);
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true);
assertEquals(2, termsEnum.totalTermFreq());
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
dpEnum.nextPosition();
assertEquals(0, offsetAtt.startOffset());
assertEquals(4, offsetAtt.endOffset());
assertEquals(0, dpEnum.startOffset());
assertEquals(4, dpEnum.endOffset());
dpEnum.nextPosition();
assertEquals(8, offsetAtt.startOffset());
assertEquals(12, offsetAtt.endOffset());
assertEquals(8, dpEnum.startOffset());
assertEquals(12, dpEnum.endOffset());
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
r.close();
@ -238,19 +227,17 @@ public class TestTermVectorsWriter extends LuceneTestCase {
IndexReader r = IndexReader.open(dir);
TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null);
assertNotNull(termsEnum.next());
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
assertNotNull(offsetAtt);
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true);
assertEquals(2, termsEnum.totalTermFreq());
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
dpEnum.nextPosition();
assertEquals(0, offsetAtt.startOffset());
assertEquals(4, offsetAtt.endOffset());
assertEquals(0, dpEnum.startOffset());
assertEquals(4, dpEnum.endOffset());
dpEnum.nextPosition();
assertEquals(9, offsetAtt.startOffset());
assertEquals(13, offsetAtt.endOffset());
assertEquals(9, dpEnum.startOffset());
assertEquals(13, dpEnum.endOffset());
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
r.close();
@ -277,32 +264,26 @@ public class TestTermVectorsWriter extends LuceneTestCase {
IndexReader r = IndexReader.open(dir);
TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null);
assertNotNull(termsEnum.next());
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
assertNotNull(offsetAtt);
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true);
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
dpEnum.nextPosition();
assertEquals(0, offsetAtt.startOffset());
assertEquals(4, offsetAtt.endOffset());
assertEquals(0, dpEnum.startOffset());
assertEquals(4, dpEnum.endOffset());
assertNotNull(termsEnum.next());
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
assertNotNull(offsetAtt);
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
dpEnum.nextPosition();
assertEquals(11, offsetAtt.startOffset());
assertEquals(17, offsetAtt.endOffset());
assertEquals(11, dpEnum.startOffset());
assertEquals(17, dpEnum.endOffset());
assertNotNull(termsEnum.next());
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
assertNotNull(offsetAtt);
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
dpEnum.nextPosition();
assertEquals(18, offsetAtt.startOffset());
assertEquals(21, offsetAtt.endOffset());
assertEquals(18, dpEnum.startOffset());
assertEquals(21, dpEnum.endOffset());
r.close();
dir.close();
@ -328,24 +309,20 @@ public class TestTermVectorsWriter extends LuceneTestCase {
IndexReader r = IndexReader.open(dir);
TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null);
assertNotNull(termsEnum.next());
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
assertNotNull(offsetAtt);
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true);
assertEquals(1, (int) termsEnum.totalTermFreq());
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
dpEnum.nextPosition();
assertEquals(1, offsetAtt.startOffset());
assertEquals(7, offsetAtt.endOffset());
assertEquals(1, dpEnum.startOffset());
assertEquals(7, dpEnum.endOffset());
assertNotNull(termsEnum.next());
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
assertNotNull(offsetAtt);
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
dpEnum.nextPosition();
assertEquals(8, offsetAtt.startOffset());
assertEquals(11, offsetAtt.endOffset());
assertEquals(8, dpEnum.startOffset());
assertEquals(11, dpEnum.endOffset());
r.close();
dir.close();
@ -375,24 +352,20 @@ public class TestTermVectorsWriter extends LuceneTestCase {
IndexReader r = IndexReader.open(dir);
TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null);
assertNotNull(termsEnum.next());
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
assertNotNull(offsetAtt);
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true);
assertEquals(1, (int) termsEnum.totalTermFreq());
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
dpEnum.nextPosition();
assertEquals(0, offsetAtt.startOffset());
assertEquals(4, offsetAtt.endOffset());
assertEquals(0, dpEnum.startOffset());
assertEquals(4, dpEnum.endOffset());
assertNotNull(termsEnum.next());
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
assertNotNull(offsetAtt);
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
dpEnum.nextPosition();
assertEquals(6, offsetAtt.startOffset());
assertEquals(12, offsetAtt.endOffset());
assertEquals(6, dpEnum.startOffset());
assertEquals(12, dpEnum.endOffset());
r.close();

View File

@ -17,36 +17,38 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
import java.io.IOException;
import java.io.Reader;
import java.util.Collection;
import java.util.LinkedList;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CannedAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
import java.util.Collection;
import java.util.LinkedList;
import java.io.Reader;
import org.apache.lucene.util.TermContext;
/**
* This class tests the MultiPhraseQuery class.
@ -329,68 +331,18 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
indexStore.close();
}
private static class TokenAndPos {
public final String token;
public final int pos;
public TokenAndPos(String token, int pos) {
this.token = token;
this.pos = pos;
}
}
private static class CannedAnalyzer extends Analyzer {
private final TokenAndPos[] tokens;
public CannedAnalyzer(TokenAndPos[] tokens) {
this.tokens = tokens;
}
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
return new TokenStreamComponents(new CannedTokenizer(tokens));
}
}
private static class CannedTokenizer extends Tokenizer {
private final TokenAndPos[] tokens;
private int upto = 0;
private int lastPos = 0;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
public CannedTokenizer(TokenAndPos[] tokens) {
this.tokens = tokens;
}
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
if (upto < tokens.length) {
final TokenAndPos token = tokens[upto++];
termAtt.setEmpty();
termAtt.append(token.token);
posIncrAtt.setPositionIncrement(token.pos - lastPos);
lastPos = token.pos;
return true;
} else {
return false;
}
}
@Override
public void reset() throws IOException {
super.reset();
this.upto = 0;
this.lastPos = 0;
}
}
public void testZeroPosIncr() throws IOException {
Directory dir = new RAMDirectory();
final TokenAndPos[] tokens = new TokenAndPos[3];
tokens[0] = new TokenAndPos("a", 0);
tokens[1] = new TokenAndPos("b", 0);
tokens[2] = new TokenAndPos("c", 0);
final Token[] tokens = new Token[3];
tokens[0] = new Token();
tokens[0].append("a");
tokens[0].setPositionIncrement(1);
tokens[1] = new Token();
tokens[1].append("b");
tokens[1].setPositionIncrement(0);
tokens[2] = new Token();
tokens[2].append("c");
tokens[2].setPositionIncrement(0);
RandomIndexWriter writer = new RandomIndexWriter(random, dir, new CannedAnalyzer(tokens));
Document doc = new Document();
@ -429,40 +381,47 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
dir.close();
}
private final static TokenAndPos[] INCR_0_DOC_TOKENS = new TokenAndPos[] {
new TokenAndPos("x", 0),
new TokenAndPos("a", 1),
new TokenAndPos("1", 1),
new TokenAndPos("m", 2), // not existing, relying on slop=2
new TokenAndPos("b", 3),
new TokenAndPos("1", 3),
new TokenAndPos("n", 4), // not existing, relying on slop=2
new TokenAndPos("c", 5),
new TokenAndPos("y", 6)
private static Token makeToken(String text, int posIncr) {
final Token t = new Token();
t.append(text);
t.setPositionIncrement(posIncr);
return t;
}
private final static Token[] INCR_0_DOC_TOKENS = new Token[] {
makeToken("x", 1),
makeToken("a", 1),
makeToken("1", 0),
makeToken("m", 1), // not existing, relying on slop=2
makeToken("b", 1),
makeToken("1", 0),
makeToken("n", 1), // not existing, relying on slop=2
makeToken("c", 1),
makeToken("y", 1)
};
private final static TokenAndPos[] INCR_0_QUERY_TOKENS_AND = new TokenAndPos[] {
new TokenAndPos("a", 0),
new TokenAndPos("1", 0),
new TokenAndPos("b", 1),
new TokenAndPos("1", 1),
new TokenAndPos("c", 2)
private final static Token[] INCR_0_QUERY_TOKENS_AND = new Token[] {
makeToken("a", 1),
makeToken("1", 0),
makeToken("b", 1),
makeToken("1", 0),
makeToken("c", 1)
};
private final static TokenAndPos[][] INCR_0_QUERY_TOKENS_AND_OR_MATCH = new TokenAndPos[][] {
{ new TokenAndPos("a", 0) },
{ new TokenAndPos("x", 0), new TokenAndPos("1", 0) },
{ new TokenAndPos("b", 1) },
{ new TokenAndPos("x", 1), new TokenAndPos("1", 1) },
{ new TokenAndPos("c", 2) }
private final static Token[][] INCR_0_QUERY_TOKENS_AND_OR_MATCH = new Token[][] {
{ makeToken("a", 1) },
{ makeToken("x", 1), makeToken("1", 0) },
{ makeToken("b", 2) },
{ makeToken("x", 2), makeToken("1", 0) },
{ makeToken("c", 3) }
};
private final static TokenAndPos[][] INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN = new TokenAndPos[][] {
{ new TokenAndPos("x", 0) },
{ new TokenAndPos("a", 0), new TokenAndPos("1", 0) },
{ new TokenAndPos("x", 1) },
{ new TokenAndPos("b", 1), new TokenAndPos("1", 1) },
{ new TokenAndPos("c", 2) }
private final static Token[][] INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN = new Token[][] {
{ makeToken("x", 1) },
{ makeToken("a", 1), makeToken("1", 0) },
{ makeToken("x", 2) },
{ makeToken("b", 2), makeToken("1", 0) },
{ makeToken("c", 3) }
};
/**
@ -515,8 +474,10 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
*/
public void testZeroPosIncrSloppyPqAnd() throws IOException {
final PhraseQuery pq = new PhraseQuery();
for (TokenAndPos tap : INCR_0_QUERY_TOKENS_AND) {
pq.add(new Term("field",tap.token), tap.pos);
int pos = -1;
for (Token tap : INCR_0_QUERY_TOKENS_AND) {
pos += tap.getPositionIncrement();
pq.add(new Term("field",tap.toString()), pos);
}
doTestZeroPosIncrSloppy(pq, 0);
pq.setSlop(1);
@ -530,8 +491,10 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
*/
public void testZeroPosIncrSloppyMpqAnd() throws IOException {
final MultiPhraseQuery mpq = new MultiPhraseQuery();
for (TokenAndPos tap : INCR_0_QUERY_TOKENS_AND) {
mpq.add(new Term[]{new Term("field",tap.token)}, tap.pos); //AND logic
int pos = -1;
for (Token tap : INCR_0_QUERY_TOKENS_AND) {
pos += tap.getPositionIncrement();
mpq.add(new Term[]{new Term("field",tap.toString())}, pos); //AND logic
}
doTestZeroPosIncrSloppy(mpq, 0);
mpq.setSlop(1);
@ -545,9 +508,9 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
*/
public void testZeroPosIncrSloppyMpqAndOrMatch() throws IOException {
final MultiPhraseQuery mpq = new MultiPhraseQuery();
for (TokenAndPos tap[] : INCR_0_QUERY_TOKENS_AND_OR_MATCH) {
for (Token tap[] : INCR_0_QUERY_TOKENS_AND_OR_MATCH) {
Term[] terms = tapTerms(tap);
final int pos = tap[0].pos;
final int pos = tap[0].getPositionIncrement()-1;
mpq.add(terms, pos); //AND logic in pos, OR across lines
}
doTestZeroPosIncrSloppy(mpq, 0);
@ -562,9 +525,9 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
*/
public void testZeroPosIncrSloppyMpqAndOrNoMatch() throws IOException {
final MultiPhraseQuery mpq = new MultiPhraseQuery();
for (TokenAndPos tap[] : INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN) {
for (Token tap[] : INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN) {
Term[] terms = tapTerms(tap);
final int pos = tap[0].pos;
final int pos = tap[0].getPositionIncrement()-1;
mpq.add(terms, pos); //AND logic in pos, OR across lines
}
doTestZeroPosIncrSloppy(mpq, 0);
@ -572,10 +535,10 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
doTestZeroPosIncrSloppy(mpq, 0);
}
private Term[] tapTerms(TokenAndPos[] tap) {
private Term[] tapTerms(Token[] tap) {
Term[] terms = new Term[tap.length];
for (int i=0; i<terms.length; i++) {
terms[i] = new Term("field",tap[i].token);
terms[i] = new Term("field",tap[i].toString());
}
return terms;
}

View File

@ -42,8 +42,6 @@ import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.BytesRef;
/**
@ -102,7 +100,8 @@ public class TestPositionIncrement extends LuceneTestCase {
DocsAndPositionsEnum pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(),
MultiFields.getLiveDocs(searcher.getIndexReader()),
"field",
new BytesRef("1"));
new BytesRef("1"),
false);
pos.nextDoc();
// first token should be at position 0
assertEquals(0, pos.nextPosition());
@ -110,7 +109,8 @@ public class TestPositionIncrement extends LuceneTestCase {
pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(),
MultiFields.getLiveDocs(searcher.getIndexReader()),
"field",
new BytesRef("2"));
new BytesRef("2"),
false);
pos.nextDoc();
// second token should be at position 2
assertEquals(2, pos.nextPosition());
@ -200,10 +200,6 @@ public class TestPositionIncrement extends LuceneTestCase {
store.close();
}
// stoplist that accepts case-insensitive "stop"
private static final CharacterRunAutomaton stopStopList =
new CharacterRunAutomaton(new RegExp("[sS][tT][oO][pP]").toAutomaton());
public void testPayloadsPos0() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, dir, new MockPayloadAnalyzer());
@ -217,7 +213,8 @@ public class TestPositionIncrement extends LuceneTestCase {
DocsAndPositionsEnum tp = r.termPositionsEnum(r.getLiveDocs(),
"content",
new BytesRef("a"));
new BytesRef("a"),
false);
int count = 0;
assertTrue(tp.nextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS);

View File

@ -23,7 +23,6 @@ import java.util.Map;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
@ -135,19 +134,19 @@ public class TestTermVectors extends LuceneTestCase {
assertNotNull(terms);
TermsEnum termsEnum = terms.iterator(null);
assertEquals("content", termsEnum.next().utf8ToString());
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
dpEnum = termsEnum.docsAndPositions(null, dpEnum, false);
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
assertEquals(1, dpEnum.freq());
assertEquals(expectedPositions[0], dpEnum.nextPosition());
assertEquals("here", termsEnum.next().utf8ToString());
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
dpEnum = termsEnum.docsAndPositions(null, dpEnum, false);
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
assertEquals(1, dpEnum.freq());
assertEquals(expectedPositions[1], dpEnum.nextPosition());
assertEquals("some", termsEnum.next().utf8ToString());
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
dpEnum = termsEnum.docsAndPositions(null, dpEnum, false);
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
assertEquals(1, dpEnum.freq());
assertEquals(expectedPositions[2], dpEnum.nextPosition());
@ -171,31 +170,21 @@ public class TestTermVectors extends LuceneTestCase {
TermsEnum termsEnum = vectors.terms("field").iterator(null);
assertNotNull(termsEnum.next());
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
OffsetAttribute offsetAtt = dpEnum == null ? null : dpEnum.attributes().getAttribute(OffsetAttribute.class);
boolean shouldBePosVector = hits[i].doc % 2 == 0;
assertTrue(!shouldBePosVector
|| (shouldBePosVector && dpEnum != null));
boolean shouldBeOffVector = hits[i].doc % 3 == 0;
assertTrue(!shouldBeOffVector
|| (shouldBeOffVector && offsetAtt != null));
if (shouldBePosVector || shouldBeOffVector) {
while(true) {
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
dpEnum = termsEnum.docsAndPositions(null, dpEnum, shouldBeOffVector);
assertNotNull(dpEnum);
offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
dpEnum.nextPosition();
if (shouldBePosVector) {
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
}
if (shouldBeOffVector) {
assertNotNull(offsetAtt);
} else {
assertNull(offsetAtt);
assertTrue(dpEnum.startOffset() != -1);
assertTrue(dpEnum.endOffset() != -1);
}
if (termsEnum.next() == null) {
@ -437,7 +426,7 @@ public class TestTermVectors extends LuceneTestCase {
assertNotNull(termsEnum.next());
assertEquals("one", termsEnum.term().utf8ToString());
assertEquals(5, termsEnum.totalTermFreq());
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, false);
assertNotNull(dpEnum);
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
assertEquals(5, dpEnum.freq());
@ -445,16 +434,14 @@ public class TestTermVectors extends LuceneTestCase {
assertEquals(i, dpEnum.nextPosition());
}
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
assertNotNull(dpEnum);
OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
assertNotNull(offsetAtt);
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
assertEquals(5, dpEnum.freq());
for(int i=0;i<5;i++) {
dpEnum.nextPosition();
assertEquals(4*i, offsetAtt.startOffset());
assertEquals(4*i+3, offsetAtt.endOffset());
assertEquals(4*i, dpEnum.startOffset());
assertEquals(4*i+3, dpEnum.endOffset());
}
reader.close();
}

View File

@ -283,7 +283,8 @@ public class TestClassicAnalyzer extends BaseTokenStreamTestCase {
DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
"content",
new BytesRef("another"));
new BytesRef("another"),
false);
assertTrue(tps.nextDoc() != DocsEnum.NO_MORE_DOCS);
assertEquals(1, tps.freq());
assertEquals(3, tps.nextPosition());

View File

@ -24,7 +24,6 @@ import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@ -110,16 +109,15 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
TermsEnum termsEnum = vector.iterator(null);
termsEnum.next();
assertEquals(2, termsEnum.totalTermFreq());
DocsAndPositionsEnum positions = termsEnum.docsAndPositions(null, null);
OffsetAttribute offsetAtt = positions.attributes().getAttribute(OffsetAttribute.class);
DocsAndPositionsEnum positions = termsEnum.docsAndPositions(null, null, true);
assertTrue(positions.nextDoc() != DocsEnum.NO_MORE_DOCS);
assertEquals(2, positions.freq());
positions.nextPosition();
assertEquals(0, offsetAtt.startOffset());
assertEquals(4, offsetAtt.endOffset());
assertEquals(0, positions.startOffset());
assertEquals(4, positions.endOffset());
positions.nextPosition();
assertEquals(8, offsetAtt.startOffset());
assertEquals(12, offsetAtt.endOffset());
assertEquals(8, positions.startOffset());
assertEquals(12, positions.endOffset());
assertEquals(DocsEnum.NO_MORE_DOCS, positions.nextDoc());
r.close();
dir.close();

View File

@ -53,7 +53,7 @@ public class PayloadIterator {
this.buffer = buffer;
// TODO (Facet): avoid Multi*?
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
this.tp = MultiFields.getTermPositionsEnum(indexReader, liveDocs, term.field(), term.bytes());
this.tp = MultiFields.getTermPositionsEnum(indexReader, liveDocs, term.field(), term.bytes(), false);
}
/**

View File

@ -104,7 +104,8 @@ class ParentArray {
// TODO (Facet): avoid Multi*?
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
DocsAndPositionsEnum positions = MultiFields.getTermPositionsEnum(indexReader, liveDocs,
Consts.FIELD_PAYLOADS, new BytesRef(Consts.PAYLOAD_PARENT));
Consts.FIELD_PAYLOADS, new BytesRef(Consts.PAYLOAD_PARENT),
false);
if ((positions == null || positions.advance(first) == DocsAndPositionsEnum.NO_MORE_DOCS) && first < num) {
throw new CorruptIndexException("Missing parent data for category " + first);
}

View File

@ -8,7 +8,6 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Fields;
@ -283,18 +282,17 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
termInfo.add("tf", freq);
}
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
dpEnum = termsEnum.docsAndPositions(null, dpEnum, fieldOptions.offsets);
boolean useOffsets = fieldOptions.offsets;
if (dpEnum == null) {
useOffsets = false;
dpEnum = termsEnum.docsAndPositions(null, dpEnum, false);
}
boolean usePositions = false;
boolean useOffsets = false;
OffsetAttribute offsetAtt = null;
if (dpEnum != null) {
dpEnum.nextDoc();
usePositions = fieldOptions.positions;
if (fieldOptions.offsets && dpEnum.attributes().hasAttribute(OffsetAttribute.class)) {
useOffsets = true;
offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
}
}
NamedList<Number> theOffsets = null;
@ -317,8 +315,8 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
}
if (theOffsets != null) {
theOffsets.add("start", offsetAtt.startOffset());
theOffsets.add("end", offsetAtt.endOffset());
theOffsets.add("start", dpEnum.startOffset());
theOffsets.add("end", dpEnum.endOffset());
}
}
}