mirror of https://github.com/apache/lucene.git
LUCENE-3684: add offsets to postings APIs
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1231794 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1e09d9eff6
commit
11f33ee521
|
@ -226,6 +226,10 @@ Changes in backwards compatibility policy
|
|||
* LUCENE-3640: Removed IndexSearcher.close(), because IndexSearcher no longer
|
||||
takes a Directory and no longer "manages" IndexReaders, it is a no-op.
|
||||
(Robert Muir)
|
||||
|
||||
* LUCENE-3684: Add offsets into DocsAndPositionsEnum, and a few
|
||||
FieldInfo.IndexOption: DOCS_AND_POSITIONS_AND_OFFSETS. (Robert
|
||||
Muir, Mike McCandless)
|
||||
|
||||
Changes in Runtime Behavior
|
||||
|
||||
|
|
|
@ -126,7 +126,7 @@ public class TokenSources {
|
|||
private static boolean hasPositions(Terms vector) throws IOException {
|
||||
final TermsEnum termsEnum = vector.iterator(null);
|
||||
if (termsEnum.next() != null) {
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, false);
|
||||
if (dpEnum != null) {
|
||||
int pos = dpEnum.nextPosition();
|
||||
if (pos >= 0) {
|
||||
|
@ -219,22 +219,21 @@ public class TokenSources {
|
|||
DocsAndPositionsEnum dpEnum = null;
|
||||
while ((text = termsEnum.next()) != null) {
|
||||
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
if (dpEnum == null || (!dpEnum.attributes().hasAttribute(OffsetAttribute.class))) {
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
|
||||
if (dpEnum == null) {
|
||||
throw new IllegalArgumentException(
|
||||
"Required TermVector Offset information was not found");
|
||||
}
|
||||
|
||||
final String term = text.utf8ToString();
|
||||
|
||||
final OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
dpEnum.nextDoc();
|
||||
final int freq = dpEnum.freq();
|
||||
for(int posUpto=0;posUpto<freq;posUpto++) {
|
||||
final int pos = dpEnum.nextPosition();
|
||||
final Token token = new Token(term,
|
||||
offsetAtt.startOffset(),
|
||||
offsetAtt.endOffset());
|
||||
dpEnum.startOffset(),
|
||||
dpEnum.endOffset());
|
||||
if (tokenPositionsGuaranteedContiguous && pos != -1) {
|
||||
// We have positions stored and a guarantee that the token position
|
||||
// information is contiguous
|
||||
|
|
|
@ -60,22 +60,23 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
|
|||
BytesRef text;
|
||||
DocsAndPositionsEnum dpEnum = null;
|
||||
while((text = termsEnum.next()) != null) {
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
|
||||
final boolean hasOffsets;
|
||||
if (dpEnum == null) {
|
||||
hasOffsets = false;
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, false);
|
||||
} else {
|
||||
hasOffsets = true;
|
||||
}
|
||||
dpEnum.nextDoc();
|
||||
final int freq = dpEnum.freq();
|
||||
final OffsetAttribute offsetAtt;
|
||||
if (dpEnum.attributes().hasAttribute(OffsetAttribute.class)) {
|
||||
offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
} else {
|
||||
offsetAtt = null;
|
||||
}
|
||||
for (int j = 0; j < freq; j++) {
|
||||
int pos = dpEnum.nextPosition();
|
||||
Token token;
|
||||
if (offsetAtt != null) {
|
||||
if (hasOffsets) {
|
||||
token = new Token(text.utf8ToString(),
|
||||
offsetAtt.startOffset(),
|
||||
offsetAtt.endOffset());
|
||||
dpEnum.startOffset(),
|
||||
dpEnum.endOffset());
|
||||
} else {
|
||||
token = new Token();
|
||||
token.setEmpty().append(text.utf8ToString());
|
||||
|
|
|
@ -21,7 +21,6 @@ import java.util.Collections;
|
|||
import java.util.LinkedList;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
@ -101,29 +100,19 @@ public class FieldTermStack {
|
|||
if (!termSet.contains(term)) {
|
||||
continue;
|
||||
}
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
|
||||
if (dpEnum == null) {
|
||||
// null snippet
|
||||
return;
|
||||
}
|
||||
|
||||
if (!dpEnum.attributes().hasAttribute(OffsetAttribute.class)) {
|
||||
// null snippet
|
||||
return;
|
||||
}
|
||||
dpEnum.nextDoc();
|
||||
|
||||
final OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
|
||||
final int freq = dpEnum.freq();
|
||||
|
||||
for(int i = 0;i < freq;i++) {
|
||||
final int pos = dpEnum.nextPosition();
|
||||
if (pos == -1) {
|
||||
// null snippet
|
||||
return;
|
||||
}
|
||||
termList.add(new TermInfo(term, offsetAtt.startOffset(), offsetAtt.endOffset(), pos));
|
||||
int pos = dpEnum.nextPosition();
|
||||
termList.add(new TermInfo(term, dpEnum.startOffset(), dpEnum.endOffset(), pos));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -953,7 +953,10 @@ public class MemoryIndex {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) {
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) {
|
||||
if (needsOffsets) {
|
||||
return null;
|
||||
}
|
||||
if (reuse == null || !(reuse instanceof MemoryDocsAndPositionsEnum)) {
|
||||
reuse = new MemoryDocsAndPositionsEnum();
|
||||
}
|
||||
|
@ -1065,6 +1068,16 @@ public class MemoryIndex {
|
|||
return positions.get(posUpto++);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayload() {
|
||||
return false;
|
||||
|
|
|
@ -206,7 +206,7 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase {
|
|||
MemoryIndex memory = new MemoryIndex();
|
||||
memory.addField("foo", "bar", analyzer);
|
||||
IndexReader reader = memory.createSearcher().getIndexReader();
|
||||
DocsAndPositionsEnum disi = reader.termPositionsEnum(null, "foo", new BytesRef("bar"));
|
||||
DocsAndPositionsEnum disi = reader.termPositionsEnum(null, "foo", new BytesRef("bar"), false);
|
||||
int docid = disi.docID();
|
||||
assertTrue(docid == -1 || docid == DocIdSetIterator.NO_MORE_DOCS);
|
||||
assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
|
@ -214,7 +214,7 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase {
|
|||
// now reuse and check again
|
||||
TermsEnum te = reader.terms("foo").iterator(null);
|
||||
assertTrue(te.seekExact(new BytesRef("bar"), true));
|
||||
disi = te.docsAndPositions(null, disi);
|
||||
disi = te.docsAndPositions(null, disi, false);
|
||||
docid = disi.docID();
|
||||
assertTrue(docid == -1 || docid == DocIdSetIterator.NO_MORE_DOCS);
|
||||
assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
|
|
|
@ -697,16 +697,20 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
//System.out.println("BTR.d&p this=" + this);
|
||||
decodeMetaData();
|
||||
if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
|
||||
if (fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
|
||||
// Positions were not indexed:
|
||||
return null;
|
||||
} else {
|
||||
DocsAndPositionsEnum dpe = postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse);
|
||||
//System.out.println(" return d&pe=" + dpe);
|
||||
return dpe;
|
||||
}
|
||||
|
||||
if (needsOffsets &&
|
||||
fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0) {
|
||||
// Offsets were not indexed:
|
||||
return null;
|
||||
}
|
||||
|
||||
decodeMetaData();
|
||||
return postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse, needsOffsets);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -881,13 +881,20 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
|
||||
if (fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
|
||||
// Positions were not indexed:
|
||||
return null;
|
||||
} else {
|
||||
currentFrame.decodeMetaData();
|
||||
return postingsReader.docsAndPositions(fieldInfo, currentFrame.termState, skipDocs, reuse);
|
||||
}
|
||||
|
||||
if (needsOffsets &&
|
||||
fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0) {
|
||||
// Offsets were not indexed:
|
||||
return null;
|
||||
}
|
||||
|
||||
currentFrame.decodeMetaData();
|
||||
return postingsReader.docsAndPositions(fieldInfo, currentFrame.termState, skipDocs, reuse, needsOffsets);
|
||||
}
|
||||
|
||||
private int getState() {
|
||||
|
@ -2096,17 +2103,21 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
assert !eof;
|
||||
//System.out.println("BTR.d&p this=" + this);
|
||||
if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
|
||||
if (fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
|
||||
// Positions were not indexed:
|
||||
return null;
|
||||
} else {
|
||||
currentFrame.decodeMetaData();
|
||||
DocsAndPositionsEnum dpe = postingsReader.docsAndPositions(fieldInfo, currentFrame.state, skipDocs, reuse);
|
||||
//System.out.println(" return d&pe=" + dpe);
|
||||
return dpe;
|
||||
}
|
||||
|
||||
if (needsOffsets &&
|
||||
fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0) {
|
||||
// Offsets were not indexed:
|
||||
return null;
|
||||
}
|
||||
|
||||
assert !eof;
|
||||
currentFrame.decodeMetaData();
|
||||
return postingsReader.docsAndPositions(fieldInfo, currentFrame.state, skipDocs, reuse, needsOffsets);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -102,6 +102,16 @@ public final class MappingMultiDocsAndPositionsEnum extends DocsAndPositionsEnum
|
|||
public int nextPosition() throws IOException {
|
||||
return current.nextPosition();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
return current.startOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return current.endOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() throws IOException {
|
||||
|
|
|
@ -44,12 +44,12 @@ public abstract class PostingsConsumer {
|
|||
int docBase;
|
||||
}
|
||||
|
||||
/** Add a new position & payload. A null payload means no
|
||||
* payload; a non-null payload with zero length also
|
||||
* means no payload. Caller may reuse the {@link
|
||||
* BytesRef} for the payload between calls (method must
|
||||
* fully consume the payload). */
|
||||
public abstract void addPosition(int position, BytesRef payload) throws IOException;
|
||||
/** Add a new position & payload, and start/end offset. A
|
||||
* null payload means no payload; a non-null payload with
|
||||
* zero length also means no payload. Caller may reuse
|
||||
* the {@link BytesRef} for the payload between calls
|
||||
* (method must fully consume the payload). */
|
||||
public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException;
|
||||
|
||||
/** Called when we are done adding positions & payloads
|
||||
* for each doc. Not called when the field omits term
|
||||
|
@ -88,7 +88,7 @@ public abstract class PostingsConsumer {
|
|||
df++;
|
||||
totTF += freq;
|
||||
}
|
||||
} else {
|
||||
} else if (mergeState.fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings;
|
||||
while(true) {
|
||||
final int doc = postingsEnum.nextDoc();
|
||||
|
@ -107,7 +107,32 @@ public abstract class PostingsConsumer {
|
|||
} else {
|
||||
payload = null;
|
||||
}
|
||||
this.addPosition(position, payload);
|
||||
this.addPosition(position, payload, -1, -1);
|
||||
}
|
||||
this.finishDoc();
|
||||
df++;
|
||||
}
|
||||
} else {
|
||||
assert mergeState.fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
|
||||
final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings;
|
||||
while(true) {
|
||||
final int doc = postingsEnum.nextDoc();
|
||||
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
break;
|
||||
}
|
||||
visitedDocs.set(doc);
|
||||
final int freq = postingsEnum.freq();
|
||||
this.startDoc(doc, freq);
|
||||
totTF += freq;
|
||||
for(int i=0;i<freq;i++) {
|
||||
final int position = postingsEnum.nextPosition();
|
||||
final BytesRef payload;
|
||||
if (postingsEnum.hasPayload()) {
|
||||
payload = postingsEnum.getPayload();
|
||||
} else {
|
||||
payload = null;
|
||||
}
|
||||
this.addPosition(position, payload, postingsEnum.startOffset(), postingsEnum.endOffset());
|
||||
}
|
||||
this.finishDoc();
|
||||
df++;
|
||||
|
|
|
@ -55,7 +55,8 @@ public abstract class PostingsReaderBase implements Closeable {
|
|||
|
||||
/** Must fully consume state, since after this call that
|
||||
* TermState may be reused. */
|
||||
public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState state, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException;
|
||||
public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState state, Bits skipDocs, DocsAndPositionsEnum reuse,
|
||||
boolean needsOffsets) throws IOException;
|
||||
|
||||
public abstract void close() throws IOException;
|
||||
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.codecs;
|
|||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
|
@ -185,7 +184,6 @@ public abstract class TermVectorsWriter implements Closeable {
|
|||
String lastFieldName = null;
|
||||
|
||||
while((fieldName = fieldsEnum.next()) != null) {
|
||||
|
||||
final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldName);
|
||||
|
||||
assert lastFieldName == null || fieldName.compareTo(lastFieldName) > 0: "lastFieldName=" + lastFieldName + " fieldName=" + fieldName;
|
||||
|
@ -200,79 +198,79 @@ public abstract class TermVectorsWriter implements Closeable {
|
|||
if (numTerms == -1) {
|
||||
throw new IllegalStateException("vector.getUniqueTermCount() must be implemented (it returned -1)");
|
||||
}
|
||||
|
||||
final boolean positions;
|
||||
|
||||
OffsetAttribute offsetAtt;
|
||||
|
||||
final TermsEnum termsEnum = terms.iterator(null);
|
||||
|
||||
DocsAndPositionsEnum docsAndPositionsEnum = null;
|
||||
|
||||
if (termsEnum.next() != null) {
|
||||
assert numTerms > 0;
|
||||
docsAndPositionsEnum = termsEnum.docsAndPositions(null, null);
|
||||
if (docsAndPositionsEnum != null) {
|
||||
// has positions
|
||||
positions = true;
|
||||
if (docsAndPositionsEnum.attributes().hasAttribute(OffsetAttribute.class)) {
|
||||
offsetAtt = docsAndPositionsEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
} else {
|
||||
offsetAtt = null;
|
||||
}
|
||||
} else {
|
||||
positions = false;
|
||||
offsetAtt = null;
|
||||
}
|
||||
} else {
|
||||
// no terms in this field (hmm why is field present
|
||||
// then...?)
|
||||
assert numTerms == 0;
|
||||
positions = false;
|
||||
offsetAtt = null;
|
||||
}
|
||||
|
||||
startField(fieldInfo, numTerms, positions, offsetAtt != null);
|
||||
boolean startedField = false;
|
||||
|
||||
int termCount = 1;
|
||||
// NOTE: this is tricky, because TermVectors allow
|
||||
// indexing offsets but NOT positions. So we must
|
||||
// lazily init the field by checking whether first
|
||||
// position we see is -1 or not.
|
||||
|
||||
int termCount = 0;
|
||||
while(termsEnum.next() != null) {
|
||||
termCount++;
|
||||
|
||||
// NOTE: we already .next()'d the TermsEnum above, to
|
||||
// peek @ first term to see if positions/offsets are
|
||||
// present
|
||||
while(true) {
|
||||
final int freq = (int) termsEnum.totalTermFreq();
|
||||
startTerm(termsEnum.term(), freq);
|
||||
|
||||
if (positions || offsetAtt != null) {
|
||||
DocsAndPositionsEnum dp = termsEnum.docsAndPositions(null, docsAndPositionsEnum);
|
||||
// TODO: add startOffset()/endOffset() to d&pEnum... this is insanity
|
||||
if (dp != docsAndPositionsEnum) {
|
||||
// producer didnt reuse, must re-pull attributes
|
||||
if (offsetAtt != null) {
|
||||
assert dp.attributes().hasAttribute(OffsetAttribute.class);
|
||||
offsetAtt = dp.attributes().getAttribute(OffsetAttribute.class);
|
||||
}
|
||||
}
|
||||
docsAndPositionsEnum = dp;
|
||||
if (startedField) {
|
||||
startTerm(termsEnum.term(), freq);
|
||||
}
|
||||
|
||||
// TODO: we need a "query" API where we can ask (via
|
||||
// flex API) what this term was indexed with...
|
||||
// Both positions & offsets:
|
||||
docsAndPositionsEnum = termsEnum.docsAndPositions(null, null, true);
|
||||
final boolean hasOffsets;
|
||||
boolean hasPositions = false;
|
||||
if (docsAndPositionsEnum == null) {
|
||||
// Fallback: no offsets
|
||||
docsAndPositionsEnum = termsEnum.docsAndPositions(null, null, false);
|
||||
hasOffsets = false;
|
||||
} else {
|
||||
hasOffsets = true;
|
||||
}
|
||||
|
||||
if (docsAndPositionsEnum != null) {
|
||||
final int docID = docsAndPositionsEnum.nextDoc();
|
||||
assert docID != DocsEnum.NO_MORE_DOCS;
|
||||
assert docsAndPositionsEnum.freq() == freq;
|
||||
|
||||
for(int posUpto=0; posUpto<freq; posUpto++) {
|
||||
final int pos = docsAndPositionsEnum.nextPosition();
|
||||
final int startOffset = offsetAtt == null ? -1 : offsetAtt.startOffset();
|
||||
final int endOffset = offsetAtt == null ? -1 : offsetAtt.endOffset();
|
||||
|
||||
if (!startedField) {
|
||||
assert numTerms > 0;
|
||||
hasPositions = pos != -1;
|
||||
startField(fieldInfo, numTerms, hasPositions, hasOffsets);
|
||||
startTerm(termsEnum.term(), freq);
|
||||
startedField = true;
|
||||
}
|
||||
final int startOffset;
|
||||
final int endOffset;
|
||||
if (hasOffsets) {
|
||||
startOffset = docsAndPositionsEnum.startOffset();
|
||||
endOffset = docsAndPositionsEnum.endOffset();
|
||||
assert startOffset != -1;
|
||||
assert endOffset != -1;
|
||||
} else {
|
||||
startOffset = -1;
|
||||
endOffset = -1;
|
||||
}
|
||||
assert !hasPositions || pos >= 0;
|
||||
addPosition(pos, startOffset, endOffset);
|
||||
}
|
||||
} else {
|
||||
if (!startedField) {
|
||||
assert numTerms > 0;
|
||||
startField(fieldInfo, numTerms, hasPositions, hasOffsets);
|
||||
startTerm(termsEnum.term(), freq);
|
||||
startedField = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (termsEnum.next() == null) {
|
||||
assert termCount == numTerms;
|
||||
break;
|
||||
}
|
||||
termCount++;
|
||||
}
|
||||
assert termCount == numTerms;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -119,8 +119,7 @@ public abstract class TermsConsumer {
|
|||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
assert mergeState.fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
|
||||
} else if (mergeState.fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
if (postingsEnum == null) {
|
||||
postingsEnum = new MappingMultiDocsAndPositionsEnum();
|
||||
}
|
||||
|
@ -129,7 +128,41 @@ public abstract class TermsConsumer {
|
|||
while((term = termsEnum.next()) != null) {
|
||||
// We can pass null for liveDocs, because the
|
||||
// mapping enum will skip the non-live docs:
|
||||
postingsEnumIn = (MultiDocsAndPositionsEnum) termsEnum.docsAndPositions(null, postingsEnumIn);
|
||||
postingsEnumIn = (MultiDocsAndPositionsEnum) termsEnum.docsAndPositions(null, postingsEnumIn, false);
|
||||
assert postingsEnumIn != null;
|
||||
postingsEnum.reset(postingsEnumIn);
|
||||
// set PayloadProcessor
|
||||
if (mergeState.payloadProcessorProvider != null) {
|
||||
for (int i = 0; i < mergeState.readers.size(); i++) {
|
||||
if (mergeState.dirPayloadProcessor[i] != null) {
|
||||
mergeState.currentPayloadProcessor[i] = mergeState.dirPayloadProcessor[i].getProcessor(mergeState.fieldInfo.name, term);
|
||||
}
|
||||
}
|
||||
}
|
||||
final PostingsConsumer postingsConsumer = startTerm(term);
|
||||
final TermStats stats = postingsConsumer.merge(mergeState, postingsEnum, visitedDocs);
|
||||
if (stats.docFreq > 0) {
|
||||
finishTerm(term, stats);
|
||||
sumTotalTermFreq += stats.totalTermFreq;
|
||||
sumDFsinceLastAbortCheck += stats.docFreq;
|
||||
sumDocFreq += stats.docFreq;
|
||||
if (sumDFsinceLastAbortCheck > 60000) {
|
||||
mergeState.checkAbort.work(sumDFsinceLastAbortCheck/5.0);
|
||||
sumDFsinceLastAbortCheck = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
assert mergeState.fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
|
||||
if (postingsEnum == null) {
|
||||
postingsEnum = new MappingMultiDocsAndPositionsEnum();
|
||||
}
|
||||
postingsEnum.setMergeState(mergeState);
|
||||
MultiDocsAndPositionsEnum postingsEnumIn = null;
|
||||
while((term = termsEnum.next()) != null) {
|
||||
// We can pass null for liveDocs, because the
|
||||
// mapping enum will skip the non-live docs:
|
||||
postingsEnumIn = (MultiDocsAndPositionsEnum) termsEnum.docsAndPositions(null, postingsEnumIn, true);
|
||||
assert postingsEnumIn != null;
|
||||
postingsEnum.reset(postingsEnumIn);
|
||||
// set PayloadProcessor
|
||||
|
@ -154,7 +187,6 @@ public abstract class TermsConsumer {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
finish(sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -966,7 +966,12 @@ public class Lucene3xFields extends FieldsProducer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
|
||||
if (needsOffsets) {
|
||||
// Pre-4.0 indices never have offsets:
|
||||
return null;
|
||||
}
|
||||
|
||||
PreDocsAndPositionsEnum docsPosEnum;
|
||||
if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
return null;
|
||||
|
@ -1081,6 +1086,16 @@ public class Lucene3xFields extends FieldsProducer {
|
|||
return pos.nextPosition();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayload() {
|
||||
assert docID != NO_MORE_DOCS;
|
||||
|
|
|
@ -215,7 +215,9 @@ public final class TermInfosReader {
|
|||
|
||||
TermInfo seekEnum(SegmentTermEnum enumerator, Term term, boolean useCache) throws IOException {
|
||||
if (useCache) {
|
||||
return seekEnum(enumerator, term, termsCache.get(new CloneableTerm(term)), useCache);
|
||||
return seekEnum(enumerator, term,
|
||||
termsCache.get(new CloneableTerm(term.deepCopyOf())),
|
||||
useCache);
|
||||
} else {
|
||||
return seekEnum(enumerator, term, null, useCache);
|
||||
}
|
||||
|
@ -247,7 +249,8 @@ public final class TermInfosReader {
|
|||
// of terms in order
|
||||
if (tiOrd == null) {
|
||||
if (useCache) {
|
||||
termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, enumerator.position));
|
||||
termsCache.put(new CloneableTerm(term.deepCopyOf()),
|
||||
new TermInfoAndOrd(ti, enumerator.position));
|
||||
}
|
||||
} else {
|
||||
assert sameTermInfo(ti, tiOrd, enumerator);
|
||||
|
@ -279,7 +282,8 @@ public final class TermInfosReader {
|
|||
ti = enumerator.termInfo;
|
||||
if (tiOrd == null) {
|
||||
if (useCache) {
|
||||
termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, enumerator.position));
|
||||
termsCache.put(new CloneableTerm(term.deepCopyOf()),
|
||||
new TermInfoAndOrd(ti, enumerator.position));
|
||||
}
|
||||
} else {
|
||||
assert sameTermInfo(ti, tiOrd, enumerator);
|
||||
|
|
|
@ -80,6 +80,8 @@ public class Lucene40FieldInfosReader extends FieldInfosReader {
|
|||
} else {
|
||||
throw new CorruptIndexException("Corrupt fieldinfos, OMIT_POSITIONS set but format=" + format + " (resource: " + input + ")");
|
||||
}
|
||||
} else if (format <= Lucene40FieldInfosWriter.FORMAT_FLEX && (bits & Lucene40FieldInfosWriter.STORE_OFFSETS_IN_POSTINGS) != 0) {
|
||||
indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
|
||||
} else {
|
||||
indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
|
||||
}
|
||||
|
|
|
@ -47,6 +47,7 @@ public class Lucene40FieldInfosWriter extends FieldInfosWriter {
|
|||
|
||||
static final byte IS_INDEXED = 0x1;
|
||||
static final byte STORE_TERMVECTOR = 0x2;
|
||||
static final byte STORE_OFFSETS_IN_POSTINGS = 0x4;
|
||||
static final byte OMIT_NORMS = 0x10;
|
||||
static final byte STORE_PAYLOADS = 0x20;
|
||||
static final byte OMIT_TERM_FREQ_AND_POSITIONS = 0x40;
|
||||
|
@ -68,6 +69,8 @@ public class Lucene40FieldInfosWriter extends FieldInfosWriter {
|
|||
if (fi.storePayloads) bits |= STORE_PAYLOADS;
|
||||
if (fi.indexOptions == IndexOptions.DOCS_ONLY) {
|
||||
bits |= OMIT_TERM_FREQ_AND_POSITIONS;
|
||||
} else if (fi.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
|
||||
bits |= STORE_OFFSETS_IN_POSTINGS;
|
||||
} else if (fi.indexOptions == IndexOptions.DOCS_AND_FREQS) {
|
||||
bits |= OMIT_POSITIONS;
|
||||
}
|
||||
|
|
|
@ -241,11 +241,15 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs,
|
||||
DocsAndPositionsEnum reuse, boolean needsOffsets)
|
||||
throws IOException {
|
||||
|
||||
if (needsOffsets) {
|
||||
// TODO: once we index offsets into postings fix this!
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
// TODO: refactor
|
||||
if (fieldInfo.storePayloads) {
|
||||
SegmentDocsAndPositionsAndPayloadsEnum docsEnum;
|
||||
|
@ -366,7 +370,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
|
||||
start = count; // buffer is consumed
|
||||
|
||||
return doc = skipTo(target, liveDocs);
|
||||
return doc = skipTo(target);
|
||||
}
|
||||
|
||||
private final int binarySearch(int hi, int low, int target, int[] docs) {
|
||||
|
@ -448,7 +452,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
|
||||
}
|
||||
|
||||
private final int skipTo(int target, Bits liveDocs) throws IOException {
|
||||
private final int skipTo(int target) throws IOException {
|
||||
if ((target - skipInterval) >= accum && limit >= skipMinimum) {
|
||||
|
||||
// There are enough docs in the posting to have
|
||||
|
@ -841,6 +845,16 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
return position;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/** Returns the payload at this position, or null if no
|
||||
* payload was indexed. */
|
||||
@Override
|
||||
|
@ -1074,6 +1088,16 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
return position;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/** Returns the payload at this position, or null if no
|
||||
* payload was indexed. */
|
||||
@Override
|
||||
|
|
|
@ -155,6 +155,10 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
*/
|
||||
this.fieldInfo = fieldInfo;
|
||||
indexOptions = fieldInfo.indexOptions;
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
|
||||
throw new IllegalArgumentException("this codec cannot index offsets");
|
||||
}
|
||||
|
||||
storePayloads = fieldInfo.storePayloads;
|
||||
//System.out.println(" set init blockFreqStart=" + freqStart);
|
||||
//System.out.println(" set init blockProxStart=" + proxStart);
|
||||
|
@ -197,11 +201,19 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
|
||||
/** Add a new position & payload */
|
||||
@Override
|
||||
public void addPosition(int position, BytesRef payload) throws IOException {
|
||||
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
|
||||
//if (DEBUG) System.out.println("SPW: addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.length + " bytes")) + " proxFP=" + proxOut.getFilePointer());
|
||||
assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS: "invalid indexOptions: " + indexOptions;
|
||||
assert proxOut != null;
|
||||
|
||||
// TODO: when we add offsets... often
|
||||
// endOffset-startOffset will be constant or near
|
||||
// constant for all docs (eg if the term wasn't stemmed
|
||||
// then this will usually be the utf16 length of the
|
||||
// term); would be nice to write that length once up
|
||||
// front and then not encode endOffset for each
|
||||
// position..
|
||||
|
||||
final int delta = position - lastPosition;
|
||||
|
||||
assert delta >= 0: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it)
|
||||
|
|
|
@ -24,7 +24,6 @@ import java.util.HashMap;
|
|||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.codecs.TermVectorsReader;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
|
@ -518,21 +517,20 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
|
||||
if (needsOffsets && !storeOffsets) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!storePositions && !storeOffsets) {
|
||||
return null;
|
||||
}
|
||||
|
||||
TVDocsAndPositionsEnum docsAndPositionsEnum;
|
||||
if (reuse != null) {
|
||||
if (reuse != null && reuse instanceof TVDocsAndPositionsEnum) {
|
||||
docsAndPositionsEnum = (TVDocsAndPositionsEnum) reuse;
|
||||
if (docsAndPositionsEnum.canReuse(storeOffsets)) {
|
||||
docsAndPositionsEnum = (TVDocsAndPositionsEnum) reuse;
|
||||
} else {
|
||||
docsAndPositionsEnum = new TVDocsAndPositionsEnum(storeOffsets);
|
||||
}
|
||||
} else {
|
||||
docsAndPositionsEnum = new TVDocsAndPositionsEnum(storeOffsets);
|
||||
docsAndPositionsEnum = new TVDocsAndPositionsEnum();
|
||||
}
|
||||
docsAndPositionsEnum.reset(liveDocs, positions, startOffsets, endOffsets);
|
||||
return docsAndPositionsEnum;
|
||||
|
@ -592,7 +590,6 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
}
|
||||
|
||||
private static class TVDocsAndPositionsEnum extends DocsAndPositionsEnum {
|
||||
private final OffsetAttribute offsetAtt;
|
||||
private boolean didNext;
|
||||
private int doc = -1;
|
||||
private int nextPos;
|
||||
|
@ -601,18 +598,6 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
private int[] startOffsets;
|
||||
private int[] endOffsets;
|
||||
|
||||
public TVDocsAndPositionsEnum(boolean storeOffsets) {
|
||||
if (storeOffsets) {
|
||||
offsetAtt = attributes().addAttribute(OffsetAttribute.class);
|
||||
} else {
|
||||
offsetAtt = null;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean canReuse(boolean storeOffsets) {
|
||||
return storeOffsets == (offsetAtt != null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int freq() {
|
||||
if (positions != null) {
|
||||
|
@ -651,7 +636,6 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
this.liveDocs = liveDocs;
|
||||
this.positions = positions;
|
||||
this.startOffsets = startOffsets;
|
||||
assert (offsetAtt != null) == (startOffsets != null);
|
||||
this.endOffsets = endOffsets;
|
||||
this.doc = -1;
|
||||
didNext = false;
|
||||
|
@ -673,10 +657,6 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
assert (positions != null && nextPos < positions.length) ||
|
||||
startOffsets != null && nextPos < startOffsets.length;
|
||||
|
||||
if (startOffsets != null) {
|
||||
offsetAtt.setOffset(startOffsets[nextPos],
|
||||
endOffsets[nextPos]);
|
||||
}
|
||||
if (positions != null) {
|
||||
return positions[nextPos++];
|
||||
} else {
|
||||
|
@ -684,6 +664,18 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() {
|
||||
assert startOffsets != null;
|
||||
return startOffsets[nextPos-1];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() {
|
||||
assert endOffsets != null;
|
||||
return endOffsets[nextPos-1];
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -131,7 +131,7 @@ public class MemoryPostingsFormat extends PostingsFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void addPosition(int pos, BytesRef payload) throws IOException {
|
||||
public void addPosition(int pos, BytesRef payload, int startOffset, int endOffset) throws IOException {
|
||||
assert payload == null || field.storePayloads;
|
||||
|
||||
if (VERBOSE) System.out.println(" addPos pos=" + pos + " payload=" + payload);
|
||||
|
@ -249,6 +249,9 @@ public class MemoryPostingsFormat extends PostingsFormat {
|
|||
return new FieldsConsumer() {
|
||||
@Override
|
||||
public TermsConsumer addField(FieldInfo field) {
|
||||
if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
|
||||
throw new IllegalArgumentException("this codec cannot index offsets");
|
||||
}
|
||||
if (VERBOSE) System.out.println("\naddField field=" + field.name);
|
||||
return new TermsWriter(out, field);
|
||||
}
|
||||
|
@ -328,7 +331,7 @@ public class MemoryPostingsFormat extends PostingsFormat {
|
|||
assert freq > 0;
|
||||
}
|
||||
|
||||
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
|
||||
// Skip positions
|
||||
for(int posUpto=0;posUpto<freq;posUpto++) {
|
||||
if (!storePayloads) {
|
||||
|
@ -500,6 +503,16 @@ public class MemoryPostingsFormat extends PostingsFormat {
|
|||
return pos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() {
|
||||
payloadRetrieved = true;
|
||||
|
@ -618,8 +631,14 @@ public class MemoryPostingsFormat extends PostingsFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
if (field.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
|
||||
|
||||
if (needsOffsets) {
|
||||
// Not until we can index offsets...
|
||||
return null;
|
||||
}
|
||||
|
||||
if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
|
||||
return null;
|
||||
}
|
||||
decodeMetaData();
|
||||
|
|
|
@ -215,10 +215,8 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(FieldInfo field, BlockTermState _termState, Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
if (field.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
return null;
|
||||
}
|
||||
public DocsAndPositionsEnum docsAndPositions(FieldInfo field, BlockTermState _termState, Bits liveDocs, DocsAndPositionsEnum reuse,
|
||||
boolean needsOffsets) throws IOException {
|
||||
//System.out.println("D&P: field=" + field.name);
|
||||
|
||||
final PulsingTermState termState = (PulsingTermState) _termState;
|
||||
|
@ -245,11 +243,12 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
return postings.reset(liveDocs, termState);
|
||||
} else {
|
||||
if (reuse instanceof PulsingDocsAndPositionsEnum) {
|
||||
DocsAndPositionsEnum wrapped = wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, (DocsAndPositionsEnum) getOther(reuse));
|
||||
DocsAndPositionsEnum wrapped = wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, (DocsAndPositionsEnum) getOther(reuse),
|
||||
needsOffsets);
|
||||
setOther(wrapped, reuse); // wrapped.other = reuse
|
||||
return wrapped;
|
||||
} else {
|
||||
return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, reuse);
|
||||
return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, reuse, needsOffsets);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -486,6 +485,16 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
return position;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
private void skipPositions() throws IOException {
|
||||
while(posPending != 0) {
|
||||
nextPosition();
|
||||
|
|
|
@ -115,6 +115,9 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
|
|||
@Override
|
||||
public void setField(FieldInfo fieldInfo) {
|
||||
this.indexOptions = fieldInfo.indexOptions;
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
|
||||
throw new IllegalArgumentException("this codec cannot index offsets: " + indexOptions);
|
||||
}
|
||||
if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions);
|
||||
storePayloads = fieldInfo.storePayloads;
|
||||
wrappedPostingsWriter.setField(fieldInfo);
|
||||
|
@ -165,7 +168,7 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void addPosition(int position, BytesRef payload) throws IOException {
|
||||
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
|
||||
|
||||
if (DEBUG) System.out.println("PW pos=" + position + " payload=" + (payload == null ? "null" : payload.length + " bytes"));
|
||||
if (pendingCount == pending.length) {
|
||||
|
@ -175,7 +178,7 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
|
|||
if (pendingCount == -1) {
|
||||
// We've already seen too many docs for this term --
|
||||
// just forward to our fallback writer
|
||||
wrappedPostingsWriter.addPosition(position, payload);
|
||||
wrappedPostingsWriter.addPosition(position, payload, -1, -1);
|
||||
} else {
|
||||
// buffer up
|
||||
final Position pos = pending[pendingCount++];
|
||||
|
@ -360,7 +363,7 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
|
|||
wrappedPostingsWriter.startTerm();
|
||||
|
||||
// Flush all buffered docs
|
||||
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
|
||||
Position doc = null;
|
||||
for(Position pos : pending) {
|
||||
if (doc == null) {
|
||||
|
@ -376,7 +379,7 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
|
|||
wrappedPostingsWriter.startDoc(doc.docID, doc.termFreq);
|
||||
}
|
||||
if (DEBUG) System.out.println("PW: wrapped.addPos pos=" + pos.pos);
|
||||
wrappedPostingsWriter.addPosition(pos.pos, pos.payload);
|
||||
wrappedPostingsWriter.addPosition(pos.pos, pos.payload, -1, -1);
|
||||
}
|
||||
//wrappedPostingsWriter.finishDoc();
|
||||
} else {
|
||||
|
|
|
@ -294,7 +294,18 @@ public class SepPostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState _termState, Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState _termState, Bits liveDocs,
|
||||
DocsAndPositionsEnum reuse, boolean needsOffsets)
|
||||
throws IOException {
|
||||
|
||||
if (fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (needsOffsets) {
|
||||
return null;
|
||||
}
|
||||
|
||||
assert fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
|
||||
final SepTermState termState = (SepTermState) _termState;
|
||||
SepDocsAndPositionsEnum postingsEnum;
|
||||
|
@ -713,6 +724,16 @@ public class SepPostingsReader extends PostingsReaderBase {
|
|||
return position;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
private BytesRef payload;
|
||||
|
||||
@Override
|
||||
|
|
|
@ -188,6 +188,9 @@ public final class SepPostingsWriter extends PostingsWriterBase {
|
|||
public void setField(FieldInfo fieldInfo) {
|
||||
this.fieldInfo = fieldInfo;
|
||||
this.indexOptions = fieldInfo.indexOptions;
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
|
||||
throw new IllegalArgumentException("this codec cannot index offsets");
|
||||
}
|
||||
skipListWriter.setIndexOptions(indexOptions);
|
||||
storePayloads = indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS && fieldInfo.storePayloads;
|
||||
}
|
||||
|
@ -222,7 +225,7 @@ public final class SepPostingsWriter extends PostingsWriterBase {
|
|||
|
||||
/** Add a new position & payload */
|
||||
@Override
|
||||
public void addPosition(int position, BytesRef payload) throws IOException {
|
||||
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
|
||||
assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
|
||||
|
||||
final int delta = position - lastPosition;
|
||||
|
|
|
@ -103,7 +103,7 @@ public class SimpleTextFieldInfosReader extends FieldInfosReader {
|
|||
IndexOptions indexOptions = IndexOptions.valueOf(readString(INDEXOPTIONS.length, scratch));
|
||||
|
||||
hasVectors |= storeTermVector;
|
||||
hasProx |= isIndexed && indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
|
||||
hasProx |= isIndexed && indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
hasFreq |= isIndexed && indexOptions != IndexOptions.DOCS_ONLY;
|
||||
|
||||
infos[i] = new FieldInfo(name, isIndexed, fieldNumber, storeTermVector,
|
||||
|
|
|
@ -62,7 +62,7 @@ public class SimpleTextFieldInfosWriter extends FieldInfosWriter {
|
|||
SimpleTextUtil.writeNewline(out);
|
||||
|
||||
for (FieldInfo fi : infos) {
|
||||
assert fi.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !fi.storePayloads;
|
||||
assert fi.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !fi.storePayloads;
|
||||
|
||||
SimpleTextUtil.write(out, NAME);
|
||||
SimpleTextUtil.write(out, fi.name, scratch);
|
||||
|
|
|
@ -50,13 +50,15 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
private final IndexInput in;
|
||||
private final FieldInfos fieldInfos;
|
||||
|
||||
final static BytesRef END = SimpleTextFieldsWriter.END;
|
||||
final static BytesRef FIELD = SimpleTextFieldsWriter.FIELD;
|
||||
final static BytesRef TERM = SimpleTextFieldsWriter.TERM;
|
||||
final static BytesRef DOC = SimpleTextFieldsWriter.DOC;
|
||||
final static BytesRef FREQ = SimpleTextFieldsWriter.FREQ;
|
||||
final static BytesRef POS = SimpleTextFieldsWriter.POS;
|
||||
final static BytesRef PAYLOAD = SimpleTextFieldsWriter.PAYLOAD;
|
||||
final static BytesRef END = SimpleTextFieldsWriter.END;
|
||||
final static BytesRef FIELD = SimpleTextFieldsWriter.FIELD;
|
||||
final static BytesRef TERM = SimpleTextFieldsWriter.TERM;
|
||||
final static BytesRef DOC = SimpleTextFieldsWriter.DOC;
|
||||
final static BytesRef FREQ = SimpleTextFieldsWriter.FREQ;
|
||||
final static BytesRef POS = SimpleTextFieldsWriter.POS;
|
||||
final static BytesRef START_OFFSET = SimpleTextFieldsWriter.START_OFFSET;
|
||||
final static BytesRef END_OFFSET = SimpleTextFieldsWriter.END_OFFSET;
|
||||
final static BytesRef PAYLOAD = SimpleTextFieldsWriter.PAYLOAD;
|
||||
|
||||
public SimpleTextFieldsReader(SegmentReadState state) throws IOException {
|
||||
in = state.dir.openInput(SimpleTextPostingsFormat.getPostingsFileName(state.segmentInfo.name, state.segmentSuffix), state.context);
|
||||
|
@ -204,8 +206,16 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
|
||||
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
|
||||
// Positions were not indexed
|
||||
return null;
|
||||
}
|
||||
|
||||
if (needsOffsets &&
|
||||
indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0) {
|
||||
// Offsets were not indexed
|
||||
return null;
|
||||
}
|
||||
|
||||
|
@ -215,7 +225,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
} else {
|
||||
docsAndPositionsEnum = new SimpleTextDocsAndPositionsEnum();
|
||||
}
|
||||
return docsAndPositionsEnum.reset(docsStart, liveDocs);
|
||||
return docsAndPositionsEnum.reset(docsStart, liveDocs, indexOptions);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -289,6 +299,10 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
termFreq = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
|
||||
} else if (StringHelper.startsWith(scratch, POS)) {
|
||||
// skip termFreq++;
|
||||
} else if (StringHelper.startsWith(scratch, START_OFFSET)) {
|
||||
// skip
|
||||
} else if (StringHelper.startsWith(scratch, END_OFFSET)) {
|
||||
// skip
|
||||
} else if (StringHelper.startsWith(scratch, PAYLOAD)) {
|
||||
// skip
|
||||
} else {
|
||||
|
@ -325,6 +339,10 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
private final CharsRef scratchUTF16_2 = new CharsRef(10);
|
||||
private BytesRef payload;
|
||||
private long nextDocStart;
|
||||
private boolean readOffsets;
|
||||
private boolean readPositions;
|
||||
private int startOffset = -1;
|
||||
private int endOffset = -1;
|
||||
|
||||
public SimpleTextDocsAndPositionsEnum() {
|
||||
this.inStart = SimpleTextFieldsReader.this.in;
|
||||
|
@ -335,10 +353,12 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
return in == inStart;
|
||||
}
|
||||
|
||||
public SimpleTextDocsAndPositionsEnum reset(long fp, Bits liveDocs) {
|
||||
public SimpleTextDocsAndPositionsEnum reset(long fp, Bits liveDocs, IndexOptions indexOptions) {
|
||||
this.liveDocs = liveDocs;
|
||||
nextDocStart = fp;
|
||||
docID = -1;
|
||||
readPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
readOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -360,6 +380,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
while(true) {
|
||||
final long lineStart = in.getFilePointer();
|
||||
SimpleTextUtil.readLine(in, scratch);
|
||||
//System.out.println("NEXT DOC: " + scratch.utf8ToString());
|
||||
if (StringHelper.startsWith(scratch, DOC)) {
|
||||
if (!first && (liveDocs == null || liveDocs.get(docID))) {
|
||||
nextDocStart = lineStart;
|
||||
|
@ -376,6 +397,10 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
posStart = in.getFilePointer();
|
||||
} else if (StringHelper.startsWith(scratch, POS)) {
|
||||
// skip
|
||||
} else if (StringHelper.startsWith(scratch, START_OFFSET)) {
|
||||
// skip
|
||||
} else if (StringHelper.startsWith(scratch, END_OFFSET)) {
|
||||
// skip
|
||||
} else if (StringHelper.startsWith(scratch, PAYLOAD)) {
|
||||
// skip
|
||||
} else {
|
||||
|
@ -399,10 +424,27 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
|
||||
@Override
|
||||
public int nextPosition() throws IOException {
|
||||
SimpleTextUtil.readLine(in, scratch);
|
||||
assert StringHelper.startsWith(scratch, POS): "got line=" + scratch.utf8ToString();
|
||||
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+POS.length, scratch.length-POS.length, scratchUTF16_2);
|
||||
final int pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
|
||||
final int pos;
|
||||
if (readPositions) {
|
||||
SimpleTextUtil.readLine(in, scratch);
|
||||
assert StringHelper.startsWith(scratch, POS): "got line=" + scratch.utf8ToString();
|
||||
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+POS.length, scratch.length-POS.length, scratchUTF16_2);
|
||||
pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
|
||||
} else {
|
||||
pos = -1;
|
||||
}
|
||||
|
||||
if (readOffsets) {
|
||||
SimpleTextUtil.readLine(in, scratch);
|
||||
assert StringHelper.startsWith(scratch, START_OFFSET): "got line=" + scratch.utf8ToString();
|
||||
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+START_OFFSET.length, scratch.length-START_OFFSET.length, scratchUTF16_2);
|
||||
startOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
|
||||
SimpleTextUtil.readLine(in, scratch);
|
||||
assert StringHelper.startsWith(scratch, END_OFFSET): "got line=" + scratch.utf8ToString();
|
||||
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+END_OFFSET.length, scratch.length-END_OFFSET.length, scratchUTF16_2);
|
||||
endOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
|
||||
}
|
||||
|
||||
final long fp = in.getFilePointer();
|
||||
SimpleTextUtil.readLine(in, scratch);
|
||||
if (StringHelper.startsWith(scratch, PAYLOAD)) {
|
||||
|
@ -420,6 +462,16 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
return pos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
return startOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return endOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() {
|
||||
// Some tests rely on only being able to retrieve the
|
||||
|
|
|
@ -35,13 +35,15 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
|
|||
private final IndexOutput out;
|
||||
private final BytesRef scratch = new BytesRef(10);
|
||||
|
||||
final static BytesRef END = new BytesRef("END");
|
||||
final static BytesRef FIELD = new BytesRef("field ");
|
||||
final static BytesRef TERM = new BytesRef(" term ");
|
||||
final static BytesRef DOC = new BytesRef(" doc ");
|
||||
final static BytesRef FREQ = new BytesRef(" freq ");
|
||||
final static BytesRef POS = new BytesRef(" pos ");
|
||||
final static BytesRef PAYLOAD = new BytesRef(" payload ");
|
||||
final static BytesRef END = new BytesRef("END");
|
||||
final static BytesRef FIELD = new BytesRef("field ");
|
||||
final static BytesRef TERM = new BytesRef(" term ");
|
||||
final static BytesRef DOC = new BytesRef(" doc ");
|
||||
final static BytesRef FREQ = new BytesRef(" freq ");
|
||||
final static BytesRef POS = new BytesRef(" pos ");
|
||||
final static BytesRef START_OFFSET = new BytesRef(" startOffset ");
|
||||
final static BytesRef END_OFFSET = new BytesRef(" endOffset ");
|
||||
final static BytesRef PAYLOAD = new BytesRef(" payload ");
|
||||
|
||||
public SimpleTextFieldsWriter(SegmentWriteState state) throws IOException {
|
||||
final String fileName = SimpleTextPostingsFormat.getPostingsFileName(state.segmentName, state.segmentSuffix);
|
||||
|
@ -97,10 +99,19 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
|
|||
private class SimpleTextPostingsWriter extends PostingsConsumer {
|
||||
private BytesRef term;
|
||||
private boolean wroteTerm;
|
||||
private IndexOptions indexOptions;
|
||||
private final IndexOptions indexOptions;
|
||||
private final boolean writePositions;
|
||||
private final boolean writeOffsets;
|
||||
|
||||
// for assert:
|
||||
private int lastEndOffset = -1;
|
||||
|
||||
public SimpleTextPostingsWriter(FieldInfo field) {
|
||||
this.indexOptions = field.indexOptions;
|
||||
writePositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
writeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
//System.out.println("writeOffsets=" + writeOffsets);
|
||||
//System.out.println("writePos=" + writePositions);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -121,10 +132,10 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
|
|||
write(Integer.toString(termDocFreq));
|
||||
newline();
|
||||
}
|
||||
|
||||
lastEndOffset = -1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public PostingsConsumer reset(BytesRef term) {
|
||||
this.term = term;
|
||||
wroteTerm = false;
|
||||
|
@ -132,10 +143,25 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void addPosition(int position, BytesRef payload) throws IOException {
|
||||
write(POS);
|
||||
write(Integer.toString(position));
|
||||
newline();
|
||||
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
|
||||
if (writePositions) {
|
||||
write(POS);
|
||||
write(Integer.toString(position));
|
||||
newline();
|
||||
}
|
||||
|
||||
if (writeOffsets) {
|
||||
assert endOffset >= startOffset;
|
||||
assert startOffset >= lastEndOffset: "startOffset=" + startOffset + " lastEndOffset=" + lastEndOffset;
|
||||
lastEndOffset = endOffset;
|
||||
write(START_OFFSET);
|
||||
write(Integer.toString(startOffset));
|
||||
newline();
|
||||
write(END_OFFSET);
|
||||
write(Integer.toString(endOffset));
|
||||
newline();
|
||||
}
|
||||
|
||||
if (payload != null && payload.length > 0) {
|
||||
assert payload.length != 0;
|
||||
write(PAYLOAD);
|
||||
|
|
|
@ -38,7 +38,7 @@ public class SimpleTextTermVectorsFormat extends TermVectorsFormat {
|
|||
|
||||
@Override
|
||||
public TermVectorsReader vectorsReader(Directory directory, SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext context) throws IOException {
|
||||
return new SimpleTextTermVectorsReader(directory, segmentInfo, fieldInfos, context);
|
||||
return new SimpleTextTermVectorsReader(directory, segmentInfo, context);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -26,11 +26,9 @@ import java.util.Set;
|
|||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.codecs.TermVectorsReader;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
|
@ -63,7 +61,7 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
|
|||
private BytesRef scratch = new BytesRef();
|
||||
private CharsRef scratchUTF16 = new CharsRef();
|
||||
|
||||
public SimpleTextTermVectorsReader(Directory directory, SegmentInfo si, FieldInfos fieldInfos, IOContext context) throws IOException {
|
||||
public SimpleTextTermVectorsReader(Directory directory, SegmentInfo si, IOContext context) throws IOException {
|
||||
boolean success = false;
|
||||
try {
|
||||
in = directory.openInput(IndexFileNames.segmentFileName(si.name, "", VECTORS_EXTENSION), context);
|
||||
|
@ -114,7 +112,8 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
|
|||
for (int i = 0; i < numFields; i++) {
|
||||
readLine();
|
||||
assert StringHelper.startsWith(scratch, FIELD);
|
||||
int fieldNumber = parseIntAt(FIELD.length);
|
||||
// skip fieldNumber:
|
||||
parseIntAt(FIELD.length);
|
||||
|
||||
readLine();
|
||||
assert StringHelper.startsWith(scratch, FIELDNAME);
|
||||
|
@ -373,13 +372,16 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
|
||||
SimpleTVPostings postings = current.getValue();
|
||||
if (postings.positions == null && postings.startOffsets == null) {
|
||||
return null;
|
||||
}
|
||||
if (needsOffsets && (postings.startOffsets == null || postings.endOffsets == null)) {
|
||||
return null;
|
||||
}
|
||||
// TODO: reuse
|
||||
SimpleTVDocsAndPositionsEnum e = new SimpleTVDocsAndPositionsEnum(postings.startOffsets != null);
|
||||
SimpleTVDocsAndPositionsEnum e = new SimpleTVDocsAndPositionsEnum();
|
||||
e.reset(liveDocs, postings.positions, postings.startOffsets, postings.endOffsets);
|
||||
return e;
|
||||
}
|
||||
|
@ -436,7 +438,6 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
|
|||
}
|
||||
|
||||
private static class SimpleTVDocsAndPositionsEnum extends DocsAndPositionsEnum {
|
||||
private final OffsetAttribute offsetAtt;
|
||||
private boolean didNext;
|
||||
private int doc = -1;
|
||||
private int nextPos;
|
||||
|
@ -445,18 +446,6 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
|
|||
private int[] startOffsets;
|
||||
private int[] endOffsets;
|
||||
|
||||
public SimpleTVDocsAndPositionsEnum(boolean storeOffsets) {
|
||||
if (storeOffsets) {
|
||||
offsetAtt = attributes().addAttribute(OffsetAttribute.class);
|
||||
} else {
|
||||
offsetAtt = null;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean canReuse(boolean storeOffsets) {
|
||||
return storeOffsets == (offsetAtt != null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int freq() {
|
||||
if (positions != null) {
|
||||
|
@ -495,7 +484,6 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
|
|||
this.liveDocs = liveDocs;
|
||||
this.positions = positions;
|
||||
this.startOffsets = startOffsets;
|
||||
assert (offsetAtt != null) == (startOffsets != null);
|
||||
this.endOffsets = endOffsets;
|
||||
this.doc = -1;
|
||||
didNext = false;
|
||||
|
@ -516,11 +504,6 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
|
|||
public int nextPosition() {
|
||||
assert (positions != null && nextPos < positions.length) ||
|
||||
startOffsets != null && nextPos < startOffsets.length;
|
||||
|
||||
if (startOffsets != null) {
|
||||
offsetAtt.setOffset(startOffsets[nextPos],
|
||||
endOffsets[nextPos]);
|
||||
}
|
||||
if (positions != null) {
|
||||
return positions[nextPos++];
|
||||
} else {
|
||||
|
@ -528,5 +511,15 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
|
|||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() {
|
||||
return startOffsets[nextPos-1];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() {
|
||||
return endOffsets[nextPos-1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -289,6 +289,20 @@ public class CheckIndex {
|
|||
infoStream = null;
|
||||
}
|
||||
|
||||
private boolean crossCheckTermVectors;
|
||||
|
||||
/** If true, term vectors are compared against postings to
|
||||
* make sure they are the same. This will likely
|
||||
* drastically increase time it takes to run CheckIndex! */
|
||||
public void setCrossCheckTermVectors(boolean v) {
|
||||
crossCheckTermVectors = v;
|
||||
}
|
||||
|
||||
/** See {@link #setCrossCheckTermVectors}. */
|
||||
public boolean getCrossCheckTermVectors() {
|
||||
return crossCheckTermVectors;
|
||||
}
|
||||
|
||||
private boolean verbose;
|
||||
|
||||
/** Set infoStream where messages should go. If null, no
|
||||
|
@ -563,7 +577,7 @@ public class CheckIndex {
|
|||
segInfoStat.fieldNormStatus = testFieldNorms(fieldInfos, reader);
|
||||
|
||||
// Test the Term Index
|
||||
segInfoStat.termIndexStatus = testTermIndex(reader);
|
||||
segInfoStat.termIndexStatus = testPostings(reader);
|
||||
|
||||
// Test Stored Fields
|
||||
segInfoStat.storedFieldStatus = testStoredFields(info, reader, nf);
|
||||
|
@ -678,7 +692,11 @@ public class CheckIndex {
|
|||
/**
|
||||
* Test the term index.
|
||||
*/
|
||||
private Status.TermIndexStatus testTermIndex(SegmentReader reader) {
|
||||
private Status.TermIndexStatus testPostings(SegmentReader reader) {
|
||||
|
||||
// TODO: we should go and verify term vectors match, if
|
||||
// crossCheckTermVectors is on...
|
||||
|
||||
final Status.TermIndexStatus status = new Status.TermIndexStatus();
|
||||
|
||||
final int maxDoc = reader.maxDoc();
|
||||
|
@ -760,7 +778,7 @@ public class CheckIndex {
|
|||
|
||||
docs = termsEnum.docs(liveDocs, docs, false);
|
||||
docsAndFreqs = termsEnum.docs(liveDocs, docsAndFreqs, true);
|
||||
postings = termsEnum.docsAndPositions(liveDocs, postings);
|
||||
postings = termsEnum.docsAndPositions(liveDocs, postings, false);
|
||||
|
||||
if (hasOrd) {
|
||||
long ord = -1;
|
||||
|
@ -890,7 +908,7 @@ public class CheckIndex {
|
|||
if (hasPositions) {
|
||||
for(int idx=0;idx<7;idx++) {
|
||||
final int skipDocID = (int) (((idx+1)*(long) maxDoc)/8);
|
||||
postings = termsEnum.docsAndPositions(liveDocs, postings);
|
||||
postings = termsEnum.docsAndPositions(liveDocs, postings, false);
|
||||
final int docID = postings.advance(skipDocID);
|
||||
if (docID == DocsEnum.NO_MORE_DOCS) {
|
||||
break;
|
||||
|
@ -1256,7 +1274,10 @@ public class CheckIndex {
|
|||
private Status.TermVectorStatus testTermVectors(SegmentInfo info, SegmentReader reader, NumberFormat format) {
|
||||
final Status.TermVectorStatus status = new Status.TermVectorStatus();
|
||||
|
||||
TermsEnum termsEnum = null;
|
||||
// TODO: in theory we could test that term vectors have
|
||||
// same terms/pos/offsets as the postings, but it'd be
|
||||
// very slow...
|
||||
|
||||
try {
|
||||
if (infoStream != null) {
|
||||
infoStream.print(" test: term vectors........");
|
||||
|
@ -1264,9 +1285,25 @@ public class CheckIndex {
|
|||
|
||||
// TODO: maybe we can factor out testTermIndex and reuse here?
|
||||
DocsEnum docs = null;
|
||||
DocsEnum docsAndFreqs = null;
|
||||
DocsAndPositionsEnum postings = null;
|
||||
|
||||
// Only used if crossCheckTermVectors is true:
|
||||
DocsEnum postingsDocs = null;
|
||||
DocsAndPositionsEnum postingsPostings = null;
|
||||
|
||||
final Bits liveDocs = reader.getLiveDocs();
|
||||
|
||||
final Fields postingsFields;
|
||||
// TODO: testTermsIndex
|
||||
if (crossCheckTermVectors) {
|
||||
postingsFields = reader.fields();
|
||||
} else {
|
||||
postingsFields = null;
|
||||
}
|
||||
|
||||
TermsEnum termsEnum = null;
|
||||
TermsEnum postingsTermsEnum = null;
|
||||
|
||||
for (int j = 0; j < info.docCount; ++j) {
|
||||
if (liveDocs == null || liveDocs.get(j)) {
|
||||
status.docCount++;
|
||||
|
@ -1290,6 +1327,16 @@ public class CheckIndex {
|
|||
|
||||
Terms terms = tfv.terms(field);
|
||||
termsEnum = terms.iterator(termsEnum);
|
||||
|
||||
if (crossCheckTermVectors) {
|
||||
Terms postingsTerms = postingsFields.terms(field);
|
||||
if (postingsTerms == null) {
|
||||
throw new RuntimeException("vector field=" + field + " does not exist in postings; doc=" + j);
|
||||
}
|
||||
postingsTermsEnum = postingsTerms.iterator(postingsTermsEnum);
|
||||
} else {
|
||||
postingsTermsEnum = null;
|
||||
}
|
||||
|
||||
long tfvComputedTermCountForField = 0;
|
||||
long tfvComputedSumTotalTermFreq = 0;
|
||||
|
@ -1308,52 +1355,171 @@ public class CheckIndex {
|
|||
throw new RuntimeException("totalTermFreq: " + totalTermFreq + " is out of bounds");
|
||||
}
|
||||
|
||||
postings = termsEnum.docsAndPositions(null, postings);
|
||||
final boolean hasPositions;
|
||||
final boolean hasOffsets;
|
||||
final boolean hasFreqs;
|
||||
|
||||
// TODO: really we need a reflection/query
|
||||
// API so we can just ask what was indexed
|
||||
// instead of "probing"...
|
||||
|
||||
// Try offsets:
|
||||
postings = termsEnum.docsAndPositions(null, postings, true);
|
||||
if (postings == null) {
|
||||
docsAndFreqs = termsEnum.docs(null, docsAndFreqs, true);
|
||||
if (docsAndFreqs == null) {
|
||||
docs = termsEnum.docs(null, docs, false);
|
||||
hasOffsets = false;
|
||||
// Try only positions:
|
||||
postings = termsEnum.docsAndPositions(null, postings, false);
|
||||
if (postings == null) {
|
||||
hasPositions = false;
|
||||
// Try docIDs & freqs:
|
||||
docs = termsEnum.docs(null, docs, true);
|
||||
if (docs == null) {
|
||||
// OK, only docIDs:
|
||||
hasFreqs = false;
|
||||
docs = termsEnum.docs(null, docs, false);
|
||||
} else {
|
||||
hasFreqs = true;
|
||||
}
|
||||
} else {
|
||||
docs = docsAndFreqs;
|
||||
hasPositions = true;
|
||||
hasFreqs = true;
|
||||
}
|
||||
} else {
|
||||
docs = docsAndFreqs = postings;
|
||||
hasOffsets = true;
|
||||
// NOTE: may be a lie... but we accept -1 below
|
||||
hasPositions = true;
|
||||
hasFreqs = true;
|
||||
}
|
||||
|
||||
final int doc = docs.nextDoc();
|
||||
final DocsEnum docs2;
|
||||
if (hasPositions || hasOffsets) {
|
||||
assert postings != null;
|
||||
docs2 = postings;
|
||||
} else {
|
||||
assert docs != null;
|
||||
docs2 = docs;
|
||||
}
|
||||
|
||||
final DocsEnum postingsDocs2;
|
||||
final boolean postingsHasFreq;
|
||||
if (crossCheckTermVectors) {
|
||||
if (!postingsTermsEnum.seekExact(term, true)) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j);
|
||||
}
|
||||
postingsPostings = postingsTermsEnum.docsAndPositions(null, postingsPostings, true);
|
||||
if (postingsPostings == null) {
|
||||
// Term vectors were indexed w/ offsets but postings were not
|
||||
postingsPostings = postingsTermsEnum.docsAndPositions(null, postingsPostings, false);
|
||||
if (postingsPostings == null) {
|
||||
postingsDocs = postingsTermsEnum.docs(null, postingsDocs, true);
|
||||
if (postingsDocs == null) {
|
||||
postingsHasFreq = false;
|
||||
postingsDocs = postingsTermsEnum.docs(null, postingsDocs, false);
|
||||
if (postingsDocs == null) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j);
|
||||
}
|
||||
} else {
|
||||
postingsHasFreq = true;
|
||||
}
|
||||
} else {
|
||||
postingsHasFreq = true;
|
||||
}
|
||||
} else {
|
||||
postingsHasFreq = true;
|
||||
}
|
||||
|
||||
if (postingsPostings != null) {
|
||||
postingsDocs2 = postingsPostings;
|
||||
} else {
|
||||
postingsDocs2 = postingsDocs;
|
||||
}
|
||||
|
||||
final int advanceDoc = postingsDocs2.advance(j);
|
||||
if (advanceDoc != j) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + ": doc=" + j + " was not found in postings (got: " + advanceDoc + ")");
|
||||
}
|
||||
} else {
|
||||
postingsDocs2 = null;
|
||||
postingsHasFreq = false;
|
||||
}
|
||||
|
||||
final int doc = docs2.nextDoc();
|
||||
|
||||
if (doc != 0) {
|
||||
throw new RuntimeException("vector for doc " + j + " didn't return docID=0: got docID=" + doc);
|
||||
}
|
||||
|
||||
if (docsAndFreqs != null) {
|
||||
final int tf = docsAndFreqs.freq();
|
||||
if (hasFreqs) {
|
||||
final int tf = docs2.freq();
|
||||
if (tf <= 0) {
|
||||
throw new RuntimeException("vector freq " + tf + " is out of bounds");
|
||||
}
|
||||
if (totalTermFreq != -1 && totalTermFreq != tf) {
|
||||
throw new RuntimeException("vector totalTermFreq " + totalTermFreq + " != tf " + tf);
|
||||
}
|
||||
if (crossCheckTermVectors && postingsHasFreq) {
|
||||
if (postingsDocs2.freq() != tf) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": freq=" + tf + " differs from postings freq=" + postingsDocs2.freq());
|
||||
}
|
||||
}
|
||||
tfvComputedSumTotalTermFreq += tf;
|
||||
|
||||
if (postings != null) {
|
||||
if (hasPositions || hasOffsets) {
|
||||
int lastPosition = -1;
|
||||
//int lastStartOffset = -1;
|
||||
for (int i = 0; i < tf; i++) {
|
||||
int pos = postings.nextPosition();
|
||||
if (pos != -1 && pos < 0) {
|
||||
throw new RuntimeException("vector position " + pos + " is out of bounds");
|
||||
}
|
||||
if (hasPositions) {
|
||||
if (pos != -1 && pos < 0) {
|
||||
throw new RuntimeException("vector position " + pos + " is out of bounds");
|
||||
}
|
||||
if (pos < lastPosition) {
|
||||
throw new RuntimeException("vector position " + pos + " < lastPos " + lastPosition);
|
||||
}
|
||||
|
||||
if (pos < lastPosition) {
|
||||
throw new RuntimeException("vector position " + pos + " < lastPos " + lastPosition);
|
||||
lastPosition = pos;
|
||||
}
|
||||
|
||||
if (crossCheckTermVectors && postingsPostings != null) {
|
||||
int postingsPos = postingsPostings.nextPosition();
|
||||
if (pos != -1 && postingsPos != -1 && pos != postingsPos) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": pos=" + pos + " differs from postings pos=" + postingsPos);
|
||||
}
|
||||
}
|
||||
|
||||
if (hasOffsets) {
|
||||
// Call the methods to at least make
|
||||
// sure they don't throw exc:
|
||||
final int startOffset = postings.startOffset();
|
||||
final int endOffset = postings.endOffset();
|
||||
// TODO: these are too anal...?
|
||||
/*
|
||||
if (endOffset < startOffset) {
|
||||
throw new RuntimeException("vector startOffset=" + startOffset + " is > endOffset=" + endOffset);
|
||||
}
|
||||
if (startOffset < lastStartOffset) {
|
||||
throw new RuntimeException("vector startOffset=" + startOffset + " is < prior startOffset=" + lastStartOffset);
|
||||
}
|
||||
lastStartOffset = startOffset;
|
||||
*/
|
||||
|
||||
if (crossCheckTermVectors && postingsPostings != null) {
|
||||
final int postingsStartOffset = postingsPostings.startOffset();
|
||||
|
||||
final int postingsEndOffset = postingsPostings.endOffset();
|
||||
if (startOffset != -1 && postingsStartOffset != -1 && startOffset != postingsStartOffset) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": startOffset=" + startOffset + " differs from postings startOffset=" + postingsStartOffset);
|
||||
}
|
||||
if (endOffset != -1 && postingsEndOffset != -1 && endOffset != postingsEndOffset) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": endOffset=" + endOffset + " differs from postings endOffset=" + postingsEndOffset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lastPosition = pos;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
if (docs2.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
throw new RuntimeException("vector for doc " + j + " references multiple documents!");
|
||||
}
|
||||
}
|
||||
|
@ -1474,6 +1640,7 @@ public class CheckIndex {
|
|||
public static void main(String[] args) throws IOException, InterruptedException {
|
||||
|
||||
boolean doFix = false;
|
||||
boolean doCrossCheckTermVectors = false;
|
||||
Codec codec = Codec.getDefault(); // only used when fixing
|
||||
boolean verbose = false;
|
||||
List<String> onlySegments = new ArrayList<String>();
|
||||
|
@ -1484,6 +1651,8 @@ public class CheckIndex {
|
|||
String arg = args[i];
|
||||
if ("-fix".equals(arg)) {
|
||||
doFix = true;
|
||||
} else if ("-crossCheckTermVectors".equals(arg)) {
|
||||
doCrossCheckTermVectors = true;
|
||||
} else if ("-codec".equals(arg)) {
|
||||
if (i == args.length-1) {
|
||||
System.out.println("ERROR: missing name for -codec option");
|
||||
|
@ -1519,9 +1688,10 @@ public class CheckIndex {
|
|||
|
||||
if (indexPath == null) {
|
||||
System.out.println("\nERROR: index path not specified");
|
||||
System.out.println("\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-fix] [-segment X] [-segment Y] [-dir-impl X]\n" +
|
||||
System.out.println("\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-fix] [-crossCheckTermVectors] [-segment X] [-segment Y] [-dir-impl X]\n" +
|
||||
"\n" +
|
||||
" -fix: actually write a new segments_N file, removing any problematic segments\n" +
|
||||
" -crossCheckTermVectors: verifies that term vectors match postings; THIS IS VERY SLOW!\n" +
|
||||
" -codec X: when fixing, codec to write the new segments_N file with\n" +
|
||||
" -verbose: print additional details\n" +
|
||||
" -segment X: only check the specified segments. This can be specified multiple\n" +
|
||||
|
@ -1570,6 +1740,7 @@ public class CheckIndex {
|
|||
}
|
||||
|
||||
CheckIndex checker = new CheckIndex(dir);
|
||||
checker.setCrossCheckTermVectors(doCrossCheckTermVectors);
|
||||
checker.setInfoStream(System.out, verbose);
|
||||
|
||||
Status result = checker.checkIndex(onlySegments);
|
||||
|
|
|
@ -73,8 +73,9 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
// tokenized.
|
||||
if (field.fieldType().indexed() && doInvert) {
|
||||
|
||||
if (i > 0)
|
||||
if (i > 0) {
|
||||
fieldState.position += docState.analyzer == null ? 0 : docState.analyzer.getPositionIncrementGap(fieldInfo.name);
|
||||
}
|
||||
|
||||
final TokenStream stream = field.tokenStream(docState.analyzer);
|
||||
// reset the TokenStream to the first token
|
||||
|
|
|
@ -655,8 +655,8 @@ public class DocTermOrds {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
return termsEnum.docsAndPositions(liveDocs, reuse);
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
|
||||
return termsEnum.docsAndPositions(liveDocs, reuse, needsOffsets);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -26,9 +26,20 @@ public abstract class DocsAndPositionsEnum extends DocsEnum {
|
|||
|
||||
/** Returns the next position. You should only call this
|
||||
* up to {@link DocsEnum#freq()} times else
|
||||
* the behavior is not defined. */
|
||||
* the behavior is not defined. If positions were not
|
||||
* indexed this will return -1; this only happens if
|
||||
* offsets were indexed and you passed needsOffset=true
|
||||
* when pulling the enum. */
|
||||
public abstract int nextPosition() throws IOException;
|
||||
|
||||
/** Returns start offset for the current position, or -1
|
||||
* if offsets were not indexed. */
|
||||
public abstract int startOffset() throws IOException;
|
||||
|
||||
/** Returns end offset for the current position, or -1 if
|
||||
* offsets were not indexed. */
|
||||
public abstract int endOffset() throws IOException;
|
||||
|
||||
/** Returns the payload at this position, or null if no
|
||||
* payload was indexed. Only call this once per
|
||||
* position. */
|
||||
|
|
|
@ -38,13 +38,18 @@ public final class FieldInfo {
|
|||
* @lucene.experimental
|
||||
*/
|
||||
public static enum IndexOptions {
|
||||
// NOTE: order is important here; FieldInfo uses this
|
||||
// order to merge two conflicting IndexOptions (always
|
||||
// "downgrades" by picking the lowest).
|
||||
/** only documents are indexed: term frequencies and positions are omitted */
|
||||
// TODO: maybe rename to just DOCS?
|
||||
DOCS_ONLY,
|
||||
/** only documents and term frequencies are indexed: positions are omitted */
|
||||
DOCS_AND_FREQS,
|
||||
/** full postings: documents, frequencies, and positions */
|
||||
DOCS_AND_FREQS_AND_POSITIONS
|
||||
/** documents, frequencies and positions */
|
||||
DOCS_AND_FREQS_AND_POSITIONS,
|
||||
/** documents, frequencies, positions and offsets */
|
||||
DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS,
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -67,7 +72,7 @@ public final class FieldInfo {
|
|||
this.omitNorms = false;
|
||||
this.indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
|
||||
}
|
||||
assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !storePayloads;
|
||||
assert indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !storePayloads;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -95,10 +100,13 @@ public final class FieldInfo {
|
|||
if (this.indexOptions != indexOptions) {
|
||||
// downgrade
|
||||
this.indexOptions = this.indexOptions.compareTo(indexOptions) < 0 ? this.indexOptions : indexOptions;
|
||||
this.storePayloads = false;
|
||||
if (this.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
|
||||
// cannot store payloads if we don't store positions:
|
||||
this.storePayloads = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
assert this.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !this.storePayloads;
|
||||
assert this.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !this.storePayloads;
|
||||
}
|
||||
|
||||
void setDocValuesType(DocValues.Type v) {
|
||||
|
|
|
@ -185,7 +185,7 @@ public final class FieldInfos implements Iterable<FieldInfo> {
|
|||
}
|
||||
// mutable FIs must check!
|
||||
for (FieldInfo fi : this) {
|
||||
if (fi.isIndexed && fi.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
if (fi.isIndexed && fi.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -430,7 +430,7 @@ public final class FieldInfos implements Iterable<FieldInfo> {
|
|||
FieldInfo clone = (FieldInfo) (fieldInfo).clone();
|
||||
roFis.putInternal(clone);
|
||||
roFis.hasVectors |= clone.storeTermVector;
|
||||
roFis.hasProx |= clone.isIndexed && clone.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
|
||||
roFis.hasProx |= clone.isIndexed && clone.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
roFis.hasFreq |= clone.isIndexed && clone.indexOptions != IndexOptions.DOCS_ONLY;
|
||||
}
|
||||
return roFis;
|
||||
|
|
|
@ -176,8 +176,8 @@ public class FilterIndexReader extends IndexReader {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
return in.docsAndPositions(liveDocs, reuse);
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
|
||||
return in.docsAndPositions(liveDocs, reuse, needsOffsets);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -258,6 +258,16 @@ public class FilterIndexReader extends IndexReader {
|
|||
return in.nextPosition();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
return in.startOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return in.endOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() throws IOException {
|
||||
return in.getPayload();
|
||||
|
|
|
@ -171,8 +171,8 @@ public abstract class FilteredTermsEnum extends TermsEnum {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits bits, DocsAndPositionsEnum reuse) throws IOException {
|
||||
return tenum.docsAndPositions(bits, reuse);
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits bits, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
|
||||
return tenum.docsAndPositions(bits, reuse, needsOffsets);
|
||||
}
|
||||
|
||||
/** This enum does not support seeking!
|
||||
|
|
|
@ -83,7 +83,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
|
|||
|
||||
// Aggregate the storePayload as seen by the same
|
||||
// field across multiple threads
|
||||
if (fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
if (fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
|
||||
fieldInfo.storePayloads |= fieldWriter.hasPayloads;
|
||||
}
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
|||
import java.util.Comparator;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.PostingsConsumer;
|
||||
|
@ -43,7 +44,11 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
final DocumentsWriterPerThread.DocState docState;
|
||||
final FieldInvertState fieldState;
|
||||
IndexOptions indexOptions;
|
||||
private boolean writeFreq;
|
||||
private boolean writeProx;
|
||||
private boolean writeOffsets;
|
||||
PayloadAttribute payloadAttribute;
|
||||
OffsetAttribute offsetAttribute;
|
||||
|
||||
public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriter parent, FieldInfo fieldInfo) {
|
||||
this.termsHashPerField = termsHashPerField;
|
||||
|
@ -51,15 +56,16 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
this.fieldInfo = fieldInfo;
|
||||
docState = termsHashPerField.docState;
|
||||
fieldState = termsHashPerField.fieldState;
|
||||
indexOptions = fieldInfo.indexOptions;
|
||||
setIndexOptions(fieldInfo.indexOptions);
|
||||
}
|
||||
|
||||
@Override
|
||||
int getStreamCount() {
|
||||
if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
|
||||
if (!writeProx) {
|
||||
return 1;
|
||||
else
|
||||
} else {
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -74,13 +80,21 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
return fieldInfo.name.compareTo(other.fieldInfo.name);
|
||||
}
|
||||
|
||||
// Called after flush
|
||||
void reset() {
|
||||
// Record, up front, whether our in-RAM format will be
|
||||
// with or without term freqs:
|
||||
indexOptions = fieldInfo.indexOptions;
|
||||
setIndexOptions(fieldInfo.indexOptions);
|
||||
payloadAttribute = null;
|
||||
}
|
||||
|
||||
private void setIndexOptions(IndexOptions indexOptions) {
|
||||
this.indexOptions = indexOptions;
|
||||
writeFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||
writeProx = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
writeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
boolean start(IndexableField[] fields, int count) {
|
||||
for(int i=0;i<count;i++) {
|
||||
|
@ -98,9 +112,16 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
} else {
|
||||
payloadAttribute = null;
|
||||
}
|
||||
if (writeOffsets) {
|
||||
offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
|
||||
} else {
|
||||
offsetAttribute = null;
|
||||
}
|
||||
}
|
||||
|
||||
void writeProx(final int termID, int proxCode) {
|
||||
//System.out.println("writeProx termID=" + termID + " proxCode=" + proxCode);
|
||||
assert writeProx;
|
||||
final Payload payload;
|
||||
if (payloadAttribute == null) {
|
||||
payload = null;
|
||||
|
@ -113,12 +134,24 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
termsHashPerField.writeVInt(1, payload.length);
|
||||
termsHashPerField.writeBytes(1, payload.data, payload.offset, payload.length);
|
||||
hasPayloads = true;
|
||||
} else
|
||||
} else {
|
||||
termsHashPerField.writeVInt(1, proxCode<<1);
|
||||
}
|
||||
|
||||
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
|
||||
postings.lastPositions[termID] = fieldState.position;
|
||||
}
|
||||
|
||||
void writeOffsets(final int termID, int prevOffset) {
|
||||
assert writeOffsets;
|
||||
final int startOffset = offsetAttribute.startOffset();
|
||||
final int endOffset = offsetAttribute.endOffset();
|
||||
//System.out.println("writeOffsets termID=" + termID + " prevOffset=" + prevOffset + " startOff=" + startOffset + " endOff=" + endOffset);
|
||||
termsHashPerField.writeVInt(1, startOffset - prevOffset);
|
||||
termsHashPerField.writeVInt(1, endOffset - startOffset);
|
||||
|
||||
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
|
||||
postings.lastOffsets[termID] = startOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -129,13 +162,18 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
|
||||
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
|
||||
postings.lastDocIDs[termID] = docState.docID;
|
||||
if (indexOptions == IndexOptions.DOCS_ONLY) {
|
||||
if (!writeFreq) {
|
||||
postings.lastDocCodes[termID] = docState.docID;
|
||||
} else {
|
||||
postings.lastDocCodes[termID] = docState.docID << 1;
|
||||
postings.docFreqs[termID] = 1;
|
||||
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
if (writeProx) {
|
||||
writeProx(termID, fieldState.position);
|
||||
if (writeOffsets) {
|
||||
writeOffsets(termID, fieldState.offset);
|
||||
}
|
||||
} else {
|
||||
assert !writeOffsets;
|
||||
}
|
||||
}
|
||||
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
|
||||
|
@ -149,9 +187,10 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
|
||||
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
|
||||
|
||||
assert indexOptions == IndexOptions.DOCS_ONLY || postings.docFreqs[termID] > 0;
|
||||
assert !writeFreq || postings.docFreqs[termID] > 0;
|
||||
|
||||
if (indexOptions == IndexOptions.DOCS_ONLY) {
|
||||
if (!writeFreq) {
|
||||
assert postings.docFreqs == null;
|
||||
if (docState.docID != postings.lastDocIDs[termID]) {
|
||||
assert docState.docID > postings.lastDocIDs[termID];
|
||||
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
|
||||
|
@ -159,59 +198,76 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
postings.lastDocIDs[termID] = docState.docID;
|
||||
fieldState.uniqueTermCount++;
|
||||
}
|
||||
} else {
|
||||
if (docState.docID != postings.lastDocIDs[termID]) {
|
||||
assert docState.docID > postings.lastDocIDs[termID]:"id: "+docState.docID + " postings ID: "+ postings.lastDocIDs[termID] + " termID: "+termID;
|
||||
// Term not yet seen in the current doc but previously
|
||||
// seen in other doc(s) since the last flush
|
||||
} else if (docState.docID != postings.lastDocIDs[termID]) {
|
||||
assert docState.docID > postings.lastDocIDs[termID]:"id: "+docState.docID + " postings ID: "+ postings.lastDocIDs[termID] + " termID: "+termID;
|
||||
// Term not yet seen in the current doc but previously
|
||||
// seen in other doc(s) since the last flush
|
||||
|
||||
// Now that we know doc freq for previous doc,
|
||||
// write it & lastDocCode
|
||||
if (1 == postings.docFreqs[termID])
|
||||
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]|1);
|
||||
else {
|
||||
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
|
||||
termsHashPerField.writeVInt(0, postings.docFreqs[termID]);
|
||||
}
|
||||
postings.docFreqs[termID] = 1;
|
||||
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
|
||||
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
|
||||
postings.lastDocIDs[termID] = docState.docID;
|
||||
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
writeProx(termID, fieldState.position);
|
||||
}
|
||||
fieldState.uniqueTermCount++;
|
||||
// Now that we know doc freq for previous doc,
|
||||
// write it & lastDocCode
|
||||
if (1 == postings.docFreqs[termID]) {
|
||||
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]|1);
|
||||
} else {
|
||||
fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]);
|
||||
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
writeProx(termID, fieldState.position-postings.lastPositions[termID]);
|
||||
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
|
||||
termsHashPerField.writeVInt(0, postings.docFreqs[termID]);
|
||||
}
|
||||
postings.docFreqs[termID] = 1;
|
||||
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
|
||||
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
|
||||
postings.lastDocIDs[termID] = docState.docID;
|
||||
if (writeProx) {
|
||||
writeProx(termID, fieldState.position);
|
||||
if (writeOffsets) {
|
||||
writeOffsets(termID, fieldState.offset);
|
||||
}
|
||||
} else {
|
||||
assert !writeOffsets;
|
||||
}
|
||||
fieldState.uniqueTermCount++;
|
||||
} else {
|
||||
fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]);
|
||||
if (writeProx) {
|
||||
writeProx(termID, fieldState.position-postings.lastPositions[termID]);
|
||||
}
|
||||
if (writeOffsets) {
|
||||
writeOffsets(termID, postings.lastOffsets[termID]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
ParallelPostingsArray createPostingsArray(int size) {
|
||||
return new FreqProxPostingsArray(size);
|
||||
return new FreqProxPostingsArray(size, writeFreq, writeProx, writeOffsets);
|
||||
}
|
||||
|
||||
static final class FreqProxPostingsArray extends ParallelPostingsArray {
|
||||
public FreqProxPostingsArray(int size) {
|
||||
public FreqProxPostingsArray(int size, boolean writeFreqs, boolean writeProx, boolean writeOffsets) {
|
||||
super(size);
|
||||
docFreqs = new int[size];
|
||||
if (writeFreqs) {
|
||||
docFreqs = new int[size];
|
||||
}
|
||||
lastDocIDs = new int[size];
|
||||
lastDocCodes = new int[size];
|
||||
lastPositions = new int[size];
|
||||
if (writeProx) {
|
||||
lastPositions = new int[size];
|
||||
if (writeOffsets) {
|
||||
lastOffsets = new int[size];
|
||||
}
|
||||
} else {
|
||||
assert !writeOffsets;
|
||||
}
|
||||
//System.out.println("PA init freqs=" + writeFreqs + " pos=" + writeProx + " offs=" + writeOffsets);
|
||||
}
|
||||
|
||||
int docFreqs[]; // # times this term occurs in the current doc
|
||||
int lastDocIDs[]; // Last docID where this term occurred
|
||||
int lastDocCodes[]; // Code for prior doc
|
||||
int lastPositions[]; // Last position where this term occurred
|
||||
int lastOffsets[]; // Last endOffset where this term occurred
|
||||
|
||||
@Override
|
||||
ParallelPostingsArray newInstance(int size) {
|
||||
return new FreqProxPostingsArray(size);
|
||||
return new FreqProxPostingsArray(size, docFreqs != null, lastPositions != null, lastOffsets != null);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -221,15 +277,36 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
|
||||
super.copyTo(toArray, numToCopy);
|
||||
|
||||
System.arraycopy(docFreqs, 0, to.docFreqs, 0, numToCopy);
|
||||
System.arraycopy(lastDocIDs, 0, to.lastDocIDs, 0, numToCopy);
|
||||
System.arraycopy(lastDocCodes, 0, to.lastDocCodes, 0, numToCopy);
|
||||
System.arraycopy(lastPositions, 0, to.lastPositions, 0, numToCopy);
|
||||
if (lastPositions != null) {
|
||||
assert to.lastPositions != null;
|
||||
System.arraycopy(lastPositions, 0, to.lastPositions, 0, numToCopy);
|
||||
}
|
||||
if (lastOffsets != null) {
|
||||
assert to.lastOffsets != null;
|
||||
System.arraycopy(lastOffsets, 0, to.lastOffsets, 0, numToCopy);
|
||||
}
|
||||
if (docFreqs != null) {
|
||||
assert to.docFreqs != null;
|
||||
System.arraycopy(docFreqs, 0, to.docFreqs, 0, numToCopy);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
int bytesPerPosting() {
|
||||
return ParallelPostingsArray.BYTES_PER_POSTING + 4 * RamUsageEstimator.NUM_BYTES_INT;
|
||||
int bytes = ParallelPostingsArray.BYTES_PER_POSTING + 2 * RamUsageEstimator.NUM_BYTES_INT;
|
||||
if (lastPositions != null) {
|
||||
bytes += RamUsageEstimator.NUM_BYTES_INT;
|
||||
}
|
||||
if (lastOffsets != null) {
|
||||
bytes += RamUsageEstimator.NUM_BYTES_INT;
|
||||
}
|
||||
if (docFreqs != null) {
|
||||
bytes += RamUsageEstimator.NUM_BYTES_INT;
|
||||
}
|
||||
|
||||
return bytes;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -246,8 +323,33 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
final TermsConsumer termsConsumer = consumer.addField(fieldInfo);
|
||||
final Comparator<BytesRef> termComp = termsConsumer.getComparator();
|
||||
|
||||
// CONFUSING: this.indexOptions holds the index options
|
||||
// that were current when we first saw this field. But
|
||||
// it's possible this has changed, eg when other
|
||||
// documents are indexed that cause a "downgrade" of the
|
||||
// IndexOptions. So we must decode the in-RAM buffer
|
||||
// according to this.indexOptions, but then write the
|
||||
// new segment to the directory according to
|
||||
// currentFieldIndexOptions:
|
||||
final IndexOptions currentFieldIndexOptions = fieldInfo.indexOptions;
|
||||
|
||||
final boolean writeTermFreq = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||
final boolean writePositions = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
final boolean writeOffsets = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
|
||||
final boolean readTermFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||
final boolean readPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
final boolean readOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
|
||||
//System.out.println("flush readTF=" + readTermFreq + " readPos=" + readPositions + " readOffs=" + readOffsets);
|
||||
|
||||
// Make sure FieldInfo.update is working correctly!:
|
||||
assert !writeTermFreq || readTermFreq;
|
||||
assert !writePositions || readPositions;
|
||||
assert !writeOffsets || readOffsets;
|
||||
|
||||
assert !writeOffsets || writePositions;
|
||||
|
||||
final Map<Term,Integer> segDeletes;
|
||||
if (state.segDeletes != null && state.segDeletes.terms.size() > 0) {
|
||||
segDeletes = state.segDeletes.terms;
|
||||
|
@ -268,12 +370,13 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
|
||||
for (int i = 0; i < numTerms; i++) {
|
||||
final int termID = termIDs[i];
|
||||
//System.out.println("term=" + termID);
|
||||
// Get BytesRef
|
||||
final int textStart = postings.textStarts[termID];
|
||||
termsHashPerField.bytePool.setBytesRef(text, textStart);
|
||||
|
||||
termsHashPerField.initReader(freq, termID, 0);
|
||||
if (fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
if (readPositions || readOffsets) {
|
||||
termsHashPerField.initReader(prox, termID, 1);
|
||||
}
|
||||
|
||||
|
@ -303,15 +406,18 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
int numDocs = 0;
|
||||
long totTF = 0;
|
||||
int docID = 0;
|
||||
int termFreq = 0;
|
||||
|
||||
while(true) {
|
||||
//System.out.println(" cycle");
|
||||
final int termDocFreq;
|
||||
if (freq.eof()) {
|
||||
if (postings.lastDocCodes[termID] != -1) {
|
||||
// Return last doc
|
||||
docID = postings.lastDocIDs[termID];
|
||||
if (indexOptions != IndexOptions.DOCS_ONLY) {
|
||||
termFreq = postings.docFreqs[termID];
|
||||
if (readTermFreq) {
|
||||
termDocFreq = postings.docFreqs[termID];
|
||||
} else {
|
||||
termDocFreq = 0;
|
||||
}
|
||||
postings.lastDocCodes[termID] = -1;
|
||||
} else {
|
||||
|
@ -320,14 +426,15 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
}
|
||||
} else {
|
||||
final int code = freq.readVInt();
|
||||
if (indexOptions == IndexOptions.DOCS_ONLY) {
|
||||
if (!readTermFreq) {
|
||||
docID += code;
|
||||
termDocFreq = 0;
|
||||
} else {
|
||||
docID += code >>> 1;
|
||||
if ((code & 1) != 0) {
|
||||
termFreq = 1;
|
||||
termDocFreq = 1;
|
||||
} else {
|
||||
termFreq = freq.readVInt();
|
||||
termDocFreq = freq.readVInt();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -336,7 +443,6 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
|
||||
numDocs++;
|
||||
assert docID < state.numDocs: "doc=" + docID + " maxDoc=" + state.numDocs;
|
||||
final int termDocFreq = termFreq;
|
||||
|
||||
// NOTE: we could check here if the docID was
|
||||
// deleted, and skip it. However, this is somewhat
|
||||
|
@ -362,45 +468,54 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
state.liveDocs.clear(docID);
|
||||
}
|
||||
|
||||
if (currentFieldIndexOptions != IndexOptions.DOCS_ONLY) {
|
||||
totTF += termDocFreq;
|
||||
}
|
||||
totTF += termDocFreq;
|
||||
|
||||
// Carefully copy over the prox + payload info,
|
||||
// changing the format to match Lucene's segment
|
||||
// format.
|
||||
|
||||
if (currentFieldIndexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
// we do write positions & payload
|
||||
if (readPositions || readOffsets) {
|
||||
// we did record positions (& maybe payload) and/or offsets
|
||||
int position = 0;
|
||||
int offset = 0;
|
||||
for(int j=0;j<termDocFreq;j++) {
|
||||
final int code = prox.readVInt();
|
||||
position += code >> 1;
|
||||
|
||||
final int payloadLength;
|
||||
final BytesRef thisPayload;
|
||||
|
||||
if ((code & 1) != 0) {
|
||||
// This position has a payload
|
||||
payloadLength = prox.readVInt();
|
||||
if (readPositions) {
|
||||
final int code = prox.readVInt();
|
||||
position += code >> 1;
|
||||
|
||||
if (payload == null) {
|
||||
payload = new BytesRef();
|
||||
payload.bytes = new byte[payloadLength];
|
||||
} else if (payload.bytes.length < payloadLength) {
|
||||
payload.grow(payloadLength);
|
||||
if ((code & 1) != 0) {
|
||||
|
||||
// This position has a payload
|
||||
final int payloadLength = prox.readVInt();
|
||||
|
||||
if (payload == null) {
|
||||
payload = new BytesRef();
|
||||
payload.bytes = new byte[payloadLength];
|
||||
} else if (payload.bytes.length < payloadLength) {
|
||||
payload.grow(payloadLength);
|
||||
}
|
||||
|
||||
prox.readBytes(payload.bytes, 0, payloadLength);
|
||||
payload.length = payloadLength;
|
||||
thisPayload = payload;
|
||||
|
||||
} else {
|
||||
thisPayload = null;
|
||||
}
|
||||
|
||||
prox.readBytes(payload.bytes, 0, payloadLength);
|
||||
payload.length = payloadLength;
|
||||
thisPayload = payload;
|
||||
|
||||
} else {
|
||||
payloadLength = 0;
|
||||
thisPayload = null;
|
||||
if (readOffsets) {
|
||||
final int startOffset = offset + prox.readVInt();
|
||||
final int endOffset = startOffset + prox.readVInt();
|
||||
offset = startOffset;
|
||||
if (writePositions) {
|
||||
postingsConsumer.addPosition(position, thisPayload, startOffset, endOffset);
|
||||
}
|
||||
} else if (writePositions) {
|
||||
postingsConsumer.addPosition(position, thisPayload, -1, -1);
|
||||
}
|
||||
}
|
||||
|
||||
postingsConsumer.addPosition(position, thisPayload);
|
||||
}
|
||||
|
||||
postingsConsumer.finishDoc();
|
||||
|
@ -413,6 +528,4 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
|
||||
termsConsumer.finish(sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -788,9 +788,9 @@ public abstract class IndexReader implements Closeable {
|
|||
|
||||
/** Returns {@link DocsAndPositionsEnum} for the specified
|
||||
* field & term. This may return null, if either the
|
||||
* field or term does not exist, or, positions were not
|
||||
* indexed for this field. */
|
||||
public final DocsAndPositionsEnum termPositionsEnum(Bits liveDocs, String field, BytesRef term) throws IOException {
|
||||
* field or term does not exist, or needsOffsets is
|
||||
* true but offsets were not indexed for this field. */
|
||||
public final DocsAndPositionsEnum termPositionsEnum(Bits liveDocs, String field, BytesRef term, boolean needsOffsets) throws IOException {
|
||||
assert field != null;
|
||||
assert term != null;
|
||||
final Fields fields = fields();
|
||||
|
@ -799,7 +799,7 @@ public abstract class IndexReader implements Closeable {
|
|||
if (terms != null) {
|
||||
final TermsEnum termsEnum = terms.iterator(null);
|
||||
if (termsEnum.seekExact(term, true)) {
|
||||
return termsEnum.docsAndPositions(liveDocs, null);
|
||||
return termsEnum.docsAndPositions(liveDocs, null, needsOffsets);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -830,8 +830,9 @@ public abstract class IndexReader implements Closeable {
|
|||
* Returns {@link DocsAndPositionsEnum} for the specified field and
|
||||
* {@link TermState}. This may return null, if either the field or the term
|
||||
* does not exists, the {@link TermState} is invalid for the underlying
|
||||
* implementation, or positions were not indexed for this field. */
|
||||
public final DocsAndPositionsEnum termPositionsEnum(Bits liveDocs, String field, BytesRef term, TermState state) throws IOException {
|
||||
* implementation, or needsOffsets is true but offsets
|
||||
* were not indexed for this field. */
|
||||
public final DocsAndPositionsEnum termPositionsEnum(Bits liveDocs, String field, BytesRef term, TermState state, boolean needsOffsets) throws IOException {
|
||||
assert state != null;
|
||||
assert field != null;
|
||||
final Fields fields = fields();
|
||||
|
@ -840,7 +841,7 @@ public abstract class IndexReader implements Closeable {
|
|||
if (terms != null) {
|
||||
final TermsEnum termsEnum = terms.iterator(null);
|
||||
termsEnum.seekExact(term, state);
|
||||
return termsEnum.docsAndPositions(liveDocs, null);
|
||||
return termsEnum.docsAndPositions(liveDocs, null, needsOffsets);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
|
|
|
@ -125,6 +125,16 @@ public final class MultiDocsAndPositionsEnum extends DocsAndPositionsEnum {
|
|||
return current.nextPosition();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
return current.startOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return current.endOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayload() {
|
||||
return current.hasPayload();
|
||||
|
|
|
@ -167,14 +167,14 @@ public final class MultiFields extends Fields {
|
|||
/** Returns {@link DocsAndPositionsEnum} for the specified
|
||||
* field & term. This may return null if the term does
|
||||
* not exist or positions were not indexed. */
|
||||
public static DocsAndPositionsEnum getTermPositionsEnum(IndexReader r, Bits liveDocs, String field, BytesRef term) throws IOException {
|
||||
public static DocsAndPositionsEnum getTermPositionsEnum(IndexReader r, Bits liveDocs, String field, BytesRef term, boolean needsOffsets) throws IOException {
|
||||
assert field != null;
|
||||
assert term != null;
|
||||
final Terms terms = getTerms(r, field);
|
||||
if (terms != null) {
|
||||
final TermsEnum termsEnum = terms.iterator(null);
|
||||
if (termsEnum.seekExact(term, true)) {
|
||||
return termsEnum.docsAndPositions(liveDocs, null);
|
||||
return termsEnum.docsAndPositions(liveDocs, null, needsOffsets);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
|
|
|
@ -418,7 +418,7 @@ public final class MultiTermsEnum extends TermsEnum {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
|
||||
MultiDocsAndPositionsEnum docsAndPositionsEnum;
|
||||
// Can only reuse if incoming enum is also a MultiDocsAndPositionsEnum
|
||||
if (reuse != null && reuse instanceof MultiDocsAndPositionsEnum) {
|
||||
|
@ -469,7 +469,7 @@ public final class MultiTermsEnum extends TermsEnum {
|
|||
}
|
||||
|
||||
assert entry.index < docsAndPositionsEnum.subDocsAndPositionsEnum.length: entry.index + " vs " + docsAndPositionsEnum.subDocsAndPositionsEnum.length + "; " + subs.length;
|
||||
final DocsAndPositionsEnum subPostings = entry.terms.docsAndPositions(b, docsAndPositionsEnum.subDocsAndPositionsEnum[entry.index]);
|
||||
final DocsAndPositionsEnum subPostings = entry.terms.docsAndPositions(b, docsAndPositionsEnum.subDocsAndPositionsEnum[entry.index], needsOffsets);
|
||||
|
||||
if (subPostings != null) {
|
||||
docsAndPositionsEnum.subDocsAndPositionsEnum[entry.index] = subPostings;
|
||||
|
@ -479,8 +479,8 @@ public final class MultiTermsEnum extends TermsEnum {
|
|||
} else {
|
||||
if (entry.terms.docs(b, null, false) != null) {
|
||||
// At least one of our subs does not store
|
||||
// positions -- we can't correctly produce a
|
||||
// MultiDocsAndPositions enum
|
||||
// offsets or positions -- we can't correctly
|
||||
// produce a MultiDocsAndPositions enum
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -44,7 +44,7 @@ public final class Term implements Comparable<Term> {
|
|||
field = fld;
|
||||
this.bytes = bytes;
|
||||
}
|
||||
|
||||
|
||||
/** Constructs a Term with the given field and text.
|
||||
* <p>Note that a null field or null text value results in undefined
|
||||
* behavior for most Lucene APIs that accept a Term parameter. */
|
||||
|
@ -132,4 +132,8 @@ public final class Term implements Comparable<Term> {
|
|||
|
||||
@Override
|
||||
public final String toString() { return field + ":" + bytes.utf8ToString(); }
|
||||
|
||||
public Term deepCopyOf() {
|
||||
return new Term(field, BytesRef.deepCopyOf(bytes));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -38,7 +38,7 @@ final class TermVectorsConsumerPerField extends TermsHashConsumerPerField {
|
|||
boolean doVectorOffsets;
|
||||
|
||||
int maxNumPostings;
|
||||
OffsetAttribute offsetAttribute = null;
|
||||
OffsetAttribute offsetAttribute;
|
||||
|
||||
public TermVectorsConsumerPerField(TermsHashPerField termsHashPerField, TermVectorsConsumer termsWriter, FieldInfo fieldInfo) {
|
||||
this.termsHashPerField = termsHashPerField;
|
||||
|
|
|
@ -160,12 +160,13 @@ public abstract class TermsEnum {
|
|||
|
||||
/** Get {@link DocsAndPositionsEnum} for the current term.
|
||||
* Do not call this when the enum is unpositioned.
|
||||
* This method will only return null if positions were
|
||||
* not indexed into the postings by this codec.
|
||||
* This method will only return null if needsOffsets is
|
||||
* true but offsets were not indexed.
|
||||
* @param liveDocs unset bits are documents that should not
|
||||
* be returned
|
||||
* @param reuse pass a prior DocsAndPositionsEnum for possible reuse */
|
||||
public abstract DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException;
|
||||
* @param reuse pass a prior DocsAndPositionsEnum for possible reuse
|
||||
* @param needsOffsets true if offsets are required */
|
||||
public abstract DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException;
|
||||
|
||||
/**
|
||||
* Expert: Returns the TermsEnums internal state to position the TermsEnum
|
||||
|
@ -238,7 +239,7 @@ public abstract class TermsEnum {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) {
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) {
|
||||
throw new IllegalStateException("this method should never be called");
|
||||
}
|
||||
|
||||
|
|
|
@ -293,7 +293,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
|
|||
|
||||
@Override
|
||||
public int[] init() {
|
||||
if(perField.postingsArray == null) {
|
||||
if (perField.postingsArray == null) {
|
||||
perField.postingsArray = perField.consumer.createPostingsArray(2);
|
||||
bytesUsed.addAndGet(perField.postingsArray.size * perField.postingsArray.bytesPerPosting());
|
||||
}
|
||||
|
@ -305,8 +305,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
|
|||
ParallelPostingsArray postingsArray = perField.postingsArray;
|
||||
final int oldSize = perField.postingsArray.size;
|
||||
postingsArray = perField.postingsArray = postingsArray.grow();
|
||||
bytesUsed
|
||||
.addAndGet((postingsArray.bytesPerPosting() * (postingsArray.size - oldSize)));
|
||||
bytesUsed.addAndGet((postingsArray.bytesPerPosting() * (postingsArray.size - oldSize)));
|
||||
return postingsArray.textStarts;
|
||||
}
|
||||
|
||||
|
|
|
@ -1043,7 +1043,7 @@ class FieldCacheImpl implements FieldCache {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
|
|
|
@ -272,8 +272,8 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
|||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs,
|
||||
DocsAndPositionsEnum reuse) throws IOException {
|
||||
return actualEnum.docsAndPositions(liveDocs, reuse);
|
||||
DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
|
||||
return actualEnum.docsAndPositions(liveDocs, reuse, needsOffsets);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -225,7 +225,7 @@ public class MultiPhraseQuery extends Query {
|
|||
return null;
|
||||
}
|
||||
termsEnum.seekExact(term.bytes(), termState);
|
||||
postingsEnum = termsEnum.docsAndPositions(liveDocs, null);
|
||||
postingsEnum = termsEnum.docsAndPositions(liveDocs, null, false);
|
||||
|
||||
if (postingsEnum == null) {
|
||||
// term does exist, but has no positions
|
||||
|
@ -475,7 +475,7 @@ class UnionDocsAndPositionsEnum extends DocsAndPositionsEnum {
|
|||
continue;
|
||||
}
|
||||
termsEnum.seekExact(term.bytes(), termState);
|
||||
DocsAndPositionsEnum postings = termsEnum.docsAndPositions(liveDocs, null);
|
||||
DocsAndPositionsEnum postings = termsEnum.docsAndPositions(liveDocs, null, false);
|
||||
if (postings == null) {
|
||||
// term does exist, but has no positions
|
||||
throw new IllegalStateException("field \"" + term.field() + "\" was indexed without position data; cannot run PhraseQuery (term=" + term.text() + ")");
|
||||
|
@ -527,6 +527,16 @@ class UnionDocsAndPositionsEnum extends DocsAndPositionsEnum {
|
|||
return _posList.next();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() {
|
||||
throw new UnsupportedOperationException();
|
||||
|
|
|
@ -239,7 +239,7 @@ public class PhraseQuery extends Query {
|
|||
return null;
|
||||
}
|
||||
te.seekExact(t.bytes(), state);
|
||||
DocsAndPositionsEnum postingsEnum = te.docsAndPositions(liveDocs, null);
|
||||
DocsAndPositionsEnum postingsEnum = te.docsAndPositions(liveDocs, null, false);
|
||||
|
||||
// PhraseQuery on a field that did not index
|
||||
// positions.
|
||||
|
|
|
@ -120,7 +120,7 @@ public class SpanTermQuery extends SpanQuery {
|
|||
final TermsEnum termsEnum = context.reader.terms(term.field()).iterator(null);
|
||||
termsEnum.seekExact(term.bytes(), state);
|
||||
|
||||
final DocsAndPositionsEnum postings = termsEnum.docsAndPositions(acceptDocs, null);
|
||||
final DocsAndPositionsEnum postings = termsEnum.docsAndPositions(acceptDocs, null, false);
|
||||
|
||||
if (postings != null) {
|
||||
return new TermSpans(postings, term);
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
public class CannedAnalyzer extends Analyzer {
|
||||
private final Token[] tokens;
|
||||
|
||||
public CannedAnalyzer(Token[] tokens) {
|
||||
this.tokens = tokens;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
return new TokenStreamComponents(new CannedTokenizer(tokens));
|
||||
}
|
||||
|
||||
public static class CannedTokenizer extends Tokenizer {
|
||||
private final Token[] tokens;
|
||||
private int upto = 0;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
public CannedTokenizer(Token[] tokens) {
|
||||
this.tokens = tokens;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (upto < tokens.length) {
|
||||
final Token token = tokens[upto++];
|
||||
// TODO: can we just capture/restoreState so
|
||||
// we get all attrs...?
|
||||
clearAttributes();
|
||||
termAtt.setEmpty();
|
||||
termAtt.append(token.toString());
|
||||
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
|
||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
this.upto = 0;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -88,6 +88,9 @@ class PreFlexFieldsWriter extends FieldsConsumer {
|
|||
@Override
|
||||
public TermsConsumer addField(FieldInfo field) throws IOException {
|
||||
assert field.number != -1;
|
||||
if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
|
||||
throw new IllegalArgumentException("this codec cannot index offsets");
|
||||
}
|
||||
//System.out.println("w field=" + field.name + " storePayload=" + field.storePayloads + " number=" + field.number);
|
||||
return new PreFlexTermsWriter(field);
|
||||
}
|
||||
|
@ -157,8 +160,10 @@ class PreFlexFieldsWriter extends FieldsConsumer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void addPosition(int position, BytesRef payload) throws IOException {
|
||||
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
|
||||
assert proxOut != null;
|
||||
assert startOffset == -1;
|
||||
assert endOffset == -1;
|
||||
|
||||
//System.out.println(" w pos=" + position + " payl=" + payload);
|
||||
final int delta = position - lastPosition;
|
||||
|
|
|
@ -37,6 +37,7 @@ import org.apache.lucene.codecs.TermStats;
|
|||
import org.apache.lucene.codecs.TermsConsumer;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
|
@ -197,6 +198,9 @@ public class RAMOnlyPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public TermsConsumer addField(FieldInfo field) {
|
||||
if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
|
||||
throw new IllegalArgumentException("this codec cannot index offsets");
|
||||
}
|
||||
RAMField ramField = new RAMField(field.name);
|
||||
postings.fieldToTerms.put(field.name, ramField);
|
||||
termsConsumer.reset(ramField);
|
||||
|
@ -265,7 +269,9 @@ public class RAMOnlyPostingsFormat extends PostingsFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void addPosition(int position, BytesRef payload) {
|
||||
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) {
|
||||
assert startOffset == -1;
|
||||
assert endOffset == -1;
|
||||
current.positions[posUpto] = position;
|
||||
if (payload != null && payload.length > 0) {
|
||||
if (current.payloads == null) {
|
||||
|
@ -388,7 +394,10 @@ public class RAMOnlyPostingsFormat extends PostingsFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) {
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) {
|
||||
if (needsOffsets) {
|
||||
return null;
|
||||
}
|
||||
return new RAMDocsAndPositionsEnum(ramField.termToDocs.get(current), liveDocs);
|
||||
}
|
||||
}
|
||||
|
@ -493,6 +502,16 @@ public class RAMOnlyPostingsFormat extends PostingsFormat {
|
|||
return current.positions[posUpto++];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayload() {
|
||||
return current.payloads != null && current.payloads[posUpto-1] != null;
|
||||
|
|
|
@ -122,6 +122,10 @@ public class RandomIndexWriter implements Closeable {
|
|||
* @see IndexWriter#addDocument(Iterable)
|
||||
*/
|
||||
public <T extends IndexableField> void addDocument(final Iterable<T> doc) throws IOException {
|
||||
addDocument(doc, w.getAnalyzer());
|
||||
}
|
||||
|
||||
public <T extends IndexableField> void addDocument(final Iterable<T> doc, Analyzer a) throws IOException {
|
||||
if (doDocValues && doc instanceof Document) {
|
||||
randomPerDocFieldValues(r, (Document) doc);
|
||||
}
|
||||
|
@ -157,9 +161,9 @@ public class RandomIndexWriter implements Closeable {
|
|||
}
|
||||
};
|
||||
}
|
||||
});
|
||||
}, a);
|
||||
} else {
|
||||
w.addDocument(doc);
|
||||
w.addDocument(doc, a);
|
||||
}
|
||||
|
||||
maybeCommit();
|
||||
|
|
|
@ -1106,6 +1106,10 @@ public abstract class LuceneTestCase extends Assert {
|
|||
return new Field(name, value, type);
|
||||
}
|
||||
|
||||
// TODO: once all core & test codecs can index
|
||||
// offsets, sometimes randomly turn on offsets if we are
|
||||
// already indexing positions...
|
||||
|
||||
FieldType newType = new FieldType(type);
|
||||
if (!newType.stored() && random.nextBoolean()) {
|
||||
newType.setStored(true); // randomly store it
|
||||
|
|
|
@ -157,6 +157,7 @@ public class _TestUtil {
|
|||
public static CheckIndex.Status checkIndex(Directory dir) throws IOException {
|
||||
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
|
||||
CheckIndex checker = new CheckIndex(dir);
|
||||
checker.setCrossCheckTermVectors(true);
|
||||
checker.setInfoStream(new PrintStream(bos), false);
|
||||
CheckIndex.Status indexStatus = checker.checkIndex(null);
|
||||
if (indexStatus == null || indexStatus.clean == false) {
|
||||
|
@ -567,7 +568,10 @@ public class _TestUtil {
|
|||
if (random.nextBoolean()) {
|
||||
if (random.nextBoolean()) {
|
||||
// TODO: cast re-use to D&PE if we can...?
|
||||
final DocsAndPositionsEnum docsAndPositions = termsEnum.docsAndPositions(liveDocs, null);
|
||||
DocsAndPositionsEnum docsAndPositions = termsEnum.docsAndPositions(liveDocs, null, true);
|
||||
if (docsAndPositions == null) {
|
||||
docsAndPositions = termsEnum.docsAndPositions(liveDocs, null, false);
|
||||
}
|
||||
if (docsAndPositions != null) {
|
||||
return docsAndPositions;
|
||||
}
|
||||
|
@ -586,7 +590,10 @@ public class _TestUtil {
|
|||
if (random.nextBoolean()) {
|
||||
if (random.nextBoolean()) {
|
||||
// TODO: cast re-use to D&PE if we can...?
|
||||
final DocsAndPositionsEnum docsAndPositions = termsEnum.docsAndPositions(liveDocs, null);
|
||||
DocsAndPositionsEnum docsAndPositions = termsEnum.docsAndPositions(liveDocs, null, true);
|
||||
if (docsAndPositions == null) {
|
||||
docsAndPositions = termsEnum.docsAndPositions(liveDocs, null, false);
|
||||
}
|
||||
if (docsAndPositions != null) {
|
||||
return docsAndPositions;
|
||||
}
|
||||
|
|
|
@ -74,7 +74,8 @@ public class TestCachingTokenFilter extends BaseTokenStreamTestCase {
|
|||
DocsAndPositionsEnum termPositions = MultiFields.getTermPositionsEnum(reader,
|
||||
MultiFields.getLiveDocs(reader),
|
||||
"preanalyzed",
|
||||
new BytesRef("term1"));
|
||||
new BytesRef("term1"),
|
||||
false);
|
||||
assertTrue(termPositions.nextDoc() != termPositions.NO_MORE_DOCS);
|
||||
assertEquals(1, termPositions.freq());
|
||||
assertEquals(0, termPositions.nextPosition());
|
||||
|
@ -82,7 +83,8 @@ public class TestCachingTokenFilter extends BaseTokenStreamTestCase {
|
|||
termPositions = MultiFields.getTermPositionsEnum(reader,
|
||||
MultiFields.getLiveDocs(reader),
|
||||
"preanalyzed",
|
||||
new BytesRef("term2"));
|
||||
new BytesRef("term2"),
|
||||
false);
|
||||
assertTrue(termPositions.nextDoc() != termPositions.NO_MORE_DOCS);
|
||||
assertEquals(2, termPositions.freq());
|
||||
assertEquals(1, termPositions.nextPosition());
|
||||
|
@ -91,7 +93,8 @@ public class TestCachingTokenFilter extends BaseTokenStreamTestCase {
|
|||
termPositions = MultiFields.getTermPositionsEnum(reader,
|
||||
MultiFields.getLiveDocs(reader),
|
||||
"preanalyzed",
|
||||
new BytesRef("term3"));
|
||||
new BytesRef("term3"),
|
||||
false);
|
||||
assertTrue(termPositions.nextDoc() != termPositions.NO_MORE_DOCS);
|
||||
assertEquals(1, termPositions.freq());
|
||||
assertEquals(2, termPositions.nextPosition());
|
||||
|
|
|
@ -23,7 +23,6 @@ import java.util.Map;
|
|||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.nestedpulsing.NestedPulsingPostingsFormat;
|
||||
import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
|
@ -70,7 +69,7 @@ public class TestPulsingReuse extends LuceneTestCase {
|
|||
DocsAndPositionsEnum posReuse = null;
|
||||
te = segment.terms("foo").iterator(null);
|
||||
while (te.next() != null) {
|
||||
posReuse = te.docsAndPositions(null, posReuse);
|
||||
posReuse = te.docsAndPositions(null, posReuse, false);
|
||||
allEnums.put(posReuse, true);
|
||||
}
|
||||
|
||||
|
@ -112,7 +111,7 @@ public class TestPulsingReuse extends LuceneTestCase {
|
|||
DocsAndPositionsEnum posReuse = null;
|
||||
te = segment.terms("foo").iterator(null);
|
||||
while (te.next() != null) {
|
||||
posReuse = te.docsAndPositions(null, posReuse);
|
||||
posReuse = te.docsAndPositions(null, posReuse, false);
|
||||
allEnums.put(posReuse, true);
|
||||
}
|
||||
|
||||
|
|
|
@ -347,7 +347,7 @@ public class TestDocument extends LuceneTestCase {
|
|||
assertEquals(2, tvs.getUniqueTermCount());
|
||||
TermsEnum tvsEnum = tvs.iterator(null);
|
||||
assertEquals(new BytesRef("abc"), tvsEnum.next());
|
||||
final DocsAndPositionsEnum dpEnum = tvsEnum.docsAndPositions(null, null);
|
||||
final DocsAndPositionsEnum dpEnum = tvsEnum.docsAndPositions(null, null, false);
|
||||
if (field.equals("tv")) {
|
||||
assertNull(dpEnum);
|
||||
} else {
|
||||
|
|
|
@ -166,7 +166,7 @@ public class TestCodecs extends LuceneTestCase {
|
|||
totTF += positions[i].length;
|
||||
for(int j=0;j<positions[i].length;j++) {
|
||||
final PositionData pos = positions[i][j];
|
||||
postingsConsumer.addPosition(pos.pos, pos.payload);
|
||||
postingsConsumer.addPosition(pos.pos, pos.payload, -1, -1);
|
||||
}
|
||||
postingsConsumer.finishDoc();
|
||||
}
|
||||
|
@ -480,7 +480,7 @@ public class TestCodecs extends LuceneTestCase {
|
|||
if (field.omitTF) {
|
||||
this.verifyDocs(term.docs, term.positions, _TestUtil.docs(random, termsEnum, null, null, false), false);
|
||||
} else {
|
||||
this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null), true);
|
||||
this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null, false), true);
|
||||
}
|
||||
|
||||
// Test random seek by ord:
|
||||
|
@ -500,7 +500,7 @@ public class TestCodecs extends LuceneTestCase {
|
|||
if (field.omitTF) {
|
||||
this.verifyDocs(term.docs, term.positions, _TestUtil.docs(random, termsEnum, null, null, false), false);
|
||||
} else {
|
||||
this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null), true);
|
||||
this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null, false), true);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -552,7 +552,7 @@ public class TestCodecs extends LuceneTestCase {
|
|||
final DocsEnum docsAndFreqs;
|
||||
final DocsAndPositionsEnum postings;
|
||||
if (!field.omitTF) {
|
||||
postings = termsEnum.docsAndPositions(null, null);
|
||||
postings = termsEnum.docsAndPositions(null, null, false);
|
||||
if (postings != null) {
|
||||
docs = docsAndFreqs = postings;
|
||||
} else {
|
||||
|
|
|
@ -234,7 +234,7 @@ public class TestDoc extends LuceneTestCase {
|
|||
out.print(" term=" + field + ":" + tis.term());
|
||||
out.println(" DF=" + tis.docFreq());
|
||||
|
||||
DocsAndPositionsEnum positions = tis.docsAndPositions(reader.getLiveDocs(), null);
|
||||
DocsAndPositionsEnum positions = tis.docsAndPositions(reader.getLiveDocs(), null, false);
|
||||
|
||||
while (positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
out.print(" doc=" + positions.docID());
|
||||
|
|
|
@ -96,7 +96,7 @@ public class TestDocsAndPositions extends LuceneTestCase {
|
|||
|
||||
public DocsAndPositionsEnum getDocsAndPositions(IndexReader reader,
|
||||
BytesRef bytes, Bits liveDocs) throws IOException {
|
||||
return reader.termPositionsEnum(null, fieldName, bytes);
|
||||
return reader.termPositionsEnum(null, fieldName, bytes, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -358,7 +358,7 @@ public class TestDocsAndPositions extends LuceneTestCase {
|
|||
writer.addDocument(doc);
|
||||
IndexReader reader = writer.getReader();
|
||||
IndexReader r = getOnlySegmentReader(reader);
|
||||
DocsAndPositionsEnum disi = r.termPositionsEnum(null, "foo", new BytesRef("bar"));
|
||||
DocsAndPositionsEnum disi = r.termPositionsEnum(null, "foo", new BytesRef("bar"), false);
|
||||
int docid = disi.docID();
|
||||
assertTrue(docid == -1 || docid == DocIdSetIterator.NO_MORE_DOCS);
|
||||
assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
|
@ -366,7 +366,7 @@ public class TestDocsAndPositions extends LuceneTestCase {
|
|||
// now reuse and check again
|
||||
TermsEnum te = r.terms("foo").iterator(null);
|
||||
assertTrue(te.seekExact(new BytesRef("bar"), true));
|
||||
disi = te.docsAndPositions(null, disi);
|
||||
disi = te.docsAndPositions(null, disi, false);
|
||||
docid = disi.docID();
|
||||
assertTrue(docid == -1 || docid == DocIdSetIterator.NO_MORE_DOCS);
|
||||
assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
|
|
|
@ -128,7 +128,7 @@ public class TestDocumentWriter extends LuceneTestCase {
|
|||
SegmentReader reader = new SegmentReader(info, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR, newIOContext(random));
|
||||
|
||||
DocsAndPositionsEnum termPositions = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader),
|
||||
"repeated", new BytesRef("repeated"));
|
||||
"repeated", new BytesRef("repeated"), false);
|
||||
assertTrue(termPositions.nextDoc() != termPositions.NO_MORE_DOCS);
|
||||
int freq = termPositions.freq();
|
||||
assertEquals(2, freq);
|
||||
|
@ -199,7 +199,7 @@ public class TestDocumentWriter extends LuceneTestCase {
|
|||
writer.close();
|
||||
SegmentReader reader = new SegmentReader(info, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR, newIOContext(random));
|
||||
|
||||
DocsAndPositionsEnum termPositions = MultiFields.getTermPositionsEnum(reader, reader.getLiveDocs(), "f1", new BytesRef("a"));
|
||||
DocsAndPositionsEnum termPositions = MultiFields.getTermPositionsEnum(reader, reader.getLiveDocs(), "f1", new BytesRef("a"), false);
|
||||
assertTrue(termPositions.nextDoc() != termPositions.NO_MORE_DOCS);
|
||||
int freq = termPositions.freq();
|
||||
assertEquals(3, freq);
|
||||
|
@ -243,18 +243,18 @@ public class TestDocumentWriter extends LuceneTestCase {
|
|||
writer.close();
|
||||
SegmentReader reader = new SegmentReader(info, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR, newIOContext(random));
|
||||
|
||||
DocsAndPositionsEnum termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term1"));
|
||||
DocsAndPositionsEnum termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term1"), false);
|
||||
assertTrue(termPositions.nextDoc() != termPositions.NO_MORE_DOCS);
|
||||
assertEquals(1, termPositions.freq());
|
||||
assertEquals(0, termPositions.nextPosition());
|
||||
|
||||
termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term2"));
|
||||
termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term2"), false);
|
||||
assertTrue(termPositions.nextDoc() != termPositions.NO_MORE_DOCS);
|
||||
assertEquals(2, termPositions.freq());
|
||||
assertEquals(1, termPositions.nextPosition());
|
||||
assertEquals(3, termPositions.nextPosition());
|
||||
|
||||
termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term3"));
|
||||
termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term3"), false);
|
||||
assertTrue(termPositions.nextDoc() != termPositions.NO_MORE_DOCS);
|
||||
assertEquals(1, termPositions.freq());
|
||||
assertEquals(2, termPositions.nextPosition());
|
||||
|
|
|
@ -260,17 +260,17 @@ public class TestDuelingCodecs extends LuceneTestCase {
|
|||
assertEquals(info, term, rightTermsEnum.next());
|
||||
assertTermStats(leftTermsEnum, rightTermsEnum);
|
||||
if (deep) {
|
||||
assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions),
|
||||
rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions));
|
||||
assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions),
|
||||
rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions));
|
||||
assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions, false),
|
||||
rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions, false));
|
||||
assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions, false),
|
||||
rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions, false));
|
||||
|
||||
assertPositionsSkipping(leftTermsEnum.docFreq(),
|
||||
leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions),
|
||||
rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions));
|
||||
leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions, false),
|
||||
rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions, false));
|
||||
assertPositionsSkipping(leftTermsEnum.docFreq(),
|
||||
leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions),
|
||||
rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions));
|
||||
leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions, false),
|
||||
rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions, false));
|
||||
|
||||
// with freqs:
|
||||
assertDocsEnum(leftDocs = leftTermsEnum.docs(null, leftDocs, true),
|
||||
|
@ -341,6 +341,8 @@ public class TestDuelingCodecs extends LuceneTestCase {
|
|||
for (int i = 0; i < freq; i++) {
|
||||
assertEquals(info, leftDocs.nextPosition(), rightDocs.nextPosition());
|
||||
assertEquals(info, leftDocs.hasPayload(), rightDocs.hasPayload());
|
||||
assertEquals(info, leftDocs.startOffset(), rightDocs.startOffset());
|
||||
assertEquals(info, leftDocs.endOffset(), rightDocs.endOffset());
|
||||
if (leftDocs.hasPayload()) {
|
||||
assertEquals(info, leftDocs.getPayload(), rightDocs.getPayload());
|
||||
}
|
||||
|
|
|
@ -90,8 +90,8 @@ public class TestFilterIndexReader extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
return new TestPositions(super.docsAndPositions(liveDocs, reuse == null ? null : ((FilterDocsAndPositionsEnum) reuse).in));
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
|
||||
return new TestPositions(super.docsAndPositions(liveDocs, reuse == null ? null : ((FilterDocsAndPositionsEnum) reuse).in, needsOffsets));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -166,7 +166,7 @@ public class TestFilterIndexReader extends LuceneTestCase {
|
|||
assertEquals(TermsEnum.SeekStatus.FOUND, terms.seekCeil(new BytesRef("one")));
|
||||
|
||||
DocsAndPositionsEnum positions = terms.docsAndPositions(MultiFields.getLiveDocs(reader),
|
||||
null);
|
||||
null, false);
|
||||
while (positions.nextDoc() != DocsEnum.NO_MORE_DOCS) {
|
||||
assertTrue((positions.docID() % 2) == 1);
|
||||
}
|
||||
|
|
|
@ -603,8 +603,8 @@ public class TestIndexReader extends LuceneTestCase {
|
|||
|
||||
while(enum1.next() != null) {
|
||||
assertEquals("Different terms", enum1.term(), enum2.next());
|
||||
DocsAndPositionsEnum tp1 = enum1.docsAndPositions(liveDocs, null);
|
||||
DocsAndPositionsEnum tp2 = enum2.docsAndPositions(liveDocs, null);
|
||||
DocsAndPositionsEnum tp1 = enum1.docsAndPositions(liveDocs, null, false);
|
||||
DocsAndPositionsEnum tp2 = enum2.docsAndPositions(liveDocs, null, false);
|
||||
|
||||
while(tp1.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
assertTrue(tp2.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
|
|
|
@ -39,6 +39,7 @@ import org.apache.lucene.document.StoredField;
|
|||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
|
@ -899,7 +900,8 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(s.getIndexReader(),
|
||||
MultiFields.getLiveDocs(s.getIndexReader()),
|
||||
"field",
|
||||
new BytesRef("a"));
|
||||
new BytesRef("a"),
|
||||
false);
|
||||
|
||||
assertTrue(tps.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
assertEquals(1, tps.freq());
|
||||
|
@ -964,14 +966,14 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
Terms tpv = r.getTermVectors(0).terms("field");
|
||||
TermsEnum termsEnum = tpv.iterator(null);
|
||||
assertNotNull(termsEnum.next());
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, false);
|
||||
assertNotNull(dpEnum);
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
assertEquals(1, dpEnum.freq());
|
||||
assertEquals(100, dpEnum.nextPosition());
|
||||
|
||||
assertNotNull(termsEnum.next());
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, false);
|
||||
assertNotNull(dpEnum);
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
assertEquals(1, dpEnum.freq());
|
||||
|
@ -1634,7 +1636,7 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
|
||||
// Make sure position is still incremented when
|
||||
// massive term is skipped:
|
||||
DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader, null, "content", new BytesRef("another"));
|
||||
DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader, null, "content", new BytesRef("another"), false);
|
||||
assertEquals(0, tps.nextDoc());
|
||||
assertEquals(1, tps.freq());
|
||||
assertEquals(3, tps.nextPosition());
|
||||
|
@ -1761,4 +1763,27 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
w1.close();
|
||||
d.close();
|
||||
}
|
||||
|
||||
public void testChangeIndexOptions() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir,
|
||||
new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)));
|
||||
|
||||
FieldType docsAndFreqs = new FieldType(TextField.TYPE_UNSTORED);
|
||||
docsAndFreqs.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||
|
||||
FieldType docsOnly = new FieldType(TextField.TYPE_UNSTORED);
|
||||
docsOnly.setIndexOptions(IndexOptions.DOCS_ONLY);
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("field", "a b c", docsAndFreqs));
|
||||
w.addDocument(doc);
|
||||
w.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
doc.add(new Field("field", "a b c", docsOnly));
|
||||
w.addDocument(doc);
|
||||
w.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -263,14 +263,14 @@ public class TestIndexableField extends LuceneTestCase {
|
|||
TermsEnum termsEnum = tfv.iterator(null);
|
||||
assertEquals(new BytesRef(""+counter), termsEnum.next());
|
||||
assertEquals(1, termsEnum.totalTermFreq());
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, false);
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
assertEquals(1, dpEnum.freq());
|
||||
assertEquals(1, dpEnum.nextPosition());
|
||||
|
||||
assertEquals(new BytesRef("text"), termsEnum.next());
|
||||
assertEquals(1, termsEnum.totalTermFreq());
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, false);
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
assertEquals(1, dpEnum.freq());
|
||||
assertEquals(0, dpEnum.nextPosition());
|
||||
|
|
|
@ -156,7 +156,8 @@ public class TestLazyProxSkipping extends LuceneTestCase {
|
|||
DocsAndPositionsEnum tp = MultiFields.getTermPositionsEnum(reader,
|
||||
MultiFields.getLiveDocs(reader),
|
||||
this.field,
|
||||
new BytesRef("b"));
|
||||
new BytesRef("b"),
|
||||
false);
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
tp.nextDoc();
|
||||
|
@ -167,7 +168,8 @@ public class TestLazyProxSkipping extends LuceneTestCase {
|
|||
tp = MultiFields.getTermPositionsEnum(reader,
|
||||
MultiFields.getLiveDocs(reader),
|
||||
this.field,
|
||||
new BytesRef("a"));
|
||||
new BytesRef("a"),
|
||||
false);
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
tp.nextDoc();
|
||||
|
|
|
@ -173,7 +173,7 @@ public class TestLongPostings extends LuceneTestCase {
|
|||
System.out.println("\nTEST: iter=" + iter + " doS1=" + doS1);
|
||||
}
|
||||
|
||||
final DocsAndPositionsEnum postings = MultiFields.getTermPositionsEnum(r, null, "field", new BytesRef(term));
|
||||
final DocsAndPositionsEnum postings = MultiFields.getTermPositionsEnum(r, null, "field", new BytesRef(term), false);
|
||||
|
||||
int docID = -1;
|
||||
while(docID < DocsEnum.NO_MORE_DOCS) {
|
||||
|
|
|
@ -86,7 +86,8 @@ public class TestMultiLevelSkipList extends LuceneTestCase {
|
|||
counter = 0;
|
||||
DocsAndPositionsEnum tp = reader.termPositionsEnum(reader.getLiveDocs(),
|
||||
term.field(),
|
||||
new BytesRef(term.text()));
|
||||
new BytesRef(term.text()),
|
||||
false);
|
||||
|
||||
checkSkipTo(tp, 14, 185); // no skips
|
||||
checkSkipTo(tp, 17, 190); // one skip on level 0
|
||||
|
|
|
@ -51,7 +51,7 @@ public class TestOmitPositions extends LuceneTestCase {
|
|||
IndexReader reader = w.getReader();
|
||||
w.close();
|
||||
|
||||
assertNull(MultiFields.getTermPositionsEnum(reader, null, "foo", new BytesRef("test")));
|
||||
assertNull(MultiFields.getTermPositionsEnum(reader, null, "foo", new BytesRef("test"), false));
|
||||
|
||||
DocsEnum de = _TestUtil.docs(random, reader, "foo", new BytesRef("test"), null, null, true);
|
||||
while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
|
|
|
@ -155,7 +155,7 @@ public class TestPayloadProcessorProvider extends LuceneTestCase {
|
|||
IndexReader reader = IndexReader.open(dir);
|
||||
try {
|
||||
int numPayloads = 0;
|
||||
DocsAndPositionsEnum tpe = MultiFields.getTermPositionsEnum(reader, null, field, text);
|
||||
DocsAndPositionsEnum tpe = MultiFields.getTermPositionsEnum(reader, null, field, text, false);
|
||||
while (tpe.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
tpe.nextPosition();
|
||||
if (tpe.hasPayload()) {
|
||||
|
|
|
@ -222,7 +222,8 @@ public class TestPayloads extends LuceneTestCase {
|
|||
tps[i] = MultiFields.getTermPositionsEnum(reader,
|
||||
MultiFields.getLiveDocs(reader),
|
||||
terms[i].field(),
|
||||
new BytesRef(terms[i].text()));
|
||||
new BytesRef(terms[i].text()),
|
||||
false);
|
||||
}
|
||||
|
||||
while (tps[0].nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
|
@ -259,7 +260,8 @@ public class TestPayloads extends LuceneTestCase {
|
|||
DocsAndPositionsEnum tp = MultiFields.getTermPositionsEnum(reader,
|
||||
MultiFields.getLiveDocs(reader),
|
||||
terms[0].field(),
|
||||
new BytesRef(terms[0].text()));
|
||||
new BytesRef(terms[0].text()),
|
||||
false);
|
||||
tp.nextDoc();
|
||||
tp.nextPosition();
|
||||
// NOTE: prior rev of this test was failing to first
|
||||
|
@ -287,7 +289,8 @@ public class TestPayloads extends LuceneTestCase {
|
|||
tp = MultiFields.getTermPositionsEnum(reader,
|
||||
MultiFields.getLiveDocs(reader),
|
||||
terms[1].field(),
|
||||
new BytesRef(terms[1].text()));
|
||||
new BytesRef(terms[1].text()),
|
||||
false);
|
||||
tp.nextDoc();
|
||||
tp.nextPosition();
|
||||
assertEquals("Wrong payload length.", 1, tp.getPayload().length);
|
||||
|
@ -330,7 +333,8 @@ public class TestPayloads extends LuceneTestCase {
|
|||
tp = MultiFields.getTermPositionsEnum(reader,
|
||||
MultiFields.getLiveDocs(reader),
|
||||
fieldName,
|
||||
new BytesRef(singleTerm));
|
||||
new BytesRef(singleTerm),
|
||||
false);
|
||||
tp.nextDoc();
|
||||
tp.nextPosition();
|
||||
|
||||
|
@ -532,7 +536,7 @@ public class TestPayloads extends LuceneTestCase {
|
|||
DocsAndPositionsEnum tp = null;
|
||||
while (terms.next() != null) {
|
||||
String termText = terms.term().utf8ToString();
|
||||
tp = terms.docsAndPositions(liveDocs, tp);
|
||||
tp = terms.docsAndPositions(liveDocs, tp, false);
|
||||
while(tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
int freq = tp.freq();
|
||||
for (int i = 0; i < freq; i++) {
|
||||
|
|
|
@ -0,0 +1,240 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.CannedAnalyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.NumericField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Assume;
|
||||
|
||||
public class TestPostingsOffsets extends LuceneTestCase {
|
||||
|
||||
public void testBasic() throws Exception {
|
||||
|
||||
// Currently only SimpleText can index offsets into postings:
|
||||
Assume.assumeTrue(Codec.getDefault().getName().equals("SimpleText"));
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random, dir);
|
||||
Document doc = new Document();
|
||||
|
||||
FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
|
||||
ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Token[] tokens = new Token[] {
|
||||
makeToken("a", 1, 0, 6),
|
||||
makeToken("b", 1, 8, 9),
|
||||
makeToken("a", 1, 9, 17),
|
||||
makeToken("c", 1, 19, 50),
|
||||
};
|
||||
doc.add(new Field("content", new CannedAnalyzer.CannedTokenizer(tokens), ft));
|
||||
|
||||
w.addDocument(doc, new CannedAnalyzer(tokens));
|
||||
IndexReader r = w.getReader();
|
||||
w.close();
|
||||
|
||||
DocsAndPositionsEnum dp = MultiFields.getTermPositionsEnum(r, null, "content", new BytesRef("a"), true);
|
||||
assertNotNull(dp);
|
||||
assertEquals(0, dp.nextDoc());
|
||||
assertEquals(2, dp.freq());
|
||||
assertEquals(0, dp.nextPosition());
|
||||
assertEquals(0, dp.startOffset());
|
||||
assertEquals(6, dp.endOffset());
|
||||
assertEquals(2, dp.nextPosition());
|
||||
assertEquals(9, dp.startOffset());
|
||||
assertEquals(17, dp.endOffset());
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, dp.nextDoc());
|
||||
|
||||
dp = MultiFields.getTermPositionsEnum(r, null, "content", new BytesRef("b"), true);
|
||||
assertNotNull(dp);
|
||||
assertEquals(0, dp.nextDoc());
|
||||
assertEquals(1, dp.freq());
|
||||
assertEquals(1, dp.nextPosition());
|
||||
assertEquals(8, dp.startOffset());
|
||||
assertEquals(9, dp.endOffset());
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, dp.nextDoc());
|
||||
|
||||
dp = MultiFields.getTermPositionsEnum(r, null, "content", new BytesRef("c"), true);
|
||||
assertNotNull(dp);
|
||||
assertEquals(0, dp.nextDoc());
|
||||
assertEquals(1, dp.freq());
|
||||
assertEquals(3, dp.nextPosition());
|
||||
assertEquals(19, dp.startOffset());
|
||||
assertEquals(50, dp.endOffset());
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, dp.nextDoc());
|
||||
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
// Currently only SimpleText can index offsets into postings:
|
||||
Assume.assumeTrue(Codec.getDefault().getName().equals("SimpleText"));
|
||||
|
||||
// token -> docID -> tokens
|
||||
final Map<String,Map<Integer,List<Token>>> actualTokens = new HashMap<String,Map<Integer,List<Token>>>();
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random, dir);
|
||||
|
||||
final int numDocs = atLeast(20);
|
||||
//final int numDocs = atLeast(5);
|
||||
|
||||
FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
|
||||
|
||||
// TODO: randomize what IndexOptions we use; also test
|
||||
// changing this up in one IW buffered segment...:
|
||||
ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
if (random.nextBoolean()) {
|
||||
ft.setStoreTermVectors(true);
|
||||
ft.setStoreTermVectorOffsets(random.nextBoolean());
|
||||
ft.setStoreTermVectorPositions(random.nextBoolean());
|
||||
}
|
||||
|
||||
for(int docCount=0;docCount<numDocs;docCount++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new NumericField("id", docCount));
|
||||
List<Token> tokens = new ArrayList<Token>();
|
||||
final int numTokens = atLeast(100);
|
||||
//final int numTokens = atLeast(20);
|
||||
int pos = -1;
|
||||
int offset = 0;
|
||||
//System.out.println("doc id=" + docCount);
|
||||
for(int tokenCount=0;tokenCount<numTokens;tokenCount++) {
|
||||
final String text;
|
||||
if (random.nextBoolean()) {
|
||||
text = "a";
|
||||
} else if (random.nextBoolean()) {
|
||||
text = "b";
|
||||
} else if (random.nextBoolean()) {
|
||||
text = "c";
|
||||
} else {
|
||||
text = "d";
|
||||
}
|
||||
|
||||
int posIncr = random.nextBoolean() ? 1 : random.nextInt(5);
|
||||
if (tokenCount == 0 && posIncr == 0) {
|
||||
posIncr = 1;
|
||||
}
|
||||
final int offIncr = random.nextBoolean() ? 0 : random.nextInt(5);
|
||||
final int tokenOffset = random.nextInt(5);
|
||||
|
||||
final Token token = makeToken(text, posIncr, offset+offIncr, offset+offIncr+tokenOffset);
|
||||
if (!actualTokens.containsKey(text)) {
|
||||
actualTokens.put(text, new HashMap<Integer,List<Token>>());
|
||||
}
|
||||
final Map<Integer,List<Token>> postingsByDoc = actualTokens.get(text);
|
||||
if (!postingsByDoc.containsKey(docCount)) {
|
||||
postingsByDoc.put(docCount, new ArrayList<Token>());
|
||||
}
|
||||
postingsByDoc.get(docCount).add(token);
|
||||
tokens.add(token);
|
||||
pos += posIncr;
|
||||
// stuff abs position into type:
|
||||
token.setType(""+pos);
|
||||
offset += offIncr + tokenOffset;
|
||||
//System.out.println(" " + token + " posIncr=" + token.getPositionIncrement() + " pos=" + pos + " off=" + token.startOffset() + "/" + token.endOffset() + " (freq=" + postingsByDoc.get(docCount).size() + ")");
|
||||
}
|
||||
doc.add(new Field("content", new CannedAnalyzer.CannedTokenizer(tokens.toArray(new Token[tokens.size()])), ft));
|
||||
w.addDocument(doc);
|
||||
}
|
||||
final IndexReader r = w.getReader();
|
||||
w.close();
|
||||
|
||||
final String[] terms = new String[] {"a", "b", "c", "d"};
|
||||
for(IndexReader sub : r.getSequentialSubReaders()) {
|
||||
//System.out.println("\nsub=" + sub);
|
||||
final TermsEnum termsEnum = sub.fields().terms("content").iterator(null);
|
||||
DocsEnum docs = null;
|
||||
DocsAndPositionsEnum docsAndPositions = null;
|
||||
DocsAndPositionsEnum docsAndPositionsAndOffsets = null;
|
||||
final int docIDToID[] = FieldCache.DEFAULT.getInts(sub, "id", false);
|
||||
for(String term : terms) {
|
||||
//System.out.println(" term=" + term);
|
||||
if (termsEnum.seekExact(new BytesRef(term), random.nextBoolean())) {
|
||||
docs = termsEnum.docs(null, docs, true);
|
||||
assertNotNull(docs);
|
||||
int doc;
|
||||
//System.out.println(" doc/freq");
|
||||
while((doc = docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
|
||||
final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
|
||||
//System.out.println(" doc=" + docIDToID[doc] + " docID=" + doc + " " + expected.size() + " freq");
|
||||
assertNotNull(expected);
|
||||
assertEquals(expected.size(), docs.freq());
|
||||
}
|
||||
|
||||
docsAndPositions = termsEnum.docsAndPositions(null, docsAndPositions, false);
|
||||
assertNotNull(docsAndPositions);
|
||||
//System.out.println(" doc/freq/pos");
|
||||
while((doc = docsAndPositions.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
|
||||
final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
|
||||
//System.out.println(" doc=" + docIDToID[doc] + " " + expected.size() + " freq");
|
||||
assertNotNull(expected);
|
||||
assertEquals(expected.size(), docsAndPositions.freq());
|
||||
for(Token token : expected) {
|
||||
int pos = Integer.parseInt(token.type());
|
||||
//System.out.println(" pos=" + pos);
|
||||
assertEquals(pos, docsAndPositions.nextPosition());
|
||||
}
|
||||
}
|
||||
|
||||
docsAndPositionsAndOffsets = termsEnum.docsAndPositions(null, docsAndPositions, true);
|
||||
assertNotNull(docsAndPositionsAndOffsets);
|
||||
//System.out.println(" doc/freq/pos/offs");
|
||||
while((doc = docsAndPositions.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
|
||||
final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
|
||||
//System.out.println(" doc=" + docIDToID[doc] + " " + expected.size() + " freq");
|
||||
assertNotNull(expected);
|
||||
assertEquals(expected.size(), docsAndPositions.freq());
|
||||
for(Token token : expected) {
|
||||
int pos = Integer.parseInt(token.type());
|
||||
//System.out.println(" pos=" + pos);
|
||||
assertEquals(pos, docsAndPositions.nextPosition());
|
||||
assertEquals(token.startOffset(), docsAndPositions.startOffset());
|
||||
assertEquals(token.endOffset(), docsAndPositions.endOffset());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO: test advance:
|
||||
}
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private Token makeToken(String text, int posIncr, int startOffset, int endOffset) {
|
||||
final Token t = new Token();
|
||||
t.append(text);
|
||||
t.setPositionIncrement(posIncr);
|
||||
t.setOffset(startOffset, endOffset);
|
||||
return t;
|
||||
}
|
||||
}
|
|
@ -148,7 +148,8 @@ public class TestSegmentReader extends LuceneTestCase {
|
|||
DocsAndPositionsEnum positions = MultiFields.getTermPositionsEnum(reader,
|
||||
MultiFields.getLiveDocs(reader),
|
||||
DocHelper.TEXT_FIELD_1_KEY,
|
||||
new BytesRef("field"));
|
||||
new BytesRef("field"),
|
||||
false);
|
||||
// NOTE: prior rev of this test was failing to first
|
||||
// call next here:
|
||||
assertTrue(positions.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
|
|
|
@ -406,7 +406,7 @@ public class TestStressIndexing2 extends LuceneTestCase {
|
|||
BytesRef term2;
|
||||
while((term2 = termsEnum3.next()) != null) {
|
||||
System.out.println(" " + term2.utf8ToString() + ": freq=" + termsEnum3.totalTermFreq());
|
||||
dpEnum = termsEnum3.docsAndPositions(null, dpEnum);
|
||||
dpEnum = termsEnum3.docsAndPositions(null, dpEnum, false);
|
||||
if (dpEnum != null) {
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
final int freq = dpEnum.freq();
|
||||
|
@ -440,7 +440,7 @@ public class TestStressIndexing2 extends LuceneTestCase {
|
|||
BytesRef term2;
|
||||
while((term2 = termsEnum3.next()) != null) {
|
||||
System.out.println(" " + term2.utf8ToString() + ": freq=" + termsEnum3.totalTermFreq());
|
||||
dpEnum = termsEnum3.docsAndPositions(null, dpEnum);
|
||||
dpEnum = termsEnum3.docsAndPositions(null, dpEnum, false);
|
||||
if (dpEnum != null) {
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
final int freq = dpEnum.freq();
|
||||
|
@ -630,8 +630,8 @@ public class TestStressIndexing2 extends LuceneTestCase {
|
|||
assertEquals(termsEnum1.totalTermFreq(),
|
||||
termsEnum2.totalTermFreq());
|
||||
|
||||
dpEnum1 = termsEnum1.docsAndPositions(null, dpEnum1);
|
||||
dpEnum2 = termsEnum2.docsAndPositions(null, dpEnum2);
|
||||
dpEnum1 = termsEnum1.docsAndPositions(null, dpEnum1, false);
|
||||
dpEnum2 = termsEnum2.docsAndPositions(null, dpEnum2, false);
|
||||
if (dpEnum1 != null) {
|
||||
assertNotNull(dpEnum2);
|
||||
int docID1 = dpEnum1.nextDoc();
|
||||
|
|
|
@ -259,7 +259,7 @@ public class TestTermVectorsReader extends LuceneTestCase {
|
|||
//System.out.println("Term: " + term);
|
||||
assertEquals(testTerms[i], term);
|
||||
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, false);
|
||||
assertNotNull(dpEnum);
|
||||
int doc = dpEnum.docID();
|
||||
assertTrue(doc == -1 || doc == DocIdSetIterator.NO_MORE_DOCS);
|
||||
|
@ -270,18 +270,16 @@ public class TestTermVectorsReader extends LuceneTestCase {
|
|||
}
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
|
||||
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
|
||||
doc = dpEnum.docID();
|
||||
assertTrue(doc == -1 || doc == DocIdSetIterator.NO_MORE_DOCS);
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
assertNotNull(dpEnum);
|
||||
final OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
assertNotNull(offsetAtt);
|
||||
assertEquals(dpEnum.freq(), positions[i].length);
|
||||
for (int j = 0; j < positions[i].length; j++) {
|
||||
assertEquals(positions[i][j], dpEnum.nextPosition());
|
||||
assertEquals(j*10, offsetAtt.startOffset());
|
||||
assertEquals(j*10 + testTerms[i].length(), offsetAtt.endOffset());
|
||||
assertEquals(j*10, dpEnum.startOffset());
|
||||
assertEquals(j*10 + testTerms[i].length(), dpEnum.endOffset());
|
||||
}
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
|
||||
}
|
||||
|
@ -315,7 +313,7 @@ public class TestTermVectorsReader extends LuceneTestCase {
|
|||
String term = text.utf8ToString();
|
||||
assertEquals(testTerms[i], term);
|
||||
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, false);
|
||||
assertNotNull(dpEnum);
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
assertEquals(dpEnum.freq(), positions[i].length);
|
||||
|
@ -324,16 +322,14 @@ public class TestTermVectorsReader extends LuceneTestCase {
|
|||
}
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
|
||||
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
final OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
assertNotNull(offsetAtt);
|
||||
assertNotNull(dpEnum);
|
||||
assertEquals(dpEnum.freq(), positions[i].length);
|
||||
for (int j = 0; j < positions[i].length; j++) {
|
||||
assertEquals(positions[i][j], dpEnum.nextPosition());
|
||||
assertEquals(j*10, offsetAtt.startOffset());
|
||||
assertEquals(j*10 + testTerms[i].length(), offsetAtt.endOffset());
|
||||
assertEquals(j*10, dpEnum.startOffset());
|
||||
assertEquals(j*10 + testTerms[i].length(), dpEnum.endOffset());
|
||||
}
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
|
||||
}
|
||||
|
|
|
@ -26,7 +26,6 @@ import org.apache.lucene.analysis.MockAnalyzer;
|
|||
import org.apache.lucene.analysis.MockTokenFilter;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
|
@ -69,34 +68,30 @@ public class TestTermVectorsWriter extends LuceneTestCase {
|
|||
// Token "" occurred once
|
||||
assertEquals(1, termsEnum.totalTermFreq());
|
||||
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
|
||||
OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
assertNotNull(offsetAtt);
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true);
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(8, offsetAtt.startOffset());
|
||||
assertEquals(8, offsetAtt.endOffset());
|
||||
assertEquals(8, dpEnum.startOffset());
|
||||
assertEquals(8, dpEnum.endOffset());
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
|
||||
|
||||
// Token "abcd" occurred three times
|
||||
assertEquals(new BytesRef("abcd"), termsEnum.next());
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
assertNotNull(offsetAtt);
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
|
||||
assertEquals(3, termsEnum.totalTermFreq());
|
||||
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(4, offsetAtt.endOffset());
|
||||
assertEquals(0, dpEnum.startOffset());
|
||||
assertEquals(4, dpEnum.endOffset());
|
||||
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(4, offsetAtt.startOffset());
|
||||
assertEquals(8, offsetAtt.endOffset());
|
||||
assertEquals(4, dpEnum.startOffset());
|
||||
assertEquals(8, dpEnum.endOffset());
|
||||
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(8, offsetAtt.startOffset());
|
||||
assertEquals(12, offsetAtt.endOffset());
|
||||
assertEquals(8, dpEnum.startOffset());
|
||||
assertEquals(12, dpEnum.endOffset());
|
||||
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
|
||||
assertNull(termsEnum.next());
|
||||
|
@ -122,19 +117,17 @@ public class TestTermVectorsWriter extends LuceneTestCase {
|
|||
IndexReader r = IndexReader.open(dir);
|
||||
TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null);
|
||||
assertNotNull(termsEnum.next());
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
|
||||
OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
assertNotNull(offsetAtt);
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true);
|
||||
assertEquals(2, termsEnum.totalTermFreq());
|
||||
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(4, offsetAtt.endOffset());
|
||||
assertEquals(0, dpEnum.startOffset());
|
||||
assertEquals(4, dpEnum.endOffset());
|
||||
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(5, offsetAtt.startOffset());
|
||||
assertEquals(9, offsetAtt.endOffset());
|
||||
assertEquals(5, dpEnum.startOffset());
|
||||
assertEquals(9, dpEnum.endOffset());
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
|
||||
|
||||
r.close();
|
||||
|
@ -159,19 +152,17 @@ public class TestTermVectorsWriter extends LuceneTestCase {
|
|||
IndexReader r = IndexReader.open(dir);
|
||||
TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null);
|
||||
assertNotNull(termsEnum.next());
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
|
||||
OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
assertNotNull(offsetAtt);
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true);
|
||||
assertEquals(2, termsEnum.totalTermFreq());
|
||||
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(4, offsetAtt.endOffset());
|
||||
assertEquals(0, dpEnum.startOffset());
|
||||
assertEquals(4, dpEnum.endOffset());
|
||||
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(8, offsetAtt.startOffset());
|
||||
assertEquals(12, offsetAtt.endOffset());
|
||||
assertEquals(8, dpEnum.startOffset());
|
||||
assertEquals(12, dpEnum.endOffset());
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
|
||||
|
||||
r.close();
|
||||
|
@ -200,19 +191,17 @@ public class TestTermVectorsWriter extends LuceneTestCase {
|
|||
IndexReader r = IndexReader.open(dir);
|
||||
TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null);
|
||||
assertNotNull(termsEnum.next());
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
|
||||
OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
assertNotNull(offsetAtt);
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true);
|
||||
assertEquals(2, termsEnum.totalTermFreq());
|
||||
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(4, offsetAtt.endOffset());
|
||||
assertEquals(0, dpEnum.startOffset());
|
||||
assertEquals(4, dpEnum.endOffset());
|
||||
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(8, offsetAtt.startOffset());
|
||||
assertEquals(12, offsetAtt.endOffset());
|
||||
assertEquals(8, dpEnum.startOffset());
|
||||
assertEquals(12, dpEnum.endOffset());
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
|
||||
|
||||
r.close();
|
||||
|
@ -238,19 +227,17 @@ public class TestTermVectorsWriter extends LuceneTestCase {
|
|||
IndexReader r = IndexReader.open(dir);
|
||||
TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null);
|
||||
assertNotNull(termsEnum.next());
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
|
||||
OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
assertNotNull(offsetAtt);
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true);
|
||||
assertEquals(2, termsEnum.totalTermFreq());
|
||||
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(4, offsetAtt.endOffset());
|
||||
assertEquals(0, dpEnum.startOffset());
|
||||
assertEquals(4, dpEnum.endOffset());
|
||||
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(9, offsetAtt.startOffset());
|
||||
assertEquals(13, offsetAtt.endOffset());
|
||||
assertEquals(9, dpEnum.startOffset());
|
||||
assertEquals(13, dpEnum.endOffset());
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, dpEnum.nextDoc());
|
||||
|
||||
r.close();
|
||||
|
@ -277,32 +264,26 @@ public class TestTermVectorsWriter extends LuceneTestCase {
|
|||
IndexReader r = IndexReader.open(dir);
|
||||
TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null);
|
||||
assertNotNull(termsEnum.next());
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
|
||||
OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
assertNotNull(offsetAtt);
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true);
|
||||
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(4, offsetAtt.endOffset());
|
||||
assertEquals(0, dpEnum.startOffset());
|
||||
assertEquals(4, dpEnum.endOffset());
|
||||
|
||||
assertNotNull(termsEnum.next());
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
assertNotNull(offsetAtt);
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(11, offsetAtt.startOffset());
|
||||
assertEquals(17, offsetAtt.endOffset());
|
||||
assertEquals(11, dpEnum.startOffset());
|
||||
assertEquals(17, dpEnum.endOffset());
|
||||
|
||||
assertNotNull(termsEnum.next());
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
assertNotNull(offsetAtt);
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(18, offsetAtt.startOffset());
|
||||
assertEquals(21, offsetAtt.endOffset());
|
||||
assertEquals(18, dpEnum.startOffset());
|
||||
assertEquals(21, dpEnum.endOffset());
|
||||
|
||||
r.close();
|
||||
dir.close();
|
||||
|
@ -328,24 +309,20 @@ public class TestTermVectorsWriter extends LuceneTestCase {
|
|||
IndexReader r = IndexReader.open(dir);
|
||||
TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null);
|
||||
assertNotNull(termsEnum.next());
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
|
||||
OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
assertNotNull(offsetAtt);
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true);
|
||||
|
||||
assertEquals(1, (int) termsEnum.totalTermFreq());
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(1, offsetAtt.startOffset());
|
||||
assertEquals(7, offsetAtt.endOffset());
|
||||
assertEquals(1, dpEnum.startOffset());
|
||||
assertEquals(7, dpEnum.endOffset());
|
||||
|
||||
assertNotNull(termsEnum.next());
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
assertNotNull(offsetAtt);
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(8, offsetAtt.startOffset());
|
||||
assertEquals(11, offsetAtt.endOffset());
|
||||
assertEquals(8, dpEnum.startOffset());
|
||||
assertEquals(11, dpEnum.endOffset());
|
||||
|
||||
r.close();
|
||||
dir.close();
|
||||
|
@ -375,24 +352,20 @@ public class TestTermVectorsWriter extends LuceneTestCase {
|
|||
IndexReader r = IndexReader.open(dir);
|
||||
TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null);
|
||||
assertNotNull(termsEnum.next());
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
|
||||
OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
assertNotNull(offsetAtt);
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, true);
|
||||
|
||||
assertEquals(1, (int) termsEnum.totalTermFreq());
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(4, offsetAtt.endOffset());
|
||||
assertEquals(0, dpEnum.startOffset());
|
||||
assertEquals(4, dpEnum.endOffset());
|
||||
|
||||
assertNotNull(termsEnum.next());
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
assertNotNull(offsetAtt);
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(6, offsetAtt.startOffset());
|
||||
assertEquals(12, offsetAtt.endOffset());
|
||||
assertEquals(6, dpEnum.startOffset());
|
||||
assertEquals(12, dpEnum.endOffset());
|
||||
|
||||
|
||||
r.close();
|
||||
|
|
|
@ -17,36 +17,38 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.TermContext;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CannedAnalyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.io.Reader;
|
||||
import org.apache.lucene.util.TermContext;
|
||||
|
||||
/**
|
||||
* This class tests the MultiPhraseQuery class.
|
||||
|
@ -329,68 +331,18 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
|
|||
indexStore.close();
|
||||
}
|
||||
|
||||
private static class TokenAndPos {
|
||||
public final String token;
|
||||
public final int pos;
|
||||
public TokenAndPos(String token, int pos) {
|
||||
this.token = token;
|
||||
this.pos = pos;
|
||||
}
|
||||
}
|
||||
|
||||
private static class CannedAnalyzer extends Analyzer {
|
||||
private final TokenAndPos[] tokens;
|
||||
|
||||
public CannedAnalyzer(TokenAndPos[] tokens) {
|
||||
this.tokens = tokens;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
return new TokenStreamComponents(new CannedTokenizer(tokens));
|
||||
}
|
||||
}
|
||||
|
||||
private static class CannedTokenizer extends Tokenizer {
|
||||
private final TokenAndPos[] tokens;
|
||||
private int upto = 0;
|
||||
private int lastPos = 0;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
public CannedTokenizer(TokenAndPos[] tokens) {
|
||||
this.tokens = tokens;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
if (upto < tokens.length) {
|
||||
final TokenAndPos token = tokens[upto++];
|
||||
termAtt.setEmpty();
|
||||
termAtt.append(token.token);
|
||||
posIncrAtt.setPositionIncrement(token.pos - lastPos);
|
||||
lastPos = token.pos;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
this.upto = 0;
|
||||
this.lastPos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public void testZeroPosIncr() throws IOException {
|
||||
Directory dir = new RAMDirectory();
|
||||
final TokenAndPos[] tokens = new TokenAndPos[3];
|
||||
tokens[0] = new TokenAndPos("a", 0);
|
||||
tokens[1] = new TokenAndPos("b", 0);
|
||||
tokens[2] = new TokenAndPos("c", 0);
|
||||
final Token[] tokens = new Token[3];
|
||||
tokens[0] = new Token();
|
||||
tokens[0].append("a");
|
||||
tokens[0].setPositionIncrement(1);
|
||||
tokens[1] = new Token();
|
||||
tokens[1].append("b");
|
||||
tokens[1].setPositionIncrement(0);
|
||||
tokens[2] = new Token();
|
||||
tokens[2].append("c");
|
||||
tokens[2].setPositionIncrement(0);
|
||||
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random, dir, new CannedAnalyzer(tokens));
|
||||
Document doc = new Document();
|
||||
|
@ -429,40 +381,47 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
private final static TokenAndPos[] INCR_0_DOC_TOKENS = new TokenAndPos[] {
|
||||
new TokenAndPos("x", 0),
|
||||
new TokenAndPos("a", 1),
|
||||
new TokenAndPos("1", 1),
|
||||
new TokenAndPos("m", 2), // not existing, relying on slop=2
|
||||
new TokenAndPos("b", 3),
|
||||
new TokenAndPos("1", 3),
|
||||
new TokenAndPos("n", 4), // not existing, relying on slop=2
|
||||
new TokenAndPos("c", 5),
|
||||
new TokenAndPos("y", 6)
|
||||
private static Token makeToken(String text, int posIncr) {
|
||||
final Token t = new Token();
|
||||
t.append(text);
|
||||
t.setPositionIncrement(posIncr);
|
||||
return t;
|
||||
}
|
||||
|
||||
private final static Token[] INCR_0_DOC_TOKENS = new Token[] {
|
||||
makeToken("x", 1),
|
||||
makeToken("a", 1),
|
||||
makeToken("1", 0),
|
||||
makeToken("m", 1), // not existing, relying on slop=2
|
||||
makeToken("b", 1),
|
||||
makeToken("1", 0),
|
||||
makeToken("n", 1), // not existing, relying on slop=2
|
||||
makeToken("c", 1),
|
||||
makeToken("y", 1)
|
||||
};
|
||||
|
||||
private final static TokenAndPos[] INCR_0_QUERY_TOKENS_AND = new TokenAndPos[] {
|
||||
new TokenAndPos("a", 0),
|
||||
new TokenAndPos("1", 0),
|
||||
new TokenAndPos("b", 1),
|
||||
new TokenAndPos("1", 1),
|
||||
new TokenAndPos("c", 2)
|
||||
private final static Token[] INCR_0_QUERY_TOKENS_AND = new Token[] {
|
||||
makeToken("a", 1),
|
||||
makeToken("1", 0),
|
||||
makeToken("b", 1),
|
||||
makeToken("1", 0),
|
||||
makeToken("c", 1)
|
||||
};
|
||||
|
||||
private final static TokenAndPos[][] INCR_0_QUERY_TOKENS_AND_OR_MATCH = new TokenAndPos[][] {
|
||||
{ new TokenAndPos("a", 0) },
|
||||
{ new TokenAndPos("x", 0), new TokenAndPos("1", 0) },
|
||||
{ new TokenAndPos("b", 1) },
|
||||
{ new TokenAndPos("x", 1), new TokenAndPos("1", 1) },
|
||||
{ new TokenAndPos("c", 2) }
|
||||
private final static Token[][] INCR_0_QUERY_TOKENS_AND_OR_MATCH = new Token[][] {
|
||||
{ makeToken("a", 1) },
|
||||
{ makeToken("x", 1), makeToken("1", 0) },
|
||||
{ makeToken("b", 2) },
|
||||
{ makeToken("x", 2), makeToken("1", 0) },
|
||||
{ makeToken("c", 3) }
|
||||
};
|
||||
|
||||
private final static TokenAndPos[][] INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN = new TokenAndPos[][] {
|
||||
{ new TokenAndPos("x", 0) },
|
||||
{ new TokenAndPos("a", 0), new TokenAndPos("1", 0) },
|
||||
{ new TokenAndPos("x", 1) },
|
||||
{ new TokenAndPos("b", 1), new TokenAndPos("1", 1) },
|
||||
{ new TokenAndPos("c", 2) }
|
||||
private final static Token[][] INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN = new Token[][] {
|
||||
{ makeToken("x", 1) },
|
||||
{ makeToken("a", 1), makeToken("1", 0) },
|
||||
{ makeToken("x", 2) },
|
||||
{ makeToken("b", 2), makeToken("1", 0) },
|
||||
{ makeToken("c", 3) }
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -515,8 +474,10 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
|
|||
*/
|
||||
public void testZeroPosIncrSloppyPqAnd() throws IOException {
|
||||
final PhraseQuery pq = new PhraseQuery();
|
||||
for (TokenAndPos tap : INCR_0_QUERY_TOKENS_AND) {
|
||||
pq.add(new Term("field",tap.token), tap.pos);
|
||||
int pos = -1;
|
||||
for (Token tap : INCR_0_QUERY_TOKENS_AND) {
|
||||
pos += tap.getPositionIncrement();
|
||||
pq.add(new Term("field",tap.toString()), pos);
|
||||
}
|
||||
doTestZeroPosIncrSloppy(pq, 0);
|
||||
pq.setSlop(1);
|
||||
|
@ -530,8 +491,10 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
|
|||
*/
|
||||
public void testZeroPosIncrSloppyMpqAnd() throws IOException {
|
||||
final MultiPhraseQuery mpq = new MultiPhraseQuery();
|
||||
for (TokenAndPos tap : INCR_0_QUERY_TOKENS_AND) {
|
||||
mpq.add(new Term[]{new Term("field",tap.token)}, tap.pos); //AND logic
|
||||
int pos = -1;
|
||||
for (Token tap : INCR_0_QUERY_TOKENS_AND) {
|
||||
pos += tap.getPositionIncrement();
|
||||
mpq.add(new Term[]{new Term("field",tap.toString())}, pos); //AND logic
|
||||
}
|
||||
doTestZeroPosIncrSloppy(mpq, 0);
|
||||
mpq.setSlop(1);
|
||||
|
@ -545,9 +508,9 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
|
|||
*/
|
||||
public void testZeroPosIncrSloppyMpqAndOrMatch() throws IOException {
|
||||
final MultiPhraseQuery mpq = new MultiPhraseQuery();
|
||||
for (TokenAndPos tap[] : INCR_0_QUERY_TOKENS_AND_OR_MATCH) {
|
||||
for (Token tap[] : INCR_0_QUERY_TOKENS_AND_OR_MATCH) {
|
||||
Term[] terms = tapTerms(tap);
|
||||
final int pos = tap[0].pos;
|
||||
final int pos = tap[0].getPositionIncrement()-1;
|
||||
mpq.add(terms, pos); //AND logic in pos, OR across lines
|
||||
}
|
||||
doTestZeroPosIncrSloppy(mpq, 0);
|
||||
|
@ -562,9 +525,9 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
|
|||
*/
|
||||
public void testZeroPosIncrSloppyMpqAndOrNoMatch() throws IOException {
|
||||
final MultiPhraseQuery mpq = new MultiPhraseQuery();
|
||||
for (TokenAndPos tap[] : INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN) {
|
||||
for (Token tap[] : INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN) {
|
||||
Term[] terms = tapTerms(tap);
|
||||
final int pos = tap[0].pos;
|
||||
final int pos = tap[0].getPositionIncrement()-1;
|
||||
mpq.add(terms, pos); //AND logic in pos, OR across lines
|
||||
}
|
||||
doTestZeroPosIncrSloppy(mpq, 0);
|
||||
|
@ -572,10 +535,10 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
|
|||
doTestZeroPosIncrSloppy(mpq, 0);
|
||||
}
|
||||
|
||||
private Term[] tapTerms(TokenAndPos[] tap) {
|
||||
private Term[] tapTerms(Token[] tap) {
|
||||
Term[] terms = new Term[tap.length];
|
||||
for (int i=0; i<terms.length; i++) {
|
||||
terms[i] = new Term("field",tap[i].token);
|
||||
terms[i] = new Term("field",tap[i].toString());
|
||||
}
|
||||
return terms;
|
||||
}
|
||||
|
|
|
@ -42,8 +42,6 @@ import org.apache.lucene.search.spans.SpanQuery;
|
|||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.search.spans.Spans;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
|
@ -102,7 +100,8 @@ public class TestPositionIncrement extends LuceneTestCase {
|
|||
DocsAndPositionsEnum pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(),
|
||||
MultiFields.getLiveDocs(searcher.getIndexReader()),
|
||||
"field",
|
||||
new BytesRef("1"));
|
||||
new BytesRef("1"),
|
||||
false);
|
||||
pos.nextDoc();
|
||||
// first token should be at position 0
|
||||
assertEquals(0, pos.nextPosition());
|
||||
|
@ -110,7 +109,8 @@ public class TestPositionIncrement extends LuceneTestCase {
|
|||
pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(),
|
||||
MultiFields.getLiveDocs(searcher.getIndexReader()),
|
||||
"field",
|
||||
new BytesRef("2"));
|
||||
new BytesRef("2"),
|
||||
false);
|
||||
pos.nextDoc();
|
||||
// second token should be at position 2
|
||||
assertEquals(2, pos.nextPosition());
|
||||
|
@ -200,10 +200,6 @@ public class TestPositionIncrement extends LuceneTestCase {
|
|||
store.close();
|
||||
}
|
||||
|
||||
// stoplist that accepts case-insensitive "stop"
|
||||
private static final CharacterRunAutomaton stopStopList =
|
||||
new CharacterRunAutomaton(new RegExp("[sS][tT][oO][pP]").toAutomaton());
|
||||
|
||||
public void testPayloadsPos0() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random, dir, new MockPayloadAnalyzer());
|
||||
|
@ -217,7 +213,8 @@ public class TestPositionIncrement extends LuceneTestCase {
|
|||
|
||||
DocsAndPositionsEnum tp = r.termPositionsEnum(r.getLiveDocs(),
|
||||
"content",
|
||||
new BytesRef("a"));
|
||||
new BytesRef("a"),
|
||||
false);
|
||||
|
||||
int count = 0;
|
||||
assertTrue(tp.nextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS);
|
||||
|
|
|
@ -23,7 +23,6 @@ import java.util.Map;
|
|||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
|
@ -135,19 +134,19 @@ public class TestTermVectors extends LuceneTestCase {
|
|||
assertNotNull(terms);
|
||||
TermsEnum termsEnum = terms.iterator(null);
|
||||
assertEquals("content", termsEnum.next().utf8ToString());
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, false);
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
assertEquals(1, dpEnum.freq());
|
||||
assertEquals(expectedPositions[0], dpEnum.nextPosition());
|
||||
|
||||
assertEquals("here", termsEnum.next().utf8ToString());
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, false);
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
assertEquals(1, dpEnum.freq());
|
||||
assertEquals(expectedPositions[1], dpEnum.nextPosition());
|
||||
|
||||
assertEquals("some", termsEnum.next().utf8ToString());
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, false);
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
assertEquals(1, dpEnum.freq());
|
||||
assertEquals(expectedPositions[2], dpEnum.nextPosition());
|
||||
|
@ -171,31 +170,21 @@ public class TestTermVectors extends LuceneTestCase {
|
|||
|
||||
TermsEnum termsEnum = vectors.terms("field").iterator(null);
|
||||
assertNotNull(termsEnum.next());
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
OffsetAttribute offsetAtt = dpEnum == null ? null : dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
|
||||
boolean shouldBePosVector = hits[i].doc % 2 == 0;
|
||||
assertTrue(!shouldBePosVector
|
||||
|| (shouldBePosVector && dpEnum != null));
|
||||
|
||||
boolean shouldBeOffVector = hits[i].doc % 3 == 0;
|
||||
assertTrue(!shouldBeOffVector
|
||||
|| (shouldBeOffVector && offsetAtt != null));
|
||||
|
||||
if (shouldBePosVector || shouldBeOffVector) {
|
||||
while(true) {
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, shouldBeOffVector);
|
||||
assertNotNull(dpEnum);
|
||||
offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
|
||||
dpEnum.nextPosition();
|
||||
|
||||
if (shouldBePosVector) {
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
}
|
||||
|
||||
if (shouldBeOffVector) {
|
||||
assertNotNull(offsetAtt);
|
||||
} else {
|
||||
assertNull(offsetAtt);
|
||||
assertTrue(dpEnum.startOffset() != -1);
|
||||
assertTrue(dpEnum.endOffset() != -1);
|
||||
}
|
||||
|
||||
if (termsEnum.next() == null) {
|
||||
|
@ -437,7 +426,7 @@ public class TestTermVectors extends LuceneTestCase {
|
|||
assertNotNull(termsEnum.next());
|
||||
assertEquals("one", termsEnum.term().utf8ToString());
|
||||
assertEquals(5, termsEnum.totalTermFreq());
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
|
||||
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, false);
|
||||
assertNotNull(dpEnum);
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
assertEquals(5, dpEnum.freq());
|
||||
|
@ -445,16 +434,14 @@ public class TestTermVectors extends LuceneTestCase {
|
|||
assertEquals(i, dpEnum.nextPosition());
|
||||
}
|
||||
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, true);
|
||||
assertNotNull(dpEnum);
|
||||
OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
assertNotNull(offsetAtt);
|
||||
assertTrue(dpEnum.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
assertEquals(5, dpEnum.freq());
|
||||
for(int i=0;i<5;i++) {
|
||||
dpEnum.nextPosition();
|
||||
assertEquals(4*i, offsetAtt.startOffset());
|
||||
assertEquals(4*i+3, offsetAtt.endOffset());
|
||||
assertEquals(4*i, dpEnum.startOffset());
|
||||
assertEquals(4*i+3, dpEnum.endOffset());
|
||||
}
|
||||
reader.close();
|
||||
}
|
||||
|
|
|
@ -283,7 +283,8 @@ public class TestClassicAnalyzer extends BaseTokenStreamTestCase {
|
|||
DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader,
|
||||
MultiFields.getLiveDocs(reader),
|
||||
"content",
|
||||
new BytesRef("another"));
|
||||
new BytesRef("another"),
|
||||
false);
|
||||
assertTrue(tps.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
assertEquals(1, tps.freq());
|
||||
assertEquals(3, tps.nextPosition());
|
||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.lucene.analysis.core.LowerCaseFilter;
|
|||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
@ -110,16 +109,15 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
|
|||
TermsEnum termsEnum = vector.iterator(null);
|
||||
termsEnum.next();
|
||||
assertEquals(2, termsEnum.totalTermFreq());
|
||||
DocsAndPositionsEnum positions = termsEnum.docsAndPositions(null, null);
|
||||
OffsetAttribute offsetAtt = positions.attributes().getAttribute(OffsetAttribute.class);
|
||||
DocsAndPositionsEnum positions = termsEnum.docsAndPositions(null, null, true);
|
||||
assertTrue(positions.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
assertEquals(2, positions.freq());
|
||||
positions.nextPosition();
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(4, offsetAtt.endOffset());
|
||||
assertEquals(0, positions.startOffset());
|
||||
assertEquals(4, positions.endOffset());
|
||||
positions.nextPosition();
|
||||
assertEquals(8, offsetAtt.startOffset());
|
||||
assertEquals(12, offsetAtt.endOffset());
|
||||
assertEquals(8, positions.startOffset());
|
||||
assertEquals(12, positions.endOffset());
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, positions.nextDoc());
|
||||
r.close();
|
||||
dir.close();
|
||||
|
|
|
@ -53,7 +53,7 @@ public class PayloadIterator {
|
|||
this.buffer = buffer;
|
||||
// TODO (Facet): avoid Multi*?
|
||||
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
|
||||
this.tp = MultiFields.getTermPositionsEnum(indexReader, liveDocs, term.field(), term.bytes());
|
||||
this.tp = MultiFields.getTermPositionsEnum(indexReader, liveDocs, term.field(), term.bytes(), false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -104,7 +104,8 @@ class ParentArray {
|
|||
// TODO (Facet): avoid Multi*?
|
||||
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
|
||||
DocsAndPositionsEnum positions = MultiFields.getTermPositionsEnum(indexReader, liveDocs,
|
||||
Consts.FIELD_PAYLOADS, new BytesRef(Consts.PAYLOAD_PARENT));
|
||||
Consts.FIELD_PAYLOADS, new BytesRef(Consts.PAYLOAD_PARENT),
|
||||
false);
|
||||
if ((positions == null || positions.advance(first) == DocsAndPositionsEnum.NO_MORE_DOCS) && first < num) {
|
||||
throw new CorruptIndexException("Missing parent data for category " + first);
|
||||
}
|
||||
|
|
|
@ -8,7 +8,6 @@ import java.util.Iterator;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.Fields;
|
||||
|
@ -283,18 +282,17 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
|
|||
termInfo.add("tf", freq);
|
||||
}
|
||||
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, fieldOptions.offsets);
|
||||
boolean useOffsets = fieldOptions.offsets;
|
||||
if (dpEnum == null) {
|
||||
useOffsets = false;
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum, false);
|
||||
}
|
||||
|
||||
boolean usePositions = false;
|
||||
boolean useOffsets = false;
|
||||
OffsetAttribute offsetAtt = null;
|
||||
if (dpEnum != null) {
|
||||
dpEnum.nextDoc();
|
||||
usePositions = fieldOptions.positions;
|
||||
if (fieldOptions.offsets && dpEnum.attributes().hasAttribute(OffsetAttribute.class)) {
|
||||
useOffsets = true;
|
||||
offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
|
||||
}
|
||||
}
|
||||
|
||||
NamedList<Number> theOffsets = null;
|
||||
|
@ -317,8 +315,8 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
|
|||
}
|
||||
|
||||
if (theOffsets != null) {
|
||||
theOffsets.add("start", offsetAtt.startOffset());
|
||||
theOffsets.add("end", offsetAtt.endOffset());
|
||||
theOffsets.add("start", dpEnum.startOffset());
|
||||
theOffsets.add("end", dpEnum.endOffset());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue