mirror of https://github.com/apache/lucene.git
LUCENE-4031: support offsets in Pulsing
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1334448 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6979137588
commit
7c24a0367e
|
@ -148,7 +148,7 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
PulsingTermState termState = (PulsingTermState) _termState;
|
||||
|
||||
// if we have positions, its total TF, otherwise its computed based on docFreq.
|
||||
long count = fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS ? termState.totalTermFreq : termState.docFreq;
|
||||
long count = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 ? termState.totalTermFreq : termState.docFreq;
|
||||
//System.out.println(" count=" + count + " threshold=" + maxPositions);
|
||||
|
||||
if (count <= maxPositions) {
|
||||
|
@ -217,7 +217,11 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(FieldInfo field, BlockTermState _termState, Bits liveDocs, DocsAndPositionsEnum reuse,
|
||||
boolean needsOffsets) throws IOException {
|
||||
//System.out.println("D&P: field=" + field.name);
|
||||
if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
|
||||
return null;
|
||||
} else if (needsOffsets && field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final PulsingTermState termState = (PulsingTermState) _termState;
|
||||
|
||||
|
@ -258,6 +262,7 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
private final ByteArrayDataInput postings = new ByteArrayDataInput();
|
||||
private final IndexOptions indexOptions;
|
||||
private final boolean storePayloads;
|
||||
private final boolean storeOffsets;
|
||||
private Bits liveDocs;
|
||||
private int docID = -1;
|
||||
private int accum;
|
||||
|
@ -267,6 +272,7 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
public PulsingDocsEnum(FieldInfo fieldInfo) {
|
||||
indexOptions = fieldInfo.indexOptions;
|
||||
storePayloads = fieldInfo.storePayloads;
|
||||
storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
}
|
||||
|
||||
public PulsingDocsEnum reset(Bits liveDocs, PulsingTermState termState) {
|
||||
|
@ -314,7 +320,7 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
freq = postings.readVInt(); // else read freq
|
||||
}
|
||||
|
||||
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
|
||||
// Skip positions
|
||||
if (storePayloads) {
|
||||
for(int pos=0;pos<freq;pos++) {
|
||||
|
@ -322,6 +328,10 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
if ((posCode & 1) != 0) {
|
||||
payloadLength = postings.readVInt();
|
||||
}
|
||||
if (storeOffsets && (postings.readVInt() & 1) != 0) {
|
||||
// new offset length
|
||||
postings.readVInt();
|
||||
}
|
||||
if (payloadLength != 0) {
|
||||
postings.skipBytes(payloadLength);
|
||||
}
|
||||
|
@ -330,6 +340,10 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
for(int pos=0;pos<freq;pos++) {
|
||||
// TODO: skipVInt
|
||||
postings.readVInt();
|
||||
if (storeOffsets && (postings.readVInt() & 1) != 0) {
|
||||
// new offset length
|
||||
postings.readVInt();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -367,6 +381,10 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
private byte[] postingsBytes;
|
||||
private final ByteArrayDataInput postings = new ByteArrayDataInput();
|
||||
private final boolean storePayloads;
|
||||
private final boolean storeOffsets;
|
||||
// note: we could actually reuse across different options, if we passed this to reset()
|
||||
// and re-init'ed storeOffsets accordingly (made it non-final)
|
||||
private final IndexOptions indexOptions;
|
||||
|
||||
private Bits liveDocs;
|
||||
private int docID = -1;
|
||||
|
@ -376,15 +394,19 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
private int position;
|
||||
private int payloadLength;
|
||||
private BytesRef payload;
|
||||
private int startOffset;
|
||||
private int offsetLength;
|
||||
|
||||
private boolean payloadRetrieved;
|
||||
|
||||
public PulsingDocsAndPositionsEnum(FieldInfo fieldInfo) {
|
||||
indexOptions = fieldInfo.indexOptions;
|
||||
storePayloads = fieldInfo.storePayloads;
|
||||
storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
}
|
||||
|
||||
boolean canReuse(FieldInfo fieldInfo) {
|
||||
return storePayloads == fieldInfo.storePayloads;
|
||||
return indexOptions == fieldInfo.indexOptions && storePayloads == fieldInfo.storePayloads;
|
||||
}
|
||||
|
||||
public PulsingDocsAndPositionsEnum reset(Bits liveDocs, PulsingTermState termState) {
|
||||
|
@ -401,6 +423,8 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
posPending = 0;
|
||||
docID = -1;
|
||||
accum = 0;
|
||||
startOffset = storeOffsets ? 0 : -1; // always return -1 if no offsets are stored
|
||||
offsetLength = 0;
|
||||
//System.out.println("PR d&p reset storesPayloads=" + storePayloads + " bytes=" + bytes.length + " this=" + this);
|
||||
return this;
|
||||
}
|
||||
|
@ -427,6 +451,7 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
freq = postings.readVInt(); // else read freq
|
||||
}
|
||||
posPending = freq;
|
||||
startOffset = storeOffsets ? 0 : -1; // always return -1 if no offsets are stored
|
||||
|
||||
if (liveDocs == null || liveDocs.get(accum)) {
|
||||
//System.out.println(" return docID=" + docID + " freq=" + freq);
|
||||
|
@ -480,6 +505,15 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
} else {
|
||||
position += postings.readVInt();
|
||||
}
|
||||
|
||||
if (storeOffsets) {
|
||||
int offsetCode = postings.readVInt();
|
||||
if ((offsetCode & 1) != 0) {
|
||||
// new offset length
|
||||
offsetLength = postings.readVInt();
|
||||
}
|
||||
startOffset += offsetCode >>> 1;
|
||||
}
|
||||
|
||||
//System.out.println("PR d&p nextPos return pos=" + position + " this=" + this);
|
||||
return position;
|
||||
|
@ -487,12 +521,12 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
|
||||
@Override
|
||||
public int startOffset() {
|
||||
return -1;
|
||||
return startOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() {
|
||||
return -1;
|
||||
return startOffset + offsetLength;
|
||||
}
|
||||
|
||||
private void skipPositions() throws IOException {
|
||||
|
|
|
@ -79,6 +79,8 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
|
|||
int termFreq; // only incremented on first position for a given doc
|
||||
int pos;
|
||||
int docID;
|
||||
int startOffset;
|
||||
int endOffset;
|
||||
}
|
||||
|
||||
// TODO: -- lazy init this? ie, if every single term
|
||||
|
@ -123,9 +125,6 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
|
|||
@Override
|
||||
public void setField(FieldInfo fieldInfo) {
|
||||
this.indexOptions = fieldInfo.indexOptions;
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
|
||||
throw new UnsupportedOperationException("this codec cannot index offsets: " + indexOptions);
|
||||
}
|
||||
if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions);
|
||||
storePayloads = fieldInfo.storePayloads;
|
||||
wrappedPostingsWriter.setField(fieldInfo);
|
||||
|
@ -186,11 +185,13 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
|
|||
if (pendingCount == -1) {
|
||||
// We've already seen too many docs for this term --
|
||||
// just forward to our fallback writer
|
||||
wrappedPostingsWriter.addPosition(position, payload, -1, -1);
|
||||
wrappedPostingsWriter.addPosition(position, payload, startOffset, endOffset);
|
||||
} else {
|
||||
// buffer up
|
||||
final Position pos = pending[pendingCount++];
|
||||
pos.pos = position;
|
||||
pos.startOffset = startOffset;
|
||||
pos.endOffset = endOffset;
|
||||
pos.docID = currentDoc.docID;
|
||||
if (payload != null && payload.length > 0) {
|
||||
if (pos.payload == null) {
|
||||
|
@ -240,10 +241,11 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
|
|||
// given codec wants to store other interesting
|
||||
// stuff, it could use this pulsing codec to do so
|
||||
|
||||
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
|
||||
int lastDocID = 0;
|
||||
int pendingIDX = 0;
|
||||
int lastPayloadLength = -1;
|
||||
int lastOffsetLength = -1;
|
||||
while(pendingIDX < pendingCount) {
|
||||
final Position doc = pending[pendingIDX];
|
||||
|
||||
|
@ -260,14 +262,15 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
|
|||
}
|
||||
|
||||
int lastPos = 0;
|
||||
int lastOffset = 0;
|
||||
for(int posIDX=0;posIDX<doc.termFreq;posIDX++) {
|
||||
final Position pos = pending[pendingIDX++];
|
||||
assert pos.docID == doc.docID;
|
||||
final int posDelta = pos.pos - lastPos;
|
||||
lastPos = pos.pos;
|
||||
if (DEBUG) System.out.println(" write pos=" + pos.pos);
|
||||
final int payloadLength = pos.payload == null ? 0 : pos.payload.length;
|
||||
if (storePayloads) {
|
||||
final int payloadLength = pos.payload == null ? 0 : pos.payload.length;
|
||||
if (payloadLength != lastPayloadLength) {
|
||||
buffer.writeVInt((posDelta << 1)|1);
|
||||
buffer.writeVInt(payloadLength);
|
||||
|
@ -275,12 +278,28 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
|
|||
} else {
|
||||
buffer.writeVInt(posDelta << 1);
|
||||
}
|
||||
if (payloadLength > 0) {
|
||||
buffer.writeBytes(pos.payload.bytes, 0, pos.payload.length);
|
||||
}
|
||||
} else {
|
||||
buffer.writeVInt(posDelta);
|
||||
}
|
||||
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
|
||||
//System.out.println("write=" + pos.startOffset + "," + pos.endOffset);
|
||||
int offsetDelta = pos.startOffset - lastOffset;
|
||||
int offsetLength = pos.endOffset - pos.startOffset;
|
||||
if (offsetLength != lastOffsetLength) {
|
||||
buffer.writeVInt(offsetDelta << 1 | 1);
|
||||
buffer.writeVInt(offsetLength);
|
||||
} else {
|
||||
buffer.writeVInt(offsetDelta << 1);
|
||||
}
|
||||
lastOffset = pos.startOffset;
|
||||
lastOffsetLength = offsetLength;
|
||||
}
|
||||
|
||||
if (payloadLength > 0) {
|
||||
assert storePayloads;
|
||||
buffer.writeBytes(pos.payload.bytes, 0, pos.payload.length);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (indexOptions == IndexOptions.DOCS_AND_FREQS) {
|
||||
|
@ -387,7 +406,7 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
|
|||
wrappedPostingsWriter.startDoc(doc.docID, doc.termFreq);
|
||||
}
|
||||
if (DEBUG) System.out.println("PW: wrapped.addPos pos=" + pos.pos);
|
||||
wrappedPostingsWriter.addPosition(pos.pos, pos.payload, -1, -1);
|
||||
wrappedPostingsWriter.addPosition(pos.pos, pos.payload, pos.startOffset, pos.endOffset);
|
||||
}
|
||||
//wrappedPostingsWriter.finishDoc();
|
||||
} else {
|
||||
|
|
|
@ -30,6 +30,8 @@ import org.apache.lucene.analysis.Token;
|
|||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
|
||||
import org.apache.lucene.codecs.memory.MemoryPostingsFormat;
|
||||
import org.apache.lucene.codecs.nestedpulsing.NestedPulsingPostingsFormat;
|
||||
import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
|
@ -56,11 +58,13 @@ public class TestPostingsOffsets extends LuceneTestCase {
|
|||
iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||
|
||||
if (Codec.getDefault().getName().equals("Lucene40")) {
|
||||
// pulsing etc are not implemented
|
||||
if (random().nextBoolean()) {
|
||||
iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
|
||||
} else {
|
||||
iwc.setCodec(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat()));
|
||||
// sep etc are not implemented
|
||||
switch(random().nextInt(4)) {
|
||||
case 0: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat())); break;
|
||||
case 1: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat())); break;
|
||||
case 2: iwc.setCodec(_TestUtil.alwaysPostingsFormat(
|
||||
new Pulsing40PostingsFormat(_TestUtil.nextInt(random(), 1, 3)))); break;
|
||||
case 3: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new NestedPulsingPostingsFormat())); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -73,6 +77,11 @@ public class TestPostingsOffsets extends LuceneTestCase {
|
|||
|
||||
FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
|
||||
ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
if (random().nextBoolean()) {
|
||||
ft.setStoreTermVectors(true);
|
||||
ft.setStoreTermVectorPositions(random().nextBoolean());
|
||||
ft.setStoreTermVectorOffsets(random().nextBoolean());
|
||||
}
|
||||
Token[] tokens = new Token[] {
|
||||
makeToken("a", 1, 0, 6),
|
||||
makeToken("b", 1, 8, 9),
|
||||
|
@ -132,11 +141,13 @@ public class TestPostingsOffsets extends LuceneTestCase {
|
|||
Analyzer analyzer = withPayloads ? new MockPayloadAnalyzer() : new MockAnalyzer(random());
|
||||
iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
|
||||
if (Codec.getDefault().getName().equals("Lucene40")) {
|
||||
// pulsing etc are not implemented
|
||||
if (random().nextBoolean()) {
|
||||
iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
|
||||
} else {
|
||||
iwc.setCodec(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat()));
|
||||
// sep etc are not implemented
|
||||
switch(random().nextInt(4)) {
|
||||
case 0: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat())); break;
|
||||
case 1: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat())); break;
|
||||
case 2: iwc.setCodec(_TestUtil.alwaysPostingsFormat(
|
||||
new Pulsing40PostingsFormat(_TestUtil.nextInt(random(), 1, 3)))); break;
|
||||
case 3: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new NestedPulsingPostingsFormat())); break;
|
||||
}
|
||||
}
|
||||
iwc.setMergePolicy(newLogMergePolicy()); // will rely on docids a bit for skipping
|
||||
|
|
|
@ -574,6 +574,12 @@ public class _TestUtil {
|
|||
* default codecs and formats, but always writes in the specified
|
||||
* format. */
|
||||
public static Codec alwaysPostingsFormat(final PostingsFormat format) {
|
||||
// TODO: we really need for postings impls etc to announce themselves
|
||||
// (and maybe their params, too) to infostream on flush and merge.
|
||||
// otherwise in a real debugging situation we won't know whats going on!
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
System.out.println("forcing postings format to:" + format);
|
||||
}
|
||||
return new Lucene40Codec() {
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
|
|
Loading…
Reference in New Issue