LUCENE-4031: support offsets in Pulsing

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1334448 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-05-05 16:53:16 +00:00
parent 6979137588
commit 7c24a0367e
4 changed files with 96 additions and 26 deletions

View File

@ -148,7 +148,7 @@ public class PulsingPostingsReader extends PostingsReaderBase {
PulsingTermState termState = (PulsingTermState) _termState;
// if we have positions, its total TF, otherwise its computed based on docFreq.
long count = fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS ? termState.totalTermFreq : termState.docFreq;
long count = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 ? termState.totalTermFreq : termState.docFreq;
//System.out.println(" count=" + count + " threshold=" + maxPositions);
if (count <= maxPositions) {
@ -217,7 +217,11 @@ public class PulsingPostingsReader extends PostingsReaderBase {
@Override
public DocsAndPositionsEnum docsAndPositions(FieldInfo field, BlockTermState _termState, Bits liveDocs, DocsAndPositionsEnum reuse,
boolean needsOffsets) throws IOException {
//System.out.println("D&P: field=" + field.name);
if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
return null;
} else if (needsOffsets && field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0) {
return null;
}
final PulsingTermState termState = (PulsingTermState) _termState;
@ -258,6 +262,7 @@ public class PulsingPostingsReader extends PostingsReaderBase {
private final ByteArrayDataInput postings = new ByteArrayDataInput();
private final IndexOptions indexOptions;
private final boolean storePayloads;
private final boolean storeOffsets;
private Bits liveDocs;
private int docID = -1;
private int accum;
@ -267,6 +272,7 @@ public class PulsingPostingsReader extends PostingsReaderBase {
public PulsingDocsEnum(FieldInfo fieldInfo) {
indexOptions = fieldInfo.indexOptions;
storePayloads = fieldInfo.storePayloads;
storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}
public PulsingDocsEnum reset(Bits liveDocs, PulsingTermState termState) {
@ -314,7 +320,7 @@ public class PulsingPostingsReader extends PostingsReaderBase {
freq = postings.readVInt(); // else read freq
}
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
// Skip positions
if (storePayloads) {
for(int pos=0;pos<freq;pos++) {
@ -322,6 +328,10 @@ public class PulsingPostingsReader extends PostingsReaderBase {
if ((posCode & 1) != 0) {
payloadLength = postings.readVInt();
}
if (storeOffsets && (postings.readVInt() & 1) != 0) {
// new offset length
postings.readVInt();
}
if (payloadLength != 0) {
postings.skipBytes(payloadLength);
}
@ -330,6 +340,10 @@ public class PulsingPostingsReader extends PostingsReaderBase {
for(int pos=0;pos<freq;pos++) {
// TODO: skipVInt
postings.readVInt();
if (storeOffsets && (postings.readVInt() & 1) != 0) {
// new offset length
postings.readVInt();
}
}
}
}
@ -367,6 +381,10 @@ public class PulsingPostingsReader extends PostingsReaderBase {
private byte[] postingsBytes;
private final ByteArrayDataInput postings = new ByteArrayDataInput();
private final boolean storePayloads;
private final boolean storeOffsets;
// note: we could actually reuse across different options, if we passed this to reset()
// and re-init'ed storeOffsets accordingly (made it non-final)
private final IndexOptions indexOptions;
private Bits liveDocs;
private int docID = -1;
@ -376,15 +394,19 @@ public class PulsingPostingsReader extends PostingsReaderBase {
private int position;
private int payloadLength;
private BytesRef payload;
private int startOffset;
private int offsetLength;
private boolean payloadRetrieved;
public PulsingDocsAndPositionsEnum(FieldInfo fieldInfo) {
indexOptions = fieldInfo.indexOptions;
storePayloads = fieldInfo.storePayloads;
storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}
boolean canReuse(FieldInfo fieldInfo) {
return storePayloads == fieldInfo.storePayloads;
return indexOptions == fieldInfo.indexOptions && storePayloads == fieldInfo.storePayloads;
}
public PulsingDocsAndPositionsEnum reset(Bits liveDocs, PulsingTermState termState) {
@ -401,6 +423,8 @@ public class PulsingPostingsReader extends PostingsReaderBase {
posPending = 0;
docID = -1;
accum = 0;
startOffset = storeOffsets ? 0 : -1; // always return -1 if no offsets are stored
offsetLength = 0;
//System.out.println("PR d&p reset storesPayloads=" + storePayloads + " bytes=" + bytes.length + " this=" + this);
return this;
}
@ -427,6 +451,7 @@ public class PulsingPostingsReader extends PostingsReaderBase {
freq = postings.readVInt(); // else read freq
}
posPending = freq;
startOffset = storeOffsets ? 0 : -1; // always return -1 if no offsets are stored
if (liveDocs == null || liveDocs.get(accum)) {
//System.out.println(" return docID=" + docID + " freq=" + freq);
@ -480,6 +505,15 @@ public class PulsingPostingsReader extends PostingsReaderBase {
} else {
position += postings.readVInt();
}
if (storeOffsets) {
int offsetCode = postings.readVInt();
if ((offsetCode & 1) != 0) {
// new offset length
offsetLength = postings.readVInt();
}
startOffset += offsetCode >>> 1;
}
//System.out.println("PR d&p nextPos return pos=" + position + " this=" + this);
return position;
@ -487,12 +521,12 @@ public class PulsingPostingsReader extends PostingsReaderBase {
@Override
public int startOffset() {
return -1;
return startOffset;
}
@Override
public int endOffset() {
return -1;
return startOffset + offsetLength;
}
private void skipPositions() throws IOException {

View File

@ -79,6 +79,8 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
int termFreq; // only incremented on first position for a given doc
int pos;
int docID;
int startOffset;
int endOffset;
}
// TODO: -- lazy init this? ie, if every single term
@ -123,9 +125,6 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
@Override
public void setField(FieldInfo fieldInfo) {
this.indexOptions = fieldInfo.indexOptions;
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
throw new UnsupportedOperationException("this codec cannot index offsets: " + indexOptions);
}
if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions);
storePayloads = fieldInfo.storePayloads;
wrappedPostingsWriter.setField(fieldInfo);
@ -186,11 +185,13 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
if (pendingCount == -1) {
// We've already seen too many docs for this term --
// just forward to our fallback writer
wrappedPostingsWriter.addPosition(position, payload, -1, -1);
wrappedPostingsWriter.addPosition(position, payload, startOffset, endOffset);
} else {
// buffer up
final Position pos = pending[pendingCount++];
pos.pos = position;
pos.startOffset = startOffset;
pos.endOffset = endOffset;
pos.docID = currentDoc.docID;
if (payload != null && payload.length > 0) {
if (pos.payload == null) {
@ -240,10 +241,11 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
// given codec wants to store other interesting
// stuff, it could use this pulsing codec to do so
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
int lastDocID = 0;
int pendingIDX = 0;
int lastPayloadLength = -1;
int lastOffsetLength = -1;
while(pendingIDX < pendingCount) {
final Position doc = pending[pendingIDX];
@ -260,14 +262,15 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
}
int lastPos = 0;
int lastOffset = 0;
for(int posIDX=0;posIDX<doc.termFreq;posIDX++) {
final Position pos = pending[pendingIDX++];
assert pos.docID == doc.docID;
final int posDelta = pos.pos - lastPos;
lastPos = pos.pos;
if (DEBUG) System.out.println(" write pos=" + pos.pos);
final int payloadLength = pos.payload == null ? 0 : pos.payload.length;
if (storePayloads) {
final int payloadLength = pos.payload == null ? 0 : pos.payload.length;
if (payloadLength != lastPayloadLength) {
buffer.writeVInt((posDelta << 1)|1);
buffer.writeVInt(payloadLength);
@ -275,12 +278,28 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
} else {
buffer.writeVInt(posDelta << 1);
}
if (payloadLength > 0) {
buffer.writeBytes(pos.payload.bytes, 0, pos.payload.length);
}
} else {
buffer.writeVInt(posDelta);
}
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
//System.out.println("write=" + pos.startOffset + "," + pos.endOffset);
int offsetDelta = pos.startOffset - lastOffset;
int offsetLength = pos.endOffset - pos.startOffset;
if (offsetLength != lastOffsetLength) {
buffer.writeVInt(offsetDelta << 1 | 1);
buffer.writeVInt(offsetLength);
} else {
buffer.writeVInt(offsetDelta << 1);
}
lastOffset = pos.startOffset;
lastOffsetLength = offsetLength;
}
if (payloadLength > 0) {
assert storePayloads;
buffer.writeBytes(pos.payload.bytes, 0, pos.payload.length);
}
}
}
} else if (indexOptions == IndexOptions.DOCS_AND_FREQS) {
@ -387,7 +406,7 @@ public final class PulsingPostingsWriter extends PostingsWriterBase {
wrappedPostingsWriter.startDoc(doc.docID, doc.termFreq);
}
if (DEBUG) System.out.println("PW: wrapped.addPos pos=" + pos.pos);
wrappedPostingsWriter.addPosition(pos.pos, pos.payload, -1, -1);
wrappedPostingsWriter.addPosition(pos.pos, pos.payload, pos.startOffset, pos.endOffset);
}
//wrappedPostingsWriter.finishDoc();
} else {

View File

@ -30,6 +30,8 @@ import org.apache.lucene.analysis.Token;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
import org.apache.lucene.codecs.memory.MemoryPostingsFormat;
import org.apache.lucene.codecs.nestedpulsing.NestedPulsingPostingsFormat;
import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
@ -56,11 +58,13 @@ public class TestPostingsOffsets extends LuceneTestCase {
iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
if (Codec.getDefault().getName().equals("Lucene40")) {
// pulsing etc are not implemented
if (random().nextBoolean()) {
iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
} else {
iwc.setCodec(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat()));
// sep etc are not implemented
switch(random().nextInt(4)) {
case 0: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat())); break;
case 1: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat())); break;
case 2: iwc.setCodec(_TestUtil.alwaysPostingsFormat(
new Pulsing40PostingsFormat(_TestUtil.nextInt(random(), 1, 3)))); break;
case 3: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new NestedPulsingPostingsFormat())); break;
}
}
}
@ -73,6 +77,11 @@ public class TestPostingsOffsets extends LuceneTestCase {
FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
if (random().nextBoolean()) {
ft.setStoreTermVectors(true);
ft.setStoreTermVectorPositions(random().nextBoolean());
ft.setStoreTermVectorOffsets(random().nextBoolean());
}
Token[] tokens = new Token[] {
makeToken("a", 1, 0, 6),
makeToken("b", 1, 8, 9),
@ -132,11 +141,13 @@ public class TestPostingsOffsets extends LuceneTestCase {
Analyzer analyzer = withPayloads ? new MockPayloadAnalyzer() : new MockAnalyzer(random());
iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
if (Codec.getDefault().getName().equals("Lucene40")) {
// pulsing etc are not implemented
if (random().nextBoolean()) {
iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
} else {
iwc.setCodec(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat()));
// sep etc are not implemented
switch(random().nextInt(4)) {
case 0: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat())); break;
case 1: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat())); break;
case 2: iwc.setCodec(_TestUtil.alwaysPostingsFormat(
new Pulsing40PostingsFormat(_TestUtil.nextInt(random(), 1, 3)))); break;
case 3: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new NestedPulsingPostingsFormat())); break;
}
}
iwc.setMergePolicy(newLogMergePolicy()); // will rely on docids a bit for skipping

View File

@ -574,6 +574,12 @@ public class _TestUtil {
* default codecs and formats, but always writes in the specified
* format. */
public static Codec alwaysPostingsFormat(final PostingsFormat format) {
// TODO: we really need for postings impls etc to announce themselves
// (and maybe their params, too) to infostream on flush and merge.
// otherwise in a real debugging situation we won't know whats going on!
if (LuceneTestCase.VERBOSE) {
System.out.println("forcing postings format to:" + format);
}
return new Lucene40Codec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {