mirror of https://github.com/apache/lucene.git
LUCENE-3706: add offsets into lucene40 postings
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1235022 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
edd8b91895
commit
b1da6f5041
|
@ -206,7 +206,7 @@ public class SegmentTermDocs {
|
|||
skipListReader = new Lucene40SkipListReader((IndexInput) freqStream.clone(), maxSkipLevels, skipInterval); // lazily clone
|
||||
|
||||
if (!haveSkipped) { // lazily initialize skip stream
|
||||
skipListReader.init(skipPointer, freqBasePointer, proxBasePointer, df, currentFieldStoresPayloads);
|
||||
skipListReader.init(skipPointer, freqBasePointer, proxBasePointer, df, currentFieldStoresPayloads, false);
|
||||
haveSkipped = true;
|
||||
}
|
||||
|
||||
|
|
|
@ -85,11 +85,11 @@ public class Lucene40FieldInfosReader extends FieldInfosReader {
|
|||
// LUCENE-3027: past indices were able to write
|
||||
// storePayloads=true when omitTFAP is also true,
|
||||
// which is invalid. We correct that, here:
|
||||
if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
|
||||
storePayloads = false;
|
||||
}
|
||||
hasVectors |= storeTermVector;
|
||||
hasProx |= isIndexed && indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
|
||||
hasProx |= isIndexed && indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
hasFreq |= isIndexed && indexOptions != IndexOptions.DOCS_ONLY;
|
||||
// DV Types are packed in one byte
|
||||
byte val = input.readByte();
|
||||
|
|
|
@ -58,7 +58,7 @@ public class Lucene40FieldInfosWriter extends FieldInfosWriter {
|
|||
output.writeVInt(FORMAT_CURRENT);
|
||||
output.writeVInt(infos.size());
|
||||
for (FieldInfo fi : infos) {
|
||||
assert fi.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !fi.storePayloads;
|
||||
assert fi.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !fi.storePayloads;
|
||||
byte bits = 0x0;
|
||||
if (fi.isIndexed) bits |= IS_INDEXED;
|
||||
if (fi.storeTermVector) bits |= STORE_TERMVECTOR;
|
||||
|
|
|
@ -197,7 +197,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
// undefined
|
||||
}
|
||||
|
||||
if (fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
if (fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
|
||||
if (isFirstTerm) {
|
||||
termState.proxOffset = termState.bytesReader.readVLong();
|
||||
} else {
|
||||
|
@ -245,23 +245,23 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
DocsAndPositionsEnum reuse, boolean needsOffsets)
|
||||
throws IOException {
|
||||
|
||||
if (needsOffsets) {
|
||||
// TODO: once we index offsets into postings fix this!
|
||||
return null;
|
||||
boolean hasOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
if (needsOffsets && !hasOffsets) {
|
||||
return null; // not available
|
||||
}
|
||||
|
||||
// TODO: refactor
|
||||
if (fieldInfo.storePayloads) {
|
||||
SegmentDocsAndPositionsAndPayloadsEnum docsEnum;
|
||||
if (reuse == null || !(reuse instanceof SegmentDocsAndPositionsAndPayloadsEnum)) {
|
||||
docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn);
|
||||
if (fieldInfo.storePayloads || hasOffsets) {
|
||||
SegmentFullPositionsEnum docsEnum;
|
||||
if (reuse == null || !(reuse instanceof SegmentFullPositionsEnum)) {
|
||||
docsEnum = new SegmentFullPositionsEnum(freqIn, proxIn);
|
||||
} else {
|
||||
docsEnum = (SegmentDocsAndPositionsAndPayloadsEnum) reuse;
|
||||
docsEnum = (SegmentFullPositionsEnum) reuse;
|
||||
if (docsEnum.startFreqIn != freqIn) {
|
||||
// If you are using ParellelReader, and pass in a
|
||||
// reused DocsEnum, it could have come from another
|
||||
// reader also using standard codec
|
||||
docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn);
|
||||
docsEnum = new SegmentFullPositionsEnum(freqIn, proxIn);
|
||||
}
|
||||
}
|
||||
return docsEnum.reset(fieldInfo, (StandardTermState) termState, liveDocs);
|
||||
|
@ -295,6 +295,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
|
||||
protected boolean indexOmitsTF; // does current field omit term freq?
|
||||
protected boolean storePayloads; // does current field store payloads?
|
||||
protected boolean storeOffsets; // does current field store offsets?
|
||||
|
||||
protected int limit; // number of docs in this posting
|
||||
protected int ord; // how many docs we've read
|
||||
|
@ -324,6 +325,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
DocsEnum reset(FieldInfo fieldInfo, StandardTermState termState) throws IOException {
|
||||
indexOmitsTF = fieldInfo.indexOptions == IndexOptions.DOCS_ONLY;
|
||||
storePayloads = fieldInfo.storePayloads;
|
||||
storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
freqOffset = termState.freqOffset;
|
||||
skipOffset = termState.skipOffset;
|
||||
|
||||
|
@ -471,7 +473,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
|
||||
skipper.init(freqOffset + skipOffset,
|
||||
freqOffset, 0,
|
||||
limit, storePayloads);
|
||||
limit, storePayloads, storeOffsets);
|
||||
|
||||
skipped = true;
|
||||
}
|
||||
|
@ -665,7 +667,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
|
||||
// TODO specialize DocsAndPosEnum too
|
||||
|
||||
// Decodes docs & positions. payloads are not present.
|
||||
// Decodes docs & positions. payloads nor offsets are present.
|
||||
private final class SegmentDocsAndPositionsEnum extends DocsAndPositionsEnum {
|
||||
final IndexInput startFreqIn;
|
||||
private final IndexInput freqIn;
|
||||
|
@ -792,7 +794,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
|
||||
skipper.init(freqOffset+skipOffset,
|
||||
freqOffset, proxOffset,
|
||||
limit, false);
|
||||
limit, false, false);
|
||||
|
||||
skipped = true;
|
||||
}
|
||||
|
@ -868,8 +870,8 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
|
||||
// Decodes docs & positions & payloads
|
||||
private class SegmentDocsAndPositionsAndPayloadsEnum extends DocsAndPositionsEnum {
|
||||
// Decodes docs & positions & (payloads and/or offsets)
|
||||
private class SegmentFullPositionsEnum extends DocsAndPositionsEnum {
|
||||
final IndexInput startFreqIn;
|
||||
private final IndexInput freqIn;
|
||||
private final IndexInput proxIn;
|
||||
|
@ -896,15 +898,23 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
private BytesRef payload;
|
||||
private long lazyProxPointer;
|
||||
|
||||
public SegmentDocsAndPositionsAndPayloadsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException {
|
||||
boolean storePayloads;
|
||||
boolean storeOffsets;
|
||||
|
||||
int offsetLength;
|
||||
int startOffset;
|
||||
|
||||
public SegmentFullPositionsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException {
|
||||
startFreqIn = freqIn;
|
||||
this.freqIn = (IndexInput) freqIn.clone();
|
||||
this.proxIn = (IndexInput) proxIn.clone();
|
||||
}
|
||||
|
||||
public SegmentDocsAndPositionsAndPayloadsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits liveDocs) throws IOException {
|
||||
assert fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
|
||||
assert fieldInfo.storePayloads;
|
||||
public SegmentFullPositionsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits liveDocs) throws IOException {
|
||||
storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
storePayloads = fieldInfo.storePayloads;
|
||||
assert fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
assert storePayloads || storeOffsets;
|
||||
if (payload == null) {
|
||||
payload = new BytesRef();
|
||||
payload.bytes = new byte[1];
|
||||
|
@ -923,6 +933,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
doc = -1;
|
||||
accum = 0;
|
||||
position = 0;
|
||||
startOffset = 0;
|
||||
|
||||
skipped = false;
|
||||
posPendingCount = 0;
|
||||
|
@ -963,6 +974,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
position = 0;
|
||||
startOffset = 0;
|
||||
|
||||
//System.out.println("StandardR.D&PE nextDoc seg=" + segment + " return doc=" + doc);
|
||||
return (doc = accum);
|
||||
|
@ -1001,7 +1013,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
//System.out.println(" init skipper freqOffset=" + freqOffset + " skipOffset=" + skipOffset + " vs len=" + freqIn.length());
|
||||
skipper.init(freqOffset+skipOffset,
|
||||
freqOffset, proxOffset,
|
||||
limit, true);
|
||||
limit, storePayloads, storeOffsets);
|
||||
|
||||
skipped = true;
|
||||
}
|
||||
|
@ -1016,8 +1028,10 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
lazyProxPointer = skipper.getProxPointer();
|
||||
posPendingCount = 0;
|
||||
position = 0;
|
||||
startOffset = 0;
|
||||
payloadPending = false;
|
||||
payloadLength = skipper.getPayloadLength();
|
||||
offsetLength = skipper.getOffsetLength();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1038,27 +1052,38 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
if (payloadPending && payloadLength > 0) {
|
||||
// payload of last position as never retrieved -- skip it
|
||||
// payload of last position was never retrieved -- skip it
|
||||
proxIn.seek(proxIn.getFilePointer() + payloadLength);
|
||||
payloadPending = false;
|
||||
}
|
||||
|
||||
// scan over any docs that were iterated without their positions
|
||||
while(posPendingCount > freq) {
|
||||
|
||||
final int code = proxIn.readVInt();
|
||||
|
||||
if (storePayloads) {
|
||||
if ((code & 1) != 0) {
|
||||
// new payload length
|
||||
payloadLength = proxIn.readVInt();
|
||||
assert payloadLength >= 0;
|
||||
}
|
||||
|
||||
assert payloadLength != -1;
|
||||
}
|
||||
|
||||
if (storeOffsets) {
|
||||
if ((proxIn.readVInt() & 1) != 0) {
|
||||
// new offset length
|
||||
offsetLength = proxIn.readVInt();
|
||||
}
|
||||
}
|
||||
|
||||
if (storePayloads) {
|
||||
proxIn.seek(proxIn.getFilePointer() + payloadLength);
|
||||
}
|
||||
|
||||
posPendingCount--;
|
||||
position = 0;
|
||||
startOffset = 0;
|
||||
payloadPending = false;
|
||||
//System.out.println("StandardR.D&PE skipPos");
|
||||
}
|
||||
|
@ -1069,7 +1094,8 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
proxIn.seek(proxIn.getFilePointer()+payloadLength);
|
||||
}
|
||||
|
||||
final int code = proxIn.readVInt();
|
||||
int code = proxIn.readVInt();
|
||||
if (storePayloads) {
|
||||
if ((code & 1) != 0) {
|
||||
// new payload length
|
||||
payloadLength = proxIn.readVInt();
|
||||
|
@ -1078,7 +1104,18 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
assert payloadLength != -1;
|
||||
|
||||
payloadPending = true;
|
||||
position += code >>> 1;
|
||||
code >>>= 1;
|
||||
}
|
||||
position += code;
|
||||
|
||||
if (storeOffsets) {
|
||||
int offsetCode = proxIn.readVInt();
|
||||
if ((offsetCode & 1) != 0) {
|
||||
// new offset length
|
||||
offsetLength = proxIn.readVInt();
|
||||
}
|
||||
startOffset += offsetCode >>> 1;
|
||||
}
|
||||
|
||||
posPendingCount--;
|
||||
|
||||
|
@ -1090,18 +1127,19 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
return -1;
|
||||
return storeOffsets ? startOffset : -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return -1;
|
||||
return storeOffsets ? startOffset + offsetLength : -1;
|
||||
}
|
||||
|
||||
/** Returns the payload at this position, or null if no
|
||||
* payload was indexed. */
|
||||
@Override
|
||||
public BytesRef getPayload() throws IOException {
|
||||
if (storePayloads) {
|
||||
assert lazyProxPointer == -1;
|
||||
assert posPendingCount < freq;
|
||||
if (!payloadPending) {
|
||||
|
@ -1116,6 +1154,9 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
payloadPending = false;
|
||||
|
||||
return payload;
|
||||
} else {
|
||||
throw new IOException("No payloads exist for this field!");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -73,12 +73,15 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
|
||||
IndexOptions indexOptions;
|
||||
boolean storePayloads;
|
||||
boolean storeOffsets;
|
||||
// Starts a new term
|
||||
long freqStart;
|
||||
long proxStart;
|
||||
FieldInfo fieldInfo;
|
||||
int lastPayloadLength;
|
||||
int lastOffsetLength;
|
||||
int lastPosition;
|
||||
int lastOffset;
|
||||
|
||||
// private String segment;
|
||||
|
||||
|
@ -137,6 +140,8 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
proxStart = proxOut.getFilePointer();
|
||||
// force first payload to write its length
|
||||
lastPayloadLength = -1;
|
||||
// force first offset to write its length
|
||||
lastOffsetLength = -1;
|
||||
}
|
||||
skipListWriter.resetSkip();
|
||||
}
|
||||
|
@ -155,10 +160,8 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
*/
|
||||
this.fieldInfo = fieldInfo;
|
||||
indexOptions = fieldInfo.indexOptions;
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
|
||||
throw new UnsupportedOperationException("this codec cannot index offsets");
|
||||
}
|
||||
|
||||
storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
storePayloads = fieldInfo.storePayloads;
|
||||
//System.out.println(" set init blockFreqStart=" + freqStart);
|
||||
//System.out.println(" set init blockProxStart=" + proxStart);
|
||||
|
@ -180,7 +183,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
}
|
||||
|
||||
if ((++df % skipInterval) == 0) {
|
||||
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
|
||||
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength, storeOffsets, lastOffsetLength);
|
||||
skipListWriter.bufferSkip(df);
|
||||
}
|
||||
|
||||
|
@ -197,31 +200,26 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
}
|
||||
|
||||
lastPosition = 0;
|
||||
lastOffset = 0;
|
||||
}
|
||||
|
||||
/** Add a new position & payload */
|
||||
@Override
|
||||
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
|
||||
//if (DEBUG) System.out.println("SPW: addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.length + " bytes")) + " proxFP=" + proxOut.getFilePointer());
|
||||
assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS: "invalid indexOptions: " + indexOptions;
|
||||
assert indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 : "invalid indexOptions: " + indexOptions;
|
||||
assert proxOut != null;
|
||||
|
||||
// TODO: when we add offsets... often
|
||||
// endOffset-startOffset will be constant or near
|
||||
// constant for all docs (eg if the term wasn't stemmed
|
||||
// then this will usually be the utf16 length of the
|
||||
// term); would be nice to write that length once up
|
||||
// front and then not encode endOffset for each
|
||||
// position..
|
||||
|
||||
final int delta = position - lastPosition;
|
||||
|
||||
assert delta >= 0: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it)
|
||||
|
||||
lastPosition = position;
|
||||
|
||||
int payloadLength = 0;
|
||||
|
||||
if (storePayloads) {
|
||||
final int payloadLength = payload == null ? 0 : payload.length;
|
||||
payloadLength = payload == null ? 0 : payload.length;
|
||||
|
||||
if (payloadLength != lastPayloadLength) {
|
||||
lastPayloadLength = payloadLength;
|
||||
|
@ -230,13 +228,28 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
} else {
|
||||
proxOut.writeVInt(delta << 1);
|
||||
}
|
||||
} else {
|
||||
proxOut.writeVInt(delta);
|
||||
}
|
||||
|
||||
if (storeOffsets) {
|
||||
// don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms,
|
||||
// and the numbers aren't that much smaller anyways.
|
||||
int offsetDelta = startOffset - lastOffset;
|
||||
int offsetLength = endOffset - startOffset;
|
||||
if (offsetLength != lastOffsetLength) {
|
||||
proxOut.writeVInt(offsetDelta << 1 | 1);
|
||||
proxOut.writeVInt(offsetLength);
|
||||
} else {
|
||||
proxOut.writeVInt(offsetDelta << 1);
|
||||
}
|
||||
lastOffset = startOffset;
|
||||
lastOffsetLength = offsetLength;
|
||||
}
|
||||
|
||||
if (payloadLength > 0) {
|
||||
proxOut.writeBytes(payload.bytes, payload.offset, payloadLength);
|
||||
}
|
||||
} else {
|
||||
proxOut.writeVInt(delta);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -304,7 +317,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
assert firstTerm.skipOffset > 0;
|
||||
bytesWriter.writeVInt(firstTerm.skipOffset);
|
||||
}
|
||||
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
|
||||
bytesWriter.writeVLong(firstTerm.proxStart);
|
||||
}
|
||||
long lastFreqStart = firstTerm.freqStart;
|
||||
|
@ -319,7 +332,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
assert term.skipOffset > 0;
|
||||
bytesWriter.writeVInt(term.skipOffset);
|
||||
}
|
||||
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
|
||||
bytesWriter.writeVLong(term.proxStart - lastProxStart);
|
||||
lastProxStart = term.proxStart;
|
||||
}
|
||||
|
|
|
@ -30,13 +30,16 @@ import org.apache.lucene.store.IndexInput;
|
|||
*/
|
||||
public class Lucene40SkipListReader extends MultiLevelSkipListReader {
|
||||
private boolean currentFieldStoresPayloads;
|
||||
private boolean currentFieldStoresOffsets;
|
||||
private long freqPointer[];
|
||||
private long proxPointer[];
|
||||
private int payloadLength[];
|
||||
private int offsetLength[];
|
||||
|
||||
private long lastFreqPointer;
|
||||
private long lastProxPointer;
|
||||
private int lastPayloadLength;
|
||||
private int lastOffsetLength;
|
||||
|
||||
|
||||
public Lucene40SkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) {
|
||||
|
@ -44,17 +47,20 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
|
|||
freqPointer = new long[maxSkipLevels];
|
||||
proxPointer = new long[maxSkipLevels];
|
||||
payloadLength = new int[maxSkipLevels];
|
||||
offsetLength = new int[maxSkipLevels];
|
||||
}
|
||||
|
||||
public void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads) {
|
||||
public void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads, boolean storesOffsets) {
|
||||
super.init(skipPointer, df);
|
||||
this.currentFieldStoresPayloads = storesPayloads;
|
||||
this.currentFieldStoresOffsets = storesOffsets;
|
||||
lastFreqPointer = freqBasePointer;
|
||||
lastProxPointer = proxBasePointer;
|
||||
|
||||
Arrays.fill(freqPointer, freqBasePointer);
|
||||
Arrays.fill(proxPointer, proxBasePointer);
|
||||
Arrays.fill(payloadLength, 0);
|
||||
Arrays.fill(offsetLength, 0);
|
||||
}
|
||||
|
||||
/** Returns the freq pointer of the doc to which the last call of
|
||||
|
@ -76,12 +82,20 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
|
|||
return lastPayloadLength;
|
||||
}
|
||||
|
||||
/** Returns the offset length (endOffset-startOffset) of the position stored just before
|
||||
* the doc to which the last call of {@link MultiLevelSkipListReader#skipTo(int)}
|
||||
* has skipped. */
|
||||
public int getOffsetLength() {
|
||||
return lastOffsetLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void seekChild(int level) throws IOException {
|
||||
super.seekChild(level);
|
||||
freqPointer[level] = lastFreqPointer;
|
||||
proxPointer[level] = lastProxPointer;
|
||||
payloadLength[level] = lastPayloadLength;
|
||||
offsetLength[level] = lastOffsetLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -90,6 +104,7 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
|
|||
lastFreqPointer = freqPointer[level];
|
||||
lastProxPointer = proxPointer[level];
|
||||
lastPayloadLength = payloadLength[level];
|
||||
lastOffsetLength = offsetLength[level];
|
||||
}
|
||||
|
||||
|
||||
|
@ -110,6 +125,11 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
|
|||
} else {
|
||||
delta = skipStream.readVInt();
|
||||
}
|
||||
|
||||
if (currentFieldStoresOffsets) {
|
||||
offsetLength[level] = skipStream.readVInt();
|
||||
}
|
||||
|
||||
freqPointer[level] += skipStream.readVInt();
|
||||
proxPointer[level] += skipStream.readVInt();
|
||||
|
||||
|
|
|
@ -40,7 +40,9 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter {
|
|||
|
||||
private int curDoc;
|
||||
private boolean curStorePayloads;
|
||||
private boolean curStoreOffsets;
|
||||
private int curPayloadLength;
|
||||
private int curOffsetLength;
|
||||
private long curFreqPointer;
|
||||
private long curProxPointer;
|
||||
|
||||
|
@ -58,10 +60,12 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter {
|
|||
/**
|
||||
* Sets the values for the current skip data.
|
||||
*/
|
||||
public void setSkipData(int doc, boolean storePayloads, int payloadLength) {
|
||||
public void setSkipData(int doc, boolean storePayloads, int payloadLength, boolean storeOffsets, int offsetLength) {
|
||||
this.curDoc = doc;
|
||||
this.curStorePayloads = storePayloads;
|
||||
this.curPayloadLength = payloadLength;
|
||||
this.curStoreOffsets = storeOffsets;
|
||||
this.curOffsetLength = offsetLength;
|
||||
this.curFreqPointer = freqOutput.getFilePointer();
|
||||
if (proxOutput != null)
|
||||
this.curProxPointer = proxOutput.getFilePointer();
|
||||
|
@ -116,6 +120,12 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter {
|
|||
// current field does not store payloads
|
||||
skipBuffer.writeVInt(curDoc - lastSkipDoc[level]);
|
||||
}
|
||||
|
||||
// TODO: not sure it really helps to shove this somewhere else if its the same as the last skip
|
||||
if (curStoreOffsets) {
|
||||
skipBuffer.writeVInt(curOffsetLength);
|
||||
}
|
||||
|
||||
skipBuffer.writeVInt((int) (curFreqPointer - lastSkipFreqPointer[level]));
|
||||
skipBuffer.writeVInt((int) (curProxPointer - lastSkipProxPointer[level]));
|
||||
|
||||
|
|
|
@ -137,7 +137,7 @@ class PreFlexRWFieldsWriter extends FieldsConsumer {
|
|||
}
|
||||
|
||||
if ((++df % termsOut.skipInterval) == 0) {
|
||||
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
|
||||
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength, false, 0);
|
||||
skipListWriter.bufferSkip(df);
|
||||
}
|
||||
|
||||
|
|
|
@ -22,29 +22,46 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CannedAnalyzer;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockPayloadAnalyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.NumericField;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.English;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Assume;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
public class TestPostingsOffsets extends LuceneTestCase {
|
||||
IndexWriterConfig iwc;
|
||||
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
// Currently only SimpleText and Lucene40 can index offsets into postings:
|
||||
assumeTrue("codec does not support offsets", Codec.getDefault().getName().equals("SimpleText") || Codec.getDefault().getName().equals("Lucene40"));
|
||||
iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random));
|
||||
|
||||
if (Codec.getDefault().getName().equals("Lucene40")) {
|
||||
// pulsing etc are not implemented
|
||||
iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
|
||||
}
|
||||
}
|
||||
|
||||
public void testBasic() throws Exception {
|
||||
|
||||
// Currently only SimpleText can index offsets into postings:
|
||||
Assume.assumeTrue(Codec.getDefault().getName().equals("SimpleText"));
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random, dir);
|
||||
|
||||
RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);
|
||||
Document doc = new Document();
|
||||
|
||||
FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
|
||||
|
@ -95,15 +112,116 @@ public class TestPostingsOffsets extends LuceneTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
// Currently only SimpleText can index offsets into postings:
|
||||
Assume.assumeTrue(Codec.getDefault().getName().equals("SimpleText"));
|
||||
public void testSkipping() throws Exception {
|
||||
doTestNumbers(false);
|
||||
}
|
||||
|
||||
public void testPayloads() throws Exception {
|
||||
doTestNumbers(true);
|
||||
}
|
||||
|
||||
public void doTestNumbers(boolean withPayloads) throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
Analyzer analyzer = withPayloads ? new MockPayloadAnalyzer() : new MockAnalyzer(random);
|
||||
iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
|
||||
if (Codec.getDefault().getName().equals("Lucene40")) {
|
||||
// pulsing etc are not implemented
|
||||
iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
|
||||
}
|
||||
iwc.setMergePolicy(newLogMergePolicy()); // will rely on docids a bit for skipping
|
||||
RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);
|
||||
|
||||
FieldType ft = new FieldType(TextField.TYPE_STORED);
|
||||
ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
if (random.nextBoolean()) {
|
||||
ft.setStoreTermVectors(true);
|
||||
ft.setStoreTermVectorOffsets(random.nextBoolean());
|
||||
ft.setStoreTermVectorPositions(random.nextBoolean());
|
||||
}
|
||||
|
||||
int numDocs = atLeast(500);
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("numbers", English.intToEnglish(i), ft));
|
||||
doc.add(new Field("oddeven", (i % 2) == 0 ? "even" : "odd", ft));
|
||||
doc.add(new StringField("id", "" + i));
|
||||
w.addDocument(doc);
|
||||
}
|
||||
|
||||
IndexReader reader = w.getReader();
|
||||
w.close();
|
||||
|
||||
String terms[] = { "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "hundred" };
|
||||
|
||||
for (String term : terms) {
|
||||
DocsAndPositionsEnum dp = MultiFields.getTermPositionsEnum(reader, null, "numbers", new BytesRef(term), true);
|
||||
int doc;
|
||||
while((doc = dp.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
|
||||
String storedNumbers = reader.document(doc).get("numbers");
|
||||
int freq = dp.freq();
|
||||
for (int i = 0; i < freq; i++) {
|
||||
dp.nextPosition();
|
||||
int start = dp.startOffset();
|
||||
assert start >= 0;
|
||||
int end = dp.endOffset();
|
||||
assert end >= 0 && end >= start;
|
||||
// check that the offsets correspond to the term in the src text
|
||||
assertTrue(storedNumbers.substring(start, end).equals(term));
|
||||
if (withPayloads) {
|
||||
// check that we have a payload and it starts with "pos"
|
||||
assertTrue(dp.hasPayload());
|
||||
BytesRef payload = dp.getPayload();
|
||||
assertTrue(payload.utf8ToString().startsWith("pos:"));
|
||||
} // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer!
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// check we can skip correctly
|
||||
int numSkippingTests = atLeast(50);
|
||||
|
||||
for (int j = 0; j < numSkippingTests; j++) {
|
||||
int num = _TestUtil.nextInt(random, 100, Math.min(numDocs-1, 999));
|
||||
DocsAndPositionsEnum dp = MultiFields.getTermPositionsEnum(reader, null, "numbers", new BytesRef("hundred"), true);
|
||||
int doc = dp.advance(num);
|
||||
assertEquals(num, doc);
|
||||
int freq = dp.freq();
|
||||
for (int i = 0; i < freq; i++) {
|
||||
String storedNumbers = reader.document(doc).get("numbers");
|
||||
dp.nextPosition();
|
||||
int start = dp.startOffset();
|
||||
assert start >= 0;
|
||||
int end = dp.endOffset();
|
||||
assert end >= 0 && end >= start;
|
||||
// check that the offsets correspond to the term in the src text
|
||||
assertTrue(storedNumbers.substring(start, end).equals("hundred"));
|
||||
if (withPayloads) {
|
||||
// check that we have a payload and it starts with "pos"
|
||||
assertTrue(dp.hasPayload());
|
||||
BytesRef payload = dp.getPayload();
|
||||
assertTrue(payload.utf8ToString().startsWith("pos:"));
|
||||
} // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer!
|
||||
}
|
||||
}
|
||||
|
||||
// check that other fields (without offsets) work correctly
|
||||
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
DocsEnum dp = MultiFields.getTermDocsEnum(reader, null, "id", new BytesRef("" + i), false);
|
||||
assertEquals(i, dp.nextDoc());
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
|
||||
}
|
||||
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
// token -> docID -> tokens
|
||||
final Map<String,Map<Integer,List<Token>>> actualTokens = new HashMap<String,Map<Integer,List<Token>>>();
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random, dir);
|
||||
RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);
|
||||
|
||||
final int numDocs = atLeast(20);
|
||||
//final int numDocs = atLeast(5);
|
||||
|
|
Loading…
Reference in New Issue