From b1da6f5041af2d7c9f8c2ca5e9dd8f694278190f Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 23 Jan 2012 22:03:08 +0000 Subject: [PATCH] LUCENE-3706: add offsets into lucene40 postings git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1235022 13f79535-47bb-0310-9956-ffa450edef68 --- .../codecs/lucene3x/SegmentTermDocs.java | 2 +- .../lucene40/Lucene40FieldInfosReader.java | 4 +- .../lucene40/Lucene40FieldInfosWriter.java | 2 +- .../lucene40/Lucene40PostingsReader.java | 143 +++++++++++------- .../lucene40/Lucene40PostingsWriter.java | 55 ++++--- .../lucene40/Lucene40SkipListReader.java | 22 ++- .../lucene40/Lucene40SkipListWriter.java | 12 +- .../preflexrw/PreFlexRWFieldsWriter.java | 2 +- .../lucene/index/TestPostingsOffsets.java | 138 +++++++++++++++-- 9 files changed, 291 insertions(+), 89 deletions(-) diff --git a/lucene/src/java/org/apache/lucene/codecs/lucene3x/SegmentTermDocs.java b/lucene/src/java/org/apache/lucene/codecs/lucene3x/SegmentTermDocs.java index c398c788534..f5020a2a6b8 100644 --- a/lucene/src/java/org/apache/lucene/codecs/lucene3x/SegmentTermDocs.java +++ b/lucene/src/java/org/apache/lucene/codecs/lucene3x/SegmentTermDocs.java @@ -206,7 +206,7 @@ public class SegmentTermDocs { skipListReader = new Lucene40SkipListReader((IndexInput) freqStream.clone(), maxSkipLevels, skipInterval); // lazily clone if (!haveSkipped) { // lazily initialize skip stream - skipListReader.init(skipPointer, freqBasePointer, proxBasePointer, df, currentFieldStoresPayloads); + skipListReader.init(skipPointer, freqBasePointer, proxBasePointer, df, currentFieldStoresPayloads, false); haveSkipped = true; } diff --git a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java index a39ff7a85ad..598efe2b697 100644 --- a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java +++ b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java @@ -85,11 +85,11 @@ public class Lucene40FieldInfosReader extends FieldInfosReader { // LUCENE-3027: past indices were able to write // storePayloads=true when omitTFAP is also true, // which is invalid. We correct that, here: - if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { storePayloads = false; } hasVectors |= storeTermVector; - hasProx |= isIndexed && indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; + hasProx |= isIndexed && indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; hasFreq |= isIndexed && indexOptions != IndexOptions.DOCS_ONLY; // DV Types are packed in one byte byte val = input.readByte(); diff --git a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosWriter.java b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosWriter.java index ee7527f2eea..e1a64a6ab77 100644 --- a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosWriter.java +++ b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosWriter.java @@ -58,7 +58,7 @@ public class Lucene40FieldInfosWriter extends FieldInfosWriter { output.writeVInt(FORMAT_CURRENT); output.writeVInt(infos.size()); for (FieldInfo fi : infos) { - assert fi.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !fi.storePayloads; + assert fi.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !fi.storePayloads; byte bits = 0x0; if (fi.isIndexed) bits |= IS_INDEXED; if (fi.storeTermVector) bits |= STORE_TERMVECTOR; diff --git a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java index 400a0902001..ea208254b21 100644 --- a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java +++ b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java @@ -197,7 +197,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase { // undefined } - if (fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + if (fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { if (isFirstTerm) { termState.proxOffset = termState.bytesReader.readVLong(); } else { @@ -245,23 +245,23 @@ public class Lucene40PostingsReader extends PostingsReaderBase { DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException { - if (needsOffsets) { - // TODO: once we index offsets into postings fix this! - return null; + boolean hasOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + if (needsOffsets && !hasOffsets) { + return null; // not available } // TODO: refactor - if (fieldInfo.storePayloads) { - SegmentDocsAndPositionsAndPayloadsEnum docsEnum; - if (reuse == null || !(reuse instanceof SegmentDocsAndPositionsAndPayloadsEnum)) { - docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn); + if (fieldInfo.storePayloads || hasOffsets) { + SegmentFullPositionsEnum docsEnum; + if (reuse == null || !(reuse instanceof SegmentFullPositionsEnum)) { + docsEnum = new SegmentFullPositionsEnum(freqIn, proxIn); } else { - docsEnum = (SegmentDocsAndPositionsAndPayloadsEnum) reuse; + docsEnum = (SegmentFullPositionsEnum) reuse; if (docsEnum.startFreqIn != freqIn) { // If you are using ParellelReader, and pass in a // reused DocsEnum, it could have come from another // reader also using standard codec - docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn); + docsEnum = new SegmentFullPositionsEnum(freqIn, proxIn); } } return docsEnum.reset(fieldInfo, (StandardTermState) termState, liveDocs); @@ -295,6 +295,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase { protected boolean indexOmitsTF; // does current field omit term freq? protected boolean storePayloads; // does current field store payloads? + protected boolean storeOffsets; // does current field store offsets? protected int limit; // number of docs in this posting protected int ord; // how many docs we've read @@ -324,6 +325,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase { DocsEnum reset(FieldInfo fieldInfo, StandardTermState termState) throws IOException { indexOmitsTF = fieldInfo.indexOptions == IndexOptions.DOCS_ONLY; storePayloads = fieldInfo.storePayloads; + storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; freqOffset = termState.freqOffset; skipOffset = termState.skipOffset; @@ -471,7 +473,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase { skipper.init(freqOffset + skipOffset, freqOffset, 0, - limit, storePayloads); + limit, storePayloads, storeOffsets); skipped = true; } @@ -665,7 +667,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase { // TODO specialize DocsAndPosEnum too - // Decodes docs & positions. payloads are not present. + // Decodes docs & positions. payloads nor offsets are present. private final class SegmentDocsAndPositionsEnum extends DocsAndPositionsEnum { final IndexInput startFreqIn; private final IndexInput freqIn; @@ -792,7 +794,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase { skipper.init(freqOffset+skipOffset, freqOffset, proxOffset, - limit, false); + limit, false, false); skipped = true; } @@ -868,8 +870,8 @@ public class Lucene40PostingsReader extends PostingsReaderBase { } } - // Decodes docs & positions & payloads - private class SegmentDocsAndPositionsAndPayloadsEnum extends DocsAndPositionsEnum { + // Decodes docs & positions & (payloads and/or offsets) + private class SegmentFullPositionsEnum extends DocsAndPositionsEnum { final IndexInput startFreqIn; private final IndexInput freqIn; private final IndexInput proxIn; @@ -895,16 +897,24 @@ public class Lucene40PostingsReader extends PostingsReaderBase { Lucene40SkipListReader skipper; private BytesRef payload; private long lazyProxPointer; + + boolean storePayloads; + boolean storeOffsets; + + int offsetLength; + int startOffset; - public SegmentDocsAndPositionsAndPayloadsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException { + public SegmentFullPositionsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException { startFreqIn = freqIn; this.freqIn = (IndexInput) freqIn.clone(); this.proxIn = (IndexInput) proxIn.clone(); } - public SegmentDocsAndPositionsAndPayloadsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits liveDocs) throws IOException { - assert fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; - assert fieldInfo.storePayloads; + public SegmentFullPositionsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits liveDocs) throws IOException { + storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + storePayloads = fieldInfo.storePayloads; + assert fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + assert storePayloads || storeOffsets; if (payload == null) { payload = new BytesRef(); payload.bytes = new byte[1]; @@ -923,6 +933,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase { doc = -1; accum = 0; position = 0; + startOffset = 0; skipped = false; posPendingCount = 0; @@ -963,6 +974,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase { } position = 0; + startOffset = 0; //System.out.println("StandardR.D&PE nextDoc seg=" + segment + " return doc=" + doc); return (doc = accum); @@ -1001,7 +1013,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase { //System.out.println(" init skipper freqOffset=" + freqOffset + " skipOffset=" + skipOffset + " vs len=" + freqIn.length()); skipper.init(freqOffset+skipOffset, freqOffset, proxOffset, - limit, true); + limit, storePayloads, storeOffsets); skipped = true; } @@ -1016,8 +1028,10 @@ public class Lucene40PostingsReader extends PostingsReaderBase { lazyProxPointer = skipper.getProxPointer(); posPendingCount = 0; position = 0; + startOffset = 0; payloadPending = false; payloadLength = skipper.getPayloadLength(); + offsetLength = skipper.getOffsetLength(); } } @@ -1038,27 +1052,38 @@ public class Lucene40PostingsReader extends PostingsReaderBase { } if (payloadPending && payloadLength > 0) { - // payload of last position as never retrieved -- skip it + // payload of last position was never retrieved -- skip it proxIn.seek(proxIn.getFilePointer() + payloadLength); payloadPending = false; } // scan over any docs that were iterated without their positions while(posPendingCount > freq) { - final int code = proxIn.readVInt(); - if ((code & 1) != 0) { - // new payload length - payloadLength = proxIn.readVInt(); - assert payloadLength >= 0; + if (storePayloads) { + if ((code & 1) != 0) { + // new payload length + payloadLength = proxIn.readVInt(); + assert payloadLength >= 0; + } + assert payloadLength != -1; } - assert payloadLength != -1; - proxIn.seek(proxIn.getFilePointer() + payloadLength); + if (storeOffsets) { + if ((proxIn.readVInt() & 1) != 0) { + // new offset length + offsetLength = proxIn.readVInt(); + } + } + + if (storePayloads) { + proxIn.seek(proxIn.getFilePointer() + payloadLength); + } posPendingCount--; position = 0; + startOffset = 0; payloadPending = false; //System.out.println("StandardR.D&PE skipPos"); } @@ -1069,16 +1094,28 @@ public class Lucene40PostingsReader extends PostingsReaderBase { proxIn.seek(proxIn.getFilePointer()+payloadLength); } - final int code = proxIn.readVInt(); - if ((code & 1) != 0) { - // new payload length - payloadLength = proxIn.readVInt(); - assert payloadLength >= 0; - } - assert payloadLength != -1; + int code = proxIn.readVInt(); + if (storePayloads) { + if ((code & 1) != 0) { + // new payload length + payloadLength = proxIn.readVInt(); + assert payloadLength >= 0; + } + assert payloadLength != -1; - payloadPending = true; - position += code >>> 1; + payloadPending = true; + code >>>= 1; + } + position += code; + + if (storeOffsets) { + int offsetCode = proxIn.readVInt(); + if ((offsetCode & 1) != 0) { + // new offset length + offsetLength = proxIn.readVInt(); + } + startOffset += offsetCode >>> 1; + } posPendingCount--; @@ -1090,32 +1127,36 @@ public class Lucene40PostingsReader extends PostingsReaderBase { @Override public int startOffset() throws IOException { - return -1; + return storeOffsets ? startOffset : -1; } @Override public int endOffset() throws IOException { - return -1; + return storeOffsets ? startOffset + offsetLength : -1; } /** Returns the payload at this position, or null if no * payload was indexed. */ @Override public BytesRef getPayload() throws IOException { - assert lazyProxPointer == -1; - assert posPendingCount < freq; - if (!payloadPending) { - throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once."); - } - if (payloadLength > payload.bytes.length) { - payload.grow(payloadLength); - } + if (storePayloads) { + assert lazyProxPointer == -1; + assert posPendingCount < freq; + if (!payloadPending) { + throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once."); + } + if (payloadLength > payload.bytes.length) { + payload.grow(payloadLength); + } - proxIn.readBytes(payload.bytes, 0, payloadLength); - payload.length = payloadLength; - payloadPending = false; + proxIn.readBytes(payload.bytes, 0, payloadLength); + payload.length = payloadLength; + payloadPending = false; - return payload; + return payload; + } else { + throw new IOException("No payloads exist for this field!"); + } } @Override diff --git a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java index 86c68bccff7..75dbca28592 100644 --- a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java +++ b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java @@ -73,12 +73,15 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase { IndexOptions indexOptions; boolean storePayloads; + boolean storeOffsets; // Starts a new term long freqStart; long proxStart; FieldInfo fieldInfo; int lastPayloadLength; + int lastOffsetLength; int lastPosition; + int lastOffset; // private String segment; @@ -137,6 +140,8 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase { proxStart = proxOut.getFilePointer(); // force first payload to write its length lastPayloadLength = -1; + // force first offset to write its length + lastOffsetLength = -1; } skipListWriter.resetSkip(); } @@ -155,10 +160,8 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase { */ this.fieldInfo = fieldInfo; indexOptions = fieldInfo.indexOptions; - if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) { - throw new UnsupportedOperationException("this codec cannot index offsets"); - } - + + storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; storePayloads = fieldInfo.storePayloads; //System.out.println(" set init blockFreqStart=" + freqStart); //System.out.println(" set init blockProxStart=" + proxStart); @@ -180,7 +183,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase { } if ((++df % skipInterval) == 0) { - skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength); + skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength, storeOffsets, lastOffsetLength); skipListWriter.bufferSkip(df); } @@ -197,31 +200,26 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase { } lastPosition = 0; + lastOffset = 0; } /** Add a new position & payload */ @Override public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { //if (DEBUG) System.out.println("SPW: addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.length + " bytes")) + " proxFP=" + proxOut.getFilePointer()); - assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS: "invalid indexOptions: " + indexOptions; + assert indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 : "invalid indexOptions: " + indexOptions; assert proxOut != null; - // TODO: when we add offsets... often - // endOffset-startOffset will be constant or near - // constant for all docs (eg if the term wasn't stemmed - // then this will usually be the utf16 length of the - // term); would be nice to write that length once up - // front and then not encode endOffset for each - // position.. - final int delta = position - lastPosition; assert delta >= 0: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it) lastPosition = position; + int payloadLength = 0; + if (storePayloads) { - final int payloadLength = payload == null ? 0 : payload.length; + payloadLength = payload == null ? 0 : payload.length; if (payloadLength != lastPayloadLength) { lastPayloadLength = payloadLength; @@ -230,13 +228,28 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase { } else { proxOut.writeVInt(delta << 1); } - - if (payloadLength > 0) { - proxOut.writeBytes(payload.bytes, payload.offset, payloadLength); - } } else { proxOut.writeVInt(delta); } + + if (storeOffsets) { + // don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms, + // and the numbers aren't that much smaller anyways. + int offsetDelta = startOffset - lastOffset; + int offsetLength = endOffset - startOffset; + if (offsetLength != lastOffsetLength) { + proxOut.writeVInt(offsetDelta << 1 | 1); + proxOut.writeVInt(offsetLength); + } else { + proxOut.writeVInt(offsetDelta << 1); + } + lastOffset = startOffset; + lastOffsetLength = offsetLength; + } + + if (payloadLength > 0) { + proxOut.writeBytes(payload.bytes, payload.offset, payloadLength); + } } @Override @@ -304,7 +317,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase { assert firstTerm.skipOffset > 0; bytesWriter.writeVInt(firstTerm.skipOffset); } - if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { bytesWriter.writeVLong(firstTerm.proxStart); } long lastFreqStart = firstTerm.freqStart; @@ -319,7 +332,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase { assert term.skipOffset > 0; bytesWriter.writeVInt(term.skipOffset); } - if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { bytesWriter.writeVLong(term.proxStart - lastProxStart); lastProxStart = term.proxStart; } diff --git a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListReader.java b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListReader.java index e9fe6c84a68..754b839d7b2 100644 --- a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListReader.java +++ b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListReader.java @@ -30,13 +30,16 @@ import org.apache.lucene.store.IndexInput; */ public class Lucene40SkipListReader extends MultiLevelSkipListReader { private boolean currentFieldStoresPayloads; + private boolean currentFieldStoresOffsets; private long freqPointer[]; private long proxPointer[]; private int payloadLength[]; + private int offsetLength[]; private long lastFreqPointer; private long lastProxPointer; private int lastPayloadLength; + private int lastOffsetLength; public Lucene40SkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) { @@ -44,17 +47,20 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader { freqPointer = new long[maxSkipLevels]; proxPointer = new long[maxSkipLevels]; payloadLength = new int[maxSkipLevels]; + offsetLength = new int[maxSkipLevels]; } - public void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads) { + public void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads, boolean storesOffsets) { super.init(skipPointer, df); this.currentFieldStoresPayloads = storesPayloads; + this.currentFieldStoresOffsets = storesOffsets; lastFreqPointer = freqBasePointer; lastProxPointer = proxBasePointer; Arrays.fill(freqPointer, freqBasePointer); Arrays.fill(proxPointer, proxBasePointer); Arrays.fill(payloadLength, 0); + Arrays.fill(offsetLength, 0); } /** Returns the freq pointer of the doc to which the last call of @@ -76,12 +82,20 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader { return lastPayloadLength; } + /** Returns the offset length (endOffset-startOffset) of the position stored just before + * the doc to which the last call of {@link MultiLevelSkipListReader#skipTo(int)} + * has skipped. */ + public int getOffsetLength() { + return lastOffsetLength; + } + @Override protected void seekChild(int level) throws IOException { super.seekChild(level); freqPointer[level] = lastFreqPointer; proxPointer[level] = lastProxPointer; payloadLength[level] = lastPayloadLength; + offsetLength[level] = lastOffsetLength; } @Override @@ -90,6 +104,7 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader { lastFreqPointer = freqPointer[level]; lastProxPointer = proxPointer[level]; lastPayloadLength = payloadLength[level]; + lastOffsetLength = offsetLength[level]; } @@ -110,6 +125,11 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader { } else { delta = skipStream.readVInt(); } + + if (currentFieldStoresOffsets) { + offsetLength[level] = skipStream.readVInt(); + } + freqPointer[level] += skipStream.readVInt(); proxPointer[level] += skipStream.readVInt(); diff --git a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListWriter.java b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListWriter.java index 17937512d62..d06a7214374 100644 --- a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListWriter.java +++ b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListWriter.java @@ -40,7 +40,9 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter { private int curDoc; private boolean curStorePayloads; + private boolean curStoreOffsets; private int curPayloadLength; + private int curOffsetLength; private long curFreqPointer; private long curProxPointer; @@ -58,10 +60,12 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter { /** * Sets the values for the current skip data. */ - public void setSkipData(int doc, boolean storePayloads, int payloadLength) { + public void setSkipData(int doc, boolean storePayloads, int payloadLength, boolean storeOffsets, int offsetLength) { this.curDoc = doc; this.curStorePayloads = storePayloads; this.curPayloadLength = payloadLength; + this.curStoreOffsets = storeOffsets; + this.curOffsetLength = offsetLength; this.curFreqPointer = freqOutput.getFilePointer(); if (proxOutput != null) this.curProxPointer = proxOutput.getFilePointer(); @@ -116,6 +120,12 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter { // current field does not store payloads skipBuffer.writeVInt(curDoc - lastSkipDoc[level]); } + + // TODO: not sure it really helps to shove this somewhere else if its the same as the last skip + if (curStoreOffsets) { + skipBuffer.writeVInt(curOffsetLength); + } + skipBuffer.writeVInt((int) (curFreqPointer - lastSkipFreqPointer[level])); skipBuffer.writeVInt((int) (curProxPointer - lastSkipProxPointer[level])); diff --git a/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWFieldsWriter.java b/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWFieldsWriter.java index 801c20efe19..11aee56637f 100644 --- a/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWFieldsWriter.java +++ b/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWFieldsWriter.java @@ -137,7 +137,7 @@ class PreFlexRWFieldsWriter extends FieldsConsumer { } if ((++df % termsOut.skipInterval) == 0) { - skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength); + skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength, false, 0); skipListWriter.bufferSkip(df); } diff --git a/lucene/src/test/org/apache/lucene/index/TestPostingsOffsets.java b/lucene/src/test/org/apache/lucene/index/TestPostingsOffsets.java index 67bcaa1e190..92b9edc8fb7 100644 --- a/lucene/src/test/org/apache/lucene/index/TestPostingsOffsets.java +++ b/lucene/src/test/org/apache/lucene/index/TestPostingsOffsets.java @@ -22,29 +22,46 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CannedAnalyzer; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockPayloadAnalyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.NumericField; +import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldCache; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.English; import org.apache.lucene.util.LuceneTestCase; -import org.junit.Assume; +import org.apache.lucene.util._TestUtil; public class TestPostingsOffsets extends LuceneTestCase { + IndexWriterConfig iwc; + + public void setUp() throws Exception { + super.setUp(); + // Currently only SimpleText and Lucene40 can index offsets into postings: + assumeTrue("codec does not support offsets", Codec.getDefault().getName().equals("SimpleText") || Codec.getDefault().getName().equals("Lucene40")); + iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)); + + if (Codec.getDefault().getName().equals("Lucene40")) { + // pulsing etc are not implemented + iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat())); + } + } public void testBasic() throws Exception { - - // Currently only SimpleText can index offsets into postings: - Assume.assumeTrue(Codec.getDefault().getName().equals("SimpleText")); - Directory dir = newDirectory(); - RandomIndexWriter w = new RandomIndexWriter(random, dir); + + RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc); Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_UNSTORED); @@ -94,16 +111,117 @@ public class TestPostingsOffsets extends LuceneTestCase { r.close(); dir.close(); } + + public void testSkipping() throws Exception { + doTestNumbers(false); + } + + public void testPayloads() throws Exception { + doTestNumbers(true); + } + + public void doTestNumbers(boolean withPayloads) throws Exception { + Directory dir = newDirectory(); + Analyzer analyzer = withPayloads ? new MockPayloadAnalyzer() : new MockAnalyzer(random); + iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); + if (Codec.getDefault().getName().equals("Lucene40")) { + // pulsing etc are not implemented + iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat())); + } + iwc.setMergePolicy(newLogMergePolicy()); // will rely on docids a bit for skipping + RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc); + + FieldType ft = new FieldType(TextField.TYPE_STORED); + ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + if (random.nextBoolean()) { + ft.setStoreTermVectors(true); + ft.setStoreTermVectorOffsets(random.nextBoolean()); + ft.setStoreTermVectorPositions(random.nextBoolean()); + } + + int numDocs = atLeast(500); + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + doc.add(new Field("numbers", English.intToEnglish(i), ft)); + doc.add(new Field("oddeven", (i % 2) == 0 ? "even" : "odd", ft)); + doc.add(new StringField("id", "" + i)); + w.addDocument(doc); + } + + IndexReader reader = w.getReader(); + w.close(); + + String terms[] = { "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "hundred" }; + + for (String term : terms) { + DocsAndPositionsEnum dp = MultiFields.getTermPositionsEnum(reader, null, "numbers", new BytesRef(term), true); + int doc; + while((doc = dp.nextDoc()) != DocsEnum.NO_MORE_DOCS) { + String storedNumbers = reader.document(doc).get("numbers"); + int freq = dp.freq(); + for (int i = 0; i < freq; i++) { + dp.nextPosition(); + int start = dp.startOffset(); + assert start >= 0; + int end = dp.endOffset(); + assert end >= 0 && end >= start; + // check that the offsets correspond to the term in the src text + assertTrue(storedNumbers.substring(start, end).equals(term)); + if (withPayloads) { + // check that we have a payload and it starts with "pos" + assertTrue(dp.hasPayload()); + BytesRef payload = dp.getPayload(); + assertTrue(payload.utf8ToString().startsWith("pos:")); + } // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer! + } + } + } + + // check we can skip correctly + int numSkippingTests = atLeast(50); + + for (int j = 0; j < numSkippingTests; j++) { + int num = _TestUtil.nextInt(random, 100, Math.min(numDocs-1, 999)); + DocsAndPositionsEnum dp = MultiFields.getTermPositionsEnum(reader, null, "numbers", new BytesRef("hundred"), true); + int doc = dp.advance(num); + assertEquals(num, doc); + int freq = dp.freq(); + for (int i = 0; i < freq; i++) { + String storedNumbers = reader.document(doc).get("numbers"); + dp.nextPosition(); + int start = dp.startOffset(); + assert start >= 0; + int end = dp.endOffset(); + assert end >= 0 && end >= start; + // check that the offsets correspond to the term in the src text + assertTrue(storedNumbers.substring(start, end).equals("hundred")); + if (withPayloads) { + // check that we have a payload and it starts with "pos" + assertTrue(dp.hasPayload()); + BytesRef payload = dp.getPayload(); + assertTrue(payload.utf8ToString().startsWith("pos:")); + } // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer! + } + } + + // check that other fields (without offsets) work correctly + + for (int i = 0; i < numDocs; i++) { + DocsEnum dp = MultiFields.getTermDocsEnum(reader, null, "id", new BytesRef("" + i), false); + assertEquals(i, dp.nextDoc()); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc()); + } + + reader.close(); + dir.close(); + } public void testRandom() throws Exception { - // Currently only SimpleText can index offsets into postings: - Assume.assumeTrue(Codec.getDefault().getName().equals("SimpleText")); - // token -> docID -> tokens final Map>> actualTokens = new HashMap>>(); Directory dir = newDirectory(); - RandomIndexWriter w = new RandomIndexWriter(random, dir); + RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc); final int numDocs = atLeast(20); //final int numDocs = atLeast(5);