LUCENE-4225: BlockPostingsFormat

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1363421 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-07-19 16:53:58 +00:00
parent cf36fb9a58
commit 1e49670a55
12 changed files with 3449 additions and 6 deletions

View File

@ -24,7 +24,8 @@
</subant> </subant>
</target> </target>
<target name="test" description="Test both Lucene and Solr" depends="validate"> <!-- nocommit put depends="validate" back -->
<target name="test" description="Test both Lucene and Solr">
<sequential> <sequential>
<subant target="test" inheritall="false" failonerror="true"> <subant target="test" inheritall="false" failonerror="true">
<fileset dir="lucene" includes="build.xml" /> <fileset dir="lucene" includes="build.xml" />

View File

@ -724,7 +724,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
// Write term stats, to separate byte[] blob: // Write term stats, to separate byte[] blob:
bytesWriter2.writeVInt(term.stats.docFreq); bytesWriter2.writeVInt(term.stats.docFreq);
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
assert term.stats.totalTermFreq >= term.stats.docFreq; assert term.stats.totalTermFreq >= term.stats.docFreq: term.stats.totalTermFreq + " vs " + term.stats.docFreq;
bytesWriter2.writeVLong(term.stats.totalTermFreq - term.stats.docFreq); bytesWriter2.writeVLong(term.stats.totalTermFreq - term.stats.docFreq);
} }
} }

View File

@ -0,0 +1,108 @@
package org.apache.lucene.codecs.block;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.BlockTreeTermsReader;
import org.apache.lucene.codecs.BlockTreeTermsWriter;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
/**
* Pass ForFactory to a PostingsWriter/ReaderBase, and get
* customized postings format plugged.
*/
public final class BlockPostingsFormat extends PostingsFormat {
public static final String DOC_EXTENSION = "doc";
public static final String POS_EXTENSION = "pos";
public static final String PAY_EXTENSION = "pay";
private final int minTermBlockSize;
private final int maxTermBlockSize;
public final static int DEFAULT_BLOCK_SIZE = 128;
public BlockPostingsFormat() {
this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
}
public BlockPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
super("Block");
this.minTermBlockSize = minTermBlockSize;
assert minTermBlockSize > 1;
this.maxTermBlockSize = maxTermBlockSize;
assert minTermBlockSize <= maxTermBlockSize;
}
@Override
public String toString() {
return getName() + "(blocksize=" + DEFAULT_BLOCK_SIZE + ")";
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
// TODO: implement a new PostingsWriterBase to improve skip-settings
PostingsWriterBase postingsWriter = new BlockPostingsWriter(state, 128);
boolean success = false;
try {
FieldsConsumer ret = new BlockTreeTermsWriter(state,
postingsWriter,
minTermBlockSize,
maxTermBlockSize);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new BlockPostingsReader(state.dir,
state.fieldInfos,
state.segmentInfo,
state.context,
state.segmentSuffix,
128);
boolean success = false;
try {
FieldsProducer ret = new BlockTreeTermsReader(state.dir,
state.fieldInfos,
state.segmentInfo.name,
postingsReader,
state.context,
state.segmentSuffix,
state.termsIndexDivisor);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsReader);
}
}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,574 @@
package org.apache.lucene.codecs.block;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.codecs.pfor.ForUtil; // nocommit move here?
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
// nocommit javadocs
public final class BlockPostingsWriter extends PostingsWriterBase {
private boolean DEBUG = BlockPostingsReader.DEBUG;
// nocommit move these constants to the PF:
static final int maxSkipLevels = 10;
final static String TERMS_CODEC = "BlockPostingsWriterTerms";
final static String DOC_CODEC = "BlockPostingsWriterDoc";
final static String POS_CODEC = "BlockPostingsWriterPos";
final static String PAY_CODEC = "BlockPostingsWriterPay";
// Increment version to change it:
final static int VERSION_START = 0;
final static int VERSION_CURRENT = VERSION_START;
final IndexOutput docOut;
final IndexOutput posOut;
final IndexOutput payOut;
static final int DEFAULT_BLOCK_SIZE = 128;
final int blockSize;
private IndexOutput termsOut;
// How current field indexes postings:
private boolean fieldHasFreqs;
private boolean fieldHasPositions;
private boolean fieldHasOffsets;
private boolean fieldHasPayloads;
// Holds starting file pointers for each term:
private long docTermStartFP;
private long posTermStartFP;
private long payTermStartFP;
final int[] docDeltaBuffer;
final int[] freqBuffer;
private int docBufferUpto;
final int[] posDeltaBuffer;
final int[] payloadLengthBuffer;
final int[] offsetStartDeltaBuffer;
final int[] offsetLengthBuffer;
private int posBufferUpto;
private byte[] payloadBytes;
private int payloadByteUpto;
private int lastBlockDocID;
private boolean saveNextPosBlock;
private long lastBlockPosFP;
private long lastBlockPayFP;
private int lastBlockPosBufferUpto;
private int lastBlockEndOffset;
private int lastBlockPayloadByteUpto;
private int lastDocID;
private int lastPosition;
private int lastEndOffset;
private int docCount;
final byte[] encoded;
final IntBuffer encodedBuffer;
private final BlockSkipWriter skipWriter;
public BlockPostingsWriter(SegmentWriteState state, int blockSize) throws IOException {
super();
this.blockSize = blockSize;
docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.DOC_EXTENSION),
state.context);
IndexOutput posOut = null;
IndexOutput payOut = null;
boolean success = false;
try {
CodecUtil.writeHeader(docOut, DOC_CODEC, VERSION_CURRENT);
if (state.fieldInfos.hasProx()) {
posDeltaBuffer = new int[blockSize];
posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.POS_EXTENSION),
state.context);
CodecUtil.writeHeader(posOut, POS_CODEC, VERSION_CURRENT);
if (state.fieldInfos.hasPayloads()) {
payloadBytes = new byte[128];
payloadLengthBuffer = new int[blockSize];
} else {
payloadBytes = null;
payloadLengthBuffer = null;
}
if (state.fieldInfos.hasOffsets()) {
offsetStartDeltaBuffer = new int[blockSize];
offsetLengthBuffer = new int[blockSize];
} else {
offsetStartDeltaBuffer = null;
offsetLengthBuffer = null;
}
if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) {
payOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.PAY_EXTENSION),
state.context);
CodecUtil.writeHeader(payOut, PAY_CODEC, VERSION_CURRENT);
}
} else {
posDeltaBuffer = null;
payloadLengthBuffer = null;
offsetStartDeltaBuffer = null;
offsetLengthBuffer = null;
payloadBytes = null;
}
this.payOut = payOut;
this.posOut = posOut;
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(docOut, posOut, payOut);
}
}
docDeltaBuffer = new int[blockSize];
freqBuffer = new int[blockSize];
skipWriter = new BlockSkipWriter(blockSize,
maxSkipLevels,
state.segmentInfo.getDocCount(),
docOut,
posOut,
payOut);
encoded = new byte[blockSize*4 + 4];
encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
}
@Override
public void start(IndexOutput termsOut) throws IOException {
this.termsOut = termsOut;
CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT);
termsOut.writeVInt(blockSize);
}
@Override
public void setField(FieldInfo fieldInfo) {
IndexOptions indexOptions = fieldInfo.getIndexOptions();
fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
fieldHasPayloads = fieldInfo.hasPayloads();
skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads);
}
@Override
public void startTerm() {
docTermStartFP = docOut.getFilePointer();
if (fieldHasPositions) {
posTermStartFP = posOut.getFilePointer();
if (fieldHasPayloads || fieldHasOffsets) {
payTermStartFP = payOut.getFilePointer();
}
}
lastBlockDocID = -1;
lastDocID = 0;
if (DEBUG) {
System.out.println("FPW.startTerm startFP=" + docTermStartFP);
}
skipWriter.resetSkip();
}
private void writeBlock(int[] buffer, IndexOutput out) throws IOException {
final int header = ForUtil.compress(buffer, encodedBuffer);
//System.out.println(" block has " + numBytes + " bytes");
out.writeVInt(header);
out.writeBytes(encoded, ForUtil.getEncodedSize(header));
}
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
if (DEBUG) {
System.out.println("FPW.startDoc docID=" + docID);
}
// nocommit do this in finishDoc... but does it fail...?
// is it not always called...?
if (posOut != null && saveNextPosBlock) {
lastBlockPosFP = posOut.getFilePointer();
if (payOut != null) {
lastBlockPayFP = payOut.getFilePointer();
}
lastBlockPosBufferUpto = posBufferUpto;
lastBlockEndOffset = lastEndOffset;
lastBlockPayloadByteUpto = payloadByteUpto;
saveNextPosBlock = false;
if (DEBUG) {
System.out.println(" now save lastBlockPosFP=" + lastBlockPosFP + " lastBlockPosBufferUpto=" + lastBlockPosBufferUpto + " lastBlockPayloadByteUpto=" + lastBlockPayloadByteUpto);
}
}
final int docDelta = docID - lastDocID;
if (docID < 0 || (docCount > 0 && docDelta <= 0)) {
throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (docOut: " + docOut + ")");
}
lastDocID = docID;
docDeltaBuffer[docBufferUpto] = docDelta;
if (DEBUG) {
System.out.println(" docDeltaBuffer[" + docBufferUpto + "]=" + docDelta);
}
if (fieldHasFreqs) {
freqBuffer[docBufferUpto] = termDocFreq;
}
docBufferUpto++;
docCount++;
if (docBufferUpto == blockSize) {
// nocommit maybe instead of buffering skip before
// writing a block based on last block's end data
// ... we could buffer after writing the block? only
// iffiness with that approach is it could be a
// pointlness skip? like we may stop adding docs
// right after that, then we have skip point AFTER
// last doc. the thing is, in finishTerm we are
// already sometimes adding a skip point AFTER the
// last doc?
if (lastBlockDocID != -1) {
if (DEBUG) {
System.out.println(" bufferSkip at writeBlock: lastDocID=" + lastBlockDocID + " docCount=" + (docCount-blockSize));
}
skipWriter.bufferSkip(lastBlockDocID, docCount-blockSize, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockEndOffset, lastBlockPayloadByteUpto);
}
lastBlockDocID = docID;
saveNextPosBlock = true;
if (DEBUG) {
System.out.println(" write docDelta block @ fp=" + docOut.getFilePointer());
}
writeBlock(docDeltaBuffer, docOut);
if (fieldHasFreqs) {
if (DEBUG) {
System.out.println(" write freq block @ fp=" + docOut.getFilePointer());
}
writeBlock(freqBuffer, docOut);
}
docBufferUpto = 0;
}
lastPosition = 0;
lastEndOffset = 0;
}
/** Add a new position & payload */
@Override
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
if (DEBUG) {
System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: ""));
}
posDeltaBuffer[posBufferUpto] = position - lastPosition;
if (fieldHasPayloads) {
if (payload == null || payload.length == 0) {
// no payload
payloadLengthBuffer[posBufferUpto] = 0;
} else {
payloadLengthBuffer[posBufferUpto] = payload.length;
if (payloadByteUpto + payload.length > payloadBytes.length) {
payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length);
}
System.arraycopy(payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length);
payloadByteUpto += payload.length;
}
}
if (fieldHasOffsets) {
assert startOffset >= lastEndOffset;
assert endOffset >= startOffset;
offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastEndOffset;
offsetLengthBuffer[posBufferUpto] = endOffset - startOffset;
lastEndOffset = endOffset;
}
posBufferUpto++;
lastPosition = position;
if (posBufferUpto == blockSize) {
if (DEBUG) {
System.out.println(" write pos bulk block @ fp=" + posOut.getFilePointer());
}
writeBlock(posDeltaBuffer, posOut);
if (fieldHasPayloads) {
writeBlock(payloadLengthBuffer, payOut);
payOut.writeVInt(payloadByteUpto);
payOut.writeBytes(payloadBytes, 0, payloadByteUpto);
payloadByteUpto = 0;
}
if (fieldHasOffsets) {
writeBlock(offsetStartDeltaBuffer, payOut);
writeBlock(offsetLengthBuffer, payOut);
}
posBufferUpto = 0;
}
}
@Override
public void finishDoc() {
}
private static class PendingTerm {
public final long docStartFP;
public final long posStartFP;
public final long payStartFP;
public final int skipOffset;
public final int lastPosBlockOffset;
public PendingTerm(long docStartFP, long posStartFP, long payStartFP, int skipOffset, int lastPosBlockOffset) {
this.docStartFP = docStartFP;
this.posStartFP = posStartFP;
this.payStartFP = payStartFP;
this.skipOffset = skipOffset;
this.lastPosBlockOffset = lastPosBlockOffset;
}
}
private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();
/** Called when we are done adding docs to this term */
@Override
public void finishTerm(TermStats stats) throws IOException {
assert stats.docFreq > 0;
// TODO: wasteful we are counting this (counting # docs
// for this term) in two places?
assert stats.docFreq == docCount: stats.docFreq + " vs " + docCount;
if (DEBUG) {
System.out.println("FPW.finishTerm docFreq=" + stats.docFreq);
}
// nocommit silly that skipper must write skip when we no
// postings come after it, but if we don't do this, skip
// reader incorrectly thinks it can read another level 0
// skip entry here!:
//if (docCount > blockSize && docBufferUpto > 0) {
if (docCount > blockSize) {
final int lastDocCount = blockSize*(docCount/blockSize);
if (DEBUG) {
System.out.println(" bufferSkip at finishTerm: lastDocID=" + lastBlockDocID + " docCount=" + lastDocCount);
}
skipWriter.bufferSkip(lastBlockDocID, lastDocCount, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockEndOffset, lastBlockPayloadByteUpto);
}
if (DEBUG) {
if (docBufferUpto > 0) {
System.out.println(" write doc/freq vInt block (count=" + docBufferUpto + ") at fp=" + docOut.getFilePointer() + " docTermStartFP=" + docTermStartFP);
}
}
// vInt encode the remaining doc deltas and freqs:
for(int i=0;i<docBufferUpto;i++) {
final int docDelta = docDeltaBuffer[i];
final int freq = freqBuffer[i];
if (!fieldHasFreqs) {
docOut.writeVInt(docDelta);
} else if (freqBuffer[i] == 1) {
docOut.writeVInt((docDelta<<1)|1);
} else {
docOut.writeVInt(docDelta<<1);
docOut.writeVInt(freq);
}
}
final int lastPosBlockOffset;
if (fieldHasPositions) {
if (DEBUG) {
if (posBufferUpto > 0) {
System.out.println(" write pos vInt block (count=" + posBufferUpto + ") at fp=" + posOut.getFilePointer() + " posTermStartFP=" + posTermStartFP + " hasPayloads=" + fieldHasPayloads + " hasOffsets=" + fieldHasOffsets);
}
}
assert stats.totalTermFreq != -1;
if (stats.totalTermFreq > blockSize) {
lastPosBlockOffset = (int) (posOut.getFilePointer() - posTermStartFP);
} else {
lastPosBlockOffset = -1;
}
if (posBufferUpto > 0) {
posOut.writeVInt(posBufferUpto);
// nocommit should we send offsets/payloads to
// .pay...? seems wasteful (have to store extra
// vLong for low (< blockSize) DF terms = vast vast
// majority)
// vInt encode the remaining positions/payloads/offsets:
int lastPayloadLength = -1;
int payloadBytesReadUpto = 0;
for(int i=0;i<posBufferUpto;i++) {
final int posDelta = posDeltaBuffer[i];
if (fieldHasPayloads) {
final int payloadLength = payloadLengthBuffer[i];
if (payloadLength != lastPayloadLength) {
lastPayloadLength = payloadLength;
posOut.writeVInt((posDelta<<1)|1);
posOut.writeVInt(payloadLength);
} else {
posOut.writeVInt(posDelta<<1);
}
if (DEBUG) {
System.out.println(" i=" + i + " payloadLen=" + payloadLength);
}
if (payloadLength != 0) {
if (DEBUG) {
System.out.println(" write payload @ pos.fp=" + posOut.getFilePointer());
}
posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength);
payloadBytesReadUpto += payloadLength;
}
} else {
posOut.writeVInt(posDelta);
}
if (fieldHasOffsets) {
if (DEBUG) {
System.out.println(" write offset @ pos.fp=" + posOut.getFilePointer());
}
posOut.writeVInt(offsetStartDeltaBuffer[i]);
posOut.writeVInt(offsetLengthBuffer[i]);
}
}
if (fieldHasPayloads) {
assert payloadBytesReadUpto == payloadByteUpto;
payloadByteUpto = 0;
}
}
if (DEBUG) {
System.out.println(" totalTermFreq=" + stats.totalTermFreq + " lastPosBlockOffset=" + lastPosBlockOffset);
}
} else {
lastPosBlockOffset = -1;
}
int skipOffset;
if (docCount > blockSize) {
skipOffset = (int) (skipWriter.writeSkip(docOut)-docTermStartFP);
if (DEBUG) {
System.out.println("skip packet " + (docOut.getFilePointer() - (docTermStartFP + skipOffset)) + " bytes");
}
} else {
skipOffset = -1;
if (DEBUG) {
System.out.println(" no skip: docCount=" + docCount);
}
}
long payStartFP;
if (stats.totalTermFreq >= blockSize) {
payStartFP = payTermStartFP;
} else {
payStartFP = -1;
}
if (DEBUG) {
System.out.println(" payStartFP=" + payStartFP);
}
pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset));
docBufferUpto = 0;
posBufferUpto = 0;
lastDocID = 0;
docCount = 0;
}
private final RAMOutputStream bytesWriter = new RAMOutputStream();
@Override
public void flushTermsBlock(int start, int count) throws IOException {
if (count == 0) {
termsOut.writeByte((byte) 0);
return;
}
assert start <= pendingTerms.size();
assert count <= start;
final int limit = pendingTerms.size() - start + count;
long lastDocStartFP = 0;
long lastPosStartFP = 0;
long lastPayStartFP = 0;
for(int idx=limit-count; idx<limit; idx++) {
PendingTerm term = pendingTerms.get(idx);
bytesWriter.writeVLong(term.docStartFP - lastDocStartFP);
lastDocStartFP = term.docStartFP;
if (fieldHasPositions) {
bytesWriter.writeVLong(term.posStartFP - lastPosStartFP);
lastPosStartFP = term.posStartFP;
if (term.lastPosBlockOffset != -1) {
bytesWriter.writeVInt(term.lastPosBlockOffset);
}
if ((fieldHasPayloads || fieldHasOffsets) && term.payStartFP != -1) {
bytesWriter.writeVLong(term.payStartFP - lastPayStartFP);
lastPayStartFP = term.payStartFP;
}
}
if (term.skipOffset != -1) {
bytesWriter.writeVInt(term.skipOffset);
}
}
termsOut.writeVInt((int) bytesWriter.getFilePointer());
bytesWriter.writeTo(termsOut);
bytesWriter.reset();
// Remove the terms we just wrote:
pendingTerms.subList(limit-count, limit).clear();
}
@Override
public void close() throws IOException {
IOUtils.close(docOut, posOut, payOut);
}
}

View File

@ -0,0 +1,205 @@
package org.apache.lucene.codecs.block;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.codecs.MultiLevelSkipListReader;
import org.apache.lucene.store.IndexInput;
/**
* Implements the skip list reader for the 4.0 posting list format
* that stores positions and payloads.
*
* @see Lucene40PostingsFormat
* @lucene.experimental
*/
final class BlockSkipReader extends MultiLevelSkipListReader {
private boolean DEBUG = BlockPostingsReader.DEBUG;
private long docPointer[];
private long posPointer[];
private long payPointer[];
private int posBufferUpto[];
private int endOffset[];
private int payloadByteUpto[];
private long lastPosPointer;
private long lastPayPointer;
private int lastEndOffset;
private int lastPayloadByteUpto;
private long lastDocPointer;
private int lastPosBufferUpto;
public BlockSkipReader(IndexInput skipStream, int maxSkipLevels, int skipInterval, boolean hasPos, boolean hasOffsets, boolean hasPayloads) {
super(skipStream, maxSkipLevels, skipInterval);
docPointer = new long[maxSkipLevels];
if (hasPos) {
posPointer = new long[maxSkipLevels];
posBufferUpto = new int[maxSkipLevels];
if (hasPayloads) {
payloadByteUpto = new int[maxSkipLevels];
} else {
payloadByteUpto = null;
}
if (hasOffsets) {
endOffset = new int[maxSkipLevels];
} else {
endOffset = null;
}
if (hasOffsets || hasPayloads) {
payPointer = new long[maxSkipLevels];
} else {
payPointer = null;
}
} else {
posPointer = null;
}
}
public void init(long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df) {
super.init(skipPointer, df);
lastDocPointer = docBasePointer;
lastPosPointer = posBasePointer;
lastPayPointer = payBasePointer;
Arrays.fill(docPointer, docBasePointer);
if (posPointer != null) {
Arrays.fill(posPointer, posBasePointer);
if (payPointer != null) {
Arrays.fill(payPointer, payBasePointer);
}
} else {
assert posBasePointer == 0;
}
}
/** Returns the doc pointer of the doc to which the last call of
* {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */
public long getDocPointer() {
return lastDocPointer;
}
public long getPosPointer() {
return lastPosPointer;
}
public int getPosBufferUpto() {
return lastPosBufferUpto;
}
public long getPayPointer() {
return lastPayPointer;
}
public int getEndOffset() {
return lastEndOffset;
}
public int getPayloadByteUpto() {
return lastPayloadByteUpto;
}
@Override
protected void seekChild(int level) throws IOException {
super.seekChild(level);
if (DEBUG) {
System.out.println("seekChild level=" + level);
}
docPointer[level] = lastDocPointer;
if (posPointer != null) {
posPointer[level] = lastPosPointer;
posBufferUpto[level] = lastPosBufferUpto;
if (endOffset != null) {
endOffset[level] = lastEndOffset;
}
if (payloadByteUpto != null) {
payloadByteUpto[level] = lastPayloadByteUpto;
}
if (payPointer != null) {
payPointer[level] = lastPayPointer;
}
}
}
@Override
protected void setLastSkipData(int level) {
super.setLastSkipData(level);
lastDocPointer = docPointer[level];
if (DEBUG) {
System.out.println("setLastSkipData level=" + level);
System.out.println(" lastDocPointer=" + lastDocPointer);
}
if (posPointer != null) {
lastPosPointer = posPointer[level];
lastPosBufferUpto = posBufferUpto[level];
if (DEBUG) {
System.out.println(" lastPosPointer=" + lastPosPointer + " lastPosBUfferUpto=" + lastPosBufferUpto);
}
if (payPointer != null) {
lastPayPointer = payPointer[level];
}
if (endOffset != null) {
lastEndOffset = endOffset[level];
}
if (payloadByteUpto != null) {
lastPayloadByteUpto = payloadByteUpto[level];
}
}
}
@Override
protected int readSkipData(int level, IndexInput skipStream) throws IOException {
if (DEBUG) {
System.out.println("readSkipData level=" + level);
}
int delta = skipStream.readVInt();
if (DEBUG) {
System.out.println(" delta=" + delta);
}
docPointer[level] += skipStream.readVInt();
if (DEBUG) {
System.out.println(" docFP=" + docPointer[level]);
}
if (posPointer != null) {
posPointer[level] += skipStream.readVInt();
if (DEBUG) {
System.out.println(" posFP=" + posPointer[level]);
}
posBufferUpto[level] = skipStream.readVInt();
if (DEBUG) {
System.out.println(" posBufferUpto=" + posBufferUpto[level]);
}
if (payloadByteUpto != null) {
payloadByteUpto[level] = skipStream.readVInt();
}
if (endOffset != null) {
endOffset[level] += skipStream.readVInt();
}
if (payPointer != null) {
payPointer[level] += skipStream.readVInt();
}
}
return delta;
}
}

View File

@ -0,0 +1,147 @@
package org.apache.lucene.codecs.block;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
// nocommit do we need more frequent skips at level > 0?
// 128*128 is immense? may need to decouple
// baseSkipInterval & theRestSkipInterval?
final class BlockSkipWriter extends MultiLevelSkipListWriter {
private boolean DEBUG = BlockPostingsReader.DEBUG;
private int[] lastSkipDoc;
private long[] lastSkipDocPointer;
private long[] lastSkipPosPointer;
private long[] lastSkipPayPointer;
private int[] lastEndOffset;
private int[] lastPayloadByteUpto;
private final IndexOutput docOut;
private final IndexOutput posOut;
private final IndexOutput payOut;
private int curDoc;
private long curDocPointer;
private long curPosPointer;
private long curPayPointer;
private int curPosBufferUpto;
private int curEndOffset;
private int curPayloadByteUpto;
private boolean fieldHasPositions;
private boolean fieldHasOffsets;
private boolean fieldHasPayloads;
public BlockSkipWriter(int skipInterval, int maxSkipLevels, int docCount, IndexOutput docOut, IndexOutput posOut, IndexOutput payOut) {
super(skipInterval, maxSkipLevels, docCount);
this.docOut = docOut;
this.posOut = posOut;
this.payOut = payOut;
lastSkipDoc = new int[maxSkipLevels];
lastSkipDocPointer = new long[maxSkipLevels];
if (posOut != null) {
lastSkipPosPointer = new long[maxSkipLevels];
if (payOut != null) {
lastSkipPayPointer = new long[maxSkipLevels];
}
lastEndOffset = new int[maxSkipLevels];
lastPayloadByteUpto = new int[maxSkipLevels];
}
}
public void setField(boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) {
this.fieldHasPositions = fieldHasPositions;
this.fieldHasOffsets = fieldHasOffsets;
this.fieldHasPayloads = fieldHasPayloads;
}
@Override
public void resetSkip() {
super.resetSkip();
Arrays.fill(lastSkipDoc, 0);
Arrays.fill(lastSkipDocPointer, docOut.getFilePointer());
if (fieldHasPositions) {
Arrays.fill(lastSkipPosPointer, posOut.getFilePointer());
if (fieldHasOffsets) {
Arrays.fill(lastEndOffset, 0);
}
if (fieldHasPayloads) {
Arrays.fill(lastPayloadByteUpto, 0);
}
if (fieldHasOffsets || fieldHasPayloads) {
Arrays.fill(lastSkipPayPointer, payOut.getFilePointer());
}
}
}
/**
* Sets the values for the current skip data.
*/
public void bufferSkip(int doc, int numDocs, long posFP, long payFP, int posBufferUpto, int endOffset, int payloadByteUpto) throws IOException {
this.curDoc = doc;
this.curDocPointer = docOut.getFilePointer();
this.curPosPointer = posFP;
this.curPayPointer = payFP;
this.curPosBufferUpto = posBufferUpto;
this.curPayloadByteUpto = payloadByteUpto;
this.curEndOffset = endOffset;
bufferSkip(numDocs);
}
@Override
protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException {
int delta = curDoc - lastSkipDoc[level];
if (DEBUG) {
System.out.println("writeSkipData level=" + level + " lastDoc=" + curDoc + " delta=" + delta + " curDocPointer=" + curDocPointer);
}
skipBuffer.writeVInt(delta);
lastSkipDoc[level] = curDoc;
skipBuffer.writeVInt((int) (curDocPointer - lastSkipDocPointer[level]));
lastSkipDocPointer[level] = curDocPointer;
if (fieldHasPositions) {
if (DEBUG) {
System.out.println(" curPosPointer=" + curPosPointer + " curPosBufferUpto=" + curPosBufferUpto);
}
skipBuffer.writeVInt((int) (curPosPointer - lastSkipPosPointer[level]));
lastSkipPosPointer[level] = curPosPointer;
skipBuffer.writeVInt(curPosBufferUpto);
if (fieldHasPayloads) {
skipBuffer.writeVInt(curPayloadByteUpto);
}
if (fieldHasOffsets) {
skipBuffer.writeVInt(curEndOffset - lastEndOffset[level]);
lastEndOffset[level] = curEndOffset;
}
if (fieldHasOffsets || fieldHasPayloads) {
skipBuffer.writeVInt((int) (curPayPointer - lastSkipPayPointer[level]));
lastSkipPayPointer[level] = curPayPointer;
}
}
}
}

View File

@ -185,8 +185,6 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
int lastDocID; int lastDocID;
int df; int df;
/** Adds a new doc in this term. If this returns null
* then we just skip consuming positions/payloads. */
@Override @Override
public void startDoc(int docID, int termDocFreq) throws IOException { public void startDoc(int docID, int termDocFreq) throws IOException {
// if (DEBUG) System.out.println("SPW: startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq + " freqOut.fp=" + freqOut.getFilePointer()); // if (DEBUG) System.out.println("SPW: startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq + " freqOut.fp=" + freqOut.getFilePointer());

View File

@ -17,8 +17,6 @@ package org.apache.lucene.codecs.pfor;
*/ */
import java.nio.IntBuffer; import java.nio.IntBuffer;
import java.nio.ByteBuffer;
import java.util.Arrays;
/** /**
* Encode all values in normal area with fixed bit width, * Encode all values in normal area with fixed bit width,
@ -73,6 +71,9 @@ public class ForUtil {
// since this buffer is reused at upper level, rewind first // since this buffer is reused at upper level, rewind first
intBuffer.rewind(); intBuffer.rewind();
// nocommit assert header isn't "malformed", ie besides
// numBytes / bit-width there is nothing else!
int numBits = ((header >> 8) & MASK[6]); int numBits = ((header >> 8) & MASK[6]);
decompressCore(intBuffer, data, numBits); decompressCore(intBuffer, data, numBits);

View File

@ -34,6 +34,8 @@ import org.apache.lucene.index.FieldInfo.IndexOptions;
public class FieldInfos implements Iterable<FieldInfo> { public class FieldInfos implements Iterable<FieldInfo> {
private final boolean hasFreq; private final boolean hasFreq;
private final boolean hasProx; private final boolean hasProx;
private final boolean hasPayloads;
private final boolean hasOffsets;
private final boolean hasVectors; private final boolean hasVectors;
private final boolean hasNorms; private final boolean hasNorms;
private final boolean hasDocValues; private final boolean hasDocValues;
@ -45,6 +47,8 @@ public class FieldInfos implements Iterable<FieldInfo> {
public FieldInfos(FieldInfo[] infos) { public FieldInfos(FieldInfo[] infos) {
boolean hasVectors = false; boolean hasVectors = false;
boolean hasProx = false; boolean hasProx = false;
boolean hasPayloads = false;
boolean hasOffsets = false;
boolean hasFreq = false; boolean hasFreq = false;
boolean hasNorms = false; boolean hasNorms = false;
boolean hasDocValues = false; boolean hasDocValues = false;
@ -58,12 +62,16 @@ public class FieldInfos implements Iterable<FieldInfo> {
hasVectors |= info.hasVectors(); hasVectors |= info.hasVectors();
hasProx |= info.isIndexed() && info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; hasProx |= info.isIndexed() && info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
hasFreq |= info.isIndexed() && info.getIndexOptions() != IndexOptions.DOCS_ONLY; hasFreq |= info.isIndexed() && info.getIndexOptions() != IndexOptions.DOCS_ONLY;
hasOffsets |= info.isIndexed() && info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
hasNorms |= info.hasNorms(); hasNorms |= info.hasNorms();
hasDocValues |= info.hasDocValues(); hasDocValues |= info.hasDocValues();
hasPayloads |= info.hasPayloads();
} }
this.hasVectors = hasVectors; this.hasVectors = hasVectors;
this.hasProx = hasProx; this.hasProx = hasProx;
this.hasPayloads = hasPayloads;
this.hasOffsets = hasOffsets;
this.hasFreq = hasFreq; this.hasFreq = hasFreq;
this.hasNorms = hasNorms; this.hasNorms = hasNorms;
this.hasDocValues = hasDocValues; this.hasDocValues = hasDocValues;
@ -80,6 +88,16 @@ public class FieldInfos implements Iterable<FieldInfo> {
return hasProx; return hasProx;
} }
/** Returns true if any fields have payloads */
public boolean hasPayloads() {
return hasPayloads;
}
/** Returns true if any fields have offsets */
public boolean hasOffsets() {
return hasOffsets;
}
/** /**
* @return true if at least one field has any vectors * @return true if at least one field has any vectors
*/ */

View File

@ -20,3 +20,4 @@ org.apache.lucene.codecs.memory.MemoryPostingsFormat
org.apache.lucene.codecs.pfor.ForPostingsFormat org.apache.lucene.codecs.pfor.ForPostingsFormat
org.apache.lucene.codecs.pfor.PForPostingsFormat org.apache.lucene.codecs.pfor.PForPostingsFormat
org.apache.lucene.codecs.bulkvint.BulkVIntPostingsFormat org.apache.lucene.codecs.bulkvint.BulkVIntPostingsFormat
org.apache.lucene.codecs.block.BlockPostingsFormat

View File

@ -0,0 +1,866 @@
package org.apache.lucene.index;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsConsumer;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.junit.BeforeClass;
/* NOTE: This test focuses on the postings
* (docs/freqs/positions/payloads/offsets) impl, not the
* terms dict. The [stretch] goal is for this test to be
* so thorough in testing a new PostingsFormat that if this
* test passes, then all Lucene/Solr tests should also pass. Ie,
* if there is some bug in a given PostingsFormat that this
* test fails to catch then this test needs to be improved! */
// nocommit can we make it easy for testing to pair up a "random terms dict impl" with your postings base format...
// nocommit test when you reuse after skipping a term or two, eg the block reuse case
// nocommit hmm contract says .doc() can return NO_MORE_DOCS
// before nextDoc too...?
/* TODO
- threads
- assert doc=-1 before any nextDoc
- if a PF passes this test but fails other tests then this
test has a bug!!
- test tricky reuse cases, eg across fields
- verify you get null if you pass needFreq/needOffset but
they weren't indexed
*/
public class TestPostingsFormat extends LuceneTestCase {
private enum Option {
// Sometimes use .advance():
SKIPPING,
// Sometimes reuse the Docs/AndPositionsEnum across terms:
REUSE_ENUMS,
// Sometimes pass non-null live docs:
LIVE_DOCS,
// Sometimes seek to term using previously saved TermState:
TERM_STATE,
// Sometimes don't fully consume docs from the enum
PARTIAL_DOC_CONSUME,
// Sometimes don't fully consume positions at each doc
PARTIAL_POS_CONSUME,
// Sometimes check payloads
PAYLOADS,
// Test w/ multiple threads
THREADS};
private static class FieldAndTerm {
String field;
BytesRef term;
public FieldAndTerm(String field, BytesRef term) {
this.field = field;
this.term = BytesRef.deepCopyOf(term);
}
}
private static class Position {
int position;
byte[] payload;
int startOffset;
int endOffset;
}
private static class Posting implements Comparable<Posting>{
int docID;
List<Position> positions;
public int compareTo(Posting other) {
return docID - other.docID;
}
}
// Holds all postings:
private static Map<String,Map<BytesRef,List<Posting>>> fields = new TreeMap<String,Map<BytesRef,List<Posting>>>();
// Holds only live doc postings:
private static Map<String,Map<BytesRef,List<Posting>>> fieldsLive = new TreeMap<String,Map<BytesRef,List<Posting>>>();
private static FieldInfos fieldInfos;
private static int maxDocID;
private static FixedBitSet globalLiveDocs;
private static List<FieldAndTerm> allTerms;
@BeforeClass
public static void createPostings() throws IOException {
final int numFields = _TestUtil.nextInt(random(), 1, 5);
if (VERBOSE) {
System.out.println("TEST: " + numFields + " fields");
}
FieldInfo[] fieldInfoArray = new FieldInfo[numFields];
int fieldUpto = 0;
int numMediumTerms = 0;
int numBigTerms = 0;
int numManyPositions = 0;
while (fieldUpto < numFields) {
String field = _TestUtil.randomSimpleString(random());
if (fields.containsKey(field)) {
continue;
}
boolean fieldHasPayloads = random().nextBoolean();
fieldInfoArray[fieldUpto] = new FieldInfo(field, true, fieldUpto, false, false, fieldHasPayloads,
IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS,
null, DocValues.Type.FIXED_INTS_8, null);
fieldUpto++;
Map<BytesRef,List<Posting>> postings = new TreeMap<BytesRef,List<Posting>>();
fields.put(field, postings);
Set<String> seenTerms = new HashSet<String>();
// nocommit
//final int numTerms = atLeast(10);
final int numTerms = 4;
for(int termUpto=0;termUpto<numTerms;termUpto++) {
String term = _TestUtil.randomSimpleString(random());
if (seenTerms.contains(term)) {
continue;
}
seenTerms.add(term);
int numDocs;
if (random().nextInt(10) == 3 && numBigTerms < 3) {
// 10% of the time make a highish freq term:
numDocs = _TestUtil.nextInt(random(), 50000, 70000);
numBigTerms++;
term = "big_" + term;
} else if (random().nextInt(10) == 3 && numMediumTerms < 10) {
// 10% of the time make a medium freq term:
// nocommit not high enough to test level 1 skipping:
numDocs = atLeast(3000);
numMediumTerms++;
term = "medium_" + term;
} else {
// Low freq term:
numDocs = _TestUtil.nextInt(random(), 1, 40);
term = "low_" + term;
}
numDocs *= RANDOM_MULTIPLIER;
List<Posting> termPostings = new ArrayList<Posting>();
postings.put(new BytesRef(term), termPostings);
int docID = 0;
// TODO: more realistic to inversely tie this to numDocs:
int maxDocSpacing = _TestUtil.nextInt(random(), 1, 100);
// 10% of the time create big payloads:
int payloadSize;
if (!fieldHasPayloads) {
payloadSize = 0;
} else if (random().nextInt(10) == 7) {
payloadSize = random().nextInt(50);
} else {
payloadSize = random().nextInt(10);
}
boolean fixedPayloads = random().nextBoolean();
for(int docUpto=0;docUpto<numDocs;docUpto++) {
if (docUpto == 0 && random().nextBoolean()) {
// Sometimes index docID = 0
} else if (maxDocSpacing == 1) {
docID++;
} else {
// nocommit: sometimes have a biggish gap here!
docID += _TestUtil.nextInt(random(), 1, maxDocSpacing);
}
Posting posting = new Posting();
posting.docID = docID;
maxDocID = Math.max(docID, maxDocID);
posting.positions = new ArrayList<Position>();
termPostings.add(posting);
int freq;
if (random().nextInt(30) == 17 && numManyPositions < 10) {
freq = _TestUtil.nextInt(random(), 1, 1000);
numManyPositions++;
} else {
freq = _TestUtil.nextInt(random(), 1, 20);
}
int pos = 0;
int offset = 0;
int posSpacing = _TestUtil.nextInt(random(), 1, 100);
for(int posUpto=0;posUpto<freq;posUpto++) {
if (posUpto == 0 && random().nextBoolean()) {
// Sometimes index pos = 0
} else if (posSpacing == 1) {
pos++;
} else {
pos += _TestUtil.nextInt(random(), 1, posSpacing);
}
Position position = new Position();
posting.positions.add(position);
position.position = pos;
if (payloadSize != 0) {
if (fixedPayloads) {
position.payload = new byte[payloadSize];
} else {
int thisPayloadSize = random().nextInt(payloadSize);
if (thisPayloadSize != 0) {
position.payload = new byte[thisPayloadSize];
}
}
}
if (position.payload != null) {
random().nextBytes(position.payload);
}
position.startOffset = offset + random().nextInt(5);
position.endOffset = position.startOffset + random().nextInt(10);
offset = position.endOffset;
}
}
}
}
fieldInfos = new FieldInfos(fieldInfoArray);
globalLiveDocs = new FixedBitSet(1+maxDocID);
double liveRatio = random().nextDouble();
for(int i=0;i<1+maxDocID;i++) {
if (random().nextDouble() <= liveRatio) {
globalLiveDocs.set(i);
}
}
// Pre-filter postings by globalLiveDocs:
for(Map.Entry<String,Map<BytesRef,List<Posting>>> fieldEnt : fields.entrySet()) {
Map<BytesRef,List<Posting>> postingsLive = new TreeMap<BytesRef,List<Posting>>();
fieldsLive.put(fieldEnt.getKey(), postingsLive);
for(Map.Entry<BytesRef,List<Posting>> termEnt : fieldEnt.getValue().entrySet()) {
List<Posting> termPostingsLive = new ArrayList<Posting>();
postingsLive.put(termEnt.getKey(), termPostingsLive);
for(Posting posting : termEnt.getValue()) {
if (globalLiveDocs.get(posting.docID)) {
termPostingsLive.add(posting);
}
}
}
}
allTerms = new ArrayList<FieldAndTerm>();
for(Map.Entry<String,Map<BytesRef,List<Posting>>> fieldEnt : fields.entrySet()) {
String field = fieldEnt.getKey();
for(Map.Entry<BytesRef,List<Posting>> termEnt : fieldEnt.getValue().entrySet()) {
allTerms.add(new FieldAndTerm(field, termEnt.getKey()));
}
}
if (VERBOSE) {
System.out.println("TEST: done init postings; maxDocID=" + maxDocID + "; " + allTerms.size() + " total terms, across " + fieldInfos.size() + " fields");
}
}
// nocommit maybe instead of @BeforeClass just make a single test run: build postings & index & test it?
private FieldInfos currentFieldInfos;
// maxAllowed = the "highest" we can index, but we will still
// randomly index at lower IndexOption
private FieldsProducer buildIndex(Directory dir, IndexOptions maxAllowed, boolean allowPayloads) throws IOException {
SegmentInfo segmentInfo = new SegmentInfo(dir, Constants.LUCENE_MAIN_VERSION, "_0", 1+maxDocID, false, Codec.getDefault(), null, null);
int maxIndexOption = Arrays.asList(IndexOptions.values()).indexOf(maxAllowed);
if (VERBOSE) {
System.out.println("\nTEST: now build index");
}
// nocommit use allowPayloads
FieldInfo[] newFieldInfoArray = new FieldInfo[fields.size()];
for(int fieldUpto=0;fieldUpto<fields.size();fieldUpto++) {
FieldInfo oldFieldInfo = fieldInfos.fieldInfo(fieldUpto);
// Randomly picked the IndexOptions to index this
// field with:
IndexOptions indexOptions = IndexOptions.values()[random().nextInt(1+maxIndexOption)];
boolean doPayloads = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 && allowPayloads;
newFieldInfoArray[fieldUpto] = new FieldInfo(oldFieldInfo.name,
true,
fieldUpto,
false,
false,
doPayloads,
indexOptions,
null,
DocValues.Type.FIXED_INTS_8,
null);
}
FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray);
SegmentWriteState writeState = new SegmentWriteState(null, dir,
segmentInfo, newFieldInfos,
32, null, IOContext.DEFAULT);
FieldsConsumer fieldsConsumer = Codec.getDefault().postingsFormat().fieldsConsumer(writeState);
for(Map.Entry<String,Map<BytesRef,List<Posting>>> fieldEnt : fields.entrySet()) {
String field = fieldEnt.getKey();
Map<BytesRef,List<Posting>> terms = fieldEnt.getValue();
FieldInfo fieldInfo = newFieldInfos.fieldInfo(field);
if (VERBOSE) {
System.out.println("field=" + field);
}
IndexOptions indexOptions = fieldInfo.getIndexOptions();
boolean doFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
boolean doPos = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
boolean doPayloads = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 && allowPayloads;
TermsConsumer termsConsumer = fieldsConsumer.addField(fieldInfo);
long sumTotalTF = 0;
long sumDF = 0;
FixedBitSet seenDocs = new FixedBitSet(maxDocID+1);
for(Map.Entry<BytesRef,List<Posting>> termEnt : terms.entrySet()) {
BytesRef term = termEnt.getKey();
List<Posting> postings = termEnt.getValue();
if (VERBOSE) {
System.out.println(" term=" + field + ":" + term.utf8ToString() + " docFreq=" + postings.size());
}
PostingsConsumer postingsConsumer = termsConsumer.startTerm(term);
long totalTF = 0;
int docCount = 0;
for(Posting posting : postings) {
if (VERBOSE) {
System.out.println(" " + docCount + ": docID=" + posting.docID + " freq=" + posting.positions.size());
}
postingsConsumer.startDoc(posting.docID, posting.positions.size());
seenDocs.set(posting.docID);
if (doPos) {
totalTF += posting.positions.size();
for(Position pos : posting.positions) {
if (VERBOSE) {
if (doPayloads) {
System.out.println(" pos=" + pos.position + " payload=" + (pos.payload == null ? "null" : pos.payload.length + " bytes"));
} else {
System.out.println(" pos=" + pos.position);
}
}
postingsConsumer.addPosition(pos.position, (doPayloads && pos.payload != null) ? new BytesRef(pos.payload) : null, pos.startOffset, pos.endOffset);
}
} else if (doFreq) {
totalTF += posting.positions.size();
} else {
totalTF++;
}
docCount++;
}
termsConsumer.finishTerm(term, new TermStats(postings.size(), totalTF));
sumTotalTF += totalTF;
sumDF += postings.size();
}
termsConsumer.finish(sumTotalTF, sumDF, seenDocs.cardinality());
}
fieldsConsumer.close();
if (VERBOSE) {
System.out.println("TEST: after indexing: files=");
for(String file : dir.listAll()) {
System.out.println(" " + file + ": " + dir.fileLength(file) + " bytes");
}
}
currentFieldInfos = newFieldInfos;
SegmentReadState readState = new SegmentReadState(dir, segmentInfo, newFieldInfos, IOContext.DEFAULT, 1);
return Codec.getDefault().postingsFormat().fieldsProducer(readState);
}
private static class ThreadState {
// Only used with REUSE option:
public DocsEnum reuseDocsEnum;
public DocsAndPositionsEnum reuseDocsAndPositionsEnum;
}
private void verifyEnum(ThreadState threadState,
String field,
BytesRef term,
TermsEnum termsEnum,
// Maximum options (docs/freqs/positions/offsets) to test:
IndexOptions maxIndexOptions,
EnumSet<Option> options) throws IOException {
if (VERBOSE) {
System.out.println(" verifyEnum: options=" + options + " maxIndexOptions=" + maxIndexOptions);
}
// 50% of the time time pass liveDocs:
Bits liveDocs;
Map<String,Map<BytesRef,List<Posting>>> fieldsToUse;
if (options.contains(Option.LIVE_DOCS) && random().nextBoolean()) {
liveDocs = globalLiveDocs;
fieldsToUse = fieldsLive;
if (VERBOSE) {
System.out.println(" use liveDocs");
}
} else {
liveDocs = null;
fieldsToUse = fields;
if (VERBOSE) {
System.out.println(" no liveDocs");
}
}
FieldInfo fieldInfo = currentFieldInfos.fieldInfo(field);
assertEquals(fields.get(field).get(term).size(), termsEnum.docFreq());
// NOTE: can be empty list if we are using liveDocs:
List<Posting> expected = fieldsToUse.get(field).get(term);
boolean allowFreqs = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0 &&
maxIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
boolean doCheckFreqs = allowFreqs && random().nextInt(3) <= 2;
boolean allowPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 &&
maxIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
boolean doCheckPositions = allowPositions && random().nextInt(3) <= 2;
boolean allowOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0 &&
maxIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
boolean doCheckOffsets = allowOffsets && random().nextInt(3) <= 2;
boolean doCheckPayloads = options.contains(Option.PAYLOADS) && allowPositions && fieldInfo.hasPayloads();
DocsEnum prevDocsEnum = null;
DocsEnum docsEnum;
DocsAndPositionsEnum docsAndPositionsEnum;
if (!doCheckPositions) {
if (allowPositions && random().nextInt(10) == 7) {
// 10% of the time, even though we will not check positions, pull a DocsAndPositions enum
if (VERBOSE) {
System.out.println(" get DocsAndPositionsEnum (but we won't check positions)");
}
if (options.contains(Option.REUSE_ENUMS) && random().nextInt(10) < 9) {
prevDocsEnum = threadState.reuseDocsAndPositionsEnum;
}
threadState.reuseDocsAndPositionsEnum = termsEnum.docsAndPositions(liveDocs, (DocsAndPositionsEnum) prevDocsEnum, false);
docsEnum = threadState.reuseDocsAndPositionsEnum;
docsAndPositionsEnum = threadState.reuseDocsAndPositionsEnum;
} else {
if (VERBOSE) {
System.out.println(" get DocsEnum");
}
if (options.contains(Option.REUSE_ENUMS) && random().nextInt(10) < 9) {
prevDocsEnum = threadState.reuseDocsEnum;
}
threadState.reuseDocsEnum = termsEnum.docs(liveDocs, prevDocsEnum, doCheckFreqs);
docsEnum = threadState.reuseDocsEnum;
docsAndPositionsEnum = null;
}
} else {
if (VERBOSE) {
System.out.println(" get DocsAndPositionsEnum");
}
if (options.contains(Option.REUSE_ENUMS) && random().nextInt(10) < 9) {
prevDocsEnum = threadState.reuseDocsAndPositionsEnum;
}
threadState.reuseDocsAndPositionsEnum = termsEnum.docsAndPositions(liveDocs, (DocsAndPositionsEnum) prevDocsEnum, doCheckOffsets);
docsEnum = threadState.reuseDocsAndPositionsEnum;
docsAndPositionsEnum = threadState.reuseDocsAndPositionsEnum;
}
assertNotNull(docsEnum);
int initialDocID = docsEnum.docID();
assertTrue(initialDocID == -1 || initialDocID == DocsEnum.NO_MORE_DOCS);
if (VERBOSE) {
if (prevDocsEnum == null) {
System.out.println(" got enum=" + docsEnum);
} else if (prevDocsEnum == docsEnum) {
System.out.println(" got reuse enum=" + docsEnum);
} else {
System.out.println(" got enum=" + docsEnum + " (reuse of " + prevDocsEnum + " failed)");
}
}
// 10% of the time don't consume all docs:
int stopAt;
if (options.contains(Option.PARTIAL_DOC_CONSUME) && expected.size() > 1 && random().nextInt(10) == 7) {
stopAt = random().nextInt(expected.size()-1);
if (VERBOSE) {
System.out.println(" will not consume all docs (" + stopAt + " vs " + expected.size() + ")");
}
} else {
stopAt = expected.size();
if (VERBOSE) {
System.out.println(" consume all docs");
}
}
double skipChance = random().nextDouble();
int numSkips = expected.size() < 3 ? 1 : _TestUtil.nextInt(random(), 1, Math.min(20, expected.size()/3));
int skipInc = expected.size()/numSkips;
int skipDocInc = (1+maxDocID)/numSkips;
// Sometimes do 100% skipping:
boolean doAllSkipping = options.contains(Option.SKIPPING) && random().nextInt(7) == 1;
double freqAskChance = random().nextDouble();
double payloadCheckChance = random().nextDouble();
double offsetCheckChance = random().nextDouble();
if (VERBOSE) {
if (options.contains(Option.SKIPPING)) {
System.out.println(" skipChance=" + skipChance + " numSkips=" + numSkips);
} else {
System.out.println(" no skipping");
}
if (doCheckFreqs) {
System.out.println(" freqAskChance=" + freqAskChance);
}
if (doCheckPayloads) {
System.out.println(" payloadCheckChance=" + payloadCheckChance);
}
if (doCheckOffsets) {
System.out.println(" offsetCheckChance=" + offsetCheckChance);
}
}
int nextPosting = 0;
while (nextPosting <= stopAt) {
if (nextPosting == stopAt) {
if (stopAt == expected.size()) {
assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.nextDoc());
// Common bug is to forget to set this.doc=NO_MORE_DOCS in the enum!:
assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.docID());
}
break;
}
Posting posting;
if (options.contains(Option.SKIPPING) && (doAllSkipping || random().nextDouble() <= skipChance)) {
int targetDocID = -1;
if (nextPosting < stopAt && random().nextBoolean()) {
// Pick target we know exists:
nextPosting = _TestUtil.nextInt(random(), nextPosting, nextPosting+skipInc);
} else {
// Pick random target (might not exist):
Posting target = new Posting();
target.docID = _TestUtil.nextInt(random(), expected.get(nextPosting).docID, expected.get(nextPosting).docID+skipDocInc);
targetDocID = target.docID;
int loc = Collections.binarySearch(expected.subList(nextPosting, expected.size()), target);
if (loc < 0) {
loc = -loc-1;
}
nextPosting = nextPosting + loc;
}
if (nextPosting >= stopAt) {
int target = random().nextBoolean() ? (maxDocID+1) : DocsEnum.NO_MORE_DOCS;
if (VERBOSE) {
System.out.println(" now advance to end (target=" + target + ")");
}
assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.advance(target));
break;
} else {
posting = expected.get(nextPosting++);
if (VERBOSE) {
if (targetDocID != -1) {
System.out.println(" now advance to random target=" + targetDocID + " (" + nextPosting + " of " + stopAt + ")");
} else {
System.out.println(" now advance to known-exists target=" + posting.docID + " (" + nextPosting + " of " + stopAt + ")");
}
}
int docID = docsEnum.advance(targetDocID != -1 ? targetDocID : posting.docID);
assertEquals(posting.docID, docID);
}
} else {
posting = expected.get(nextPosting++);
if (VERBOSE) {
System.out.println(" now nextDoc to " + posting.docID + " (" + nextPosting + " of " + stopAt + ")");
}
int docID = docsEnum.nextDoc();
assertEquals(posting.docID, docID);
}
if (doCheckFreqs && random().nextDouble() <= freqAskChance) {
if (VERBOSE) {
System.out.println(" now freq()=" + posting.positions.size());
}
int freq = docsEnum.freq();
assertEquals(posting.positions.size(), freq);
}
if (doCheckPositions) {
int freq = docsEnum.freq();
int numPosToConsume;
if (options.contains(Option.PARTIAL_POS_CONSUME) && random().nextInt(5) == 1) {
numPosToConsume = random().nextInt(freq);
} else {
numPosToConsume = freq;
}
for(int i=0;i<numPosToConsume;i++) {
Position position = posting.positions.get(i);
if (VERBOSE) {
System.out.println(" now nextPosition to " + position.position);
}
assertEquals(position.position, docsAndPositionsEnum.nextPosition());
// nocommit sometimes don't pull the payload even
// though we pulled the position
if (doCheckPayloads) {
if (random().nextDouble() <= payloadCheckChance) {
if (VERBOSE) {
System.out.println(" now check payload length=" + (position.payload == null ? 0 : position.payload.length));
}
if (position.payload == null || position.payload.length == 0) {
assertFalse(docsAndPositionsEnum.hasPayload());
} else {
assertTrue(docsAndPositionsEnum.hasPayload());
BytesRef payload = docsAndPositionsEnum.getPayload();
assertFalse(docsAndPositionsEnum.hasPayload());
assertNotNull(payload);
assertEquals(position.payload.length, payload.length);
for(int byteUpto=0;byteUpto<position.payload.length;byteUpto++) {
assertEquals(position.payload[byteUpto],
payload.bytes[payload.offset+byteUpto]);
}
}
} else {
if (VERBOSE) {
System.out.println(" skip check payload length=" + (position.payload == null ? 0 : position.payload.length));
}
}
}
if (doCheckOffsets) {
if (random().nextDouble() <= offsetCheckChance) {
if (VERBOSE) {
System.out.println(" now check offsets: startOff=" + position.startOffset + " endOffset=" + position.endOffset);
}
assertEquals(position.startOffset, docsAndPositionsEnum.startOffset());
assertEquals(position.endOffset, docsAndPositionsEnum.endOffset());
} else {
if (VERBOSE) {
System.out.println(" skip check offsets");
}
}
} else {
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
}
}
}
}
}
private void testTerms(final Fields fieldsSource, final EnumSet<Option> options, final IndexOptions maxIndexOptions) throws Exception {
if (options.contains(Option.THREADS)) {
int numThreads = _TestUtil.nextInt(random(), 2, 5);
Thread[] threads = new Thread[numThreads];
for(int threadUpto=0;threadUpto<numThreads;threadUpto++) {
threads[threadUpto] = new Thread() {
@Override
public void run() {
try {
testTermsOneThread(fieldsSource, options, maxIndexOptions);
} catch (Throwable t) {
throw new RuntimeException(t);
}
}
};
threads[threadUpto].start();
}
for(int threadUpto=0;threadUpto<numThreads;threadUpto++) {
threads[threadUpto].join();
}
} else {
testTermsOneThread(fieldsSource, options, maxIndexOptions);
}
}
private void testTermsOneThread(Fields fieldsSource, EnumSet<Option> options, IndexOptions maxIndexOptions) throws IOException {
ThreadState threadState = new ThreadState();
// Test random terms/fields:
List<TermState> termStates = new ArrayList<TermState>();
List<FieldAndTerm> termStateTerms = new ArrayList<FieldAndTerm>();
Collections.shuffle(allTerms, random());
int upto = 0;
while (upto < allTerms.size()) {
boolean useTermState = termStates.size() != 0 && random().nextInt(5) == 1;
FieldAndTerm fieldAndTerm;
TermsEnum termsEnum;
TermState termState = null;
if (!useTermState) {
// Seek by random field+term:
fieldAndTerm = allTerms.get(upto++);
if (VERBOSE) {
System.out.println("\nTEST: seek to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString() );
}
} else {
// Seek by previous saved TermState
int idx = random().nextInt(termStates.size());
fieldAndTerm = termStateTerms.get(idx);
if (VERBOSE) {
System.out.println("\nTEST: seek using TermState to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString());
}
termState = termStates.get(idx);
}
Terms terms = fieldsSource.terms(fieldAndTerm.field);
assertNotNull(terms);
termsEnum = terms.iterator(null);
if (!useTermState) {
assertTrue(termsEnum.seekExact(fieldAndTerm.term, true));
} else {
termsEnum.seekExact(fieldAndTerm.term, termState);
}
boolean savedTermState = false;
if (options.contains(Option.TERM_STATE) && !useTermState && random().nextInt(5) == 1) {
// Save away this TermState:
termStates.add(termsEnum.termState());
termStateTerms.add(fieldAndTerm);
savedTermState = true;
}
verifyEnum(threadState,
fieldAndTerm.field,
fieldAndTerm.term,
termsEnum,
maxIndexOptions,
options);
// Sometimes save term state after pulling the enum:
if (options.contains(Option.TERM_STATE) && !useTermState && !savedTermState && random().nextInt(5) == 1) {
// Save away this TermState:
termStates.add(termsEnum.termState());
termStateTerms.add(fieldAndTerm);
useTermState = true;
}
// 10% of the time make sure you can pull another enum
// from the same term:
if (random().nextInt(10) == 7) {
// Try same term again
if (VERBOSE) {
System.out.println("TEST: try enum again on same term");
}
verifyEnum(threadState,
fieldAndTerm.field,
fieldAndTerm.term,
termsEnum,
maxIndexOptions,
options);
}
}
}
public void test() throws Exception {
Directory dir = newDirectory();
boolean indexPayloads = random().nextBoolean();
// nocommit test thread safety of buildIndex too
FieldsProducer fieldsProducer = buildIndex(dir, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, indexPayloads);
//testTerms(fieldsProducer, EnumSet.noneOf(Option.class), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
//testTerms(fieldsProducer, EnumSet.of(Option.LIVE_DOCS), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
//testTerms(fieldsProducer, EnumSet.of(Option.TERM_STATE, Option.LIVE_DOCS, Option.PARTIAL_DOC_CONSUME, Option.PARTIAL_POS_CONSUME), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
//testTerms(fieldsProducer, EnumSet.of(Option.SKIPPING), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
//testTerms(fieldsProducer, EnumSet.of(Option.THREADS, Option.TERM_STATE, Option.SKIPPING, Option.PARTIAL_DOC_CONSUME, Option.PARTIAL_POS_CONSUME), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
//testTerms(fieldsProducer, EnumSet.of(Option.TERM_STATE, Option.SKIPPING, Option.PARTIAL_DOC_CONSUME, Option.PARTIAL_POS_CONSUME), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
testTerms(fieldsProducer, EnumSet.of(Option.TERM_STATE, Option.PAYLOADS, Option.PARTIAL_DOC_CONSUME, Option.PARTIAL_POS_CONSUME, Option.SKIPPING), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
fieldsProducer.close();
dir.close();
}
}
// nocommit test that start/endOffset return -1 if field has
// no offsets