mirror of https://github.com/apache/lucene.git
LUCENE-4225: BlockPostingsFormat
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1363421 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cf36fb9a58
commit
1e49670a55
|
@ -24,7 +24,8 @@
|
|||
</subant>
|
||||
</target>
|
||||
|
||||
<target name="test" description="Test both Lucene and Solr" depends="validate">
|
||||
<!-- nocommit put depends="validate" back -->
|
||||
<target name="test" description="Test both Lucene and Solr">
|
||||
<sequential>
|
||||
<subant target="test" inheritall="false" failonerror="true">
|
||||
<fileset dir="lucene" includes="build.xml" />
|
||||
|
|
|
@ -724,7 +724,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
// Write term stats, to separate byte[] blob:
|
||||
bytesWriter2.writeVInt(term.stats.docFreq);
|
||||
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
|
||||
assert term.stats.totalTermFreq >= term.stats.docFreq;
|
||||
assert term.stats.totalTermFreq >= term.stats.docFreq: term.stats.totalTermFreq + " vs " + term.stats.docFreq;
|
||||
bytesWriter2.writeVLong(term.stats.totalTermFreq - term.stats.docFreq);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,108 @@
|
|||
package org.apache.lucene.codecs.block;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.BlockTreeTermsReader;
|
||||
import org.apache.lucene.codecs.BlockTreeTermsWriter;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Pass ForFactory to a PostingsWriter/ReaderBase, and get
|
||||
* customized postings format plugged.
|
||||
*/
|
||||
public final class BlockPostingsFormat extends PostingsFormat {
|
||||
public static final String DOC_EXTENSION = "doc";
|
||||
public static final String POS_EXTENSION = "pos";
|
||||
public static final String PAY_EXTENSION = "pay";
|
||||
|
||||
private final int minTermBlockSize;
|
||||
private final int maxTermBlockSize;
|
||||
public final static int DEFAULT_BLOCK_SIZE = 128;
|
||||
|
||||
public BlockPostingsFormat() {
|
||||
this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
public BlockPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
||||
super("Block");
|
||||
this.minTermBlockSize = minTermBlockSize;
|
||||
assert minTermBlockSize > 1;
|
||||
this.maxTermBlockSize = maxTermBlockSize;
|
||||
assert minTermBlockSize <= maxTermBlockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getName() + "(blocksize=" + DEFAULT_BLOCK_SIZE + ")";
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
// TODO: implement a new PostingsWriterBase to improve skip-settings
|
||||
PostingsWriterBase postingsWriter = new BlockPostingsWriter(state, 128);
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer ret = new BlockTreeTermsWriter(state,
|
||||
postingsWriter,
|
||||
minTermBlockSize,
|
||||
maxTermBlockSize);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsWriter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postingsReader = new BlockPostingsReader(state.dir,
|
||||
state.fieldInfos,
|
||||
state.segmentInfo,
|
||||
state.context,
|
||||
state.segmentSuffix,
|
||||
128);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer ret = new BlockTreeTermsReader(state.dir,
|
||||
state.fieldInfos,
|
||||
state.segmentInfo.name,
|
||||
postingsReader,
|
||||
state.context,
|
||||
state.segmentSuffix,
|
||||
state.termsIndexDivisor);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsReader);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,574 @@
|
|||
package org.apache.lucene.codecs.block;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.codecs.pfor.ForUtil; // nocommit move here?
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
// nocommit javadocs
|
||||
|
||||
public final class BlockPostingsWriter extends PostingsWriterBase {
|
||||
|
||||
private boolean DEBUG = BlockPostingsReader.DEBUG;
|
||||
|
||||
// nocommit move these constants to the PF:
|
||||
|
||||
static final int maxSkipLevels = 10;
|
||||
|
||||
final static String TERMS_CODEC = "BlockPostingsWriterTerms";
|
||||
final static String DOC_CODEC = "BlockPostingsWriterDoc";
|
||||
final static String POS_CODEC = "BlockPostingsWriterPos";
|
||||
final static String PAY_CODEC = "BlockPostingsWriterPay";
|
||||
|
||||
// Increment version to change it:
|
||||
final static int VERSION_START = 0;
|
||||
final static int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
final IndexOutput docOut;
|
||||
final IndexOutput posOut;
|
||||
final IndexOutput payOut;
|
||||
|
||||
static final int DEFAULT_BLOCK_SIZE = 128;
|
||||
|
||||
final int blockSize;
|
||||
|
||||
private IndexOutput termsOut;
|
||||
|
||||
// How current field indexes postings:
|
||||
private boolean fieldHasFreqs;
|
||||
private boolean fieldHasPositions;
|
||||
private boolean fieldHasOffsets;
|
||||
private boolean fieldHasPayloads;
|
||||
|
||||
// Holds starting file pointers for each term:
|
||||
private long docTermStartFP;
|
||||
private long posTermStartFP;
|
||||
private long payTermStartFP;
|
||||
|
||||
final int[] docDeltaBuffer;
|
||||
final int[] freqBuffer;
|
||||
private int docBufferUpto;
|
||||
|
||||
final int[] posDeltaBuffer;
|
||||
final int[] payloadLengthBuffer;
|
||||
final int[] offsetStartDeltaBuffer;
|
||||
final int[] offsetLengthBuffer;
|
||||
private int posBufferUpto;
|
||||
|
||||
private byte[] payloadBytes;
|
||||
private int payloadByteUpto;
|
||||
|
||||
private int lastBlockDocID;
|
||||
private boolean saveNextPosBlock;
|
||||
private long lastBlockPosFP;
|
||||
private long lastBlockPayFP;
|
||||
private int lastBlockPosBufferUpto;
|
||||
private int lastBlockEndOffset;
|
||||
private int lastBlockPayloadByteUpto;
|
||||
private int lastDocID;
|
||||
private int lastPosition;
|
||||
private int lastEndOffset;
|
||||
private int docCount;
|
||||
|
||||
final byte[] encoded;
|
||||
final IntBuffer encodedBuffer;
|
||||
|
||||
private final BlockSkipWriter skipWriter;
|
||||
|
||||
public BlockPostingsWriter(SegmentWriteState state, int blockSize) throws IOException {
|
||||
super();
|
||||
this.blockSize = blockSize;
|
||||
|
||||
docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.DOC_EXTENSION),
|
||||
state.context);
|
||||
IndexOutput posOut = null;
|
||||
IndexOutput payOut = null;
|
||||
boolean success = false;
|
||||
try {
|
||||
CodecUtil.writeHeader(docOut, DOC_CODEC, VERSION_CURRENT);
|
||||
if (state.fieldInfos.hasProx()) {
|
||||
posDeltaBuffer = new int[blockSize];
|
||||
posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.POS_EXTENSION),
|
||||
state.context);
|
||||
CodecUtil.writeHeader(posOut, POS_CODEC, VERSION_CURRENT);
|
||||
|
||||
if (state.fieldInfos.hasPayloads()) {
|
||||
payloadBytes = new byte[128];
|
||||
payloadLengthBuffer = new int[blockSize];
|
||||
} else {
|
||||
payloadBytes = null;
|
||||
payloadLengthBuffer = null;
|
||||
}
|
||||
|
||||
if (state.fieldInfos.hasOffsets()) {
|
||||
offsetStartDeltaBuffer = new int[blockSize];
|
||||
offsetLengthBuffer = new int[blockSize];
|
||||
} else {
|
||||
offsetStartDeltaBuffer = null;
|
||||
offsetLengthBuffer = null;
|
||||
}
|
||||
|
||||
if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) {
|
||||
payOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.PAY_EXTENSION),
|
||||
state.context);
|
||||
CodecUtil.writeHeader(payOut, PAY_CODEC, VERSION_CURRENT);
|
||||
}
|
||||
} else {
|
||||
posDeltaBuffer = null;
|
||||
payloadLengthBuffer = null;
|
||||
offsetStartDeltaBuffer = null;
|
||||
offsetLengthBuffer = null;
|
||||
payloadBytes = null;
|
||||
}
|
||||
this.payOut = payOut;
|
||||
this.posOut = posOut;
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(docOut, posOut, payOut);
|
||||
}
|
||||
}
|
||||
|
||||
docDeltaBuffer = new int[blockSize];
|
||||
freqBuffer = new int[blockSize];
|
||||
|
||||
skipWriter = new BlockSkipWriter(blockSize,
|
||||
maxSkipLevels,
|
||||
state.segmentInfo.getDocCount(),
|
||||
docOut,
|
||||
posOut,
|
||||
payOut);
|
||||
|
||||
encoded = new byte[blockSize*4 + 4];
|
||||
encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void start(IndexOutput termsOut) throws IOException {
|
||||
this.termsOut = termsOut;
|
||||
CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT);
|
||||
termsOut.writeVInt(blockSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setField(FieldInfo fieldInfo) {
|
||||
IndexOptions indexOptions = fieldInfo.getIndexOptions();
|
||||
fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||
fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
fieldHasPayloads = fieldInfo.hasPayloads();
|
||||
skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startTerm() {
|
||||
docTermStartFP = docOut.getFilePointer();
|
||||
if (fieldHasPositions) {
|
||||
posTermStartFP = posOut.getFilePointer();
|
||||
if (fieldHasPayloads || fieldHasOffsets) {
|
||||
payTermStartFP = payOut.getFilePointer();
|
||||
}
|
||||
}
|
||||
lastBlockDocID = -1;
|
||||
lastDocID = 0;
|
||||
if (DEBUG) {
|
||||
System.out.println("FPW.startTerm startFP=" + docTermStartFP);
|
||||
}
|
||||
skipWriter.resetSkip();
|
||||
}
|
||||
|
||||
private void writeBlock(int[] buffer, IndexOutput out) throws IOException {
|
||||
final int header = ForUtil.compress(buffer, encodedBuffer);
|
||||
//System.out.println(" block has " + numBytes + " bytes");
|
||||
out.writeVInt(header);
|
||||
out.writeBytes(encoded, ForUtil.getEncodedSize(header));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startDoc(int docID, int termDocFreq) throws IOException {
|
||||
if (DEBUG) {
|
||||
System.out.println("FPW.startDoc docID=" + docID);
|
||||
}
|
||||
|
||||
// nocommit do this in finishDoc... but does it fail...?
|
||||
// is it not always called...?
|
||||
if (posOut != null && saveNextPosBlock) {
|
||||
lastBlockPosFP = posOut.getFilePointer();
|
||||
if (payOut != null) {
|
||||
lastBlockPayFP = payOut.getFilePointer();
|
||||
}
|
||||
lastBlockPosBufferUpto = posBufferUpto;
|
||||
lastBlockEndOffset = lastEndOffset;
|
||||
lastBlockPayloadByteUpto = payloadByteUpto;
|
||||
saveNextPosBlock = false;
|
||||
if (DEBUG) {
|
||||
System.out.println(" now save lastBlockPosFP=" + lastBlockPosFP + " lastBlockPosBufferUpto=" + lastBlockPosBufferUpto + " lastBlockPayloadByteUpto=" + lastBlockPayloadByteUpto);
|
||||
}
|
||||
}
|
||||
|
||||
final int docDelta = docID - lastDocID;
|
||||
if (docID < 0 || (docCount > 0 && docDelta <= 0)) {
|
||||
throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (docOut: " + docOut + ")");
|
||||
}
|
||||
lastDocID = docID;
|
||||
|
||||
docDeltaBuffer[docBufferUpto] = docDelta;
|
||||
if (DEBUG) {
|
||||
System.out.println(" docDeltaBuffer[" + docBufferUpto + "]=" + docDelta);
|
||||
}
|
||||
if (fieldHasFreqs) {
|
||||
freqBuffer[docBufferUpto] = termDocFreq;
|
||||
}
|
||||
|
||||
docBufferUpto++;
|
||||
docCount++;
|
||||
|
||||
if (docBufferUpto == blockSize) {
|
||||
// nocommit maybe instead of buffering skip before
|
||||
// writing a block based on last block's end data
|
||||
// ... we could buffer after writing the block? only
|
||||
// iffiness with that approach is it could be a
|
||||
// pointlness skip? like we may stop adding docs
|
||||
// right after that, then we have skip point AFTER
|
||||
// last doc. the thing is, in finishTerm we are
|
||||
// already sometimes adding a skip point AFTER the
|
||||
// last doc?
|
||||
if (lastBlockDocID != -1) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" bufferSkip at writeBlock: lastDocID=" + lastBlockDocID + " docCount=" + (docCount-blockSize));
|
||||
}
|
||||
skipWriter.bufferSkip(lastBlockDocID, docCount-blockSize, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockEndOffset, lastBlockPayloadByteUpto);
|
||||
}
|
||||
lastBlockDocID = docID;
|
||||
saveNextPosBlock = true;
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" write docDelta block @ fp=" + docOut.getFilePointer());
|
||||
}
|
||||
writeBlock(docDeltaBuffer, docOut);
|
||||
if (fieldHasFreqs) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" write freq block @ fp=" + docOut.getFilePointer());
|
||||
}
|
||||
writeBlock(freqBuffer, docOut);
|
||||
}
|
||||
docBufferUpto = 0;
|
||||
}
|
||||
|
||||
lastPosition = 0;
|
||||
lastEndOffset = 0;
|
||||
}
|
||||
|
||||
/** Add a new position & payload */
|
||||
@Override
|
||||
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
|
||||
if (DEBUG) {
|
||||
System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: ""));
|
||||
}
|
||||
posDeltaBuffer[posBufferUpto] = position - lastPosition;
|
||||
if (fieldHasPayloads) {
|
||||
if (payload == null || payload.length == 0) {
|
||||
// no payload
|
||||
payloadLengthBuffer[posBufferUpto] = 0;
|
||||
} else {
|
||||
payloadLengthBuffer[posBufferUpto] = payload.length;
|
||||
if (payloadByteUpto + payload.length > payloadBytes.length) {
|
||||
payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length);
|
||||
}
|
||||
System.arraycopy(payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length);
|
||||
payloadByteUpto += payload.length;
|
||||
}
|
||||
}
|
||||
|
||||
if (fieldHasOffsets) {
|
||||
assert startOffset >= lastEndOffset;
|
||||
assert endOffset >= startOffset;
|
||||
offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastEndOffset;
|
||||
offsetLengthBuffer[posBufferUpto] = endOffset - startOffset;
|
||||
lastEndOffset = endOffset;
|
||||
}
|
||||
|
||||
posBufferUpto++;
|
||||
lastPosition = position;
|
||||
if (posBufferUpto == blockSize) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" write pos bulk block @ fp=" + posOut.getFilePointer());
|
||||
}
|
||||
writeBlock(posDeltaBuffer, posOut);
|
||||
|
||||
if (fieldHasPayloads) {
|
||||
writeBlock(payloadLengthBuffer, payOut);
|
||||
payOut.writeVInt(payloadByteUpto);
|
||||
payOut.writeBytes(payloadBytes, 0, payloadByteUpto);
|
||||
payloadByteUpto = 0;
|
||||
}
|
||||
if (fieldHasOffsets) {
|
||||
writeBlock(offsetStartDeltaBuffer, payOut);
|
||||
writeBlock(offsetLengthBuffer, payOut);
|
||||
}
|
||||
posBufferUpto = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finishDoc() {
|
||||
}
|
||||
|
||||
private static class PendingTerm {
|
||||
public final long docStartFP;
|
||||
public final long posStartFP;
|
||||
public final long payStartFP;
|
||||
public final int skipOffset;
|
||||
public final int lastPosBlockOffset;
|
||||
|
||||
public PendingTerm(long docStartFP, long posStartFP, long payStartFP, int skipOffset, int lastPosBlockOffset) {
|
||||
this.docStartFP = docStartFP;
|
||||
this.posStartFP = posStartFP;
|
||||
this.payStartFP = payStartFP;
|
||||
this.skipOffset = skipOffset;
|
||||
this.lastPosBlockOffset = lastPosBlockOffset;
|
||||
}
|
||||
}
|
||||
|
||||
private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();
|
||||
|
||||
/** Called when we are done adding docs to this term */
|
||||
@Override
|
||||
public void finishTerm(TermStats stats) throws IOException {
|
||||
|
||||
assert stats.docFreq > 0;
|
||||
|
||||
// TODO: wasteful we are counting this (counting # docs
|
||||
// for this term) in two places?
|
||||
assert stats.docFreq == docCount: stats.docFreq + " vs " + docCount;
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("FPW.finishTerm docFreq=" + stats.docFreq);
|
||||
}
|
||||
|
||||
// nocommit silly that skipper must write skip when we no
|
||||
// postings come after it, but if we don't do this, skip
|
||||
// reader incorrectly thinks it can read another level 0
|
||||
// skip entry here!:
|
||||
//if (docCount > blockSize && docBufferUpto > 0) {
|
||||
if (docCount > blockSize) {
|
||||
final int lastDocCount = blockSize*(docCount/blockSize);
|
||||
if (DEBUG) {
|
||||
System.out.println(" bufferSkip at finishTerm: lastDocID=" + lastBlockDocID + " docCount=" + lastDocCount);
|
||||
}
|
||||
skipWriter.bufferSkip(lastBlockDocID, lastDocCount, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockEndOffset, lastBlockPayloadByteUpto);
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
if (docBufferUpto > 0) {
|
||||
System.out.println(" write doc/freq vInt block (count=" + docBufferUpto + ") at fp=" + docOut.getFilePointer() + " docTermStartFP=" + docTermStartFP);
|
||||
}
|
||||
}
|
||||
|
||||
// vInt encode the remaining doc deltas and freqs:
|
||||
for(int i=0;i<docBufferUpto;i++) {
|
||||
final int docDelta = docDeltaBuffer[i];
|
||||
final int freq = freqBuffer[i];
|
||||
if (!fieldHasFreqs) {
|
||||
docOut.writeVInt(docDelta);
|
||||
} else if (freqBuffer[i] == 1) {
|
||||
docOut.writeVInt((docDelta<<1)|1);
|
||||
} else {
|
||||
docOut.writeVInt(docDelta<<1);
|
||||
docOut.writeVInt(freq);
|
||||
}
|
||||
}
|
||||
|
||||
final int lastPosBlockOffset;
|
||||
|
||||
if (fieldHasPositions) {
|
||||
if (DEBUG) {
|
||||
if (posBufferUpto > 0) {
|
||||
System.out.println(" write pos vInt block (count=" + posBufferUpto + ") at fp=" + posOut.getFilePointer() + " posTermStartFP=" + posTermStartFP + " hasPayloads=" + fieldHasPayloads + " hasOffsets=" + fieldHasOffsets);
|
||||
}
|
||||
}
|
||||
|
||||
assert stats.totalTermFreq != -1;
|
||||
if (stats.totalTermFreq > blockSize) {
|
||||
lastPosBlockOffset = (int) (posOut.getFilePointer() - posTermStartFP);
|
||||
} else {
|
||||
lastPosBlockOffset = -1;
|
||||
}
|
||||
if (posBufferUpto > 0) {
|
||||
posOut.writeVInt(posBufferUpto);
|
||||
|
||||
// nocommit should we send offsets/payloads to
|
||||
// .pay...? seems wasteful (have to store extra
|
||||
// vLong for low (< blockSize) DF terms = vast vast
|
||||
// majority)
|
||||
|
||||
// vInt encode the remaining positions/payloads/offsets:
|
||||
int lastPayloadLength = -1;
|
||||
int payloadBytesReadUpto = 0;
|
||||
for(int i=0;i<posBufferUpto;i++) {
|
||||
final int posDelta = posDeltaBuffer[i];
|
||||
if (fieldHasPayloads) {
|
||||
final int payloadLength = payloadLengthBuffer[i];
|
||||
if (payloadLength != lastPayloadLength) {
|
||||
lastPayloadLength = payloadLength;
|
||||
posOut.writeVInt((posDelta<<1)|1);
|
||||
posOut.writeVInt(payloadLength);
|
||||
} else {
|
||||
posOut.writeVInt(posDelta<<1);
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" i=" + i + " payloadLen=" + payloadLength);
|
||||
}
|
||||
|
||||
if (payloadLength != 0) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" write payload @ pos.fp=" + posOut.getFilePointer());
|
||||
}
|
||||
posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength);
|
||||
payloadBytesReadUpto += payloadLength;
|
||||
}
|
||||
} else {
|
||||
posOut.writeVInt(posDelta);
|
||||
}
|
||||
|
||||
if (fieldHasOffsets) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" write offset @ pos.fp=" + posOut.getFilePointer());
|
||||
}
|
||||
posOut.writeVInt(offsetStartDeltaBuffer[i]);
|
||||
posOut.writeVInt(offsetLengthBuffer[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (fieldHasPayloads) {
|
||||
assert payloadBytesReadUpto == payloadByteUpto;
|
||||
payloadByteUpto = 0;
|
||||
}
|
||||
}
|
||||
if (DEBUG) {
|
||||
System.out.println(" totalTermFreq=" + stats.totalTermFreq + " lastPosBlockOffset=" + lastPosBlockOffset);
|
||||
}
|
||||
} else {
|
||||
lastPosBlockOffset = -1;
|
||||
}
|
||||
|
||||
int skipOffset;
|
||||
if (docCount > blockSize) {
|
||||
skipOffset = (int) (skipWriter.writeSkip(docOut)-docTermStartFP);
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("skip packet " + (docOut.getFilePointer() - (docTermStartFP + skipOffset)) + " bytes");
|
||||
}
|
||||
} else {
|
||||
skipOffset = -1;
|
||||
if (DEBUG) {
|
||||
System.out.println(" no skip: docCount=" + docCount);
|
||||
}
|
||||
}
|
||||
|
||||
long payStartFP;
|
||||
if (stats.totalTermFreq >= blockSize) {
|
||||
payStartFP = payTermStartFP;
|
||||
} else {
|
||||
payStartFP = -1;
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" payStartFP=" + payStartFP);
|
||||
}
|
||||
|
||||
pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset));
|
||||
docBufferUpto = 0;
|
||||
posBufferUpto = 0;
|
||||
lastDocID = 0;
|
||||
docCount = 0;
|
||||
}
|
||||
|
||||
private final RAMOutputStream bytesWriter = new RAMOutputStream();
|
||||
|
||||
@Override
|
||||
public void flushTermsBlock(int start, int count) throws IOException {
|
||||
|
||||
if (count == 0) {
|
||||
termsOut.writeByte((byte) 0);
|
||||
return;
|
||||
}
|
||||
|
||||
assert start <= pendingTerms.size();
|
||||
assert count <= start;
|
||||
|
||||
final int limit = pendingTerms.size() - start + count;
|
||||
|
||||
long lastDocStartFP = 0;
|
||||
long lastPosStartFP = 0;
|
||||
long lastPayStartFP = 0;
|
||||
for(int idx=limit-count; idx<limit; idx++) {
|
||||
PendingTerm term = pendingTerms.get(idx);
|
||||
|
||||
bytesWriter.writeVLong(term.docStartFP - lastDocStartFP);
|
||||
lastDocStartFP = term.docStartFP;
|
||||
|
||||
if (fieldHasPositions) {
|
||||
bytesWriter.writeVLong(term.posStartFP - lastPosStartFP);
|
||||
lastPosStartFP = term.posStartFP;
|
||||
if (term.lastPosBlockOffset != -1) {
|
||||
bytesWriter.writeVInt(term.lastPosBlockOffset);
|
||||
}
|
||||
if ((fieldHasPayloads || fieldHasOffsets) && term.payStartFP != -1) {
|
||||
bytesWriter.writeVLong(term.payStartFP - lastPayStartFP);
|
||||
lastPayStartFP = term.payStartFP;
|
||||
}
|
||||
}
|
||||
|
||||
if (term.skipOffset != -1) {
|
||||
bytesWriter.writeVInt(term.skipOffset);
|
||||
}
|
||||
}
|
||||
|
||||
termsOut.writeVInt((int) bytesWriter.getFilePointer());
|
||||
bytesWriter.writeTo(termsOut);
|
||||
bytesWriter.reset();
|
||||
|
||||
// Remove the terms we just wrote:
|
||||
pendingTerms.subList(limit-count, limit).clear();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
IOUtils.close(docOut, posOut, payOut);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,205 @@
|
|||
package org.apache.lucene.codecs.block;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.codecs.MultiLevelSkipListReader;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
/**
|
||||
* Implements the skip list reader for the 4.0 posting list format
|
||||
* that stores positions and payloads.
|
||||
*
|
||||
* @see Lucene40PostingsFormat
|
||||
* @lucene.experimental
|
||||
*/
|
||||
final class BlockSkipReader extends MultiLevelSkipListReader {
|
||||
private boolean DEBUG = BlockPostingsReader.DEBUG;
|
||||
|
||||
private long docPointer[];
|
||||
private long posPointer[];
|
||||
private long payPointer[];
|
||||
private int posBufferUpto[];
|
||||
private int endOffset[];
|
||||
private int payloadByteUpto[];
|
||||
|
||||
private long lastPosPointer;
|
||||
private long lastPayPointer;
|
||||
private int lastEndOffset;
|
||||
private int lastPayloadByteUpto;
|
||||
private long lastDocPointer;
|
||||
private int lastPosBufferUpto;
|
||||
|
||||
public BlockSkipReader(IndexInput skipStream, int maxSkipLevels, int skipInterval, boolean hasPos, boolean hasOffsets, boolean hasPayloads) {
|
||||
super(skipStream, maxSkipLevels, skipInterval);
|
||||
docPointer = new long[maxSkipLevels];
|
||||
if (hasPos) {
|
||||
posPointer = new long[maxSkipLevels];
|
||||
posBufferUpto = new int[maxSkipLevels];
|
||||
if (hasPayloads) {
|
||||
payloadByteUpto = new int[maxSkipLevels];
|
||||
} else {
|
||||
payloadByteUpto = null;
|
||||
}
|
||||
if (hasOffsets) {
|
||||
endOffset = new int[maxSkipLevels];
|
||||
} else {
|
||||
endOffset = null;
|
||||
}
|
||||
if (hasOffsets || hasPayloads) {
|
||||
payPointer = new long[maxSkipLevels];
|
||||
} else {
|
||||
payPointer = null;
|
||||
}
|
||||
} else {
|
||||
posPointer = null;
|
||||
}
|
||||
}
|
||||
|
||||
public void init(long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df) {
|
||||
super.init(skipPointer, df);
|
||||
lastDocPointer = docBasePointer;
|
||||
lastPosPointer = posBasePointer;
|
||||
lastPayPointer = payBasePointer;
|
||||
|
||||
Arrays.fill(docPointer, docBasePointer);
|
||||
if (posPointer != null) {
|
||||
Arrays.fill(posPointer, posBasePointer);
|
||||
if (payPointer != null) {
|
||||
Arrays.fill(payPointer, payBasePointer);
|
||||
}
|
||||
} else {
|
||||
assert posBasePointer == 0;
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the doc pointer of the doc to which the last call of
|
||||
* {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */
|
||||
public long getDocPointer() {
|
||||
return lastDocPointer;
|
||||
}
|
||||
|
||||
public long getPosPointer() {
|
||||
return lastPosPointer;
|
||||
}
|
||||
|
||||
public int getPosBufferUpto() {
|
||||
return lastPosBufferUpto;
|
||||
}
|
||||
|
||||
public long getPayPointer() {
|
||||
return lastPayPointer;
|
||||
}
|
||||
|
||||
public int getEndOffset() {
|
||||
return lastEndOffset;
|
||||
}
|
||||
|
||||
public int getPayloadByteUpto() {
|
||||
return lastPayloadByteUpto;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void seekChild(int level) throws IOException {
|
||||
super.seekChild(level);
|
||||
if (DEBUG) {
|
||||
System.out.println("seekChild level=" + level);
|
||||
}
|
||||
docPointer[level] = lastDocPointer;
|
||||
if (posPointer != null) {
|
||||
posPointer[level] = lastPosPointer;
|
||||
posBufferUpto[level] = lastPosBufferUpto;
|
||||
if (endOffset != null) {
|
||||
endOffset[level] = lastEndOffset;
|
||||
}
|
||||
if (payloadByteUpto != null) {
|
||||
payloadByteUpto[level] = lastPayloadByteUpto;
|
||||
}
|
||||
if (payPointer != null) {
|
||||
payPointer[level] = lastPayPointer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void setLastSkipData(int level) {
|
||||
super.setLastSkipData(level);
|
||||
lastDocPointer = docPointer[level];
|
||||
if (DEBUG) {
|
||||
System.out.println("setLastSkipData level=" + level);
|
||||
System.out.println(" lastDocPointer=" + lastDocPointer);
|
||||
}
|
||||
if (posPointer != null) {
|
||||
lastPosPointer = posPointer[level];
|
||||
lastPosBufferUpto = posBufferUpto[level];
|
||||
if (DEBUG) {
|
||||
System.out.println(" lastPosPointer=" + lastPosPointer + " lastPosBUfferUpto=" + lastPosBufferUpto);
|
||||
}
|
||||
if (payPointer != null) {
|
||||
lastPayPointer = payPointer[level];
|
||||
}
|
||||
if (endOffset != null) {
|
||||
lastEndOffset = endOffset[level];
|
||||
}
|
||||
if (payloadByteUpto != null) {
|
||||
lastPayloadByteUpto = payloadByteUpto[level];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int readSkipData(int level, IndexInput skipStream) throws IOException {
|
||||
if (DEBUG) {
|
||||
System.out.println("readSkipData level=" + level);
|
||||
}
|
||||
int delta = skipStream.readVInt();
|
||||
if (DEBUG) {
|
||||
System.out.println(" delta=" + delta);
|
||||
}
|
||||
docPointer[level] += skipStream.readVInt();
|
||||
if (DEBUG) {
|
||||
System.out.println(" docFP=" + docPointer[level]);
|
||||
}
|
||||
|
||||
if (posPointer != null) {
|
||||
posPointer[level] += skipStream.readVInt();
|
||||
if (DEBUG) {
|
||||
System.out.println(" posFP=" + posPointer[level]);
|
||||
}
|
||||
posBufferUpto[level] = skipStream.readVInt();
|
||||
if (DEBUG) {
|
||||
System.out.println(" posBufferUpto=" + posBufferUpto[level]);
|
||||
}
|
||||
|
||||
if (payloadByteUpto != null) {
|
||||
payloadByteUpto[level] = skipStream.readVInt();
|
||||
}
|
||||
|
||||
if (endOffset != null) {
|
||||
endOffset[level] += skipStream.readVInt();
|
||||
}
|
||||
|
||||
if (payPointer != null) {
|
||||
payPointer[level] += skipStream.readVInt();
|
||||
}
|
||||
}
|
||||
return delta;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,147 @@
|
|||
package org.apache.lucene.codecs.block;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
|
||||
|
||||
// nocommit do we need more frequent skips at level > 0?
|
||||
// 128*128 is immense? may need to decouple
|
||||
// baseSkipInterval & theRestSkipInterval?
|
||||
|
||||
final class BlockSkipWriter extends MultiLevelSkipListWriter {
|
||||
private boolean DEBUG = BlockPostingsReader.DEBUG;
|
||||
|
||||
private int[] lastSkipDoc;
|
||||
private long[] lastSkipDocPointer;
|
||||
private long[] lastSkipPosPointer;
|
||||
private long[] lastSkipPayPointer;
|
||||
private int[] lastEndOffset;
|
||||
private int[] lastPayloadByteUpto;
|
||||
|
||||
private final IndexOutput docOut;
|
||||
private final IndexOutput posOut;
|
||||
private final IndexOutput payOut;
|
||||
|
||||
private int curDoc;
|
||||
private long curDocPointer;
|
||||
private long curPosPointer;
|
||||
private long curPayPointer;
|
||||
private int curPosBufferUpto;
|
||||
private int curEndOffset;
|
||||
private int curPayloadByteUpto;
|
||||
private boolean fieldHasPositions;
|
||||
private boolean fieldHasOffsets;
|
||||
private boolean fieldHasPayloads;
|
||||
|
||||
public BlockSkipWriter(int skipInterval, int maxSkipLevels, int docCount, IndexOutput docOut, IndexOutput posOut, IndexOutput payOut) {
|
||||
super(skipInterval, maxSkipLevels, docCount);
|
||||
this.docOut = docOut;
|
||||
this.posOut = posOut;
|
||||
this.payOut = payOut;
|
||||
|
||||
lastSkipDoc = new int[maxSkipLevels];
|
||||
lastSkipDocPointer = new long[maxSkipLevels];
|
||||
if (posOut != null) {
|
||||
lastSkipPosPointer = new long[maxSkipLevels];
|
||||
if (payOut != null) {
|
||||
lastSkipPayPointer = new long[maxSkipLevels];
|
||||
}
|
||||
lastEndOffset = new int[maxSkipLevels];
|
||||
lastPayloadByteUpto = new int[maxSkipLevels];
|
||||
}
|
||||
}
|
||||
|
||||
public void setField(boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) {
|
||||
this.fieldHasPositions = fieldHasPositions;
|
||||
this.fieldHasOffsets = fieldHasOffsets;
|
||||
this.fieldHasPayloads = fieldHasPayloads;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void resetSkip() {
|
||||
super.resetSkip();
|
||||
Arrays.fill(lastSkipDoc, 0);
|
||||
Arrays.fill(lastSkipDocPointer, docOut.getFilePointer());
|
||||
if (fieldHasPositions) {
|
||||
Arrays.fill(lastSkipPosPointer, posOut.getFilePointer());
|
||||
if (fieldHasOffsets) {
|
||||
Arrays.fill(lastEndOffset, 0);
|
||||
}
|
||||
if (fieldHasPayloads) {
|
||||
Arrays.fill(lastPayloadByteUpto, 0);
|
||||
}
|
||||
if (fieldHasOffsets || fieldHasPayloads) {
|
||||
Arrays.fill(lastSkipPayPointer, payOut.getFilePointer());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the values for the current skip data.
|
||||
*/
|
||||
public void bufferSkip(int doc, int numDocs, long posFP, long payFP, int posBufferUpto, int endOffset, int payloadByteUpto) throws IOException {
|
||||
this.curDoc = doc;
|
||||
this.curDocPointer = docOut.getFilePointer();
|
||||
this.curPosPointer = posFP;
|
||||
this.curPayPointer = payFP;
|
||||
this.curPosBufferUpto = posBufferUpto;
|
||||
this.curPayloadByteUpto = payloadByteUpto;
|
||||
this.curEndOffset = endOffset;
|
||||
bufferSkip(numDocs);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException {
|
||||
int delta = curDoc - lastSkipDoc[level];
|
||||
if (DEBUG) {
|
||||
System.out.println("writeSkipData level=" + level + " lastDoc=" + curDoc + " delta=" + delta + " curDocPointer=" + curDocPointer);
|
||||
}
|
||||
skipBuffer.writeVInt(delta);
|
||||
lastSkipDoc[level] = curDoc;
|
||||
|
||||
skipBuffer.writeVInt((int) (curDocPointer - lastSkipDocPointer[level]));
|
||||
lastSkipDocPointer[level] = curDocPointer;
|
||||
|
||||
if (fieldHasPositions) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" curPosPointer=" + curPosPointer + " curPosBufferUpto=" + curPosBufferUpto);
|
||||
}
|
||||
skipBuffer.writeVInt((int) (curPosPointer - lastSkipPosPointer[level]));
|
||||
lastSkipPosPointer[level] = curPosPointer;
|
||||
skipBuffer.writeVInt(curPosBufferUpto);
|
||||
|
||||
if (fieldHasPayloads) {
|
||||
skipBuffer.writeVInt(curPayloadByteUpto);
|
||||
}
|
||||
|
||||
if (fieldHasOffsets) {
|
||||
skipBuffer.writeVInt(curEndOffset - lastEndOffset[level]);
|
||||
lastEndOffset[level] = curEndOffset;
|
||||
}
|
||||
|
||||
if (fieldHasOffsets || fieldHasPayloads) {
|
||||
skipBuffer.writeVInt((int) (curPayPointer - lastSkipPayPointer[level]));
|
||||
lastSkipPayPointer[level] = curPayPointer;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -185,8 +185,6 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
int lastDocID;
|
||||
int df;
|
||||
|
||||
/** Adds a new doc in this term. If this returns null
|
||||
* then we just skip consuming positions/payloads. */
|
||||
@Override
|
||||
public void startDoc(int docID, int termDocFreq) throws IOException {
|
||||
// if (DEBUG) System.out.println("SPW: startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq + " freqOut.fp=" + freqOut.getFilePointer());
|
||||
|
|
|
@ -17,8 +17,6 @@ package org.apache.lucene.codecs.pfor;
|
|||
*/
|
||||
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Encode all values in normal area with fixed bit width,
|
||||
|
@ -73,6 +71,9 @@ public class ForUtil {
|
|||
// since this buffer is reused at upper level, rewind first
|
||||
intBuffer.rewind();
|
||||
|
||||
// nocommit assert header isn't "malformed", ie besides
|
||||
// numBytes / bit-width there is nothing else!
|
||||
|
||||
int numBits = ((header >> 8) & MASK[6]);
|
||||
|
||||
decompressCore(intBuffer, data, numBits);
|
||||
|
|
|
@ -34,6 +34,8 @@ import org.apache.lucene.index.FieldInfo.IndexOptions;
|
|||
public class FieldInfos implements Iterable<FieldInfo> {
|
||||
private final boolean hasFreq;
|
||||
private final boolean hasProx;
|
||||
private final boolean hasPayloads;
|
||||
private final boolean hasOffsets;
|
||||
private final boolean hasVectors;
|
||||
private final boolean hasNorms;
|
||||
private final boolean hasDocValues;
|
||||
|
@ -45,6 +47,8 @@ public class FieldInfos implements Iterable<FieldInfo> {
|
|||
public FieldInfos(FieldInfo[] infos) {
|
||||
boolean hasVectors = false;
|
||||
boolean hasProx = false;
|
||||
boolean hasPayloads = false;
|
||||
boolean hasOffsets = false;
|
||||
boolean hasFreq = false;
|
||||
boolean hasNorms = false;
|
||||
boolean hasDocValues = false;
|
||||
|
@ -58,12 +62,16 @@ public class FieldInfos implements Iterable<FieldInfo> {
|
|||
hasVectors |= info.hasVectors();
|
||||
hasProx |= info.isIndexed() && info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
hasFreq |= info.isIndexed() && info.getIndexOptions() != IndexOptions.DOCS_ONLY;
|
||||
hasOffsets |= info.isIndexed() && info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
hasNorms |= info.hasNorms();
|
||||
hasDocValues |= info.hasDocValues();
|
||||
hasPayloads |= info.hasPayloads();
|
||||
}
|
||||
|
||||
this.hasVectors = hasVectors;
|
||||
this.hasProx = hasProx;
|
||||
this.hasPayloads = hasPayloads;
|
||||
this.hasOffsets = hasOffsets;
|
||||
this.hasFreq = hasFreq;
|
||||
this.hasNorms = hasNorms;
|
||||
this.hasDocValues = hasDocValues;
|
||||
|
@ -79,6 +87,16 @@ public class FieldInfos implements Iterable<FieldInfo> {
|
|||
public boolean hasProx() {
|
||||
return hasProx;
|
||||
}
|
||||
|
||||
/** Returns true if any fields have payloads */
|
||||
public boolean hasPayloads() {
|
||||
return hasPayloads;
|
||||
}
|
||||
|
||||
/** Returns true if any fields have offsets */
|
||||
public boolean hasOffsets() {
|
||||
return hasOffsets;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if at least one field has any vectors
|
||||
|
|
|
@ -20,3 +20,4 @@ org.apache.lucene.codecs.memory.MemoryPostingsFormat
|
|||
org.apache.lucene.codecs.pfor.ForPostingsFormat
|
||||
org.apache.lucene.codecs.pfor.PForPostingsFormat
|
||||
org.apache.lucene.codecs.bulkvint.BulkVIntPostingsFormat
|
||||
org.apache.lucene.codecs.block.BlockPostingsFormat
|
||||
|
|
|
@ -0,0 +1,866 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.EnumSet;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsConsumer;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.codecs.TermsConsumer;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.Constants;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
/* NOTE: This test focuses on the postings
|
||||
* (docs/freqs/positions/payloads/offsets) impl, not the
|
||||
* terms dict. The [stretch] goal is for this test to be
|
||||
* so thorough in testing a new PostingsFormat that if this
|
||||
* test passes, then all Lucene/Solr tests should also pass. Ie,
|
||||
* if there is some bug in a given PostingsFormat that this
|
||||
* test fails to catch then this test needs to be improved! */
|
||||
|
||||
// nocommit can we make it easy for testing to pair up a "random terms dict impl" with your postings base format...
|
||||
|
||||
// nocommit test when you reuse after skipping a term or two, eg the block reuse case
|
||||
|
||||
// nocommit hmm contract says .doc() can return NO_MORE_DOCS
|
||||
// before nextDoc too...?
|
||||
|
||||
/* TODO
|
||||
- threads
|
||||
- assert doc=-1 before any nextDoc
|
||||
- if a PF passes this test but fails other tests then this
|
||||
test has a bug!!
|
||||
- test tricky reuse cases, eg across fields
|
||||
- verify you get null if you pass needFreq/needOffset but
|
||||
they weren't indexed
|
||||
*/
|
||||
|
||||
public class TestPostingsFormat extends LuceneTestCase {
|
||||
|
||||
private enum Option {
|
||||
// Sometimes use .advance():
|
||||
SKIPPING,
|
||||
|
||||
// Sometimes reuse the Docs/AndPositionsEnum across terms:
|
||||
REUSE_ENUMS,
|
||||
|
||||
// Sometimes pass non-null live docs:
|
||||
LIVE_DOCS,
|
||||
|
||||
// Sometimes seek to term using previously saved TermState:
|
||||
TERM_STATE,
|
||||
|
||||
// Sometimes don't fully consume docs from the enum
|
||||
PARTIAL_DOC_CONSUME,
|
||||
|
||||
// Sometimes don't fully consume positions at each doc
|
||||
PARTIAL_POS_CONSUME,
|
||||
|
||||
// Sometimes check payloads
|
||||
PAYLOADS,
|
||||
|
||||
// Test w/ multiple threads
|
||||
THREADS};
|
||||
|
||||
private static class FieldAndTerm {
|
||||
String field;
|
||||
BytesRef term;
|
||||
|
||||
public FieldAndTerm(String field, BytesRef term) {
|
||||
this.field = field;
|
||||
this.term = BytesRef.deepCopyOf(term);
|
||||
}
|
||||
}
|
||||
|
||||
private static class Position {
|
||||
int position;
|
||||
byte[] payload;
|
||||
int startOffset;
|
||||
int endOffset;
|
||||
}
|
||||
|
||||
private static class Posting implements Comparable<Posting>{
|
||||
int docID;
|
||||
List<Position> positions;
|
||||
|
||||
public int compareTo(Posting other) {
|
||||
return docID - other.docID;
|
||||
}
|
||||
}
|
||||
|
||||
// Holds all postings:
|
||||
private static Map<String,Map<BytesRef,List<Posting>>> fields = new TreeMap<String,Map<BytesRef,List<Posting>>>();
|
||||
|
||||
// Holds only live doc postings:
|
||||
private static Map<String,Map<BytesRef,List<Posting>>> fieldsLive = new TreeMap<String,Map<BytesRef,List<Posting>>>();
|
||||
|
||||
private static FieldInfos fieldInfos;
|
||||
|
||||
private static int maxDocID;
|
||||
|
||||
private static FixedBitSet globalLiveDocs;
|
||||
|
||||
private static List<FieldAndTerm> allTerms;
|
||||
|
||||
@BeforeClass
|
||||
public static void createPostings() throws IOException {
|
||||
|
||||
final int numFields = _TestUtil.nextInt(random(), 1, 5);
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: " + numFields + " fields");
|
||||
}
|
||||
|
||||
FieldInfo[] fieldInfoArray = new FieldInfo[numFields];
|
||||
int fieldUpto = 0;
|
||||
int numMediumTerms = 0;
|
||||
int numBigTerms = 0;
|
||||
int numManyPositions = 0;
|
||||
while (fieldUpto < numFields) {
|
||||
String field = _TestUtil.randomSimpleString(random());
|
||||
if (fields.containsKey(field)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
boolean fieldHasPayloads = random().nextBoolean();
|
||||
|
||||
fieldInfoArray[fieldUpto] = new FieldInfo(field, true, fieldUpto, false, false, fieldHasPayloads,
|
||||
IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS,
|
||||
null, DocValues.Type.FIXED_INTS_8, null);
|
||||
fieldUpto++;
|
||||
|
||||
Map<BytesRef,List<Posting>> postings = new TreeMap<BytesRef,List<Posting>>();
|
||||
fields.put(field, postings);
|
||||
Set<String> seenTerms = new HashSet<String>();
|
||||
|
||||
// nocommit
|
||||
//final int numTerms = atLeast(10);
|
||||
final int numTerms = 4;
|
||||
for(int termUpto=0;termUpto<numTerms;termUpto++) {
|
||||
String term = _TestUtil.randomSimpleString(random());
|
||||
if (seenTerms.contains(term)) {
|
||||
continue;
|
||||
}
|
||||
seenTerms.add(term);
|
||||
|
||||
int numDocs;
|
||||
if (random().nextInt(10) == 3 && numBigTerms < 3) {
|
||||
// 10% of the time make a highish freq term:
|
||||
numDocs = _TestUtil.nextInt(random(), 50000, 70000);
|
||||
numBigTerms++;
|
||||
term = "big_" + term;
|
||||
} else if (random().nextInt(10) == 3 && numMediumTerms < 10) {
|
||||
// 10% of the time make a medium freq term:
|
||||
// nocommit not high enough to test level 1 skipping:
|
||||
numDocs = atLeast(3000);
|
||||
numMediumTerms++;
|
||||
term = "medium_" + term;
|
||||
} else {
|
||||
// Low freq term:
|
||||
numDocs = _TestUtil.nextInt(random(), 1, 40);
|
||||
term = "low_" + term;
|
||||
}
|
||||
|
||||
numDocs *= RANDOM_MULTIPLIER;
|
||||
|
||||
List<Posting> termPostings = new ArrayList<Posting>();
|
||||
postings.put(new BytesRef(term), termPostings);
|
||||
|
||||
int docID = 0;
|
||||
|
||||
// TODO: more realistic to inversely tie this to numDocs:
|
||||
int maxDocSpacing = _TestUtil.nextInt(random(), 1, 100);
|
||||
|
||||
// 10% of the time create big payloads:
|
||||
int payloadSize;
|
||||
if (!fieldHasPayloads) {
|
||||
payloadSize = 0;
|
||||
} else if (random().nextInt(10) == 7) {
|
||||
payloadSize = random().nextInt(50);
|
||||
} else {
|
||||
payloadSize = random().nextInt(10);
|
||||
}
|
||||
|
||||
boolean fixedPayloads = random().nextBoolean();
|
||||
|
||||
for(int docUpto=0;docUpto<numDocs;docUpto++) {
|
||||
if (docUpto == 0 && random().nextBoolean()) {
|
||||
// Sometimes index docID = 0
|
||||
} else if (maxDocSpacing == 1) {
|
||||
docID++;
|
||||
} else {
|
||||
// nocommit: sometimes have a biggish gap here!
|
||||
docID += _TestUtil.nextInt(random(), 1, maxDocSpacing);
|
||||
}
|
||||
|
||||
Posting posting = new Posting();
|
||||
posting.docID = docID;
|
||||
maxDocID = Math.max(docID, maxDocID);
|
||||
posting.positions = new ArrayList<Position>();
|
||||
termPostings.add(posting);
|
||||
|
||||
int freq;
|
||||
if (random().nextInt(30) == 17 && numManyPositions < 10) {
|
||||
freq = _TestUtil.nextInt(random(), 1, 1000);
|
||||
numManyPositions++;
|
||||
} else {
|
||||
freq = _TestUtil.nextInt(random(), 1, 20);
|
||||
}
|
||||
int pos = 0;
|
||||
int offset = 0;
|
||||
int posSpacing = _TestUtil.nextInt(random(), 1, 100);
|
||||
for(int posUpto=0;posUpto<freq;posUpto++) {
|
||||
if (posUpto == 0 && random().nextBoolean()) {
|
||||
// Sometimes index pos = 0
|
||||
} else if (posSpacing == 1) {
|
||||
pos++;
|
||||
} else {
|
||||
pos += _TestUtil.nextInt(random(), 1, posSpacing);
|
||||
}
|
||||
|
||||
Position position = new Position();
|
||||
posting.positions.add(position);
|
||||
position.position = pos;
|
||||
if (payloadSize != 0) {
|
||||
if (fixedPayloads) {
|
||||
position.payload = new byte[payloadSize];
|
||||
} else {
|
||||
int thisPayloadSize = random().nextInt(payloadSize);
|
||||
if (thisPayloadSize != 0) {
|
||||
position.payload = new byte[thisPayloadSize];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (position.payload != null) {
|
||||
random().nextBytes(position.payload);
|
||||
}
|
||||
|
||||
position.startOffset = offset + random().nextInt(5);
|
||||
position.endOffset = position.startOffset + random().nextInt(10);
|
||||
offset = position.endOffset;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fieldInfos = new FieldInfos(fieldInfoArray);
|
||||
|
||||
globalLiveDocs = new FixedBitSet(1+maxDocID);
|
||||
double liveRatio = random().nextDouble();
|
||||
for(int i=0;i<1+maxDocID;i++) {
|
||||
if (random().nextDouble() <= liveRatio) {
|
||||
globalLiveDocs.set(i);
|
||||
}
|
||||
}
|
||||
|
||||
// Pre-filter postings by globalLiveDocs:
|
||||
for(Map.Entry<String,Map<BytesRef,List<Posting>>> fieldEnt : fields.entrySet()) {
|
||||
Map<BytesRef,List<Posting>> postingsLive = new TreeMap<BytesRef,List<Posting>>();
|
||||
fieldsLive.put(fieldEnt.getKey(), postingsLive);
|
||||
for(Map.Entry<BytesRef,List<Posting>> termEnt : fieldEnt.getValue().entrySet()) {
|
||||
List<Posting> termPostingsLive = new ArrayList<Posting>();
|
||||
postingsLive.put(termEnt.getKey(), termPostingsLive);
|
||||
for(Posting posting : termEnt.getValue()) {
|
||||
if (globalLiveDocs.get(posting.docID)) {
|
||||
termPostingsLive.add(posting);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
allTerms = new ArrayList<FieldAndTerm>();
|
||||
for(Map.Entry<String,Map<BytesRef,List<Posting>>> fieldEnt : fields.entrySet()) {
|
||||
String field = fieldEnt.getKey();
|
||||
for(Map.Entry<BytesRef,List<Posting>> termEnt : fieldEnt.getValue().entrySet()) {
|
||||
allTerms.add(new FieldAndTerm(field, termEnt.getKey()));
|
||||
}
|
||||
}
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: done init postings; maxDocID=" + maxDocID + "; " + allTerms.size() + " total terms, across " + fieldInfos.size() + " fields");
|
||||
}
|
||||
}
|
||||
|
||||
// nocommit maybe instead of @BeforeClass just make a single test run: build postings & index & test it?
|
||||
|
||||
private FieldInfos currentFieldInfos;
|
||||
|
||||
// maxAllowed = the "highest" we can index, but we will still
|
||||
// randomly index at lower IndexOption
|
||||
private FieldsProducer buildIndex(Directory dir, IndexOptions maxAllowed, boolean allowPayloads) throws IOException {
|
||||
SegmentInfo segmentInfo = new SegmentInfo(dir, Constants.LUCENE_MAIN_VERSION, "_0", 1+maxDocID, false, Codec.getDefault(), null, null);
|
||||
|
||||
int maxIndexOption = Arrays.asList(IndexOptions.values()).indexOf(maxAllowed);
|
||||
if (VERBOSE) {
|
||||
System.out.println("\nTEST: now build index");
|
||||
}
|
||||
|
||||
// nocommit use allowPayloads
|
||||
|
||||
FieldInfo[] newFieldInfoArray = new FieldInfo[fields.size()];
|
||||
for(int fieldUpto=0;fieldUpto<fields.size();fieldUpto++) {
|
||||
FieldInfo oldFieldInfo = fieldInfos.fieldInfo(fieldUpto);
|
||||
|
||||
// Randomly picked the IndexOptions to index this
|
||||
// field with:
|
||||
IndexOptions indexOptions = IndexOptions.values()[random().nextInt(1+maxIndexOption)];
|
||||
boolean doPayloads = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 && allowPayloads;
|
||||
|
||||
newFieldInfoArray[fieldUpto] = new FieldInfo(oldFieldInfo.name,
|
||||
true,
|
||||
fieldUpto,
|
||||
false,
|
||||
false,
|
||||
doPayloads,
|
||||
indexOptions,
|
||||
null,
|
||||
DocValues.Type.FIXED_INTS_8,
|
||||
null);
|
||||
}
|
||||
|
||||
FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray);
|
||||
|
||||
SegmentWriteState writeState = new SegmentWriteState(null, dir,
|
||||
segmentInfo, newFieldInfos,
|
||||
32, null, IOContext.DEFAULT);
|
||||
|
||||
FieldsConsumer fieldsConsumer = Codec.getDefault().postingsFormat().fieldsConsumer(writeState);
|
||||
|
||||
for(Map.Entry<String,Map<BytesRef,List<Posting>>> fieldEnt : fields.entrySet()) {
|
||||
String field = fieldEnt.getKey();
|
||||
Map<BytesRef,List<Posting>> terms = fieldEnt.getValue();
|
||||
|
||||
FieldInfo fieldInfo = newFieldInfos.fieldInfo(field);
|
||||
if (VERBOSE) {
|
||||
System.out.println("field=" + field);
|
||||
}
|
||||
|
||||
IndexOptions indexOptions = fieldInfo.getIndexOptions();
|
||||
|
||||
boolean doFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||
boolean doPos = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
boolean doPayloads = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 && allowPayloads;
|
||||
|
||||
TermsConsumer termsConsumer = fieldsConsumer.addField(fieldInfo);
|
||||
long sumTotalTF = 0;
|
||||
long sumDF = 0;
|
||||
FixedBitSet seenDocs = new FixedBitSet(maxDocID+1);
|
||||
for(Map.Entry<BytesRef,List<Posting>> termEnt : terms.entrySet()) {
|
||||
BytesRef term = termEnt.getKey();
|
||||
List<Posting> postings = termEnt.getValue();
|
||||
if (VERBOSE) {
|
||||
System.out.println(" term=" + field + ":" + term.utf8ToString() + " docFreq=" + postings.size());
|
||||
}
|
||||
|
||||
PostingsConsumer postingsConsumer = termsConsumer.startTerm(term);
|
||||
long totalTF = 0;
|
||||
int docCount = 0;
|
||||
for(Posting posting : postings) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" " + docCount + ": docID=" + posting.docID + " freq=" + posting.positions.size());
|
||||
}
|
||||
postingsConsumer.startDoc(posting.docID, posting.positions.size());
|
||||
seenDocs.set(posting.docID);
|
||||
if (doPos) {
|
||||
totalTF += posting.positions.size();
|
||||
for(Position pos : posting.positions) {
|
||||
if (VERBOSE) {
|
||||
if (doPayloads) {
|
||||
System.out.println(" pos=" + pos.position + " payload=" + (pos.payload == null ? "null" : pos.payload.length + " bytes"));
|
||||
} else {
|
||||
System.out.println(" pos=" + pos.position);
|
||||
}
|
||||
}
|
||||
postingsConsumer.addPosition(pos.position, (doPayloads && pos.payload != null) ? new BytesRef(pos.payload) : null, pos.startOffset, pos.endOffset);
|
||||
}
|
||||
} else if (doFreq) {
|
||||
totalTF += posting.positions.size();
|
||||
} else {
|
||||
totalTF++;
|
||||
}
|
||||
docCount++;
|
||||
}
|
||||
termsConsumer.finishTerm(term, new TermStats(postings.size(), totalTF));
|
||||
sumTotalTF += totalTF;
|
||||
sumDF += postings.size();
|
||||
}
|
||||
|
||||
termsConsumer.finish(sumTotalTF, sumDF, seenDocs.cardinality());
|
||||
}
|
||||
|
||||
fieldsConsumer.close();
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: after indexing: files=");
|
||||
for(String file : dir.listAll()) {
|
||||
System.out.println(" " + file + ": " + dir.fileLength(file) + " bytes");
|
||||
}
|
||||
}
|
||||
|
||||
currentFieldInfos = newFieldInfos;
|
||||
|
||||
SegmentReadState readState = new SegmentReadState(dir, segmentInfo, newFieldInfos, IOContext.DEFAULT, 1);
|
||||
|
||||
return Codec.getDefault().postingsFormat().fieldsProducer(readState);
|
||||
}
|
||||
|
||||
private static class ThreadState {
|
||||
// Only used with REUSE option:
|
||||
public DocsEnum reuseDocsEnum;
|
||||
public DocsAndPositionsEnum reuseDocsAndPositionsEnum;
|
||||
}
|
||||
|
||||
private void verifyEnum(ThreadState threadState,
|
||||
String field,
|
||||
BytesRef term,
|
||||
TermsEnum termsEnum,
|
||||
|
||||
// Maximum options (docs/freqs/positions/offsets) to test:
|
||||
IndexOptions maxIndexOptions,
|
||||
|
||||
EnumSet<Option> options) throws IOException {
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(" verifyEnum: options=" + options + " maxIndexOptions=" + maxIndexOptions);
|
||||
}
|
||||
|
||||
// 50% of the time time pass liveDocs:
|
||||
Bits liveDocs;
|
||||
Map<String,Map<BytesRef,List<Posting>>> fieldsToUse;
|
||||
if (options.contains(Option.LIVE_DOCS) && random().nextBoolean()) {
|
||||
liveDocs = globalLiveDocs;
|
||||
fieldsToUse = fieldsLive;
|
||||
if (VERBOSE) {
|
||||
System.out.println(" use liveDocs");
|
||||
}
|
||||
} else {
|
||||
liveDocs = null;
|
||||
fieldsToUse = fields;
|
||||
if (VERBOSE) {
|
||||
System.out.println(" no liveDocs");
|
||||
}
|
||||
}
|
||||
|
||||
FieldInfo fieldInfo = currentFieldInfos.fieldInfo(field);
|
||||
|
||||
assertEquals(fields.get(field).get(term).size(), termsEnum.docFreq());
|
||||
|
||||
// NOTE: can be empty list if we are using liveDocs:
|
||||
List<Posting> expected = fieldsToUse.get(field).get(term);
|
||||
|
||||
boolean allowFreqs = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0 &&
|
||||
maxIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||
boolean doCheckFreqs = allowFreqs && random().nextInt(3) <= 2;
|
||||
|
||||
boolean allowPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 &&
|
||||
maxIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
boolean doCheckPositions = allowPositions && random().nextInt(3) <= 2;
|
||||
|
||||
boolean allowOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0 &&
|
||||
maxIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
boolean doCheckOffsets = allowOffsets && random().nextInt(3) <= 2;
|
||||
|
||||
boolean doCheckPayloads = options.contains(Option.PAYLOADS) && allowPositions && fieldInfo.hasPayloads();
|
||||
|
||||
DocsEnum prevDocsEnum = null;
|
||||
|
||||
DocsEnum docsEnum;
|
||||
DocsAndPositionsEnum docsAndPositionsEnum;
|
||||
|
||||
if (!doCheckPositions) {
|
||||
if (allowPositions && random().nextInt(10) == 7) {
|
||||
// 10% of the time, even though we will not check positions, pull a DocsAndPositions enum
|
||||
if (VERBOSE) {
|
||||
System.out.println(" get DocsAndPositionsEnum (but we won't check positions)");
|
||||
}
|
||||
|
||||
if (options.contains(Option.REUSE_ENUMS) && random().nextInt(10) < 9) {
|
||||
prevDocsEnum = threadState.reuseDocsAndPositionsEnum;
|
||||
}
|
||||
|
||||
threadState.reuseDocsAndPositionsEnum = termsEnum.docsAndPositions(liveDocs, (DocsAndPositionsEnum) prevDocsEnum, false);
|
||||
docsEnum = threadState.reuseDocsAndPositionsEnum;
|
||||
docsAndPositionsEnum = threadState.reuseDocsAndPositionsEnum;
|
||||
} else {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" get DocsEnum");
|
||||
}
|
||||
if (options.contains(Option.REUSE_ENUMS) && random().nextInt(10) < 9) {
|
||||
prevDocsEnum = threadState.reuseDocsEnum;
|
||||
}
|
||||
threadState.reuseDocsEnum = termsEnum.docs(liveDocs, prevDocsEnum, doCheckFreqs);
|
||||
docsEnum = threadState.reuseDocsEnum;
|
||||
docsAndPositionsEnum = null;
|
||||
}
|
||||
} else {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" get DocsAndPositionsEnum");
|
||||
}
|
||||
if (options.contains(Option.REUSE_ENUMS) && random().nextInt(10) < 9) {
|
||||
prevDocsEnum = threadState.reuseDocsAndPositionsEnum;
|
||||
}
|
||||
threadState.reuseDocsAndPositionsEnum = termsEnum.docsAndPositions(liveDocs, (DocsAndPositionsEnum) prevDocsEnum, doCheckOffsets);
|
||||
docsEnum = threadState.reuseDocsAndPositionsEnum;
|
||||
docsAndPositionsEnum = threadState.reuseDocsAndPositionsEnum;
|
||||
}
|
||||
|
||||
assertNotNull(docsEnum);
|
||||
int initialDocID = docsEnum.docID();
|
||||
assertTrue(initialDocID == -1 || initialDocID == DocsEnum.NO_MORE_DOCS);
|
||||
|
||||
if (VERBOSE) {
|
||||
if (prevDocsEnum == null) {
|
||||
System.out.println(" got enum=" + docsEnum);
|
||||
} else if (prevDocsEnum == docsEnum) {
|
||||
System.out.println(" got reuse enum=" + docsEnum);
|
||||
} else {
|
||||
System.out.println(" got enum=" + docsEnum + " (reuse of " + prevDocsEnum + " failed)");
|
||||
}
|
||||
}
|
||||
|
||||
// 10% of the time don't consume all docs:
|
||||
int stopAt;
|
||||
if (options.contains(Option.PARTIAL_DOC_CONSUME) && expected.size() > 1 && random().nextInt(10) == 7) {
|
||||
stopAt = random().nextInt(expected.size()-1);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" will not consume all docs (" + stopAt + " vs " + expected.size() + ")");
|
||||
}
|
||||
} else {
|
||||
stopAt = expected.size();
|
||||
if (VERBOSE) {
|
||||
System.out.println(" consume all docs");
|
||||
}
|
||||
}
|
||||
|
||||
double skipChance = random().nextDouble();
|
||||
int numSkips = expected.size() < 3 ? 1 : _TestUtil.nextInt(random(), 1, Math.min(20, expected.size()/3));
|
||||
int skipInc = expected.size()/numSkips;
|
||||
int skipDocInc = (1+maxDocID)/numSkips;
|
||||
|
||||
// Sometimes do 100% skipping:
|
||||
boolean doAllSkipping = options.contains(Option.SKIPPING) && random().nextInt(7) == 1;
|
||||
|
||||
double freqAskChance = random().nextDouble();
|
||||
double payloadCheckChance = random().nextDouble();
|
||||
double offsetCheckChance = random().nextDouble();
|
||||
|
||||
if (VERBOSE) {
|
||||
if (options.contains(Option.SKIPPING)) {
|
||||
System.out.println(" skipChance=" + skipChance + " numSkips=" + numSkips);
|
||||
} else {
|
||||
System.out.println(" no skipping");
|
||||
}
|
||||
if (doCheckFreqs) {
|
||||
System.out.println(" freqAskChance=" + freqAskChance);
|
||||
}
|
||||
if (doCheckPayloads) {
|
||||
System.out.println(" payloadCheckChance=" + payloadCheckChance);
|
||||
}
|
||||
if (doCheckOffsets) {
|
||||
System.out.println(" offsetCheckChance=" + offsetCheckChance);
|
||||
}
|
||||
}
|
||||
|
||||
int nextPosting = 0;
|
||||
while (nextPosting <= stopAt) {
|
||||
if (nextPosting == stopAt) {
|
||||
if (stopAt == expected.size()) {
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.nextDoc());
|
||||
|
||||
// Common bug is to forget to set this.doc=NO_MORE_DOCS in the enum!:
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.docID());
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
Posting posting;
|
||||
if (options.contains(Option.SKIPPING) && (doAllSkipping || random().nextDouble() <= skipChance)) {
|
||||
int targetDocID = -1;
|
||||
if (nextPosting < stopAt && random().nextBoolean()) {
|
||||
// Pick target we know exists:
|
||||
nextPosting = _TestUtil.nextInt(random(), nextPosting, nextPosting+skipInc);
|
||||
} else {
|
||||
// Pick random target (might not exist):
|
||||
Posting target = new Posting();
|
||||
target.docID = _TestUtil.nextInt(random(), expected.get(nextPosting).docID, expected.get(nextPosting).docID+skipDocInc);
|
||||
targetDocID = target.docID;
|
||||
int loc = Collections.binarySearch(expected.subList(nextPosting, expected.size()), target);
|
||||
if (loc < 0) {
|
||||
loc = -loc-1;
|
||||
}
|
||||
nextPosting = nextPosting + loc;
|
||||
}
|
||||
|
||||
if (nextPosting >= stopAt) {
|
||||
int target = random().nextBoolean() ? (maxDocID+1) : DocsEnum.NO_MORE_DOCS;
|
||||
if (VERBOSE) {
|
||||
System.out.println(" now advance to end (target=" + target + ")");
|
||||
}
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.advance(target));
|
||||
break;
|
||||
} else {
|
||||
posting = expected.get(nextPosting++);
|
||||
if (VERBOSE) {
|
||||
if (targetDocID != -1) {
|
||||
System.out.println(" now advance to random target=" + targetDocID + " (" + nextPosting + " of " + stopAt + ")");
|
||||
} else {
|
||||
System.out.println(" now advance to known-exists target=" + posting.docID + " (" + nextPosting + " of " + stopAt + ")");
|
||||
}
|
||||
}
|
||||
int docID = docsEnum.advance(targetDocID != -1 ? targetDocID : posting.docID);
|
||||
assertEquals(posting.docID, docID);
|
||||
}
|
||||
} else {
|
||||
posting = expected.get(nextPosting++);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" now nextDoc to " + posting.docID + " (" + nextPosting + " of " + stopAt + ")");
|
||||
}
|
||||
int docID = docsEnum.nextDoc();
|
||||
assertEquals(posting.docID, docID);
|
||||
}
|
||||
|
||||
if (doCheckFreqs && random().nextDouble() <= freqAskChance) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" now freq()=" + posting.positions.size());
|
||||
}
|
||||
int freq = docsEnum.freq();
|
||||
assertEquals(posting.positions.size(), freq);
|
||||
}
|
||||
|
||||
if (doCheckPositions) {
|
||||
int freq = docsEnum.freq();
|
||||
int numPosToConsume;
|
||||
if (options.contains(Option.PARTIAL_POS_CONSUME) && random().nextInt(5) == 1) {
|
||||
numPosToConsume = random().nextInt(freq);
|
||||
} else {
|
||||
numPosToConsume = freq;
|
||||
}
|
||||
|
||||
for(int i=0;i<numPosToConsume;i++) {
|
||||
Position position = posting.positions.get(i);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" now nextPosition to " + position.position);
|
||||
}
|
||||
assertEquals(position.position, docsAndPositionsEnum.nextPosition());
|
||||
|
||||
// nocommit sometimes don't pull the payload even
|
||||
// though we pulled the position
|
||||
|
||||
if (doCheckPayloads) {
|
||||
if (random().nextDouble() <= payloadCheckChance) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" now check payload length=" + (position.payload == null ? 0 : position.payload.length));
|
||||
}
|
||||
if (position.payload == null || position.payload.length == 0) {
|
||||
assertFalse(docsAndPositionsEnum.hasPayload());
|
||||
} else {
|
||||
assertTrue(docsAndPositionsEnum.hasPayload());
|
||||
|
||||
BytesRef payload = docsAndPositionsEnum.getPayload();
|
||||
assertFalse(docsAndPositionsEnum.hasPayload());
|
||||
|
||||
assertNotNull(payload);
|
||||
assertEquals(position.payload.length, payload.length);
|
||||
for(int byteUpto=0;byteUpto<position.payload.length;byteUpto++) {
|
||||
assertEquals(position.payload[byteUpto],
|
||||
payload.bytes[payload.offset+byteUpto]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" skip check payload length=" + (position.payload == null ? 0 : position.payload.length));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (doCheckOffsets) {
|
||||
if (random().nextDouble() <= offsetCheckChance) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" now check offsets: startOff=" + position.startOffset + " endOffset=" + position.endOffset);
|
||||
}
|
||||
assertEquals(position.startOffset, docsAndPositionsEnum.startOffset());
|
||||
assertEquals(position.endOffset, docsAndPositionsEnum.endOffset());
|
||||
} else {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" skip check offsets");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
assertEquals(-1, docsAndPositionsEnum.startOffset());
|
||||
assertEquals(-1, docsAndPositionsEnum.endOffset());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void testTerms(final Fields fieldsSource, final EnumSet<Option> options, final IndexOptions maxIndexOptions) throws Exception {
|
||||
|
||||
if (options.contains(Option.THREADS)) {
|
||||
int numThreads = _TestUtil.nextInt(random(), 2, 5);
|
||||
Thread[] threads = new Thread[numThreads];
|
||||
for(int threadUpto=0;threadUpto<numThreads;threadUpto++) {
|
||||
threads[threadUpto] = new Thread() {
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
testTermsOneThread(fieldsSource, options, maxIndexOptions);
|
||||
} catch (Throwable t) {
|
||||
throw new RuntimeException(t);
|
||||
}
|
||||
}
|
||||
};
|
||||
threads[threadUpto].start();
|
||||
}
|
||||
for(int threadUpto=0;threadUpto<numThreads;threadUpto++) {
|
||||
threads[threadUpto].join();
|
||||
}
|
||||
} else {
|
||||
testTermsOneThread(fieldsSource, options, maxIndexOptions);
|
||||
}
|
||||
}
|
||||
|
||||
private void testTermsOneThread(Fields fieldsSource, EnumSet<Option> options, IndexOptions maxIndexOptions) throws IOException {
|
||||
|
||||
ThreadState threadState = new ThreadState();
|
||||
|
||||
// Test random terms/fields:
|
||||
List<TermState> termStates = new ArrayList<TermState>();
|
||||
List<FieldAndTerm> termStateTerms = new ArrayList<FieldAndTerm>();
|
||||
|
||||
Collections.shuffle(allTerms, random());
|
||||
int upto = 0;
|
||||
while (upto < allTerms.size()) {
|
||||
|
||||
boolean useTermState = termStates.size() != 0 && random().nextInt(5) == 1;
|
||||
FieldAndTerm fieldAndTerm;
|
||||
TermsEnum termsEnum;
|
||||
|
||||
TermState termState = null;
|
||||
|
||||
if (!useTermState) {
|
||||
// Seek by random field+term:
|
||||
fieldAndTerm = allTerms.get(upto++);
|
||||
if (VERBOSE) {
|
||||
System.out.println("\nTEST: seek to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString() );
|
||||
}
|
||||
} else {
|
||||
// Seek by previous saved TermState
|
||||
int idx = random().nextInt(termStates.size());
|
||||
fieldAndTerm = termStateTerms.get(idx);
|
||||
if (VERBOSE) {
|
||||
System.out.println("\nTEST: seek using TermState to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString());
|
||||
}
|
||||
termState = termStates.get(idx);
|
||||
}
|
||||
|
||||
Terms terms = fieldsSource.terms(fieldAndTerm.field);
|
||||
assertNotNull(terms);
|
||||
termsEnum = terms.iterator(null);
|
||||
|
||||
if (!useTermState) {
|
||||
assertTrue(termsEnum.seekExact(fieldAndTerm.term, true));
|
||||
} else {
|
||||
termsEnum.seekExact(fieldAndTerm.term, termState);
|
||||
}
|
||||
|
||||
boolean savedTermState = false;
|
||||
|
||||
if (options.contains(Option.TERM_STATE) && !useTermState && random().nextInt(5) == 1) {
|
||||
// Save away this TermState:
|
||||
termStates.add(termsEnum.termState());
|
||||
termStateTerms.add(fieldAndTerm);
|
||||
savedTermState = true;
|
||||
}
|
||||
|
||||
verifyEnum(threadState,
|
||||
fieldAndTerm.field,
|
||||
fieldAndTerm.term,
|
||||
termsEnum,
|
||||
maxIndexOptions,
|
||||
options);
|
||||
|
||||
// Sometimes save term state after pulling the enum:
|
||||
if (options.contains(Option.TERM_STATE) && !useTermState && !savedTermState && random().nextInt(5) == 1) {
|
||||
// Save away this TermState:
|
||||
termStates.add(termsEnum.termState());
|
||||
termStateTerms.add(fieldAndTerm);
|
||||
useTermState = true;
|
||||
}
|
||||
|
||||
// 10% of the time make sure you can pull another enum
|
||||
// from the same term:
|
||||
if (random().nextInt(10) == 7) {
|
||||
// Try same term again
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: try enum again on same term");
|
||||
}
|
||||
|
||||
verifyEnum(threadState,
|
||||
fieldAndTerm.field,
|
||||
fieldAndTerm.term,
|
||||
termsEnum,
|
||||
maxIndexOptions,
|
||||
options);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void test() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
|
||||
boolean indexPayloads = random().nextBoolean();
|
||||
// nocommit test thread safety of buildIndex too
|
||||
FieldsProducer fieldsProducer = buildIndex(dir, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, indexPayloads);
|
||||
|
||||
//testTerms(fieldsProducer, EnumSet.noneOf(Option.class), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
|
||||
//testTerms(fieldsProducer, EnumSet.of(Option.LIVE_DOCS), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
|
||||
//testTerms(fieldsProducer, EnumSet.of(Option.TERM_STATE, Option.LIVE_DOCS, Option.PARTIAL_DOC_CONSUME, Option.PARTIAL_POS_CONSUME), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
|
||||
|
||||
//testTerms(fieldsProducer, EnumSet.of(Option.SKIPPING), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
|
||||
//testTerms(fieldsProducer, EnumSet.of(Option.THREADS, Option.TERM_STATE, Option.SKIPPING, Option.PARTIAL_DOC_CONSUME, Option.PARTIAL_POS_CONSUME), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
|
||||
//testTerms(fieldsProducer, EnumSet.of(Option.TERM_STATE, Option.SKIPPING, Option.PARTIAL_DOC_CONSUME, Option.PARTIAL_POS_CONSUME), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
testTerms(fieldsProducer, EnumSet.of(Option.TERM_STATE, Option.PAYLOADS, Option.PARTIAL_DOC_CONSUME, Option.PARTIAL_POS_CONSUME, Option.SKIPPING), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
|
||||
fieldsProducer.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
||||
// nocommit test that start/endOffset return -1 if field has
|
||||
// no offsets
|
Loading…
Reference in New Issue