mirror of https://github.com/apache/lucene.git
LUCENE-4225: BlockPostingsFormat
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1363421 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cf36fb9a58
commit
1e49670a55
|
@ -24,7 +24,8 @@
|
||||||
</subant>
|
</subant>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="test" description="Test both Lucene and Solr" depends="validate">
|
<!-- nocommit put depends="validate" back -->
|
||||||
|
<target name="test" description="Test both Lucene and Solr">
|
||||||
<sequential>
|
<sequential>
|
||||||
<subant target="test" inheritall="false" failonerror="true">
|
<subant target="test" inheritall="false" failonerror="true">
|
||||||
<fileset dir="lucene" includes="build.xml" />
|
<fileset dir="lucene" includes="build.xml" />
|
||||||
|
|
|
@ -724,7 +724,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
||||||
// Write term stats, to separate byte[] blob:
|
// Write term stats, to separate byte[] blob:
|
||||||
bytesWriter2.writeVInt(term.stats.docFreq);
|
bytesWriter2.writeVInt(term.stats.docFreq);
|
||||||
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
|
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
|
||||||
assert term.stats.totalTermFreq >= term.stats.docFreq;
|
assert term.stats.totalTermFreq >= term.stats.docFreq: term.stats.totalTermFreq + " vs " + term.stats.docFreq;
|
||||||
bytesWriter2.writeVLong(term.stats.totalTermFreq - term.stats.docFreq);
|
bytesWriter2.writeVLong(term.stats.totalTermFreq - term.stats.docFreq);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,108 @@
|
||||||
|
package org.apache.lucene.codecs.block;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.BlockTreeTermsReader;
|
||||||
|
import org.apache.lucene.codecs.BlockTreeTermsWriter;
|
||||||
|
import org.apache.lucene.codecs.FieldsConsumer;
|
||||||
|
import org.apache.lucene.codecs.FieldsProducer;
|
||||||
|
import org.apache.lucene.codecs.PostingsFormat;
|
||||||
|
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||||
|
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||||
|
import org.apache.lucene.index.SegmentReadState;
|
||||||
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pass ForFactory to a PostingsWriter/ReaderBase, and get
|
||||||
|
* customized postings format plugged.
|
||||||
|
*/
|
||||||
|
public final class BlockPostingsFormat extends PostingsFormat {
|
||||||
|
public static final String DOC_EXTENSION = "doc";
|
||||||
|
public static final String POS_EXTENSION = "pos";
|
||||||
|
public static final String PAY_EXTENSION = "pay";
|
||||||
|
|
||||||
|
private final int minTermBlockSize;
|
||||||
|
private final int maxTermBlockSize;
|
||||||
|
public final static int DEFAULT_BLOCK_SIZE = 128;
|
||||||
|
|
||||||
|
public BlockPostingsFormat() {
|
||||||
|
this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
public BlockPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
||||||
|
super("Block");
|
||||||
|
this.minTermBlockSize = minTermBlockSize;
|
||||||
|
assert minTermBlockSize > 1;
|
||||||
|
this.maxTermBlockSize = maxTermBlockSize;
|
||||||
|
assert minTermBlockSize <= maxTermBlockSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return getName() + "(blocksize=" + DEFAULT_BLOCK_SIZE + ")";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||||
|
// TODO: implement a new PostingsWriterBase to improve skip-settings
|
||||||
|
PostingsWriterBase postingsWriter = new BlockPostingsWriter(state, 128);
|
||||||
|
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
FieldsConsumer ret = new BlockTreeTermsWriter(state,
|
||||||
|
postingsWriter,
|
||||||
|
minTermBlockSize,
|
||||||
|
maxTermBlockSize);
|
||||||
|
success = true;
|
||||||
|
return ret;
|
||||||
|
} finally {
|
||||||
|
if (!success) {
|
||||||
|
IOUtils.closeWhileHandlingException(postingsWriter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||||
|
PostingsReaderBase postingsReader = new BlockPostingsReader(state.dir,
|
||||||
|
state.fieldInfos,
|
||||||
|
state.segmentInfo,
|
||||||
|
state.context,
|
||||||
|
state.segmentSuffix,
|
||||||
|
128);
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
FieldsProducer ret = new BlockTreeTermsReader(state.dir,
|
||||||
|
state.fieldInfos,
|
||||||
|
state.segmentInfo.name,
|
||||||
|
postingsReader,
|
||||||
|
state.context,
|
||||||
|
state.segmentSuffix,
|
||||||
|
state.termsIndexDivisor);
|
||||||
|
success = true;
|
||||||
|
return ret;
|
||||||
|
} finally {
|
||||||
|
if (!success) {
|
||||||
|
IOUtils.closeWhileHandlingException(postingsReader);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,574 @@
|
||||||
|
package org.apache.lucene.codecs.block;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.IntBuffer;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
|
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||||
|
import org.apache.lucene.codecs.TermStats;
|
||||||
|
import org.apache.lucene.codecs.pfor.ForUtil; // nocommit move here?
|
||||||
|
import org.apache.lucene.index.CorruptIndexException;
|
||||||
|
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||||
|
import org.apache.lucene.index.FieldInfo;
|
||||||
|
import org.apache.lucene.index.IndexFileNames;
|
||||||
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.store.RAMOutputStream;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
|
// nocommit javadocs
|
||||||
|
|
||||||
|
public final class BlockPostingsWriter extends PostingsWriterBase {
|
||||||
|
|
||||||
|
private boolean DEBUG = BlockPostingsReader.DEBUG;
|
||||||
|
|
||||||
|
// nocommit move these constants to the PF:
|
||||||
|
|
||||||
|
static final int maxSkipLevels = 10;
|
||||||
|
|
||||||
|
final static String TERMS_CODEC = "BlockPostingsWriterTerms";
|
||||||
|
final static String DOC_CODEC = "BlockPostingsWriterDoc";
|
||||||
|
final static String POS_CODEC = "BlockPostingsWriterPos";
|
||||||
|
final static String PAY_CODEC = "BlockPostingsWriterPay";
|
||||||
|
|
||||||
|
// Increment version to change it:
|
||||||
|
final static int VERSION_START = 0;
|
||||||
|
final static int VERSION_CURRENT = VERSION_START;
|
||||||
|
|
||||||
|
final IndexOutput docOut;
|
||||||
|
final IndexOutput posOut;
|
||||||
|
final IndexOutput payOut;
|
||||||
|
|
||||||
|
static final int DEFAULT_BLOCK_SIZE = 128;
|
||||||
|
|
||||||
|
final int blockSize;
|
||||||
|
|
||||||
|
private IndexOutput termsOut;
|
||||||
|
|
||||||
|
// How current field indexes postings:
|
||||||
|
private boolean fieldHasFreqs;
|
||||||
|
private boolean fieldHasPositions;
|
||||||
|
private boolean fieldHasOffsets;
|
||||||
|
private boolean fieldHasPayloads;
|
||||||
|
|
||||||
|
// Holds starting file pointers for each term:
|
||||||
|
private long docTermStartFP;
|
||||||
|
private long posTermStartFP;
|
||||||
|
private long payTermStartFP;
|
||||||
|
|
||||||
|
final int[] docDeltaBuffer;
|
||||||
|
final int[] freqBuffer;
|
||||||
|
private int docBufferUpto;
|
||||||
|
|
||||||
|
final int[] posDeltaBuffer;
|
||||||
|
final int[] payloadLengthBuffer;
|
||||||
|
final int[] offsetStartDeltaBuffer;
|
||||||
|
final int[] offsetLengthBuffer;
|
||||||
|
private int posBufferUpto;
|
||||||
|
|
||||||
|
private byte[] payloadBytes;
|
||||||
|
private int payloadByteUpto;
|
||||||
|
|
||||||
|
private int lastBlockDocID;
|
||||||
|
private boolean saveNextPosBlock;
|
||||||
|
private long lastBlockPosFP;
|
||||||
|
private long lastBlockPayFP;
|
||||||
|
private int lastBlockPosBufferUpto;
|
||||||
|
private int lastBlockEndOffset;
|
||||||
|
private int lastBlockPayloadByteUpto;
|
||||||
|
private int lastDocID;
|
||||||
|
private int lastPosition;
|
||||||
|
private int lastEndOffset;
|
||||||
|
private int docCount;
|
||||||
|
|
||||||
|
final byte[] encoded;
|
||||||
|
final IntBuffer encodedBuffer;
|
||||||
|
|
||||||
|
private final BlockSkipWriter skipWriter;
|
||||||
|
|
||||||
|
public BlockPostingsWriter(SegmentWriteState state, int blockSize) throws IOException {
|
||||||
|
super();
|
||||||
|
this.blockSize = blockSize;
|
||||||
|
|
||||||
|
docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.DOC_EXTENSION),
|
||||||
|
state.context);
|
||||||
|
IndexOutput posOut = null;
|
||||||
|
IndexOutput payOut = null;
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
CodecUtil.writeHeader(docOut, DOC_CODEC, VERSION_CURRENT);
|
||||||
|
if (state.fieldInfos.hasProx()) {
|
||||||
|
posDeltaBuffer = new int[blockSize];
|
||||||
|
posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.POS_EXTENSION),
|
||||||
|
state.context);
|
||||||
|
CodecUtil.writeHeader(posOut, POS_CODEC, VERSION_CURRENT);
|
||||||
|
|
||||||
|
if (state.fieldInfos.hasPayloads()) {
|
||||||
|
payloadBytes = new byte[128];
|
||||||
|
payloadLengthBuffer = new int[blockSize];
|
||||||
|
} else {
|
||||||
|
payloadBytes = null;
|
||||||
|
payloadLengthBuffer = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (state.fieldInfos.hasOffsets()) {
|
||||||
|
offsetStartDeltaBuffer = new int[blockSize];
|
||||||
|
offsetLengthBuffer = new int[blockSize];
|
||||||
|
} else {
|
||||||
|
offsetStartDeltaBuffer = null;
|
||||||
|
offsetLengthBuffer = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) {
|
||||||
|
payOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.PAY_EXTENSION),
|
||||||
|
state.context);
|
||||||
|
CodecUtil.writeHeader(payOut, PAY_CODEC, VERSION_CURRENT);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
posDeltaBuffer = null;
|
||||||
|
payloadLengthBuffer = null;
|
||||||
|
offsetStartDeltaBuffer = null;
|
||||||
|
offsetLengthBuffer = null;
|
||||||
|
payloadBytes = null;
|
||||||
|
}
|
||||||
|
this.payOut = payOut;
|
||||||
|
this.posOut = posOut;
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
if (!success) {
|
||||||
|
IOUtils.closeWhileHandlingException(docOut, posOut, payOut);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
docDeltaBuffer = new int[blockSize];
|
||||||
|
freqBuffer = new int[blockSize];
|
||||||
|
|
||||||
|
skipWriter = new BlockSkipWriter(blockSize,
|
||||||
|
maxSkipLevels,
|
||||||
|
state.segmentInfo.getDocCount(),
|
||||||
|
docOut,
|
||||||
|
posOut,
|
||||||
|
payOut);
|
||||||
|
|
||||||
|
encoded = new byte[blockSize*4 + 4];
|
||||||
|
encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void start(IndexOutput termsOut) throws IOException {
|
||||||
|
this.termsOut = termsOut;
|
||||||
|
CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT);
|
||||||
|
termsOut.writeVInt(blockSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setField(FieldInfo fieldInfo) {
|
||||||
|
IndexOptions indexOptions = fieldInfo.getIndexOptions();
|
||||||
|
fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||||
|
fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||||
|
fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||||
|
fieldHasPayloads = fieldInfo.hasPayloads();
|
||||||
|
skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void startTerm() {
|
||||||
|
docTermStartFP = docOut.getFilePointer();
|
||||||
|
if (fieldHasPositions) {
|
||||||
|
posTermStartFP = posOut.getFilePointer();
|
||||||
|
if (fieldHasPayloads || fieldHasOffsets) {
|
||||||
|
payTermStartFP = payOut.getFilePointer();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lastBlockDocID = -1;
|
||||||
|
lastDocID = 0;
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println("FPW.startTerm startFP=" + docTermStartFP);
|
||||||
|
}
|
||||||
|
skipWriter.resetSkip();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeBlock(int[] buffer, IndexOutput out) throws IOException {
|
||||||
|
final int header = ForUtil.compress(buffer, encodedBuffer);
|
||||||
|
//System.out.println(" block has " + numBytes + " bytes");
|
||||||
|
out.writeVInt(header);
|
||||||
|
out.writeBytes(encoded, ForUtil.getEncodedSize(header));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void startDoc(int docID, int termDocFreq) throws IOException {
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println("FPW.startDoc docID=" + docID);
|
||||||
|
}
|
||||||
|
|
||||||
|
// nocommit do this in finishDoc... but does it fail...?
|
||||||
|
// is it not always called...?
|
||||||
|
if (posOut != null && saveNextPosBlock) {
|
||||||
|
lastBlockPosFP = posOut.getFilePointer();
|
||||||
|
if (payOut != null) {
|
||||||
|
lastBlockPayFP = payOut.getFilePointer();
|
||||||
|
}
|
||||||
|
lastBlockPosBufferUpto = posBufferUpto;
|
||||||
|
lastBlockEndOffset = lastEndOffset;
|
||||||
|
lastBlockPayloadByteUpto = payloadByteUpto;
|
||||||
|
saveNextPosBlock = false;
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(" now save lastBlockPosFP=" + lastBlockPosFP + " lastBlockPosBufferUpto=" + lastBlockPosBufferUpto + " lastBlockPayloadByteUpto=" + lastBlockPayloadByteUpto);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final int docDelta = docID - lastDocID;
|
||||||
|
if (docID < 0 || (docCount > 0 && docDelta <= 0)) {
|
||||||
|
throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (docOut: " + docOut + ")");
|
||||||
|
}
|
||||||
|
lastDocID = docID;
|
||||||
|
|
||||||
|
docDeltaBuffer[docBufferUpto] = docDelta;
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(" docDeltaBuffer[" + docBufferUpto + "]=" + docDelta);
|
||||||
|
}
|
||||||
|
if (fieldHasFreqs) {
|
||||||
|
freqBuffer[docBufferUpto] = termDocFreq;
|
||||||
|
}
|
||||||
|
|
||||||
|
docBufferUpto++;
|
||||||
|
docCount++;
|
||||||
|
|
||||||
|
if (docBufferUpto == blockSize) {
|
||||||
|
// nocommit maybe instead of buffering skip before
|
||||||
|
// writing a block based on last block's end data
|
||||||
|
// ... we could buffer after writing the block? only
|
||||||
|
// iffiness with that approach is it could be a
|
||||||
|
// pointlness skip? like we may stop adding docs
|
||||||
|
// right after that, then we have skip point AFTER
|
||||||
|
// last doc. the thing is, in finishTerm we are
|
||||||
|
// already sometimes adding a skip point AFTER the
|
||||||
|
// last doc?
|
||||||
|
if (lastBlockDocID != -1) {
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(" bufferSkip at writeBlock: lastDocID=" + lastBlockDocID + " docCount=" + (docCount-blockSize));
|
||||||
|
}
|
||||||
|
skipWriter.bufferSkip(lastBlockDocID, docCount-blockSize, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockEndOffset, lastBlockPayloadByteUpto);
|
||||||
|
}
|
||||||
|
lastBlockDocID = docID;
|
||||||
|
saveNextPosBlock = true;
|
||||||
|
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(" write docDelta block @ fp=" + docOut.getFilePointer());
|
||||||
|
}
|
||||||
|
writeBlock(docDeltaBuffer, docOut);
|
||||||
|
if (fieldHasFreqs) {
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(" write freq block @ fp=" + docOut.getFilePointer());
|
||||||
|
}
|
||||||
|
writeBlock(freqBuffer, docOut);
|
||||||
|
}
|
||||||
|
docBufferUpto = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
lastPosition = 0;
|
||||||
|
lastEndOffset = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Add a new position & payload */
|
||||||
|
@Override
|
||||||
|
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: ""));
|
||||||
|
}
|
||||||
|
posDeltaBuffer[posBufferUpto] = position - lastPosition;
|
||||||
|
if (fieldHasPayloads) {
|
||||||
|
if (payload == null || payload.length == 0) {
|
||||||
|
// no payload
|
||||||
|
payloadLengthBuffer[posBufferUpto] = 0;
|
||||||
|
} else {
|
||||||
|
payloadLengthBuffer[posBufferUpto] = payload.length;
|
||||||
|
if (payloadByteUpto + payload.length > payloadBytes.length) {
|
||||||
|
payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length);
|
||||||
|
}
|
||||||
|
System.arraycopy(payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length);
|
||||||
|
payloadByteUpto += payload.length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fieldHasOffsets) {
|
||||||
|
assert startOffset >= lastEndOffset;
|
||||||
|
assert endOffset >= startOffset;
|
||||||
|
offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastEndOffset;
|
||||||
|
offsetLengthBuffer[posBufferUpto] = endOffset - startOffset;
|
||||||
|
lastEndOffset = endOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
posBufferUpto++;
|
||||||
|
lastPosition = position;
|
||||||
|
if (posBufferUpto == blockSize) {
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(" write pos bulk block @ fp=" + posOut.getFilePointer());
|
||||||
|
}
|
||||||
|
writeBlock(posDeltaBuffer, posOut);
|
||||||
|
|
||||||
|
if (fieldHasPayloads) {
|
||||||
|
writeBlock(payloadLengthBuffer, payOut);
|
||||||
|
payOut.writeVInt(payloadByteUpto);
|
||||||
|
payOut.writeBytes(payloadBytes, 0, payloadByteUpto);
|
||||||
|
payloadByteUpto = 0;
|
||||||
|
}
|
||||||
|
if (fieldHasOffsets) {
|
||||||
|
writeBlock(offsetStartDeltaBuffer, payOut);
|
||||||
|
writeBlock(offsetLengthBuffer, payOut);
|
||||||
|
}
|
||||||
|
posBufferUpto = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void finishDoc() {
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class PendingTerm {
|
||||||
|
public final long docStartFP;
|
||||||
|
public final long posStartFP;
|
||||||
|
public final long payStartFP;
|
||||||
|
public final int skipOffset;
|
||||||
|
public final int lastPosBlockOffset;
|
||||||
|
|
||||||
|
public PendingTerm(long docStartFP, long posStartFP, long payStartFP, int skipOffset, int lastPosBlockOffset) {
|
||||||
|
this.docStartFP = docStartFP;
|
||||||
|
this.posStartFP = posStartFP;
|
||||||
|
this.payStartFP = payStartFP;
|
||||||
|
this.skipOffset = skipOffset;
|
||||||
|
this.lastPosBlockOffset = lastPosBlockOffset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();
|
||||||
|
|
||||||
|
/** Called when we are done adding docs to this term */
|
||||||
|
@Override
|
||||||
|
public void finishTerm(TermStats stats) throws IOException {
|
||||||
|
|
||||||
|
assert stats.docFreq > 0;
|
||||||
|
|
||||||
|
// TODO: wasteful we are counting this (counting # docs
|
||||||
|
// for this term) in two places?
|
||||||
|
assert stats.docFreq == docCount: stats.docFreq + " vs " + docCount;
|
||||||
|
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println("FPW.finishTerm docFreq=" + stats.docFreq);
|
||||||
|
}
|
||||||
|
|
||||||
|
// nocommit silly that skipper must write skip when we no
|
||||||
|
// postings come after it, but if we don't do this, skip
|
||||||
|
// reader incorrectly thinks it can read another level 0
|
||||||
|
// skip entry here!:
|
||||||
|
//if (docCount > blockSize && docBufferUpto > 0) {
|
||||||
|
if (docCount > blockSize) {
|
||||||
|
final int lastDocCount = blockSize*(docCount/blockSize);
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(" bufferSkip at finishTerm: lastDocID=" + lastBlockDocID + " docCount=" + lastDocCount);
|
||||||
|
}
|
||||||
|
skipWriter.bufferSkip(lastBlockDocID, lastDocCount, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockEndOffset, lastBlockPayloadByteUpto);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (DEBUG) {
|
||||||
|
if (docBufferUpto > 0) {
|
||||||
|
System.out.println(" write doc/freq vInt block (count=" + docBufferUpto + ") at fp=" + docOut.getFilePointer() + " docTermStartFP=" + docTermStartFP);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// vInt encode the remaining doc deltas and freqs:
|
||||||
|
for(int i=0;i<docBufferUpto;i++) {
|
||||||
|
final int docDelta = docDeltaBuffer[i];
|
||||||
|
final int freq = freqBuffer[i];
|
||||||
|
if (!fieldHasFreqs) {
|
||||||
|
docOut.writeVInt(docDelta);
|
||||||
|
} else if (freqBuffer[i] == 1) {
|
||||||
|
docOut.writeVInt((docDelta<<1)|1);
|
||||||
|
} else {
|
||||||
|
docOut.writeVInt(docDelta<<1);
|
||||||
|
docOut.writeVInt(freq);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final int lastPosBlockOffset;
|
||||||
|
|
||||||
|
if (fieldHasPositions) {
|
||||||
|
if (DEBUG) {
|
||||||
|
if (posBufferUpto > 0) {
|
||||||
|
System.out.println(" write pos vInt block (count=" + posBufferUpto + ") at fp=" + posOut.getFilePointer() + " posTermStartFP=" + posTermStartFP + " hasPayloads=" + fieldHasPayloads + " hasOffsets=" + fieldHasOffsets);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assert stats.totalTermFreq != -1;
|
||||||
|
if (stats.totalTermFreq > blockSize) {
|
||||||
|
lastPosBlockOffset = (int) (posOut.getFilePointer() - posTermStartFP);
|
||||||
|
} else {
|
||||||
|
lastPosBlockOffset = -1;
|
||||||
|
}
|
||||||
|
if (posBufferUpto > 0) {
|
||||||
|
posOut.writeVInt(posBufferUpto);
|
||||||
|
|
||||||
|
// nocommit should we send offsets/payloads to
|
||||||
|
// .pay...? seems wasteful (have to store extra
|
||||||
|
// vLong for low (< blockSize) DF terms = vast vast
|
||||||
|
// majority)
|
||||||
|
|
||||||
|
// vInt encode the remaining positions/payloads/offsets:
|
||||||
|
int lastPayloadLength = -1;
|
||||||
|
int payloadBytesReadUpto = 0;
|
||||||
|
for(int i=0;i<posBufferUpto;i++) {
|
||||||
|
final int posDelta = posDeltaBuffer[i];
|
||||||
|
if (fieldHasPayloads) {
|
||||||
|
final int payloadLength = payloadLengthBuffer[i];
|
||||||
|
if (payloadLength != lastPayloadLength) {
|
||||||
|
lastPayloadLength = payloadLength;
|
||||||
|
posOut.writeVInt((posDelta<<1)|1);
|
||||||
|
posOut.writeVInt(payloadLength);
|
||||||
|
} else {
|
||||||
|
posOut.writeVInt(posDelta<<1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(" i=" + i + " payloadLen=" + payloadLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (payloadLength != 0) {
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(" write payload @ pos.fp=" + posOut.getFilePointer());
|
||||||
|
}
|
||||||
|
posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength);
|
||||||
|
payloadBytesReadUpto += payloadLength;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
posOut.writeVInt(posDelta);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fieldHasOffsets) {
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(" write offset @ pos.fp=" + posOut.getFilePointer());
|
||||||
|
}
|
||||||
|
posOut.writeVInt(offsetStartDeltaBuffer[i]);
|
||||||
|
posOut.writeVInt(offsetLengthBuffer[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fieldHasPayloads) {
|
||||||
|
assert payloadBytesReadUpto == payloadByteUpto;
|
||||||
|
payloadByteUpto = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(" totalTermFreq=" + stats.totalTermFreq + " lastPosBlockOffset=" + lastPosBlockOffset);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
lastPosBlockOffset = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int skipOffset;
|
||||||
|
if (docCount > blockSize) {
|
||||||
|
skipOffset = (int) (skipWriter.writeSkip(docOut)-docTermStartFP);
|
||||||
|
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println("skip packet " + (docOut.getFilePointer() - (docTermStartFP + skipOffset)) + " bytes");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
skipOffset = -1;
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(" no skip: docCount=" + docCount);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
long payStartFP;
|
||||||
|
if (stats.totalTermFreq >= blockSize) {
|
||||||
|
payStartFP = payTermStartFP;
|
||||||
|
} else {
|
||||||
|
payStartFP = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(" payStartFP=" + payStartFP);
|
||||||
|
}
|
||||||
|
|
||||||
|
pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset));
|
||||||
|
docBufferUpto = 0;
|
||||||
|
posBufferUpto = 0;
|
||||||
|
lastDocID = 0;
|
||||||
|
docCount = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final RAMOutputStream bytesWriter = new RAMOutputStream();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void flushTermsBlock(int start, int count) throws IOException {
|
||||||
|
|
||||||
|
if (count == 0) {
|
||||||
|
termsOut.writeByte((byte) 0);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert start <= pendingTerms.size();
|
||||||
|
assert count <= start;
|
||||||
|
|
||||||
|
final int limit = pendingTerms.size() - start + count;
|
||||||
|
|
||||||
|
long lastDocStartFP = 0;
|
||||||
|
long lastPosStartFP = 0;
|
||||||
|
long lastPayStartFP = 0;
|
||||||
|
for(int idx=limit-count; idx<limit; idx++) {
|
||||||
|
PendingTerm term = pendingTerms.get(idx);
|
||||||
|
|
||||||
|
bytesWriter.writeVLong(term.docStartFP - lastDocStartFP);
|
||||||
|
lastDocStartFP = term.docStartFP;
|
||||||
|
|
||||||
|
if (fieldHasPositions) {
|
||||||
|
bytesWriter.writeVLong(term.posStartFP - lastPosStartFP);
|
||||||
|
lastPosStartFP = term.posStartFP;
|
||||||
|
if (term.lastPosBlockOffset != -1) {
|
||||||
|
bytesWriter.writeVInt(term.lastPosBlockOffset);
|
||||||
|
}
|
||||||
|
if ((fieldHasPayloads || fieldHasOffsets) && term.payStartFP != -1) {
|
||||||
|
bytesWriter.writeVLong(term.payStartFP - lastPayStartFP);
|
||||||
|
lastPayStartFP = term.payStartFP;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (term.skipOffset != -1) {
|
||||||
|
bytesWriter.writeVInt(term.skipOffset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
termsOut.writeVInt((int) bytesWriter.getFilePointer());
|
||||||
|
bytesWriter.writeTo(termsOut);
|
||||||
|
bytesWriter.reset();
|
||||||
|
|
||||||
|
// Remove the terms we just wrote:
|
||||||
|
pendingTerms.subList(limit-count, limit).clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
IOUtils.close(docOut, posOut, payOut);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,205 @@
|
||||||
|
package org.apache.lucene.codecs.block;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.MultiLevelSkipListReader;
|
||||||
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implements the skip list reader for the 4.0 posting list format
|
||||||
|
* that stores positions and payloads.
|
||||||
|
*
|
||||||
|
* @see Lucene40PostingsFormat
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
final class BlockSkipReader extends MultiLevelSkipListReader {
|
||||||
|
private boolean DEBUG = BlockPostingsReader.DEBUG;
|
||||||
|
|
||||||
|
private long docPointer[];
|
||||||
|
private long posPointer[];
|
||||||
|
private long payPointer[];
|
||||||
|
private int posBufferUpto[];
|
||||||
|
private int endOffset[];
|
||||||
|
private int payloadByteUpto[];
|
||||||
|
|
||||||
|
private long lastPosPointer;
|
||||||
|
private long lastPayPointer;
|
||||||
|
private int lastEndOffset;
|
||||||
|
private int lastPayloadByteUpto;
|
||||||
|
private long lastDocPointer;
|
||||||
|
private int lastPosBufferUpto;
|
||||||
|
|
||||||
|
public BlockSkipReader(IndexInput skipStream, int maxSkipLevels, int skipInterval, boolean hasPos, boolean hasOffsets, boolean hasPayloads) {
|
||||||
|
super(skipStream, maxSkipLevels, skipInterval);
|
||||||
|
docPointer = new long[maxSkipLevels];
|
||||||
|
if (hasPos) {
|
||||||
|
posPointer = new long[maxSkipLevels];
|
||||||
|
posBufferUpto = new int[maxSkipLevels];
|
||||||
|
if (hasPayloads) {
|
||||||
|
payloadByteUpto = new int[maxSkipLevels];
|
||||||
|
} else {
|
||||||
|
payloadByteUpto = null;
|
||||||
|
}
|
||||||
|
if (hasOffsets) {
|
||||||
|
endOffset = new int[maxSkipLevels];
|
||||||
|
} else {
|
||||||
|
endOffset = null;
|
||||||
|
}
|
||||||
|
if (hasOffsets || hasPayloads) {
|
||||||
|
payPointer = new long[maxSkipLevels];
|
||||||
|
} else {
|
||||||
|
payPointer = null;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
posPointer = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void init(long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df) {
|
||||||
|
super.init(skipPointer, df);
|
||||||
|
lastDocPointer = docBasePointer;
|
||||||
|
lastPosPointer = posBasePointer;
|
||||||
|
lastPayPointer = payBasePointer;
|
||||||
|
|
||||||
|
Arrays.fill(docPointer, docBasePointer);
|
||||||
|
if (posPointer != null) {
|
||||||
|
Arrays.fill(posPointer, posBasePointer);
|
||||||
|
if (payPointer != null) {
|
||||||
|
Arrays.fill(payPointer, payBasePointer);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
assert posBasePointer == 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the doc pointer of the doc to which the last call of
|
||||||
|
* {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */
|
||||||
|
public long getDocPointer() {
|
||||||
|
return lastDocPointer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long getPosPointer() {
|
||||||
|
return lastPosPointer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getPosBufferUpto() {
|
||||||
|
return lastPosBufferUpto;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long getPayPointer() {
|
||||||
|
return lastPayPointer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getEndOffset() {
|
||||||
|
return lastEndOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getPayloadByteUpto() {
|
||||||
|
return lastPayloadByteUpto;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void seekChild(int level) throws IOException {
|
||||||
|
super.seekChild(level);
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println("seekChild level=" + level);
|
||||||
|
}
|
||||||
|
docPointer[level] = lastDocPointer;
|
||||||
|
if (posPointer != null) {
|
||||||
|
posPointer[level] = lastPosPointer;
|
||||||
|
posBufferUpto[level] = lastPosBufferUpto;
|
||||||
|
if (endOffset != null) {
|
||||||
|
endOffset[level] = lastEndOffset;
|
||||||
|
}
|
||||||
|
if (payloadByteUpto != null) {
|
||||||
|
payloadByteUpto[level] = lastPayloadByteUpto;
|
||||||
|
}
|
||||||
|
if (payPointer != null) {
|
||||||
|
payPointer[level] = lastPayPointer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void setLastSkipData(int level) {
|
||||||
|
super.setLastSkipData(level);
|
||||||
|
lastDocPointer = docPointer[level];
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println("setLastSkipData level=" + level);
|
||||||
|
System.out.println(" lastDocPointer=" + lastDocPointer);
|
||||||
|
}
|
||||||
|
if (posPointer != null) {
|
||||||
|
lastPosPointer = posPointer[level];
|
||||||
|
lastPosBufferUpto = posBufferUpto[level];
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(" lastPosPointer=" + lastPosPointer + " lastPosBUfferUpto=" + lastPosBufferUpto);
|
||||||
|
}
|
||||||
|
if (payPointer != null) {
|
||||||
|
lastPayPointer = payPointer[level];
|
||||||
|
}
|
||||||
|
if (endOffset != null) {
|
||||||
|
lastEndOffset = endOffset[level];
|
||||||
|
}
|
||||||
|
if (payloadByteUpto != null) {
|
||||||
|
lastPayloadByteUpto = payloadByteUpto[level];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected int readSkipData(int level, IndexInput skipStream) throws IOException {
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println("readSkipData level=" + level);
|
||||||
|
}
|
||||||
|
int delta = skipStream.readVInt();
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(" delta=" + delta);
|
||||||
|
}
|
||||||
|
docPointer[level] += skipStream.readVInt();
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(" docFP=" + docPointer[level]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (posPointer != null) {
|
||||||
|
posPointer[level] += skipStream.readVInt();
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(" posFP=" + posPointer[level]);
|
||||||
|
}
|
||||||
|
posBufferUpto[level] = skipStream.readVInt();
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(" posBufferUpto=" + posBufferUpto[level]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (payloadByteUpto != null) {
|
||||||
|
payloadByteUpto[level] = skipStream.readVInt();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (endOffset != null) {
|
||||||
|
endOffset[level] += skipStream.readVInt();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (payPointer != null) {
|
||||||
|
payPointer[level] += skipStream.readVInt();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return delta;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,147 @@
|
||||||
|
package org.apache.lucene.codecs.block;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
|
||||||
|
|
||||||
|
// nocommit do we need more frequent skips at level > 0?
|
||||||
|
// 128*128 is immense? may need to decouple
|
||||||
|
// baseSkipInterval & theRestSkipInterval?
|
||||||
|
|
||||||
|
final class BlockSkipWriter extends MultiLevelSkipListWriter {
|
||||||
|
private boolean DEBUG = BlockPostingsReader.DEBUG;
|
||||||
|
|
||||||
|
private int[] lastSkipDoc;
|
||||||
|
private long[] lastSkipDocPointer;
|
||||||
|
private long[] lastSkipPosPointer;
|
||||||
|
private long[] lastSkipPayPointer;
|
||||||
|
private int[] lastEndOffset;
|
||||||
|
private int[] lastPayloadByteUpto;
|
||||||
|
|
||||||
|
private final IndexOutput docOut;
|
||||||
|
private final IndexOutput posOut;
|
||||||
|
private final IndexOutput payOut;
|
||||||
|
|
||||||
|
private int curDoc;
|
||||||
|
private long curDocPointer;
|
||||||
|
private long curPosPointer;
|
||||||
|
private long curPayPointer;
|
||||||
|
private int curPosBufferUpto;
|
||||||
|
private int curEndOffset;
|
||||||
|
private int curPayloadByteUpto;
|
||||||
|
private boolean fieldHasPositions;
|
||||||
|
private boolean fieldHasOffsets;
|
||||||
|
private boolean fieldHasPayloads;
|
||||||
|
|
||||||
|
public BlockSkipWriter(int skipInterval, int maxSkipLevels, int docCount, IndexOutput docOut, IndexOutput posOut, IndexOutput payOut) {
|
||||||
|
super(skipInterval, maxSkipLevels, docCount);
|
||||||
|
this.docOut = docOut;
|
||||||
|
this.posOut = posOut;
|
||||||
|
this.payOut = payOut;
|
||||||
|
|
||||||
|
lastSkipDoc = new int[maxSkipLevels];
|
||||||
|
lastSkipDocPointer = new long[maxSkipLevels];
|
||||||
|
if (posOut != null) {
|
||||||
|
lastSkipPosPointer = new long[maxSkipLevels];
|
||||||
|
if (payOut != null) {
|
||||||
|
lastSkipPayPointer = new long[maxSkipLevels];
|
||||||
|
}
|
||||||
|
lastEndOffset = new int[maxSkipLevels];
|
||||||
|
lastPayloadByteUpto = new int[maxSkipLevels];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setField(boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) {
|
||||||
|
this.fieldHasPositions = fieldHasPositions;
|
||||||
|
this.fieldHasOffsets = fieldHasOffsets;
|
||||||
|
this.fieldHasPayloads = fieldHasPayloads;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void resetSkip() {
|
||||||
|
super.resetSkip();
|
||||||
|
Arrays.fill(lastSkipDoc, 0);
|
||||||
|
Arrays.fill(lastSkipDocPointer, docOut.getFilePointer());
|
||||||
|
if (fieldHasPositions) {
|
||||||
|
Arrays.fill(lastSkipPosPointer, posOut.getFilePointer());
|
||||||
|
if (fieldHasOffsets) {
|
||||||
|
Arrays.fill(lastEndOffset, 0);
|
||||||
|
}
|
||||||
|
if (fieldHasPayloads) {
|
||||||
|
Arrays.fill(lastPayloadByteUpto, 0);
|
||||||
|
}
|
||||||
|
if (fieldHasOffsets || fieldHasPayloads) {
|
||||||
|
Arrays.fill(lastSkipPayPointer, payOut.getFilePointer());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the values for the current skip data.
|
||||||
|
*/
|
||||||
|
public void bufferSkip(int doc, int numDocs, long posFP, long payFP, int posBufferUpto, int endOffset, int payloadByteUpto) throws IOException {
|
||||||
|
this.curDoc = doc;
|
||||||
|
this.curDocPointer = docOut.getFilePointer();
|
||||||
|
this.curPosPointer = posFP;
|
||||||
|
this.curPayPointer = payFP;
|
||||||
|
this.curPosBufferUpto = posBufferUpto;
|
||||||
|
this.curPayloadByteUpto = payloadByteUpto;
|
||||||
|
this.curEndOffset = endOffset;
|
||||||
|
bufferSkip(numDocs);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException {
|
||||||
|
int delta = curDoc - lastSkipDoc[level];
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println("writeSkipData level=" + level + " lastDoc=" + curDoc + " delta=" + delta + " curDocPointer=" + curDocPointer);
|
||||||
|
}
|
||||||
|
skipBuffer.writeVInt(delta);
|
||||||
|
lastSkipDoc[level] = curDoc;
|
||||||
|
|
||||||
|
skipBuffer.writeVInt((int) (curDocPointer - lastSkipDocPointer[level]));
|
||||||
|
lastSkipDocPointer[level] = curDocPointer;
|
||||||
|
|
||||||
|
if (fieldHasPositions) {
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println(" curPosPointer=" + curPosPointer + " curPosBufferUpto=" + curPosBufferUpto);
|
||||||
|
}
|
||||||
|
skipBuffer.writeVInt((int) (curPosPointer - lastSkipPosPointer[level]));
|
||||||
|
lastSkipPosPointer[level] = curPosPointer;
|
||||||
|
skipBuffer.writeVInt(curPosBufferUpto);
|
||||||
|
|
||||||
|
if (fieldHasPayloads) {
|
||||||
|
skipBuffer.writeVInt(curPayloadByteUpto);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fieldHasOffsets) {
|
||||||
|
skipBuffer.writeVInt(curEndOffset - lastEndOffset[level]);
|
||||||
|
lastEndOffset[level] = curEndOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fieldHasOffsets || fieldHasPayloads) {
|
||||||
|
skipBuffer.writeVInt((int) (curPayPointer - lastSkipPayPointer[level]));
|
||||||
|
lastSkipPayPointer[level] = curPayPointer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -185,8 +185,6 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
||||||
int lastDocID;
|
int lastDocID;
|
||||||
int df;
|
int df;
|
||||||
|
|
||||||
/** Adds a new doc in this term. If this returns null
|
|
||||||
* then we just skip consuming positions/payloads. */
|
|
||||||
@Override
|
@Override
|
||||||
public void startDoc(int docID, int termDocFreq) throws IOException {
|
public void startDoc(int docID, int termDocFreq) throws IOException {
|
||||||
// if (DEBUG) System.out.println("SPW: startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq + " freqOut.fp=" + freqOut.getFilePointer());
|
// if (DEBUG) System.out.println("SPW: startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq + " freqOut.fp=" + freqOut.getFilePointer());
|
||||||
|
|
|
@ -17,8 +17,6 @@ package org.apache.lucene.codecs.pfor;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.nio.IntBuffer;
|
import java.nio.IntBuffer;
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
import java.util.Arrays;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Encode all values in normal area with fixed bit width,
|
* Encode all values in normal area with fixed bit width,
|
||||||
|
@ -73,6 +71,9 @@ public class ForUtil {
|
||||||
// since this buffer is reused at upper level, rewind first
|
// since this buffer is reused at upper level, rewind first
|
||||||
intBuffer.rewind();
|
intBuffer.rewind();
|
||||||
|
|
||||||
|
// nocommit assert header isn't "malformed", ie besides
|
||||||
|
// numBytes / bit-width there is nothing else!
|
||||||
|
|
||||||
int numBits = ((header >> 8) & MASK[6]);
|
int numBits = ((header >> 8) & MASK[6]);
|
||||||
|
|
||||||
decompressCore(intBuffer, data, numBits);
|
decompressCore(intBuffer, data, numBits);
|
||||||
|
|
|
@ -34,6 +34,8 @@ import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||||
public class FieldInfos implements Iterable<FieldInfo> {
|
public class FieldInfos implements Iterable<FieldInfo> {
|
||||||
private final boolean hasFreq;
|
private final boolean hasFreq;
|
||||||
private final boolean hasProx;
|
private final boolean hasProx;
|
||||||
|
private final boolean hasPayloads;
|
||||||
|
private final boolean hasOffsets;
|
||||||
private final boolean hasVectors;
|
private final boolean hasVectors;
|
||||||
private final boolean hasNorms;
|
private final boolean hasNorms;
|
||||||
private final boolean hasDocValues;
|
private final boolean hasDocValues;
|
||||||
|
@ -45,6 +47,8 @@ public class FieldInfos implements Iterable<FieldInfo> {
|
||||||
public FieldInfos(FieldInfo[] infos) {
|
public FieldInfos(FieldInfo[] infos) {
|
||||||
boolean hasVectors = false;
|
boolean hasVectors = false;
|
||||||
boolean hasProx = false;
|
boolean hasProx = false;
|
||||||
|
boolean hasPayloads = false;
|
||||||
|
boolean hasOffsets = false;
|
||||||
boolean hasFreq = false;
|
boolean hasFreq = false;
|
||||||
boolean hasNorms = false;
|
boolean hasNorms = false;
|
||||||
boolean hasDocValues = false;
|
boolean hasDocValues = false;
|
||||||
|
@ -58,12 +62,16 @@ public class FieldInfos implements Iterable<FieldInfo> {
|
||||||
hasVectors |= info.hasVectors();
|
hasVectors |= info.hasVectors();
|
||||||
hasProx |= info.isIndexed() && info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
hasProx |= info.isIndexed() && info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||||
hasFreq |= info.isIndexed() && info.getIndexOptions() != IndexOptions.DOCS_ONLY;
|
hasFreq |= info.isIndexed() && info.getIndexOptions() != IndexOptions.DOCS_ONLY;
|
||||||
|
hasOffsets |= info.isIndexed() && info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||||
hasNorms |= info.hasNorms();
|
hasNorms |= info.hasNorms();
|
||||||
hasDocValues |= info.hasDocValues();
|
hasDocValues |= info.hasDocValues();
|
||||||
|
hasPayloads |= info.hasPayloads();
|
||||||
}
|
}
|
||||||
|
|
||||||
this.hasVectors = hasVectors;
|
this.hasVectors = hasVectors;
|
||||||
this.hasProx = hasProx;
|
this.hasProx = hasProx;
|
||||||
|
this.hasPayloads = hasPayloads;
|
||||||
|
this.hasOffsets = hasOffsets;
|
||||||
this.hasFreq = hasFreq;
|
this.hasFreq = hasFreq;
|
||||||
this.hasNorms = hasNorms;
|
this.hasNorms = hasNorms;
|
||||||
this.hasDocValues = hasDocValues;
|
this.hasDocValues = hasDocValues;
|
||||||
|
@ -80,6 +88,16 @@ public class FieldInfos implements Iterable<FieldInfo> {
|
||||||
return hasProx;
|
return hasProx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns true if any fields have payloads */
|
||||||
|
public boolean hasPayloads() {
|
||||||
|
return hasPayloads;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns true if any fields have offsets */
|
||||||
|
public boolean hasOffsets() {
|
||||||
|
return hasOffsets;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return true if at least one field has any vectors
|
* @return true if at least one field has any vectors
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -20,3 +20,4 @@ org.apache.lucene.codecs.memory.MemoryPostingsFormat
|
||||||
org.apache.lucene.codecs.pfor.ForPostingsFormat
|
org.apache.lucene.codecs.pfor.ForPostingsFormat
|
||||||
org.apache.lucene.codecs.pfor.PForPostingsFormat
|
org.apache.lucene.codecs.pfor.PForPostingsFormat
|
||||||
org.apache.lucene.codecs.bulkvint.BulkVIntPostingsFormat
|
org.apache.lucene.codecs.bulkvint.BulkVIntPostingsFormat
|
||||||
|
org.apache.lucene.codecs.block.BlockPostingsFormat
|
||||||
|
|
|
@ -0,0 +1,866 @@
|
||||||
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.EnumSet;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.Codec;
|
||||||
|
import org.apache.lucene.codecs.FieldsConsumer;
|
||||||
|
import org.apache.lucene.codecs.FieldsProducer;
|
||||||
|
import org.apache.lucene.codecs.PostingsConsumer;
|
||||||
|
import org.apache.lucene.codecs.TermStats;
|
||||||
|
import org.apache.lucene.codecs.TermsConsumer;
|
||||||
|
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
|
import org.apache.lucene.util.Bits;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.Constants;
|
||||||
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
|
||||||
|
/* NOTE: This test focuses on the postings
|
||||||
|
* (docs/freqs/positions/payloads/offsets) impl, not the
|
||||||
|
* terms dict. The [stretch] goal is for this test to be
|
||||||
|
* so thorough in testing a new PostingsFormat that if this
|
||||||
|
* test passes, then all Lucene/Solr tests should also pass. Ie,
|
||||||
|
* if there is some bug in a given PostingsFormat that this
|
||||||
|
* test fails to catch then this test needs to be improved! */
|
||||||
|
|
||||||
|
// nocommit can we make it easy for testing to pair up a "random terms dict impl" with your postings base format...
|
||||||
|
|
||||||
|
// nocommit test when you reuse after skipping a term or two, eg the block reuse case
|
||||||
|
|
||||||
|
// nocommit hmm contract says .doc() can return NO_MORE_DOCS
|
||||||
|
// before nextDoc too...?
|
||||||
|
|
||||||
|
/* TODO
|
||||||
|
- threads
|
||||||
|
- assert doc=-1 before any nextDoc
|
||||||
|
- if a PF passes this test but fails other tests then this
|
||||||
|
test has a bug!!
|
||||||
|
- test tricky reuse cases, eg across fields
|
||||||
|
- verify you get null if you pass needFreq/needOffset but
|
||||||
|
they weren't indexed
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class TestPostingsFormat extends LuceneTestCase {
|
||||||
|
|
||||||
|
private enum Option {
|
||||||
|
// Sometimes use .advance():
|
||||||
|
SKIPPING,
|
||||||
|
|
||||||
|
// Sometimes reuse the Docs/AndPositionsEnum across terms:
|
||||||
|
REUSE_ENUMS,
|
||||||
|
|
||||||
|
// Sometimes pass non-null live docs:
|
||||||
|
LIVE_DOCS,
|
||||||
|
|
||||||
|
// Sometimes seek to term using previously saved TermState:
|
||||||
|
TERM_STATE,
|
||||||
|
|
||||||
|
// Sometimes don't fully consume docs from the enum
|
||||||
|
PARTIAL_DOC_CONSUME,
|
||||||
|
|
||||||
|
// Sometimes don't fully consume positions at each doc
|
||||||
|
PARTIAL_POS_CONSUME,
|
||||||
|
|
||||||
|
// Sometimes check payloads
|
||||||
|
PAYLOADS,
|
||||||
|
|
||||||
|
// Test w/ multiple threads
|
||||||
|
THREADS};
|
||||||
|
|
||||||
|
private static class FieldAndTerm {
|
||||||
|
String field;
|
||||||
|
BytesRef term;
|
||||||
|
|
||||||
|
public FieldAndTerm(String field, BytesRef term) {
|
||||||
|
this.field = field;
|
||||||
|
this.term = BytesRef.deepCopyOf(term);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class Position {
|
||||||
|
int position;
|
||||||
|
byte[] payload;
|
||||||
|
int startOffset;
|
||||||
|
int endOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class Posting implements Comparable<Posting>{
|
||||||
|
int docID;
|
||||||
|
List<Position> positions;
|
||||||
|
|
||||||
|
public int compareTo(Posting other) {
|
||||||
|
return docID - other.docID;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Holds all postings:
|
||||||
|
private static Map<String,Map<BytesRef,List<Posting>>> fields = new TreeMap<String,Map<BytesRef,List<Posting>>>();
|
||||||
|
|
||||||
|
// Holds only live doc postings:
|
||||||
|
private static Map<String,Map<BytesRef,List<Posting>>> fieldsLive = new TreeMap<String,Map<BytesRef,List<Posting>>>();
|
||||||
|
|
||||||
|
private static FieldInfos fieldInfos;
|
||||||
|
|
||||||
|
private static int maxDocID;
|
||||||
|
|
||||||
|
private static FixedBitSet globalLiveDocs;
|
||||||
|
|
||||||
|
private static List<FieldAndTerm> allTerms;
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void createPostings() throws IOException {
|
||||||
|
|
||||||
|
final int numFields = _TestUtil.nextInt(random(), 1, 5);
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: " + numFields + " fields");
|
||||||
|
}
|
||||||
|
|
||||||
|
FieldInfo[] fieldInfoArray = new FieldInfo[numFields];
|
||||||
|
int fieldUpto = 0;
|
||||||
|
int numMediumTerms = 0;
|
||||||
|
int numBigTerms = 0;
|
||||||
|
int numManyPositions = 0;
|
||||||
|
while (fieldUpto < numFields) {
|
||||||
|
String field = _TestUtil.randomSimpleString(random());
|
||||||
|
if (fields.containsKey(field)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean fieldHasPayloads = random().nextBoolean();
|
||||||
|
|
||||||
|
fieldInfoArray[fieldUpto] = new FieldInfo(field, true, fieldUpto, false, false, fieldHasPayloads,
|
||||||
|
IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS,
|
||||||
|
null, DocValues.Type.FIXED_INTS_8, null);
|
||||||
|
fieldUpto++;
|
||||||
|
|
||||||
|
Map<BytesRef,List<Posting>> postings = new TreeMap<BytesRef,List<Posting>>();
|
||||||
|
fields.put(field, postings);
|
||||||
|
Set<String> seenTerms = new HashSet<String>();
|
||||||
|
|
||||||
|
// nocommit
|
||||||
|
//final int numTerms = atLeast(10);
|
||||||
|
final int numTerms = 4;
|
||||||
|
for(int termUpto=0;termUpto<numTerms;termUpto++) {
|
||||||
|
String term = _TestUtil.randomSimpleString(random());
|
||||||
|
if (seenTerms.contains(term)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
seenTerms.add(term);
|
||||||
|
|
||||||
|
int numDocs;
|
||||||
|
if (random().nextInt(10) == 3 && numBigTerms < 3) {
|
||||||
|
// 10% of the time make a highish freq term:
|
||||||
|
numDocs = _TestUtil.nextInt(random(), 50000, 70000);
|
||||||
|
numBigTerms++;
|
||||||
|
term = "big_" + term;
|
||||||
|
} else if (random().nextInt(10) == 3 && numMediumTerms < 10) {
|
||||||
|
// 10% of the time make a medium freq term:
|
||||||
|
// nocommit not high enough to test level 1 skipping:
|
||||||
|
numDocs = atLeast(3000);
|
||||||
|
numMediumTerms++;
|
||||||
|
term = "medium_" + term;
|
||||||
|
} else {
|
||||||
|
// Low freq term:
|
||||||
|
numDocs = _TestUtil.nextInt(random(), 1, 40);
|
||||||
|
term = "low_" + term;
|
||||||
|
}
|
||||||
|
|
||||||
|
numDocs *= RANDOM_MULTIPLIER;
|
||||||
|
|
||||||
|
List<Posting> termPostings = new ArrayList<Posting>();
|
||||||
|
postings.put(new BytesRef(term), termPostings);
|
||||||
|
|
||||||
|
int docID = 0;
|
||||||
|
|
||||||
|
// TODO: more realistic to inversely tie this to numDocs:
|
||||||
|
int maxDocSpacing = _TestUtil.nextInt(random(), 1, 100);
|
||||||
|
|
||||||
|
// 10% of the time create big payloads:
|
||||||
|
int payloadSize;
|
||||||
|
if (!fieldHasPayloads) {
|
||||||
|
payloadSize = 0;
|
||||||
|
} else if (random().nextInt(10) == 7) {
|
||||||
|
payloadSize = random().nextInt(50);
|
||||||
|
} else {
|
||||||
|
payloadSize = random().nextInt(10);
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean fixedPayloads = random().nextBoolean();
|
||||||
|
|
||||||
|
for(int docUpto=0;docUpto<numDocs;docUpto++) {
|
||||||
|
if (docUpto == 0 && random().nextBoolean()) {
|
||||||
|
// Sometimes index docID = 0
|
||||||
|
} else if (maxDocSpacing == 1) {
|
||||||
|
docID++;
|
||||||
|
} else {
|
||||||
|
// nocommit: sometimes have a biggish gap here!
|
||||||
|
docID += _TestUtil.nextInt(random(), 1, maxDocSpacing);
|
||||||
|
}
|
||||||
|
|
||||||
|
Posting posting = new Posting();
|
||||||
|
posting.docID = docID;
|
||||||
|
maxDocID = Math.max(docID, maxDocID);
|
||||||
|
posting.positions = new ArrayList<Position>();
|
||||||
|
termPostings.add(posting);
|
||||||
|
|
||||||
|
int freq;
|
||||||
|
if (random().nextInt(30) == 17 && numManyPositions < 10) {
|
||||||
|
freq = _TestUtil.nextInt(random(), 1, 1000);
|
||||||
|
numManyPositions++;
|
||||||
|
} else {
|
||||||
|
freq = _TestUtil.nextInt(random(), 1, 20);
|
||||||
|
}
|
||||||
|
int pos = 0;
|
||||||
|
int offset = 0;
|
||||||
|
int posSpacing = _TestUtil.nextInt(random(), 1, 100);
|
||||||
|
for(int posUpto=0;posUpto<freq;posUpto++) {
|
||||||
|
if (posUpto == 0 && random().nextBoolean()) {
|
||||||
|
// Sometimes index pos = 0
|
||||||
|
} else if (posSpacing == 1) {
|
||||||
|
pos++;
|
||||||
|
} else {
|
||||||
|
pos += _TestUtil.nextInt(random(), 1, posSpacing);
|
||||||
|
}
|
||||||
|
|
||||||
|
Position position = new Position();
|
||||||
|
posting.positions.add(position);
|
||||||
|
position.position = pos;
|
||||||
|
if (payloadSize != 0) {
|
||||||
|
if (fixedPayloads) {
|
||||||
|
position.payload = new byte[payloadSize];
|
||||||
|
} else {
|
||||||
|
int thisPayloadSize = random().nextInt(payloadSize);
|
||||||
|
if (thisPayloadSize != 0) {
|
||||||
|
position.payload = new byte[thisPayloadSize];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (position.payload != null) {
|
||||||
|
random().nextBytes(position.payload);
|
||||||
|
}
|
||||||
|
|
||||||
|
position.startOffset = offset + random().nextInt(5);
|
||||||
|
position.endOffset = position.startOffset + random().nextInt(10);
|
||||||
|
offset = position.endOffset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fieldInfos = new FieldInfos(fieldInfoArray);
|
||||||
|
|
||||||
|
globalLiveDocs = new FixedBitSet(1+maxDocID);
|
||||||
|
double liveRatio = random().nextDouble();
|
||||||
|
for(int i=0;i<1+maxDocID;i++) {
|
||||||
|
if (random().nextDouble() <= liveRatio) {
|
||||||
|
globalLiveDocs.set(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pre-filter postings by globalLiveDocs:
|
||||||
|
for(Map.Entry<String,Map<BytesRef,List<Posting>>> fieldEnt : fields.entrySet()) {
|
||||||
|
Map<BytesRef,List<Posting>> postingsLive = new TreeMap<BytesRef,List<Posting>>();
|
||||||
|
fieldsLive.put(fieldEnt.getKey(), postingsLive);
|
||||||
|
for(Map.Entry<BytesRef,List<Posting>> termEnt : fieldEnt.getValue().entrySet()) {
|
||||||
|
List<Posting> termPostingsLive = new ArrayList<Posting>();
|
||||||
|
postingsLive.put(termEnt.getKey(), termPostingsLive);
|
||||||
|
for(Posting posting : termEnt.getValue()) {
|
||||||
|
if (globalLiveDocs.get(posting.docID)) {
|
||||||
|
termPostingsLive.add(posting);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
allTerms = new ArrayList<FieldAndTerm>();
|
||||||
|
for(Map.Entry<String,Map<BytesRef,List<Posting>>> fieldEnt : fields.entrySet()) {
|
||||||
|
String field = fieldEnt.getKey();
|
||||||
|
for(Map.Entry<BytesRef,List<Posting>> termEnt : fieldEnt.getValue().entrySet()) {
|
||||||
|
allTerms.add(new FieldAndTerm(field, termEnt.getKey()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: done init postings; maxDocID=" + maxDocID + "; " + allTerms.size() + " total terms, across " + fieldInfos.size() + " fields");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// nocommit maybe instead of @BeforeClass just make a single test run: build postings & index & test it?
|
||||||
|
|
||||||
|
private FieldInfos currentFieldInfos;
|
||||||
|
|
||||||
|
// maxAllowed = the "highest" we can index, but we will still
|
||||||
|
// randomly index at lower IndexOption
|
||||||
|
private FieldsProducer buildIndex(Directory dir, IndexOptions maxAllowed, boolean allowPayloads) throws IOException {
|
||||||
|
SegmentInfo segmentInfo = new SegmentInfo(dir, Constants.LUCENE_MAIN_VERSION, "_0", 1+maxDocID, false, Codec.getDefault(), null, null);
|
||||||
|
|
||||||
|
int maxIndexOption = Arrays.asList(IndexOptions.values()).indexOf(maxAllowed);
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("\nTEST: now build index");
|
||||||
|
}
|
||||||
|
|
||||||
|
// nocommit use allowPayloads
|
||||||
|
|
||||||
|
FieldInfo[] newFieldInfoArray = new FieldInfo[fields.size()];
|
||||||
|
for(int fieldUpto=0;fieldUpto<fields.size();fieldUpto++) {
|
||||||
|
FieldInfo oldFieldInfo = fieldInfos.fieldInfo(fieldUpto);
|
||||||
|
|
||||||
|
// Randomly picked the IndexOptions to index this
|
||||||
|
// field with:
|
||||||
|
IndexOptions indexOptions = IndexOptions.values()[random().nextInt(1+maxIndexOption)];
|
||||||
|
boolean doPayloads = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 && allowPayloads;
|
||||||
|
|
||||||
|
newFieldInfoArray[fieldUpto] = new FieldInfo(oldFieldInfo.name,
|
||||||
|
true,
|
||||||
|
fieldUpto,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
doPayloads,
|
||||||
|
indexOptions,
|
||||||
|
null,
|
||||||
|
DocValues.Type.FIXED_INTS_8,
|
||||||
|
null);
|
||||||
|
}
|
||||||
|
|
||||||
|
FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray);
|
||||||
|
|
||||||
|
SegmentWriteState writeState = new SegmentWriteState(null, dir,
|
||||||
|
segmentInfo, newFieldInfos,
|
||||||
|
32, null, IOContext.DEFAULT);
|
||||||
|
|
||||||
|
FieldsConsumer fieldsConsumer = Codec.getDefault().postingsFormat().fieldsConsumer(writeState);
|
||||||
|
|
||||||
|
for(Map.Entry<String,Map<BytesRef,List<Posting>>> fieldEnt : fields.entrySet()) {
|
||||||
|
String field = fieldEnt.getKey();
|
||||||
|
Map<BytesRef,List<Posting>> terms = fieldEnt.getValue();
|
||||||
|
|
||||||
|
FieldInfo fieldInfo = newFieldInfos.fieldInfo(field);
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("field=" + field);
|
||||||
|
}
|
||||||
|
|
||||||
|
IndexOptions indexOptions = fieldInfo.getIndexOptions();
|
||||||
|
|
||||||
|
boolean doFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||||
|
boolean doPos = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||||
|
boolean doPayloads = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 && allowPayloads;
|
||||||
|
|
||||||
|
TermsConsumer termsConsumer = fieldsConsumer.addField(fieldInfo);
|
||||||
|
long sumTotalTF = 0;
|
||||||
|
long sumDF = 0;
|
||||||
|
FixedBitSet seenDocs = new FixedBitSet(maxDocID+1);
|
||||||
|
for(Map.Entry<BytesRef,List<Posting>> termEnt : terms.entrySet()) {
|
||||||
|
BytesRef term = termEnt.getKey();
|
||||||
|
List<Posting> postings = termEnt.getValue();
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" term=" + field + ":" + term.utf8ToString() + " docFreq=" + postings.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
PostingsConsumer postingsConsumer = termsConsumer.startTerm(term);
|
||||||
|
long totalTF = 0;
|
||||||
|
int docCount = 0;
|
||||||
|
for(Posting posting : postings) {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" " + docCount + ": docID=" + posting.docID + " freq=" + posting.positions.size());
|
||||||
|
}
|
||||||
|
postingsConsumer.startDoc(posting.docID, posting.positions.size());
|
||||||
|
seenDocs.set(posting.docID);
|
||||||
|
if (doPos) {
|
||||||
|
totalTF += posting.positions.size();
|
||||||
|
for(Position pos : posting.positions) {
|
||||||
|
if (VERBOSE) {
|
||||||
|
if (doPayloads) {
|
||||||
|
System.out.println(" pos=" + pos.position + " payload=" + (pos.payload == null ? "null" : pos.payload.length + " bytes"));
|
||||||
|
} else {
|
||||||
|
System.out.println(" pos=" + pos.position);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
postingsConsumer.addPosition(pos.position, (doPayloads && pos.payload != null) ? new BytesRef(pos.payload) : null, pos.startOffset, pos.endOffset);
|
||||||
|
}
|
||||||
|
} else if (doFreq) {
|
||||||
|
totalTF += posting.positions.size();
|
||||||
|
} else {
|
||||||
|
totalTF++;
|
||||||
|
}
|
||||||
|
docCount++;
|
||||||
|
}
|
||||||
|
termsConsumer.finishTerm(term, new TermStats(postings.size(), totalTF));
|
||||||
|
sumTotalTF += totalTF;
|
||||||
|
sumDF += postings.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
termsConsumer.finish(sumTotalTF, sumDF, seenDocs.cardinality());
|
||||||
|
}
|
||||||
|
|
||||||
|
fieldsConsumer.close();
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: after indexing: files=");
|
||||||
|
for(String file : dir.listAll()) {
|
||||||
|
System.out.println(" " + file + ": " + dir.fileLength(file) + " bytes");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
currentFieldInfos = newFieldInfos;
|
||||||
|
|
||||||
|
SegmentReadState readState = new SegmentReadState(dir, segmentInfo, newFieldInfos, IOContext.DEFAULT, 1);
|
||||||
|
|
||||||
|
return Codec.getDefault().postingsFormat().fieldsProducer(readState);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class ThreadState {
|
||||||
|
// Only used with REUSE option:
|
||||||
|
public DocsEnum reuseDocsEnum;
|
||||||
|
public DocsAndPositionsEnum reuseDocsAndPositionsEnum;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyEnum(ThreadState threadState,
|
||||||
|
String field,
|
||||||
|
BytesRef term,
|
||||||
|
TermsEnum termsEnum,
|
||||||
|
|
||||||
|
// Maximum options (docs/freqs/positions/offsets) to test:
|
||||||
|
IndexOptions maxIndexOptions,
|
||||||
|
|
||||||
|
EnumSet<Option> options) throws IOException {
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" verifyEnum: options=" + options + " maxIndexOptions=" + maxIndexOptions);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 50% of the time time pass liveDocs:
|
||||||
|
Bits liveDocs;
|
||||||
|
Map<String,Map<BytesRef,List<Posting>>> fieldsToUse;
|
||||||
|
if (options.contains(Option.LIVE_DOCS) && random().nextBoolean()) {
|
||||||
|
liveDocs = globalLiveDocs;
|
||||||
|
fieldsToUse = fieldsLive;
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" use liveDocs");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
liveDocs = null;
|
||||||
|
fieldsToUse = fields;
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" no liveDocs");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
FieldInfo fieldInfo = currentFieldInfos.fieldInfo(field);
|
||||||
|
|
||||||
|
assertEquals(fields.get(field).get(term).size(), termsEnum.docFreq());
|
||||||
|
|
||||||
|
// NOTE: can be empty list if we are using liveDocs:
|
||||||
|
List<Posting> expected = fieldsToUse.get(field).get(term);
|
||||||
|
|
||||||
|
boolean allowFreqs = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0 &&
|
||||||
|
maxIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||||
|
boolean doCheckFreqs = allowFreqs && random().nextInt(3) <= 2;
|
||||||
|
|
||||||
|
boolean allowPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 &&
|
||||||
|
maxIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||||
|
boolean doCheckPositions = allowPositions && random().nextInt(3) <= 2;
|
||||||
|
|
||||||
|
boolean allowOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0 &&
|
||||||
|
maxIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||||
|
boolean doCheckOffsets = allowOffsets && random().nextInt(3) <= 2;
|
||||||
|
|
||||||
|
boolean doCheckPayloads = options.contains(Option.PAYLOADS) && allowPositions && fieldInfo.hasPayloads();
|
||||||
|
|
||||||
|
DocsEnum prevDocsEnum = null;
|
||||||
|
|
||||||
|
DocsEnum docsEnum;
|
||||||
|
DocsAndPositionsEnum docsAndPositionsEnum;
|
||||||
|
|
||||||
|
if (!doCheckPositions) {
|
||||||
|
if (allowPositions && random().nextInt(10) == 7) {
|
||||||
|
// 10% of the time, even though we will not check positions, pull a DocsAndPositions enum
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" get DocsAndPositionsEnum (but we won't check positions)");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options.contains(Option.REUSE_ENUMS) && random().nextInt(10) < 9) {
|
||||||
|
prevDocsEnum = threadState.reuseDocsAndPositionsEnum;
|
||||||
|
}
|
||||||
|
|
||||||
|
threadState.reuseDocsAndPositionsEnum = termsEnum.docsAndPositions(liveDocs, (DocsAndPositionsEnum) prevDocsEnum, false);
|
||||||
|
docsEnum = threadState.reuseDocsAndPositionsEnum;
|
||||||
|
docsAndPositionsEnum = threadState.reuseDocsAndPositionsEnum;
|
||||||
|
} else {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" get DocsEnum");
|
||||||
|
}
|
||||||
|
if (options.contains(Option.REUSE_ENUMS) && random().nextInt(10) < 9) {
|
||||||
|
prevDocsEnum = threadState.reuseDocsEnum;
|
||||||
|
}
|
||||||
|
threadState.reuseDocsEnum = termsEnum.docs(liveDocs, prevDocsEnum, doCheckFreqs);
|
||||||
|
docsEnum = threadState.reuseDocsEnum;
|
||||||
|
docsAndPositionsEnum = null;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" get DocsAndPositionsEnum");
|
||||||
|
}
|
||||||
|
if (options.contains(Option.REUSE_ENUMS) && random().nextInt(10) < 9) {
|
||||||
|
prevDocsEnum = threadState.reuseDocsAndPositionsEnum;
|
||||||
|
}
|
||||||
|
threadState.reuseDocsAndPositionsEnum = termsEnum.docsAndPositions(liveDocs, (DocsAndPositionsEnum) prevDocsEnum, doCheckOffsets);
|
||||||
|
docsEnum = threadState.reuseDocsAndPositionsEnum;
|
||||||
|
docsAndPositionsEnum = threadState.reuseDocsAndPositionsEnum;
|
||||||
|
}
|
||||||
|
|
||||||
|
assertNotNull(docsEnum);
|
||||||
|
int initialDocID = docsEnum.docID();
|
||||||
|
assertTrue(initialDocID == -1 || initialDocID == DocsEnum.NO_MORE_DOCS);
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
if (prevDocsEnum == null) {
|
||||||
|
System.out.println(" got enum=" + docsEnum);
|
||||||
|
} else if (prevDocsEnum == docsEnum) {
|
||||||
|
System.out.println(" got reuse enum=" + docsEnum);
|
||||||
|
} else {
|
||||||
|
System.out.println(" got enum=" + docsEnum + " (reuse of " + prevDocsEnum + " failed)");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 10% of the time don't consume all docs:
|
||||||
|
int stopAt;
|
||||||
|
if (options.contains(Option.PARTIAL_DOC_CONSUME) && expected.size() > 1 && random().nextInt(10) == 7) {
|
||||||
|
stopAt = random().nextInt(expected.size()-1);
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" will not consume all docs (" + stopAt + " vs " + expected.size() + ")");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
stopAt = expected.size();
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" consume all docs");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double skipChance = random().nextDouble();
|
||||||
|
int numSkips = expected.size() < 3 ? 1 : _TestUtil.nextInt(random(), 1, Math.min(20, expected.size()/3));
|
||||||
|
int skipInc = expected.size()/numSkips;
|
||||||
|
int skipDocInc = (1+maxDocID)/numSkips;
|
||||||
|
|
||||||
|
// Sometimes do 100% skipping:
|
||||||
|
boolean doAllSkipping = options.contains(Option.SKIPPING) && random().nextInt(7) == 1;
|
||||||
|
|
||||||
|
double freqAskChance = random().nextDouble();
|
||||||
|
double payloadCheckChance = random().nextDouble();
|
||||||
|
double offsetCheckChance = random().nextDouble();
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
if (options.contains(Option.SKIPPING)) {
|
||||||
|
System.out.println(" skipChance=" + skipChance + " numSkips=" + numSkips);
|
||||||
|
} else {
|
||||||
|
System.out.println(" no skipping");
|
||||||
|
}
|
||||||
|
if (doCheckFreqs) {
|
||||||
|
System.out.println(" freqAskChance=" + freqAskChance);
|
||||||
|
}
|
||||||
|
if (doCheckPayloads) {
|
||||||
|
System.out.println(" payloadCheckChance=" + payloadCheckChance);
|
||||||
|
}
|
||||||
|
if (doCheckOffsets) {
|
||||||
|
System.out.println(" offsetCheckChance=" + offsetCheckChance);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int nextPosting = 0;
|
||||||
|
while (nextPosting <= stopAt) {
|
||||||
|
if (nextPosting == stopAt) {
|
||||||
|
if (stopAt == expected.size()) {
|
||||||
|
assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.nextDoc());
|
||||||
|
|
||||||
|
// Common bug is to forget to set this.doc=NO_MORE_DOCS in the enum!:
|
||||||
|
assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.docID());
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
Posting posting;
|
||||||
|
if (options.contains(Option.SKIPPING) && (doAllSkipping || random().nextDouble() <= skipChance)) {
|
||||||
|
int targetDocID = -1;
|
||||||
|
if (nextPosting < stopAt && random().nextBoolean()) {
|
||||||
|
// Pick target we know exists:
|
||||||
|
nextPosting = _TestUtil.nextInt(random(), nextPosting, nextPosting+skipInc);
|
||||||
|
} else {
|
||||||
|
// Pick random target (might not exist):
|
||||||
|
Posting target = new Posting();
|
||||||
|
target.docID = _TestUtil.nextInt(random(), expected.get(nextPosting).docID, expected.get(nextPosting).docID+skipDocInc);
|
||||||
|
targetDocID = target.docID;
|
||||||
|
int loc = Collections.binarySearch(expected.subList(nextPosting, expected.size()), target);
|
||||||
|
if (loc < 0) {
|
||||||
|
loc = -loc-1;
|
||||||
|
}
|
||||||
|
nextPosting = nextPosting + loc;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nextPosting >= stopAt) {
|
||||||
|
int target = random().nextBoolean() ? (maxDocID+1) : DocsEnum.NO_MORE_DOCS;
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" now advance to end (target=" + target + ")");
|
||||||
|
}
|
||||||
|
assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.advance(target));
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
posting = expected.get(nextPosting++);
|
||||||
|
if (VERBOSE) {
|
||||||
|
if (targetDocID != -1) {
|
||||||
|
System.out.println(" now advance to random target=" + targetDocID + " (" + nextPosting + " of " + stopAt + ")");
|
||||||
|
} else {
|
||||||
|
System.out.println(" now advance to known-exists target=" + posting.docID + " (" + nextPosting + " of " + stopAt + ")");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int docID = docsEnum.advance(targetDocID != -1 ? targetDocID : posting.docID);
|
||||||
|
assertEquals(posting.docID, docID);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
posting = expected.get(nextPosting++);
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" now nextDoc to " + posting.docID + " (" + nextPosting + " of " + stopAt + ")");
|
||||||
|
}
|
||||||
|
int docID = docsEnum.nextDoc();
|
||||||
|
assertEquals(posting.docID, docID);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (doCheckFreqs && random().nextDouble() <= freqAskChance) {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" now freq()=" + posting.positions.size());
|
||||||
|
}
|
||||||
|
int freq = docsEnum.freq();
|
||||||
|
assertEquals(posting.positions.size(), freq);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (doCheckPositions) {
|
||||||
|
int freq = docsEnum.freq();
|
||||||
|
int numPosToConsume;
|
||||||
|
if (options.contains(Option.PARTIAL_POS_CONSUME) && random().nextInt(5) == 1) {
|
||||||
|
numPosToConsume = random().nextInt(freq);
|
||||||
|
} else {
|
||||||
|
numPosToConsume = freq;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int i=0;i<numPosToConsume;i++) {
|
||||||
|
Position position = posting.positions.get(i);
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" now nextPosition to " + position.position);
|
||||||
|
}
|
||||||
|
assertEquals(position.position, docsAndPositionsEnum.nextPosition());
|
||||||
|
|
||||||
|
// nocommit sometimes don't pull the payload even
|
||||||
|
// though we pulled the position
|
||||||
|
|
||||||
|
if (doCheckPayloads) {
|
||||||
|
if (random().nextDouble() <= payloadCheckChance) {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" now check payload length=" + (position.payload == null ? 0 : position.payload.length));
|
||||||
|
}
|
||||||
|
if (position.payload == null || position.payload.length == 0) {
|
||||||
|
assertFalse(docsAndPositionsEnum.hasPayload());
|
||||||
|
} else {
|
||||||
|
assertTrue(docsAndPositionsEnum.hasPayload());
|
||||||
|
|
||||||
|
BytesRef payload = docsAndPositionsEnum.getPayload();
|
||||||
|
assertFalse(docsAndPositionsEnum.hasPayload());
|
||||||
|
|
||||||
|
assertNotNull(payload);
|
||||||
|
assertEquals(position.payload.length, payload.length);
|
||||||
|
for(int byteUpto=0;byteUpto<position.payload.length;byteUpto++) {
|
||||||
|
assertEquals(position.payload[byteUpto],
|
||||||
|
payload.bytes[payload.offset+byteUpto]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" skip check payload length=" + (position.payload == null ? 0 : position.payload.length));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (doCheckOffsets) {
|
||||||
|
if (random().nextDouble() <= offsetCheckChance) {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" now check offsets: startOff=" + position.startOffset + " endOffset=" + position.endOffset);
|
||||||
|
}
|
||||||
|
assertEquals(position.startOffset, docsAndPositionsEnum.startOffset());
|
||||||
|
assertEquals(position.endOffset, docsAndPositionsEnum.endOffset());
|
||||||
|
} else {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" skip check offsets");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
assertEquals(-1, docsAndPositionsEnum.startOffset());
|
||||||
|
assertEquals(-1, docsAndPositionsEnum.endOffset());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void testTerms(final Fields fieldsSource, final EnumSet<Option> options, final IndexOptions maxIndexOptions) throws Exception {
|
||||||
|
|
||||||
|
if (options.contains(Option.THREADS)) {
|
||||||
|
int numThreads = _TestUtil.nextInt(random(), 2, 5);
|
||||||
|
Thread[] threads = new Thread[numThreads];
|
||||||
|
for(int threadUpto=0;threadUpto<numThreads;threadUpto++) {
|
||||||
|
threads[threadUpto] = new Thread() {
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
try {
|
||||||
|
testTermsOneThread(fieldsSource, options, maxIndexOptions);
|
||||||
|
} catch (Throwable t) {
|
||||||
|
throw new RuntimeException(t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
threads[threadUpto].start();
|
||||||
|
}
|
||||||
|
for(int threadUpto=0;threadUpto<numThreads;threadUpto++) {
|
||||||
|
threads[threadUpto].join();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
testTermsOneThread(fieldsSource, options, maxIndexOptions);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void testTermsOneThread(Fields fieldsSource, EnumSet<Option> options, IndexOptions maxIndexOptions) throws IOException {
|
||||||
|
|
||||||
|
ThreadState threadState = new ThreadState();
|
||||||
|
|
||||||
|
// Test random terms/fields:
|
||||||
|
List<TermState> termStates = new ArrayList<TermState>();
|
||||||
|
List<FieldAndTerm> termStateTerms = new ArrayList<FieldAndTerm>();
|
||||||
|
|
||||||
|
Collections.shuffle(allTerms, random());
|
||||||
|
int upto = 0;
|
||||||
|
while (upto < allTerms.size()) {
|
||||||
|
|
||||||
|
boolean useTermState = termStates.size() != 0 && random().nextInt(5) == 1;
|
||||||
|
FieldAndTerm fieldAndTerm;
|
||||||
|
TermsEnum termsEnum;
|
||||||
|
|
||||||
|
TermState termState = null;
|
||||||
|
|
||||||
|
if (!useTermState) {
|
||||||
|
// Seek by random field+term:
|
||||||
|
fieldAndTerm = allTerms.get(upto++);
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("\nTEST: seek to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString() );
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Seek by previous saved TermState
|
||||||
|
int idx = random().nextInt(termStates.size());
|
||||||
|
fieldAndTerm = termStateTerms.get(idx);
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("\nTEST: seek using TermState to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString());
|
||||||
|
}
|
||||||
|
termState = termStates.get(idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
Terms terms = fieldsSource.terms(fieldAndTerm.field);
|
||||||
|
assertNotNull(terms);
|
||||||
|
termsEnum = terms.iterator(null);
|
||||||
|
|
||||||
|
if (!useTermState) {
|
||||||
|
assertTrue(termsEnum.seekExact(fieldAndTerm.term, true));
|
||||||
|
} else {
|
||||||
|
termsEnum.seekExact(fieldAndTerm.term, termState);
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean savedTermState = false;
|
||||||
|
|
||||||
|
if (options.contains(Option.TERM_STATE) && !useTermState && random().nextInt(5) == 1) {
|
||||||
|
// Save away this TermState:
|
||||||
|
termStates.add(termsEnum.termState());
|
||||||
|
termStateTerms.add(fieldAndTerm);
|
||||||
|
savedTermState = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
verifyEnum(threadState,
|
||||||
|
fieldAndTerm.field,
|
||||||
|
fieldAndTerm.term,
|
||||||
|
termsEnum,
|
||||||
|
maxIndexOptions,
|
||||||
|
options);
|
||||||
|
|
||||||
|
// Sometimes save term state after pulling the enum:
|
||||||
|
if (options.contains(Option.TERM_STATE) && !useTermState && !savedTermState && random().nextInt(5) == 1) {
|
||||||
|
// Save away this TermState:
|
||||||
|
termStates.add(termsEnum.termState());
|
||||||
|
termStateTerms.add(fieldAndTerm);
|
||||||
|
useTermState = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 10% of the time make sure you can pull another enum
|
||||||
|
// from the same term:
|
||||||
|
if (random().nextInt(10) == 7) {
|
||||||
|
// Try same term again
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: try enum again on same term");
|
||||||
|
}
|
||||||
|
|
||||||
|
verifyEnum(threadState,
|
||||||
|
fieldAndTerm.field,
|
||||||
|
fieldAndTerm.term,
|
||||||
|
termsEnum,
|
||||||
|
maxIndexOptions,
|
||||||
|
options);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void test() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
|
||||||
|
boolean indexPayloads = random().nextBoolean();
|
||||||
|
// nocommit test thread safety of buildIndex too
|
||||||
|
FieldsProducer fieldsProducer = buildIndex(dir, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, indexPayloads);
|
||||||
|
|
||||||
|
//testTerms(fieldsProducer, EnumSet.noneOf(Option.class), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
|
||||||
|
//testTerms(fieldsProducer, EnumSet.of(Option.LIVE_DOCS), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
|
||||||
|
//testTerms(fieldsProducer, EnumSet.of(Option.TERM_STATE, Option.LIVE_DOCS, Option.PARTIAL_DOC_CONSUME, Option.PARTIAL_POS_CONSUME), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
|
||||||
|
|
||||||
|
//testTerms(fieldsProducer, EnumSet.of(Option.SKIPPING), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
|
||||||
|
//testTerms(fieldsProducer, EnumSet.of(Option.THREADS, Option.TERM_STATE, Option.SKIPPING, Option.PARTIAL_DOC_CONSUME, Option.PARTIAL_POS_CONSUME), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
|
||||||
|
//testTerms(fieldsProducer, EnumSet.of(Option.TERM_STATE, Option.SKIPPING, Option.PARTIAL_DOC_CONSUME, Option.PARTIAL_POS_CONSUME), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||||
|
testTerms(fieldsProducer, EnumSet.of(Option.TERM_STATE, Option.PAYLOADS, Option.PARTIAL_DOC_CONSUME, Option.PARTIAL_POS_CONSUME, Option.SKIPPING), IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||||
|
|
||||||
|
fieldsProducer.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// nocommit test that start/endOffset return -1 if field has
|
||||||
|
// no offsets
|
Loading…
Reference in New Issue