mirror of
https://github.com/apache/lucene.git
synced 2025-02-16 23:15:46 +00:00
LUCENE-3892: replace Block with BlockPacked
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1371519 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
90918ad2d6
commit
4a5496ac97
@ -43,7 +43,8 @@ public final class BlockPostingsFormat extends PostingsFormat {
|
||||
private final int maxTermBlockSize;
|
||||
|
||||
// nocommit is this right?:
|
||||
// NOTE: must be factor of .... 32?
|
||||
// NOTE: should be at least 64 because of PackedInts long-aligned encoding/decoding
|
||||
// NOTE: must be factor of ... 64?
|
||||
public final static int BLOCK_SIZE = 128;
|
||||
|
||||
public BlockPostingsFormat() {
|
||||
|
@ -17,9 +17,11 @@ package org.apache.lucene.codecs.block;
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.codecs.block.BlockPostingsFormat.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.block.ForUtil.MAX_DATA_SIZE;
|
||||
import static org.apache.lucene.codecs.block.ForUtil.MAX_ENCODED_SIZE;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
@ -27,8 +29,8 @@ import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
@ -43,8 +45,6 @@ import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
import static org.apache.lucene.codecs.blockpacked.BlockPackedPostingsFormat.BLOCK_SIZE;
|
||||
|
||||
/**
|
||||
* Concrete class that reads docId(maybe frq,pos,offset,payloads) list
|
||||
* with postings format.
|
||||
@ -58,6 +58,8 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
private final IndexInput posIn;
|
||||
private final IndexInput payIn;
|
||||
|
||||
private final ForUtil forUtil;
|
||||
|
||||
public static boolean DEBUG = false;
|
||||
|
||||
// nocommit
|
||||
@ -76,6 +78,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
BlockPostingsWriter.DOC_CODEC,
|
||||
BlockPostingsWriter.VERSION_START,
|
||||
BlockPostingsWriter.VERSION_START);
|
||||
forUtil = new ForUtil(docIn);
|
||||
|
||||
if (fieldInfos.hasProx()) {
|
||||
posIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, BlockPostingsFormat.POS_EXTENSION),
|
||||
@ -119,7 +122,11 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
}
|
||||
}
|
||||
|
||||
static void readVIntBlock(IndexInput docIn, int[] docBuffer, int[] freqBuffer, int num, boolean indexHasFreq) throws IOException {
|
||||
/**
|
||||
* Read values that have been written using variable-length encoding instead of bit-packing.
|
||||
*/
|
||||
private static void readVIntBlock(IndexInput docIn, int[] docBuffer,
|
||||
int[] freqBuffer, int num, boolean indexHasFreq) throws IOException {
|
||||
if (indexHasFreq) {
|
||||
for(int i=0;i<num;i++) {
|
||||
final int code = docIn.readVInt();
|
||||
@ -137,17 +144,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
}
|
||||
}
|
||||
|
||||
static void readBlock(IndexInput in, byte[] encoded, IntBuffer encodedBuffer, int[] buffer) throws IOException {
|
||||
int header = in.readVInt();
|
||||
in.readBytes(encoded, 0, ForUtil.getEncodedSize(header));
|
||||
ForUtil.decompress(encodedBuffer, buffer, header);
|
||||
}
|
||||
|
||||
static void skipBlock(IndexInput in) throws IOException {
|
||||
int header = in.readVInt();
|
||||
in.seek(in.getFilePointer() + ForUtil.getEncodedSize(header));
|
||||
}
|
||||
|
||||
// Must keep final because we do non-standard clone
|
||||
private final static class IntBlockTermState extends BlockTermState {
|
||||
long docStartFP;
|
||||
@ -323,10 +319,9 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
|
||||
final class BlockDocsEnum extends DocsEnum {
|
||||
private final byte[] encoded;
|
||||
private final IntBuffer encodedBuffer;
|
||||
|
||||
private final int[] docDeltaBuffer = new int[BLOCK_SIZE];
|
||||
private final int[] freqBuffer = new int[BLOCK_SIZE];
|
||||
private final int[] docDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||
private final int[] freqBuffer = new int[MAX_DATA_SIZE];
|
||||
|
||||
private int docBufferUpto;
|
||||
|
||||
@ -368,8 +363,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
indexHasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
indexHasPayloads = fieldInfo.hasPayloads();
|
||||
encoded = new byte[BLOCK_SIZE*4];
|
||||
encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
|
||||
encoded = new byte[MAX_ENCODED_SIZE];
|
||||
}
|
||||
|
||||
public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) {
|
||||
@ -419,14 +413,16 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
if (DEBUG) {
|
||||
System.out.println(" fill doc block from fp=" + docIn.getFilePointer());
|
||||
}
|
||||
readBlock(docIn, encoded, encodedBuffer, docDeltaBuffer);
|
||||
forUtil.readBlock(docIn, encoded, docDeltaBuffer);
|
||||
|
||||
if (indexHasFreq) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
|
||||
}
|
||||
readBlock(docIn, encoded, encodedBuffer, freqBuffer);
|
||||
forUtil.readBlock(docIn, encoded, freqBuffer);
|
||||
}
|
||||
} else {
|
||||
// Read vInts:
|
||||
if (DEBUG) {
|
||||
System.out.println(" fill last vInt block from fp=" + docIn.getFilePointer());
|
||||
}
|
||||
@ -444,6 +440,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
if (DEBUG) {
|
||||
System.out.println(" docUpto=" + docUpto + " (of df=" + docFreq + ") docBufferUpto=" + docBufferUpto);
|
||||
}
|
||||
|
||||
if (docUpto == docFreq) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" return doc=END");
|
||||
@ -453,6 +450,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
if (docBufferUpto == BLOCK_SIZE) {
|
||||
refillDocs();
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" accum=" + accum + " docDeltaBuffer[" + docBufferUpto + "]=" + docDeltaBuffer[docBufferUpto]);
|
||||
}
|
||||
@ -571,11 +569,10 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
final class BlockDocsAndPositionsEnum extends DocsAndPositionsEnum {
|
||||
|
||||
private final byte[] encoded;
|
||||
private final IntBuffer encodedBuffer;
|
||||
|
||||
private final int[] docDeltaBuffer = new int[BLOCK_SIZE];
|
||||
private final int[] freqBuffer = new int[BLOCK_SIZE];
|
||||
private final int[] posDeltaBuffer = new int[BLOCK_SIZE];
|
||||
private final int[] docDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||
private final int[] freqBuffer = new int[MAX_DATA_SIZE];
|
||||
private final int[] posDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||
|
||||
private int docBufferUpto;
|
||||
private int posBufferUpto;
|
||||
@ -634,8 +631,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
this.startDocIn = BlockPostingsReader.this.docIn;
|
||||
this.docIn = (IndexInput) startDocIn.clone();
|
||||
this.posIn = (IndexInput) BlockPostingsReader.this.posIn.clone();
|
||||
encoded = new byte[BLOCK_SIZE*4];
|
||||
encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
|
||||
encoded = new byte[MAX_ENCODED_SIZE];
|
||||
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
indexHasPayloads = fieldInfo.hasPayloads();
|
||||
}
|
||||
@ -694,12 +690,13 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
if (DEBUG) {
|
||||
System.out.println(" fill doc block from fp=" + docIn.getFilePointer());
|
||||
}
|
||||
readBlock(docIn, encoded, encodedBuffer, docDeltaBuffer);
|
||||
forUtil.readBlock(docIn, encoded, docDeltaBuffer);
|
||||
if (DEBUG) {
|
||||
System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
|
||||
}
|
||||
readBlock(docIn, encoded, encodedBuffer, freqBuffer);
|
||||
forUtil.readBlock(docIn, encoded, freqBuffer);
|
||||
} else {
|
||||
// Read vInts:
|
||||
if (DEBUG) {
|
||||
System.out.println(" fill last vInt doc block from fp=" + docIn.getFilePointer());
|
||||
}
|
||||
@ -740,7 +737,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
if (DEBUG) {
|
||||
System.out.println(" bulk pos block @ fp=" + posIn.getFilePointer());
|
||||
}
|
||||
readBlock(posIn, encoded, encodedBuffer, posDeltaBuffer);
|
||||
forUtil.readBlock(posIn, encoded, posDeltaBuffer);
|
||||
}
|
||||
}
|
||||
|
||||
@ -905,7 +902,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
System.out.println(" skip whole block @ fp=" + posIn.getFilePointer());
|
||||
}
|
||||
assert posIn.getFilePointer() != lastPosBlockFP;
|
||||
skipBlock(posIn);
|
||||
forUtil.skipBlock(posIn);
|
||||
toSkip -= BLOCK_SIZE;
|
||||
}
|
||||
refillPositions();
|
||||
@ -976,11 +973,10 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
final class EverythingEnum extends DocsAndPositionsEnum {
|
||||
|
||||
private final byte[] encoded;
|
||||
private final IntBuffer encodedBuffer;
|
||||
|
||||
private final int[] docDeltaBuffer = new int[BLOCK_SIZE];
|
||||
private final int[] freqBuffer = new int[BLOCK_SIZE];
|
||||
private final int[] posDeltaBuffer = new int[BLOCK_SIZE];
|
||||
private final int[] docDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||
private final int[] freqBuffer = new int[MAX_DATA_SIZE];
|
||||
private final int[] posDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||
|
||||
private final int[] payloadLengthBuffer;
|
||||
private final int[] offsetStartDeltaBuffer;
|
||||
@ -1058,12 +1054,11 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
this.docIn = (IndexInput) startDocIn.clone();
|
||||
this.posIn = (IndexInput) BlockPostingsReader.this.posIn.clone();
|
||||
this.payIn = (IndexInput) BlockPostingsReader.this.payIn.clone();
|
||||
encoded = new byte[BLOCK_SIZE*4];
|
||||
encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
|
||||
encoded = new byte[MAX_ENCODED_SIZE];
|
||||
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
if (indexHasOffsets) {
|
||||
offsetStartDeltaBuffer = new int[BLOCK_SIZE];
|
||||
offsetLengthBuffer = new int[BLOCK_SIZE];
|
||||
offsetStartDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||
offsetLengthBuffer = new int[MAX_DATA_SIZE];
|
||||
} else {
|
||||
offsetStartDeltaBuffer = null;
|
||||
offsetLengthBuffer = null;
|
||||
@ -1073,7 +1068,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
|
||||
indexHasPayloads = fieldInfo.hasPayloads();
|
||||
if (indexHasPayloads) {
|
||||
payloadLengthBuffer = new int[BLOCK_SIZE];
|
||||
payloadLengthBuffer = new int[MAX_DATA_SIZE];
|
||||
payloadBytes = new byte[128];
|
||||
payload = new BytesRef();
|
||||
} else {
|
||||
@ -1138,11 +1133,11 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
if (DEBUG) {
|
||||
System.out.println(" fill doc block from fp=" + docIn.getFilePointer());
|
||||
}
|
||||
readBlock(docIn, encoded, encodedBuffer, docDeltaBuffer);
|
||||
forUtil.readBlock(docIn, encoded, docDeltaBuffer);
|
||||
if (DEBUG) {
|
||||
System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
|
||||
}
|
||||
readBlock(docIn, encoded, encodedBuffer, freqBuffer);
|
||||
forUtil.readBlock(docIn, encoded, freqBuffer);
|
||||
} else {
|
||||
if (DEBUG) {
|
||||
System.out.println(" fill last vInt doc block from fp=" + docIn.getFilePointer());
|
||||
@ -1202,13 +1197,13 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
if (DEBUG) {
|
||||
System.out.println(" bulk pos block @ fp=" + posIn.getFilePointer());
|
||||
}
|
||||
readBlock(posIn, encoded, encodedBuffer, posDeltaBuffer);
|
||||
forUtil.readBlock(posIn, encoded, posDeltaBuffer);
|
||||
|
||||
if (indexHasPayloads) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" bulk payload block @ pay.fp=" + payIn.getFilePointer());
|
||||
}
|
||||
readBlock(payIn, encoded, encodedBuffer, payloadLengthBuffer);
|
||||
forUtil.readBlock(payIn, encoded, payloadLengthBuffer);
|
||||
int numBytes = payIn.readVInt();
|
||||
if (DEBUG) {
|
||||
System.out.println(" " + numBytes + " payload bytes @ pay.fp=" + payIn.getFilePointer());
|
||||
@ -1224,8 +1219,8 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
if (DEBUG) {
|
||||
System.out.println(" bulk offset block @ pay.fp=" + payIn.getFilePointer());
|
||||
}
|
||||
readBlock(payIn, encoded, encodedBuffer, offsetStartDeltaBuffer);
|
||||
readBlock(payIn, encoded, encodedBuffer, offsetLengthBuffer);
|
||||
forUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer);
|
||||
forUtil.readBlock(payIn, encoded, offsetLengthBuffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1268,6 +1263,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
lastStartOffset = 0;
|
||||
return doc;
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" doc=" + accum + " is deleted; try next doc");
|
||||
}
|
||||
@ -1412,11 +1408,11 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
System.out.println(" skip whole block @ fp=" + posIn.getFilePointer());
|
||||
}
|
||||
assert posIn.getFilePointer() != lastPosBlockFP;
|
||||
skipBlock(posIn);
|
||||
forUtil.skipBlock(posIn);
|
||||
|
||||
if (indexHasPayloads) {
|
||||
// Skip payloadLength block:
|
||||
skipBlock(payIn);
|
||||
forUtil.skipBlock(payIn);
|
||||
|
||||
// Skip payloadBytes block:
|
||||
int numBytes = payIn.readVInt();
|
||||
@ -1426,8 +1422,8 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
if (indexHasOffsets) {
|
||||
// Must load offset blocks merely to sum
|
||||
// up into lastStartOffset:
|
||||
readBlock(payIn, encoded, encodedBuffer, offsetStartDeltaBuffer);
|
||||
readBlock(payIn, encoded, encodedBuffer, offsetLengthBuffer);
|
||||
forUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer);
|
||||
forUtil.readBlock(payIn, encoded, offsetLengthBuffer);
|
||||
for(int i=0;i<BLOCK_SIZE;i++) {
|
||||
lastStartOffset += offsetStartDeltaBuffer[i] + offsetLengthBuffer[i];
|
||||
}
|
||||
|
@ -17,9 +17,12 @@ package org.apache.lucene.codecs.block;
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.codecs.block.BlockPostingsFormat.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.block.BlockPostingsReader.DEBUG;
|
||||
import static org.apache.lucene.codecs.block.ForUtil.MAX_DATA_SIZE;
|
||||
import static org.apache.lucene.codecs.block.ForUtil.MAX_ENCODED_SIZE;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@ -27,8 +30,8 @@ import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
@ -36,8 +39,8 @@ import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
import static org.apache.lucene.codecs.blockpacked.BlockPackedPostingsFormat.BLOCK_SIZE;
|
||||
|
||||
/**
|
||||
* Concrete class that writes docId(maybe frq,pos,offset,payloads) list
|
||||
@ -50,8 +53,6 @@ import static org.apache.lucene.codecs.blockpacked.BlockPackedPostingsFormat.BLO
|
||||
*/
|
||||
public final class BlockPostingsWriter extends PostingsWriterBase {
|
||||
|
||||
private boolean DEBUG = BlockPostingsReader.DEBUG;
|
||||
|
||||
// nocommit move these constants to the PF:
|
||||
|
||||
static final int maxSkipLevels = 10;
|
||||
@ -108,11 +109,11 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
|
||||
private int docCount;
|
||||
|
||||
final byte[] encoded;
|
||||
final IntBuffer encodedBuffer;
|
||||
|
||||
private final ForUtil forUtil;
|
||||
private final BlockSkipWriter skipWriter;
|
||||
|
||||
public BlockPostingsWriter(SegmentWriteState state) throws IOException {
|
||||
public BlockPostingsWriter(SegmentWriteState state, float acceptableOverheadRatio) throws IOException {
|
||||
super();
|
||||
|
||||
docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.DOC_EXTENSION),
|
||||
@ -122,23 +123,24 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
|
||||
boolean success = false;
|
||||
try {
|
||||
CodecUtil.writeHeader(docOut, DOC_CODEC, VERSION_CURRENT);
|
||||
forUtil = new ForUtil(acceptableOverheadRatio, docOut);
|
||||
if (state.fieldInfos.hasProx()) {
|
||||
posDeltaBuffer = new int[BLOCK_SIZE];
|
||||
posDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||
posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.POS_EXTENSION),
|
||||
state.context);
|
||||
CodecUtil.writeHeader(posOut, POS_CODEC, VERSION_CURRENT);
|
||||
|
||||
if (state.fieldInfos.hasPayloads()) {
|
||||
payloadBytes = new byte[128];
|
||||
payloadLengthBuffer = new int[BLOCK_SIZE];
|
||||
payloadLengthBuffer = new int[MAX_DATA_SIZE];
|
||||
} else {
|
||||
payloadBytes = null;
|
||||
payloadLengthBuffer = null;
|
||||
}
|
||||
|
||||
if (state.fieldInfos.hasOffsets()) {
|
||||
offsetStartDeltaBuffer = new int[BLOCK_SIZE];
|
||||
offsetLengthBuffer = new int[BLOCK_SIZE];
|
||||
offsetStartDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||
offsetLengthBuffer = new int[MAX_DATA_SIZE];
|
||||
} else {
|
||||
offsetStartDeltaBuffer = null;
|
||||
offsetLengthBuffer = null;
|
||||
@ -165,19 +167,22 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
|
||||
}
|
||||
}
|
||||
|
||||
docDeltaBuffer = new int[BLOCK_SIZE];
|
||||
freqBuffer = new int[BLOCK_SIZE];
|
||||
docDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||
freqBuffer = new int[MAX_DATA_SIZE];
|
||||
|
||||
// nocommit should we try skipping every 2/4 blocks...?
|
||||
skipWriter = new BlockSkipWriter(maxSkipLevels,
|
||||
BLOCK_SIZE,
|
||||
skipWriter = new BlockSkipWriter(maxSkipLevels,
|
||||
BLOCK_SIZE,
|
||||
state.segmentInfo.getDocCount(),
|
||||
docOut,
|
||||
posOut,
|
||||
payOut);
|
||||
|
||||
encoded = new byte[BLOCK_SIZE*4];
|
||||
encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
|
||||
encoded = new byte[MAX_ENCODED_SIZE];
|
||||
}
|
||||
|
||||
public BlockPostingsWriter(SegmentWriteState state) throws IOException {
|
||||
this(state, PackedInts.DEFAULT);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -214,17 +219,12 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
|
||||
skipWriter.resetSkip();
|
||||
}
|
||||
|
||||
private void writeBlock(int[] buffer, IndexOutput out) throws IOException {
|
||||
final int header = ForUtil.compress(buffer, encodedBuffer);
|
||||
out.writeVInt(header);
|
||||
out.writeBytes(encoded, ForUtil.getEncodedSize(header));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startDoc(int docID, int termDocFreq) throws IOException {
|
||||
if (DEBUG) {
|
||||
System.out.println("FPW.startDoc docID["+docBufferUpto+"]=" + docID);
|
||||
}
|
||||
|
||||
final int docDelta = docID - lastDocID;
|
||||
|
||||
if (docID < 0 || (docCount > 0 && docDelta <= 0)) {
|
||||
@ -245,17 +245,18 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
|
||||
if (DEBUG) {
|
||||
System.out.println(" write docDelta block @ fp=" + docOut.getFilePointer());
|
||||
}
|
||||
writeBlock(docDeltaBuffer, docOut);
|
||||
forUtil.writeBlock(docDeltaBuffer, encoded, docOut);
|
||||
if (fieldHasFreqs) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" write freq block @ fp=" + docOut.getFilePointer());
|
||||
}
|
||||
writeBlock(freqBuffer, docOut);
|
||||
forUtil.writeBlock(freqBuffer, encoded, docOut);
|
||||
}
|
||||
// NOTE: don't set docBufferUpto back to 0 here;
|
||||
// finishDoc will do so (because it needs to see that
|
||||
// the block was filled so it can save skip data)
|
||||
}
|
||||
|
||||
lastDocID = docID;
|
||||
lastPosition = 0;
|
||||
lastStartOffset = 0;
|
||||
@ -296,17 +297,17 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
|
||||
if (DEBUG) {
|
||||
System.out.println(" write pos bulk block @ fp=" + posOut.getFilePointer());
|
||||
}
|
||||
writeBlock(posDeltaBuffer, posOut);
|
||||
forUtil.writeBlock(posDeltaBuffer, encoded, posOut);
|
||||
|
||||
if (fieldHasPayloads) {
|
||||
writeBlock(payloadLengthBuffer, payOut);
|
||||
forUtil.writeBlock(payloadLengthBuffer, encoded, payOut);
|
||||
payOut.writeVInt(payloadByteUpto);
|
||||
payOut.writeBytes(payloadBytes, 0, payloadByteUpto);
|
||||
payloadByteUpto = 0;
|
||||
}
|
||||
if (fieldHasOffsets) {
|
||||
writeBlock(offsetStartDeltaBuffer, payOut);
|
||||
writeBlock(offsetLengthBuffer, payOut);
|
||||
forUtil.writeBlock(offsetStartDeltaBuffer, encoded, payOut);
|
||||
forUtil.writeBlock(offsetLengthBuffer, encoded, payOut);
|
||||
}
|
||||
posBufferUpto = 0;
|
||||
}
|
||||
@ -475,7 +476,7 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
|
||||
|
||||
int skipOffset;
|
||||
if (docCount > BLOCK_SIZE) {
|
||||
skipOffset = (int) (skipWriter.writeSkip(docOut)-docTermStartFP);
|
||||
skipOffset = (int) (skipWriter.writeSkip(docOut) - docTermStartFP);
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("skip packet " + (docOut.getFilePointer() - (docTermStartFP + skipOffset)) + " bytes");
|
||||
|
@ -52,7 +52,7 @@ import org.apache.lucene.store.IndexInput;
|
||||
*/
|
||||
final class BlockSkipReader extends MultiLevelSkipListReader {
|
||||
private boolean DEBUG = BlockPostingsReader.DEBUG;
|
||||
private int blockSize;
|
||||
private final int blockSize;
|
||||
|
||||
private long docPointer[];
|
||||
private long posPointer[];
|
||||
@ -95,7 +95,6 @@ final class BlockSkipReader extends MultiLevelSkipListReader {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Trim original docFreq to tell skipReader read proper number of skip points.
|
||||
*
|
||||
|
@ -37,7 +37,7 @@ import org.apache.lucene.codecs.MultiLevelSkipListWriter;
|
||||
* block, only record skip data at the start its start point(if it exist).
|
||||
*
|
||||
* For each skip point, we will record:
|
||||
* 1. docID in former position, i.e. for position 12, record docID[11], etc.
|
||||
* 1. docID in former position, i.e. for position 12, record docID[11], etc.
|
||||
* 2. its related file points(position, payload),
|
||||
* 3. related numbers or uptos(position, payload).
|
||||
* 4. start offset.
|
||||
|
@ -16,149 +16,208 @@ package org.apache.lucene.codecs.block;
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.nio.IntBuffer;
|
||||
|
||||
import static org.apache.lucene.codecs.block.BlockPostingsFormat.BLOCK_SIZE;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
import org.apache.lucene.util.packed.PackedInts.Decoder;
|
||||
import org.apache.lucene.util.packed.PackedInts.FormatAndBits;
|
||||
|
||||
/**
|
||||
* Encode all values in normal area with fixed bit width,
|
||||
* which is determined by the max value in this block.
|
||||
*/
|
||||
public final class ForUtil {
|
||||
protected static final int[] MASK = { 0x00000000,
|
||||
0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
|
||||
0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
|
||||
0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff,
|
||||
0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
|
||||
0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
|
||||
0x7fffffff, 0xffffffff};
|
||||
|
||||
/** Compress given int[] into Integer buffer, with For format
|
||||
*
|
||||
* @param data uncompressed data
|
||||
* @param intBuffer integer buffer to hold compressed data
|
||||
* @return the header for the current block
|
||||
*/
|
||||
static int compress(final int[] data, IntBuffer intBuffer) {
|
||||
int numBits = getNumBits(data);
|
||||
if (numBits == 0) {
|
||||
return compressDuplicateBlock(data, intBuffer);
|
||||
}
|
||||
|
||||
for (int i=0; i<BLOCK_SIZE; ++i) {
|
||||
assert data[i] >= 0;
|
||||
encodeNormalValue(intBuffer, i, data[i], numBits);
|
||||
}
|
||||
|
||||
return numBits;
|
||||
}
|
||||
final class ForUtil {
|
||||
|
||||
/**
|
||||
* Save only one int when the whole block equals to a
|
||||
* single value.
|
||||
* Special number of bits per value used whenever all values to encode are equal.
|
||||
*/
|
||||
static int compressDuplicateBlock(final int[] data, IntBuffer intBuffer) {
|
||||
intBuffer.put(0, data[0]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/** Decompress given Integer buffer into int array.
|
||||
*
|
||||
* @param intBuffer integer buffer to hold compressed data
|
||||
* @param data int array to hold uncompressed data
|
||||
* @param header header of current block, which contains numFrameBits
|
||||
*/
|
||||
static void decompress(IntBuffer intBuffer, int[] data, int header) {
|
||||
// since this buffer is reused at upper level, rewind first
|
||||
intBuffer.rewind();
|
||||
|
||||
// NOTE: header == numBits now, but we may change that
|
||||
final int numBits = header;
|
||||
assert numBits >=0 && numBits < 32;
|
||||
decompressCore(intBuffer, data, numBits);
|
||||
}
|
||||
|
||||
public static void decompressCore(IntBuffer intBuffer, int[] data, int numBits) {
|
||||
switch(numBits) {
|
||||
case 0: PackedIntsDecompress.decode0(intBuffer, data); break;
|
||||
case 1: PackedIntsDecompress.decode1(intBuffer, data); break;
|
||||
case 2: PackedIntsDecompress.decode2(intBuffer, data); break;
|
||||
case 3: PackedIntsDecompress.decode3(intBuffer, data); break;
|
||||
case 4: PackedIntsDecompress.decode4(intBuffer, data); break;
|
||||
case 5: PackedIntsDecompress.decode5(intBuffer, data); break;
|
||||
case 6: PackedIntsDecompress.decode6(intBuffer, data); break;
|
||||
case 7: PackedIntsDecompress.decode7(intBuffer, data); break;
|
||||
case 8: PackedIntsDecompress.decode8(intBuffer, data); break;
|
||||
case 9: PackedIntsDecompress.decode9(intBuffer, data); break;
|
||||
case 10: PackedIntsDecompress.decode10(intBuffer, data); break;
|
||||
case 11: PackedIntsDecompress.decode11(intBuffer, data); break;
|
||||
case 12: PackedIntsDecompress.decode12(intBuffer, data); break;
|
||||
case 13: PackedIntsDecompress.decode13(intBuffer, data); break;
|
||||
case 14: PackedIntsDecompress.decode14(intBuffer, data); break;
|
||||
case 15: PackedIntsDecompress.decode15(intBuffer, data); break;
|
||||
case 16: PackedIntsDecompress.decode16(intBuffer, data); break;
|
||||
case 17: PackedIntsDecompress.decode17(intBuffer, data); break;
|
||||
case 18: PackedIntsDecompress.decode18(intBuffer, data); break;
|
||||
case 19: PackedIntsDecompress.decode19(intBuffer, data); break;
|
||||
case 20: PackedIntsDecompress.decode20(intBuffer, data); break;
|
||||
case 21: PackedIntsDecompress.decode21(intBuffer, data); break;
|
||||
case 22: PackedIntsDecompress.decode22(intBuffer, data); break;
|
||||
case 23: PackedIntsDecompress.decode23(intBuffer, data); break;
|
||||
case 24: PackedIntsDecompress.decode24(intBuffer, data); break;
|
||||
case 25: PackedIntsDecompress.decode25(intBuffer, data); break;
|
||||
case 26: PackedIntsDecompress.decode26(intBuffer, data); break;
|
||||
case 27: PackedIntsDecompress.decode27(intBuffer, data); break;
|
||||
case 28: PackedIntsDecompress.decode28(intBuffer, data); break;
|
||||
case 29: PackedIntsDecompress.decode29(intBuffer, data); break;
|
||||
case 30: PackedIntsDecompress.decode30(intBuffer, data); break;
|
||||
case 31: PackedIntsDecompress.decode31(intBuffer, data); break;
|
||||
// nocommit have default throw exc? or add assert up above
|
||||
}
|
||||
}
|
||||
|
||||
static void encodeNormalValue(IntBuffer intBuffer, int pos, int value, int numBits) {
|
||||
final int globalBitPos = numBits*pos; // position in bit stream
|
||||
final int localBitPos = globalBitPos & 31; // position inside an int
|
||||
int intPos = globalBitPos/32; // which integer to locate
|
||||
setBufferIntBits(intBuffer, intPos, localBitPos, numBits, value);
|
||||
if ((localBitPos + numBits) > 32) { // value does not fit in this int, fill tail
|
||||
setBufferIntBits(intBuffer, intPos+1, 0,
|
||||
(localBitPos+numBits-32),
|
||||
(value >>> (32-localBitPos)));
|
||||
}
|
||||
}
|
||||
|
||||
static void setBufferIntBits(IntBuffer intBuffer, int intPos, int firstBitPos, int numBits, int value) {
|
||||
assert (value & ~MASK[numBits]) == 0;
|
||||
// safely discards those msb parts when firstBitPos+numBits>32
|
||||
intBuffer.put(intPos,
|
||||
(intBuffer.get(intPos) & ~(MASK[numBits] << firstBitPos))
|
||||
| (value << firstBitPos));
|
||||
}
|
||||
private static final int ALL_VALUES_EQUAL = 0;
|
||||
private static final int PACKED_INTS_VERSION = 0; // nocommit: encode in the stream?
|
||||
|
||||
/**
|
||||
* Returns number of bits necessary to represent max value.
|
||||
* Upper limit of the number of bytes that might be required to stored
|
||||
* <code>BLOCK_SIZE</code> encoded values.
|
||||
*/
|
||||
static int getNumBits(final int[] data) {
|
||||
if (isAllEqual(data)) {
|
||||
return 0;
|
||||
}
|
||||
int size=data.length;
|
||||
int optBits=1;
|
||||
for (int i=0; i<size; ++i) {
|
||||
while ((data[i] & ~MASK[optBits]) != 0) {
|
||||
optBits++;
|
||||
static final int MAX_ENCODED_SIZE = BLOCK_SIZE * 4;
|
||||
|
||||
/**
|
||||
* Upper limit of the number of values that might be decoded in a single call to
|
||||
* {@link #readBlock(IndexInput, byte[], int[])}. Although values after
|
||||
* <code>BLOCK_SIZE</code> are garbage, it is necessary to allocate value buffers
|
||||
* whose size is >= MAX_DATA_SIZE to avoid {@link ArrayIndexOutOfBoundsException}s.
|
||||
*/
|
||||
static final int MAX_DATA_SIZE;
|
||||
static {
|
||||
int minDataSize = 0;
|
||||
for (PackedInts.Format format : PackedInts.Format.values()) {
|
||||
for (int bpv = 1; bpv <= 32; ++bpv) {
|
||||
if (!format.isSupported(bpv)) {
|
||||
continue;
|
||||
}
|
||||
final PackedInts.Decoder decoder = PackedInts.getDecoder(format, PACKED_INTS_VERSION, bpv);
|
||||
final int iterations = (int) Math.ceil((float) BLOCK_SIZE / decoder.valueCount());
|
||||
minDataSize = Math.max(minDataSize, iterations * decoder.valueCount());
|
||||
}
|
||||
}
|
||||
assert optBits < 32;
|
||||
return optBits;
|
||||
MAX_DATA_SIZE = minDataSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the number of iterations required to decode <code>BLOCK_SIZE</code>
|
||||
* values with the provided {@link Decoder}.
|
||||
*/
|
||||
private static int computeIterations(PackedInts.Decoder decoder) {
|
||||
return (int) Math.ceil((float) BLOCK_SIZE / decoder.valueCount());
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the number of bytes required to encode a block of values that require
|
||||
* <code>bitsPerValue</code> bits per value with format <code>format</code>.
|
||||
*/
|
||||
private static int encodedSize(PackedInts.Format format, int bitsPerValue) {
|
||||
return format.nblocks(bitsPerValue, BLOCK_SIZE) << 3;
|
||||
}
|
||||
|
||||
private final int[] encodedSizes;
|
||||
private final PackedInts.Encoder[] encoders;
|
||||
private final PackedInts.Decoder[] decoders;
|
||||
private final int[] iterations;
|
||||
|
||||
/**
|
||||
* Create a new {@link ForUtil} instance and save state into <code>out</code>.
|
||||
*/
|
||||
ForUtil(float acceptableOverheadRatio, DataOutput out) throws IOException {
|
||||
encodedSizes = new int[33];
|
||||
encoders = new PackedInts.Encoder[33];
|
||||
decoders = new PackedInts.Decoder[33];
|
||||
iterations = new int[33];
|
||||
|
||||
for (int bpv = 1; bpv <= 32; ++bpv) {
|
||||
final FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(
|
||||
BLOCK_SIZE, bpv, acceptableOverheadRatio);
|
||||
assert formatAndBits.format.isSupported(formatAndBits.bitsPerValue);
|
||||
assert formatAndBits.bitsPerValue <= 32;
|
||||
encodedSizes[bpv] = encodedSize(formatAndBits.format, formatAndBits.bitsPerValue);
|
||||
encoders[bpv] = PackedInts.getEncoder(
|
||||
formatAndBits.format, PACKED_INTS_VERSION, formatAndBits.bitsPerValue);
|
||||
decoders[bpv] = PackedInts.getDecoder(
|
||||
formatAndBits.format, PACKED_INTS_VERSION, formatAndBits.bitsPerValue);
|
||||
iterations[bpv] = computeIterations(decoders[bpv]);
|
||||
|
||||
out.writeVInt(formatAndBits.format.getId() << 5 | (formatAndBits.bitsPerValue - 1));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Restore a {@link ForUtil} from a {@link DataInput}.
|
||||
*/
|
||||
ForUtil(DataInput in) throws IOException {
|
||||
encodedSizes = new int[33];
|
||||
encoders = new PackedInts.Encoder[33];
|
||||
decoders = new PackedInts.Decoder[33];
|
||||
iterations = new int[33];
|
||||
|
||||
for (int bpv = 1; bpv <= 32; ++bpv) {
|
||||
final int code = in.readVInt();
|
||||
final int formatId = code >>> 5;
|
||||
final int bitsPerValue = (code & 31) + 1;
|
||||
|
||||
final PackedInts.Format format = PackedInts.Format.byId(formatId);
|
||||
assert format.isSupported(bitsPerValue);
|
||||
encodedSizes[bpv] = encodedSize(format, bitsPerValue);
|
||||
encoders[bpv] = PackedInts.getEncoder(
|
||||
format, PACKED_INTS_VERSION, bitsPerValue);
|
||||
decoders[bpv] = PackedInts.getDecoder(
|
||||
format, PACKED_INTS_VERSION, bitsPerValue);
|
||||
iterations[bpv] = computeIterations(decoders[bpv]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a block of data (<code>For</code> format).
|
||||
*
|
||||
* @param data the data to write
|
||||
* @param encoded a buffer to use to encode data
|
||||
* @param out the destination output
|
||||
* @throws IOException
|
||||
*/
|
||||
void writeBlock(int[] data, byte[] encoded, IndexOutput out) throws IOException {
|
||||
if (isAllEqual(data)) {
|
||||
out.writeVInt(ALL_VALUES_EQUAL);
|
||||
out.writeInt(data[0]);
|
||||
return;
|
||||
}
|
||||
|
||||
final int numBits = bitsRequired(data);
|
||||
assert numBits > 0 && numBits <= 32 : numBits;
|
||||
final PackedInts.Encoder encoder = encoders[numBits];
|
||||
final int iters = iterations[numBits];
|
||||
assert iters * encoder.valueCount() >= BLOCK_SIZE;
|
||||
final int encodedSize = encodedSizes[numBits];
|
||||
assert (iters * encoder.blockCount()) << 3 >= encodedSize;
|
||||
|
||||
out.writeVInt(numBits);
|
||||
|
||||
encoder.encode(data, 0, encoded, 0, iters);
|
||||
out.writeBytes(encoded, encodedSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the next block of data (<code>For</code> format).
|
||||
*
|
||||
* @param in the input to use to read data
|
||||
* @param encoded a buffer that can be used to store encoded data
|
||||
* @param decoded where to write decoded data
|
||||
* @throws IOException
|
||||
*/
|
||||
void readBlock(IndexInput in, byte[] encoded, int[] decoded) throws IOException {
|
||||
final int numBits = in.readVInt();
|
||||
assert numBits <= 32 : numBits;
|
||||
|
||||
if (numBits == ALL_VALUES_EQUAL) {
|
||||
final int value = in.readInt();
|
||||
Arrays.fill(decoded, 0, BLOCK_SIZE, value);
|
||||
return;
|
||||
}
|
||||
|
||||
final int encodedSize = encodedSizes[numBits];
|
||||
in.readBytes(encoded, 0, encodedSize);
|
||||
|
||||
final PackedInts.Decoder decoder = decoders[numBits];
|
||||
final int iters = iterations[numBits];
|
||||
assert iters * decoder.valueCount() >= BLOCK_SIZE;
|
||||
|
||||
decoder.decode(encoded, 0, decoded, 0, iters);
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip the next block of data.
|
||||
*
|
||||
* @param in the input where to read data
|
||||
* @throws IOException
|
||||
*/
|
||||
void skipBlock(IndexInput in) throws IOException {
|
||||
final int numBits = in.readVInt();
|
||||
if (numBits == ALL_VALUES_EQUAL) {
|
||||
in.seek(in.getFilePointer() + 4);
|
||||
return;
|
||||
}
|
||||
assert numBits > 0 && numBits <= 32 : numBits;
|
||||
final int encodedSize = encodedSizes[numBits];
|
||||
in.seek(in.getFilePointer() + encodedSize);
|
||||
}
|
||||
|
||||
// nocommit: we must have a util function for this, hmm?
|
||||
protected static boolean isAllEqual(final int[] data) {
|
||||
int len = data.length;
|
||||
int v = data[0];
|
||||
for (int i=1; i<len; i++) {
|
||||
private static boolean isAllEqual(final int[] data) {
|
||||
final long v = data[0];
|
||||
for (int i = 1; i < BLOCK_SIZE; ++i) {
|
||||
if (data[i] != v) {
|
||||
return false;
|
||||
}
|
||||
@ -166,11 +225,16 @@ public final class ForUtil {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: get compressed block size(in byte)
|
||||
/**
|
||||
* Compute the number of bits required to serialize any of the longs in
|
||||
* <code>data</code>.
|
||||
*/
|
||||
static int getEncodedSize(int numBits) {
|
||||
// NOTE: works only because BLOCK_SIZE is 0 mod 8:
|
||||
return numBits == 0 ? 4 : numBits*BLOCK_SIZE/8;
|
||||
private static int bitsRequired(final int[] data) {
|
||||
long or = 0;
|
||||
for (int i = 0; i < BLOCK_SIZE; ++i) {
|
||||
or |= data[i];
|
||||
}
|
||||
return PackedInts.bitsRequired(or);
|
||||
}
|
||||
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,107 +0,0 @@
|
||||
#!/usr/bin/env python2
|
||||
"""
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
|
||||
"""
|
||||
Generate source code for java classes for For or PFor decompression.
|
||||
"""
|
||||
|
||||
def bitsExpr(i, numFrameBits):
|
||||
framePos = i * numFrameBits
|
||||
intValNum = (framePos / 32)
|
||||
bitPos = framePos % 32
|
||||
bitsInInt = "intValue" + str(intValNum)
|
||||
needBrackets = 0
|
||||
if bitPos > 0:
|
||||
bitsInInt += " >>> " + str(bitPos)
|
||||
needBrackets = 1
|
||||
if bitPos + numFrameBits > 32:
|
||||
if needBrackets:
|
||||
bitsInInt = "(" + bitsInInt + ")"
|
||||
bitsInInt += " | (intValue" + str(intValNum+1) + " << "+ str(32 - bitPos) + ")"
|
||||
needBrackets = 1
|
||||
if bitPos + numFrameBits != 32:
|
||||
if needBrackets:
|
||||
bitsInInt = "(" + bitsInInt + ")"
|
||||
bitsInInt += " & mask"
|
||||
return bitsInInt
|
||||
|
||||
|
||||
def genDecompress():
|
||||
className = "PackedIntsDecompress"
|
||||
fileName = className + ".java"
|
||||
imports = "import java.nio.IntBuffer;\n"
|
||||
f = open(fileName, 'w')
|
||||
w = f.write
|
||||
try:
|
||||
w("package org.apache.lucene.codecs.block;\n")
|
||||
w("""/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
""")
|
||||
|
||||
w("\n/* This code is generated, do not modify. See gendecompress.py */\n\n")
|
||||
|
||||
w("import java.nio.IntBuffer;\n")
|
||||
w("import java.util.Arrays;\n\n")
|
||||
|
||||
w("final class PackedIntsDecompress {\n")
|
||||
|
||||
w('\n // nocommit: assess perf of this to see if specializing is really needed\n')
|
||||
w('\n // NOTE: hardwired to blockSize == 128\n\n')
|
||||
|
||||
w(' public static void decode0(final IntBuffer compressedBuffer, final int[] output) {\n')
|
||||
w(' Arrays.fill(output, compressedBuffer.get());\n')
|
||||
w(' }\n')
|
||||
|
||||
for numFrameBits in xrange(1, 32):
|
||||
w(' public static void decode%d(final IntBuffer compressedBuffer, final int[] output) {\n' % numFrameBits)
|
||||
w(' final int numFrameBits = %d;\n' % numFrameBits)
|
||||
w(' final int mask = (int) ((1L<<numFrameBits) - 1);\n')
|
||||
w(' int outputOffset = 0;\n')
|
||||
w(' for(int step=0;step<4;step++) {\n')
|
||||
|
||||
for i in range(numFrameBits): # declare int vars and init from buffer
|
||||
w(" int intValue" + str(i) + " = compressedBuffer.get();\n")
|
||||
|
||||
for i in range(32): # set output from int vars
|
||||
w(" output[" + str(i) + " + outputOffset] = " + bitsExpr(i, numFrameBits) + ";\n")
|
||||
|
||||
w(' outputOffset += 32;\n')
|
||||
w(' }\n')
|
||||
w(' }\n')
|
||||
|
||||
w('}\n')
|
||||
|
||||
finally:
|
||||
f.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
genDecompress()
|
@ -1,110 +0,0 @@
|
||||
package org.apache.lucene.codecs.blockpacked;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.BlockTreeTermsReader;
|
||||
import org.apache.lucene.codecs.BlockTreeTermsWriter;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Encodes/decode postings in packed int blocks for faster
|
||||
* decode.
|
||||
*/
|
||||
public final class BlockPackedPostingsFormat extends PostingsFormat {
|
||||
public static final String DOC_EXTENSION = "doc";
|
||||
public static final String POS_EXTENSION = "pos";
|
||||
public static final String PAY_EXTENSION = "pay";
|
||||
|
||||
private final int minTermBlockSize;
|
||||
private final int maxTermBlockSize;
|
||||
|
||||
// nocommit is this right?:
|
||||
// NOTE: should be at least 64 because of PackedInts long-aligned encoding/decoding
|
||||
// NOTE: must be factor of ... 64?
|
||||
public final static int BLOCK_SIZE = 128;
|
||||
|
||||
public BlockPackedPostingsFormat() {
|
||||
this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
public BlockPackedPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
||||
super("BlockPacked");
|
||||
this.minTermBlockSize = minTermBlockSize;
|
||||
assert minTermBlockSize > 1;
|
||||
this.maxTermBlockSize = maxTermBlockSize;
|
||||
assert minTermBlockSize <= maxTermBlockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getName() + "(blocksize=" + BLOCK_SIZE + ")";
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new BlockPackedPostingsWriter(state);
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer ret = new BlockTreeTermsWriter(state,
|
||||
postingsWriter,
|
||||
minTermBlockSize,
|
||||
maxTermBlockSize);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsWriter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postingsReader = new BlockPackedPostingsReader(state.dir,
|
||||
state.fieldInfos,
|
||||
state.segmentInfo,
|
||||
state.context,
|
||||
state.segmentSuffix);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer ret = new BlockTreeTermsReader(state.dir,
|
||||
state.fieldInfos,
|
||||
state.segmentInfo.name,
|
||||
postingsReader,
|
||||
state.context,
|
||||
state.segmentSuffix,
|
||||
state.termsIndexDivisor);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsReader);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -1,562 +0,0 @@
|
||||
package org.apache.lucene.codecs.blockpacked;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.codecs.blockpacked.BlockPackedPostingsFormat.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.blockpacked.BlockPackedPostingsReader.DEBUG;
|
||||
import static org.apache.lucene.codecs.blockpacked.ForUtil.MAX_DATA_SIZE;
|
||||
import static org.apache.lucene.codecs.blockpacked.ForUtil.MAX_ENCODED_SIZE;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
|
||||
/**
|
||||
* Concrete class that writes docId(maybe frq,pos,offset,payloads) list
|
||||
* with postings format.
|
||||
*
|
||||
* Postings list for each term will be stored separately.
|
||||
*
|
||||
* @see BlockPackedSkipWriter for details about skipping setting and postings layout.
|
||||
*
|
||||
*/
|
||||
public final class BlockPackedPostingsWriter extends PostingsWriterBase {
|
||||
|
||||
// nocommit move these constants to the PF:
|
||||
|
||||
static final int maxSkipLevels = 10;
|
||||
|
||||
final static String TERMS_CODEC = "BlockPackedPostingsWriterTerms";
|
||||
final static String DOC_CODEC = "BlockPackedPostingsWriterDoc";
|
||||
final static String POS_CODEC = "BlockPackedPostingsWriterPos";
|
||||
final static String PAY_CODEC = "BlockPackedPostingsWriterPay";
|
||||
|
||||
// Increment version to change it:
|
||||
final static int VERSION_START = 0;
|
||||
final static int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
final IndexOutput docOut;
|
||||
final IndexOutput posOut;
|
||||
final IndexOutput payOut;
|
||||
|
||||
private IndexOutput termsOut;
|
||||
|
||||
// How current field indexes postings:
|
||||
private boolean fieldHasFreqs;
|
||||
private boolean fieldHasPositions;
|
||||
private boolean fieldHasOffsets;
|
||||
private boolean fieldHasPayloads;
|
||||
|
||||
// Holds starting file pointers for each term:
|
||||
private long docTermStartFP;
|
||||
private long posTermStartFP;
|
||||
private long payTermStartFP;
|
||||
|
||||
final int[] docDeltaBuffer;
|
||||
final int[] freqBuffer;
|
||||
private int docBufferUpto;
|
||||
|
||||
final int[] posDeltaBuffer;
|
||||
final int[] payloadLengthBuffer;
|
||||
final int[] offsetStartDeltaBuffer;
|
||||
final int[] offsetLengthBuffer;
|
||||
private int posBufferUpto;
|
||||
|
||||
private byte[] payloadBytes;
|
||||
private int payloadByteUpto;
|
||||
|
||||
private int lastBlockDocID;
|
||||
private long lastBlockPosFP;
|
||||
private long lastBlockPayFP;
|
||||
private int lastBlockPosBufferUpto;
|
||||
private int lastBlockStartOffset;
|
||||
private int lastBlockPayloadByteUpto;
|
||||
|
||||
private int lastDocID;
|
||||
private int lastPosition;
|
||||
private int lastStartOffset;
|
||||
private int docCount;
|
||||
|
||||
final byte[] encoded;
|
||||
|
||||
private final ForUtil forUtil;
|
||||
private final BlockPackedSkipWriter skipWriter;
|
||||
|
||||
public BlockPackedPostingsWriter(SegmentWriteState state, float acceptableOverheadRatio) throws IOException {
|
||||
super();
|
||||
|
||||
docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPackedPostingsFormat.DOC_EXTENSION),
|
||||
state.context);
|
||||
IndexOutput posOut = null;
|
||||
IndexOutput payOut = null;
|
||||
boolean success = false;
|
||||
try {
|
||||
CodecUtil.writeHeader(docOut, DOC_CODEC, VERSION_CURRENT);
|
||||
forUtil = new ForUtil(acceptableOverheadRatio, docOut);
|
||||
if (state.fieldInfos.hasProx()) {
|
||||
posDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||
posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPackedPostingsFormat.POS_EXTENSION),
|
||||
state.context);
|
||||
CodecUtil.writeHeader(posOut, POS_CODEC, VERSION_CURRENT);
|
||||
|
||||
if (state.fieldInfos.hasPayloads()) {
|
||||
payloadBytes = new byte[128];
|
||||
payloadLengthBuffer = new int[MAX_DATA_SIZE];
|
||||
} else {
|
||||
payloadBytes = null;
|
||||
payloadLengthBuffer = null;
|
||||
}
|
||||
|
||||
if (state.fieldInfos.hasOffsets()) {
|
||||
offsetStartDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||
offsetLengthBuffer = new int[MAX_DATA_SIZE];
|
||||
} else {
|
||||
offsetStartDeltaBuffer = null;
|
||||
offsetLengthBuffer = null;
|
||||
}
|
||||
|
||||
if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) {
|
||||
payOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPackedPostingsFormat.PAY_EXTENSION),
|
||||
state.context);
|
||||
CodecUtil.writeHeader(payOut, PAY_CODEC, VERSION_CURRENT);
|
||||
}
|
||||
} else {
|
||||
posDeltaBuffer = null;
|
||||
payloadLengthBuffer = null;
|
||||
offsetStartDeltaBuffer = null;
|
||||
offsetLengthBuffer = null;
|
||||
payloadBytes = null;
|
||||
}
|
||||
this.payOut = payOut;
|
||||
this.posOut = posOut;
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(docOut, posOut, payOut);
|
||||
}
|
||||
}
|
||||
|
||||
docDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||
freqBuffer = new int[MAX_DATA_SIZE];
|
||||
|
||||
// nocommit should we try skipping every 2/4 blocks...?
|
||||
skipWriter = new BlockPackedSkipWriter(maxSkipLevels,
|
||||
BlockPackedPostingsFormat.BLOCK_SIZE,
|
||||
state.segmentInfo.getDocCount(),
|
||||
docOut,
|
||||
posOut,
|
||||
payOut);
|
||||
|
||||
encoded = new byte[MAX_ENCODED_SIZE];
|
||||
}
|
||||
|
||||
public BlockPackedPostingsWriter(SegmentWriteState state) throws IOException {
|
||||
this(state, PackedInts.DEFAULT);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void start(IndexOutput termsOut) throws IOException {
|
||||
this.termsOut = termsOut;
|
||||
CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT);
|
||||
termsOut.writeVInt(BLOCK_SIZE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setField(FieldInfo fieldInfo) {
|
||||
IndexOptions indexOptions = fieldInfo.getIndexOptions();
|
||||
fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||
fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
fieldHasPayloads = fieldInfo.hasPayloads();
|
||||
skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startTerm() {
|
||||
docTermStartFP = docOut.getFilePointer();
|
||||
if (fieldHasPositions) {
|
||||
posTermStartFP = posOut.getFilePointer();
|
||||
if (fieldHasPayloads || fieldHasOffsets) {
|
||||
payTermStartFP = payOut.getFilePointer();
|
||||
}
|
||||
}
|
||||
lastDocID = 0;
|
||||
lastBlockDocID = -1;
|
||||
if (DEBUG) {
|
||||
System.out.println("FPW.startTerm startFP=" + docTermStartFP);
|
||||
}
|
||||
skipWriter.resetSkip();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startDoc(int docID, int termDocFreq) throws IOException {
|
||||
if (DEBUG) {
|
||||
System.out.println("FPW.startDoc docID["+docBufferUpto+"]=" + docID);
|
||||
}
|
||||
|
||||
final int docDelta = docID - lastDocID;
|
||||
|
||||
if (docID < 0 || (docCount > 0 && docDelta <= 0)) {
|
||||
throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (docOut: " + docOut + ")");
|
||||
}
|
||||
|
||||
docDeltaBuffer[docBufferUpto] = docDelta;
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" docDeltaBuffer[" + docBufferUpto + "]=" + docDelta);
|
||||
// }
|
||||
if (fieldHasFreqs) {
|
||||
freqBuffer[docBufferUpto] = termDocFreq;
|
||||
}
|
||||
docBufferUpto++;
|
||||
docCount++;
|
||||
|
||||
if (docBufferUpto == BLOCK_SIZE) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" write docDelta block @ fp=" + docOut.getFilePointer());
|
||||
}
|
||||
forUtil.writeBlock(docDeltaBuffer, encoded, docOut);
|
||||
if (fieldHasFreqs) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" write freq block @ fp=" + docOut.getFilePointer());
|
||||
}
|
||||
forUtil.writeBlock(freqBuffer, encoded, docOut);
|
||||
}
|
||||
// NOTE: don't set docBufferUpto back to 0 here;
|
||||
// finishDoc will do so (because it needs to see that
|
||||
// the block was filled so it can save skip data)
|
||||
}
|
||||
|
||||
lastDocID = docID;
|
||||
lastPosition = 0;
|
||||
lastStartOffset = 0;
|
||||
}
|
||||
|
||||
/** Add a new position & payload */
|
||||
@Override
|
||||
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
|
||||
// if (DEBUG) {
|
||||
// System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: ""));
|
||||
// }
|
||||
posDeltaBuffer[posBufferUpto] = position - lastPosition;
|
||||
if (fieldHasPayloads) {
|
||||
if (payload == null || payload.length == 0) {
|
||||
// no payload
|
||||
payloadLengthBuffer[posBufferUpto] = 0;
|
||||
} else {
|
||||
payloadLengthBuffer[posBufferUpto] = payload.length;
|
||||
if (payloadByteUpto + payload.length > payloadBytes.length) {
|
||||
payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length);
|
||||
}
|
||||
System.arraycopy(payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length);
|
||||
payloadByteUpto += payload.length;
|
||||
}
|
||||
}
|
||||
|
||||
if (fieldHasOffsets) {
|
||||
assert startOffset >= lastStartOffset;
|
||||
assert endOffset >= startOffset;
|
||||
offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset;
|
||||
offsetLengthBuffer[posBufferUpto] = endOffset - startOffset;
|
||||
lastStartOffset = startOffset;
|
||||
}
|
||||
|
||||
posBufferUpto++;
|
||||
lastPosition = position;
|
||||
if (posBufferUpto == BLOCK_SIZE) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" write pos bulk block @ fp=" + posOut.getFilePointer());
|
||||
}
|
||||
forUtil.writeBlock(posDeltaBuffer, encoded, posOut);
|
||||
|
||||
if (fieldHasPayloads) {
|
||||
forUtil.writeBlock(payloadLengthBuffer, encoded, payOut);
|
||||
payOut.writeVInt(payloadByteUpto);
|
||||
payOut.writeBytes(payloadBytes, 0, payloadByteUpto);
|
||||
payloadByteUpto = 0;
|
||||
}
|
||||
if (fieldHasOffsets) {
|
||||
forUtil.writeBlock(offsetStartDeltaBuffer, encoded, payOut);
|
||||
forUtil.writeBlock(offsetLengthBuffer, encoded, payOut);
|
||||
}
|
||||
posBufferUpto = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finishDoc() throws IOException {
|
||||
// Have collected a block of docs, and get a new doc.
|
||||
// Should write skip data as well as postings list for
|
||||
// current block
|
||||
|
||||
if (lastBlockDocID != -1 && docBufferUpto == 1) {
|
||||
// nocomit move to startDoc? ie we can write skip
|
||||
// data as soon as the next doc starts...
|
||||
if (DEBUG) {
|
||||
System.out.println(" bufferSkip at writeBlock: lastDocID=" + lastBlockDocID + " docCount=" + (docCount-1));
|
||||
}
|
||||
skipWriter.bufferSkip(lastBlockDocID, docCount-1, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockStartOffset, lastBlockPayloadByteUpto);
|
||||
}
|
||||
|
||||
// Since we don't know df for current term, we had to buffer
|
||||
// those skip data for each block, and when a new doc comes,
|
||||
// write them to skip file.
|
||||
if (docBufferUpto == BLOCK_SIZE) {
|
||||
lastBlockDocID = lastDocID;
|
||||
if (posOut != null) {
|
||||
if (payOut != null) {
|
||||
lastBlockPayFP = payOut.getFilePointer();
|
||||
}
|
||||
lastBlockPosFP = posOut.getFilePointer();
|
||||
lastBlockPosBufferUpto = posBufferUpto;
|
||||
lastBlockStartOffset = lastStartOffset;
|
||||
lastBlockPayloadByteUpto = payloadByteUpto;
|
||||
}
|
||||
if (DEBUG) {
|
||||
System.out.println(" docBufferUpto="+docBufferUpto+" now get lastBlockDocID="+lastBlockDocID+" lastBlockPosFP=" + lastBlockPosFP + " lastBlockPosBufferUpto=" + lastBlockPosBufferUpto + " lastBlockPayloadByteUpto=" + lastBlockPayloadByteUpto);
|
||||
}
|
||||
docBufferUpto = 0;
|
||||
}
|
||||
}
|
||||
|
||||
private static class PendingTerm {
|
||||
public final long docStartFP;
|
||||
public final long posStartFP;
|
||||
public final long payStartFP;
|
||||
public final int skipOffset;
|
||||
public final int lastPosBlockOffset;
|
||||
|
||||
public PendingTerm(long docStartFP, long posStartFP, long payStartFP, int skipOffset, int lastPosBlockOffset) {
|
||||
this.docStartFP = docStartFP;
|
||||
this.posStartFP = posStartFP;
|
||||
this.payStartFP = payStartFP;
|
||||
this.skipOffset = skipOffset;
|
||||
this.lastPosBlockOffset = lastPosBlockOffset;
|
||||
}
|
||||
}
|
||||
|
||||
private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();
|
||||
|
||||
/** Called when we are done adding docs to this term */
|
||||
@Override
|
||||
public void finishTerm(TermStats stats) throws IOException {
|
||||
assert stats.docFreq > 0;
|
||||
|
||||
// TODO: wasteful we are counting this (counting # docs
|
||||
// for this term) in two places?
|
||||
assert stats.docFreq == docCount: stats.docFreq + " vs " + docCount;
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("FPW.finishTerm docFreq=" + stats.docFreq);
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
if (docBufferUpto > 0) {
|
||||
System.out.println(" write doc/freq vInt block (count=" + docBufferUpto + ") at fp=" + docOut.getFilePointer() + " docTermStartFP=" + docTermStartFP);
|
||||
}
|
||||
}
|
||||
|
||||
// vInt encode the remaining doc deltas and freqs:
|
||||
for(int i=0;i<docBufferUpto;i++) {
|
||||
final int docDelta = docDeltaBuffer[i];
|
||||
final int freq = freqBuffer[i];
|
||||
if (!fieldHasFreqs) {
|
||||
docOut.writeVInt(docDelta);
|
||||
} else if (freqBuffer[i] == 1) {
|
||||
docOut.writeVInt((docDelta<<1)|1);
|
||||
} else {
|
||||
docOut.writeVInt(docDelta<<1);
|
||||
docOut.writeVInt(freq);
|
||||
}
|
||||
}
|
||||
|
||||
final int lastPosBlockOffset;
|
||||
|
||||
if (fieldHasPositions) {
|
||||
if (DEBUG) {
|
||||
if (posBufferUpto > 0) {
|
||||
System.out.println(" write pos vInt block (count=" + posBufferUpto + ") at fp=" + posOut.getFilePointer() + " posTermStartFP=" + posTermStartFP + " hasPayloads=" + fieldHasPayloads + " hasOffsets=" + fieldHasOffsets);
|
||||
}
|
||||
}
|
||||
|
||||
assert stats.totalTermFreq != -1;
|
||||
if (stats.totalTermFreq > BLOCK_SIZE) {
|
||||
lastPosBlockOffset = (int) (posOut.getFilePointer() - posTermStartFP);
|
||||
} else {
|
||||
lastPosBlockOffset = -1;
|
||||
}
|
||||
if (posBufferUpto > 0) {
|
||||
posOut.writeVInt(posBufferUpto);
|
||||
|
||||
// nocommit should we send offsets/payloads to
|
||||
// .pay...? seems wasteful (have to store extra
|
||||
// vLong for low (< BLOCK_SIZE) DF terms = vast vast
|
||||
// majority)
|
||||
|
||||
// vInt encode the remaining positions/payloads/offsets:
|
||||
int lastPayloadLength = -1;
|
||||
int payloadBytesReadUpto = 0;
|
||||
for(int i=0;i<posBufferUpto;i++) {
|
||||
final int posDelta = posDeltaBuffer[i];
|
||||
if (fieldHasPayloads) {
|
||||
final int payloadLength = payloadLengthBuffer[i];
|
||||
if (payloadLength != lastPayloadLength) {
|
||||
lastPayloadLength = payloadLength;
|
||||
posOut.writeVInt((posDelta<<1)|1);
|
||||
posOut.writeVInt(payloadLength);
|
||||
} else {
|
||||
posOut.writeVInt(posDelta<<1);
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" i=" + i + " payloadLen=" + payloadLength);
|
||||
}
|
||||
|
||||
if (payloadLength != 0) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" write payload @ pos.fp=" + posOut.getFilePointer());
|
||||
}
|
||||
posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength);
|
||||
payloadBytesReadUpto += payloadLength;
|
||||
}
|
||||
} else {
|
||||
posOut.writeVInt(posDelta);
|
||||
}
|
||||
|
||||
if (fieldHasOffsets) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" write offset @ pos.fp=" + posOut.getFilePointer());
|
||||
}
|
||||
posOut.writeVInt(offsetStartDeltaBuffer[i]);
|
||||
posOut.writeVInt(offsetLengthBuffer[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (fieldHasPayloads) {
|
||||
assert payloadBytesReadUpto == payloadByteUpto;
|
||||
payloadByteUpto = 0;
|
||||
}
|
||||
}
|
||||
if (DEBUG) {
|
||||
System.out.println(" totalTermFreq=" + stats.totalTermFreq + " lastPosBlockOffset=" + lastPosBlockOffset);
|
||||
}
|
||||
} else {
|
||||
lastPosBlockOffset = -1;
|
||||
}
|
||||
|
||||
int skipOffset;
|
||||
if (docCount > BLOCK_SIZE) {
|
||||
skipOffset = (int) (skipWriter.writeSkip(docOut) - docTermStartFP);
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("skip packet " + (docOut.getFilePointer() - (docTermStartFP + skipOffset)) + " bytes");
|
||||
}
|
||||
} else {
|
||||
skipOffset = -1;
|
||||
if (DEBUG) {
|
||||
System.out.println(" no skip: docCount=" + docCount);
|
||||
}
|
||||
}
|
||||
|
||||
long payStartFP;
|
||||
if (stats.totalTermFreq >= BLOCK_SIZE) {
|
||||
payStartFP = payTermStartFP;
|
||||
} else {
|
||||
payStartFP = -1;
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" payStartFP=" + payStartFP);
|
||||
}
|
||||
|
||||
pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset));
|
||||
docBufferUpto = 0;
|
||||
posBufferUpto = 0;
|
||||
lastDocID = 0;
|
||||
docCount = 0;
|
||||
}
|
||||
|
||||
private final RAMOutputStream bytesWriter = new RAMOutputStream();
|
||||
|
||||
@Override
|
||||
public void flushTermsBlock(int start, int count) throws IOException {
|
||||
|
||||
if (count == 0) {
|
||||
termsOut.writeByte((byte) 0);
|
||||
return;
|
||||
}
|
||||
|
||||
assert start <= pendingTerms.size();
|
||||
assert count <= start;
|
||||
|
||||
final int limit = pendingTerms.size() - start + count;
|
||||
|
||||
long lastDocStartFP = 0;
|
||||
long lastPosStartFP = 0;
|
||||
long lastPayStartFP = 0;
|
||||
for(int idx=limit-count; idx<limit; idx++) {
|
||||
PendingTerm term = pendingTerms.get(idx);
|
||||
|
||||
bytesWriter.writeVLong(term.docStartFP - lastDocStartFP);
|
||||
lastDocStartFP = term.docStartFP;
|
||||
|
||||
if (fieldHasPositions) {
|
||||
bytesWriter.writeVLong(term.posStartFP - lastPosStartFP);
|
||||
lastPosStartFP = term.posStartFP;
|
||||
if (term.lastPosBlockOffset != -1) {
|
||||
bytesWriter.writeVInt(term.lastPosBlockOffset);
|
||||
}
|
||||
if ((fieldHasPayloads || fieldHasOffsets) && term.payStartFP != -1) {
|
||||
bytesWriter.writeVLong(term.payStartFP - lastPayStartFP);
|
||||
lastPayStartFP = term.payStartFP;
|
||||
}
|
||||
}
|
||||
|
||||
if (term.skipOffset != -1) {
|
||||
bytesWriter.writeVInt(term.skipOffset);
|
||||
}
|
||||
}
|
||||
|
||||
termsOut.writeVInt((int) bytesWriter.getFilePointer());
|
||||
bytesWriter.writeTo(termsOut);
|
||||
bytesWriter.reset();
|
||||
|
||||
// Remove the terms we just wrote:
|
||||
pendingTerms.subList(limit-count, limit).clear();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
IOUtils.close(docOut, posOut, payOut);
|
||||
}
|
||||
}
|
@ -1,244 +0,0 @@
|
||||
package org.apache.lucene.codecs.blockpacked;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.codecs.MultiLevelSkipListReader;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
/**
|
||||
* Implements the skip list reader for block postings format
|
||||
* that stores positions and payloads.
|
||||
*
|
||||
* Although this skipper uses MultiLevelSkipListReader as an interface,
|
||||
* its definition of skip position will be a little different.
|
||||
*
|
||||
* For example, when skipInterval = blockSize = 3, df = 2*skipInterval = 6,
|
||||
*
|
||||
* 0 1 2 3 4 5
|
||||
* d d d d d d (posting list)
|
||||
* ^ ^ (skip point in MultiLeveSkipWriter)
|
||||
* ^ (skip point in BlockSkipWriter)
|
||||
*
|
||||
* In this case, MultiLevelSkipListReader will use the last document as a skip point,
|
||||
* while BlockSkipReader should assume no skip point will comes.
|
||||
*
|
||||
* If we use the interface directly in BlockSkipReader, it may silly try to read
|
||||
* another skip data after the only skip point is loaded.
|
||||
*
|
||||
* To illustrate this, we can call skipTo(d[5]), since skip point d[3] has smaller docId,
|
||||
* and numSkipped+blockSize== df, the MultiLevelSkipListReader will assume the skip list
|
||||
* isn't exhausted yet, and try to load a non-existed skip point
|
||||
*
|
||||
* Therefore, we'll trim df before passing it to the interface. see trim(int)
|
||||
*
|
||||
*/
|
||||
final class BlockPackedSkipReader extends MultiLevelSkipListReader {
|
||||
private boolean DEBUG = BlockPackedPostingsReader.DEBUG;
|
||||
private final int blockSize;
|
||||
|
||||
private long docPointer[];
|
||||
private long posPointer[];
|
||||
private long payPointer[];
|
||||
private int posBufferUpto[];
|
||||
private int startOffset[];
|
||||
private int payloadByteUpto[];
|
||||
|
||||
private long lastPosPointer;
|
||||
private long lastPayPointer;
|
||||
private int lastStartOffset;
|
||||
private int lastPayloadByteUpto;
|
||||
private long lastDocPointer;
|
||||
private int lastPosBufferUpto;
|
||||
|
||||
public BlockPackedSkipReader(IndexInput skipStream, int maxSkipLevels, int blockSize, boolean hasPos, boolean hasOffsets, boolean hasPayloads) {
|
||||
super(skipStream, maxSkipLevels, blockSize, 8);
|
||||
this.blockSize = blockSize;
|
||||
docPointer = new long[maxSkipLevels];
|
||||
if (hasPos) {
|
||||
posPointer = new long[maxSkipLevels];
|
||||
posBufferUpto = new int[maxSkipLevels];
|
||||
if (hasPayloads) {
|
||||
payloadByteUpto = new int[maxSkipLevels];
|
||||
} else {
|
||||
payloadByteUpto = null;
|
||||
}
|
||||
if (hasOffsets) {
|
||||
startOffset = new int[maxSkipLevels];
|
||||
} else {
|
||||
startOffset = null;
|
||||
}
|
||||
if (hasOffsets || hasPayloads) {
|
||||
payPointer = new long[maxSkipLevels];
|
||||
} else {
|
||||
payPointer = null;
|
||||
}
|
||||
} else {
|
||||
posPointer = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Trim original docFreq to tell skipReader read proper number of skip points.
|
||||
*
|
||||
* Since our definition in BlockSkip* is a little different from MultiLevelSkip*
|
||||
* This trimed docFreq will prevent skipReader from:
|
||||
* 1. silly reading a non-existed skip point after the last block boundary
|
||||
* 2. moving into the vInt block
|
||||
*
|
||||
*/
|
||||
protected int trim(int df) {
|
||||
return df % blockSize == 0? df - 1: df;
|
||||
}
|
||||
|
||||
public void init(long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df) {
|
||||
super.init(skipPointer, trim(df));
|
||||
lastDocPointer = docBasePointer;
|
||||
lastPosPointer = posBasePointer;
|
||||
lastPayPointer = payBasePointer;
|
||||
|
||||
Arrays.fill(docPointer, docBasePointer);
|
||||
if (posPointer != null) {
|
||||
Arrays.fill(posPointer, posBasePointer);
|
||||
if (payPointer != null) {
|
||||
Arrays.fill(payPointer, payBasePointer);
|
||||
}
|
||||
} else {
|
||||
assert posBasePointer == 0;
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the doc pointer of the doc to which the last call of
|
||||
* {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */
|
||||
public long getDocPointer() {
|
||||
return lastDocPointer;
|
||||
}
|
||||
|
||||
public long getPosPointer() {
|
||||
return lastPosPointer;
|
||||
}
|
||||
|
||||
public int getPosBufferUpto() {
|
||||
return lastPosBufferUpto;
|
||||
}
|
||||
|
||||
public long getPayPointer() {
|
||||
return lastPayPointer;
|
||||
}
|
||||
|
||||
public int getStartOffset() {
|
||||
return lastStartOffset;
|
||||
}
|
||||
|
||||
public int getPayloadByteUpto() {
|
||||
return lastPayloadByteUpto;
|
||||
}
|
||||
|
||||
public int getNextSkipDoc() {
|
||||
return skipDoc[0];
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void seekChild(int level) throws IOException {
|
||||
super.seekChild(level);
|
||||
if (DEBUG) {
|
||||
System.out.println("seekChild level=" + level);
|
||||
}
|
||||
docPointer[level] = lastDocPointer;
|
||||
if (posPointer != null) {
|
||||
posPointer[level] = lastPosPointer;
|
||||
posBufferUpto[level] = lastPosBufferUpto;
|
||||
if (startOffset != null) {
|
||||
startOffset[level] = lastStartOffset;
|
||||
}
|
||||
if (payloadByteUpto != null) {
|
||||
payloadByteUpto[level] = lastPayloadByteUpto;
|
||||
}
|
||||
if (payPointer != null) {
|
||||
payPointer[level] = lastPayPointer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void setLastSkipData(int level) {
|
||||
super.setLastSkipData(level);
|
||||
lastDocPointer = docPointer[level];
|
||||
if (DEBUG) {
|
||||
System.out.println("setLastSkipData level=" + level);
|
||||
System.out.println(" lastDocPointer=" + lastDocPointer);
|
||||
}
|
||||
if (posPointer != null) {
|
||||
lastPosPointer = posPointer[level];
|
||||
lastPosBufferUpto = posBufferUpto[level];
|
||||
if (DEBUG) {
|
||||
System.out.println(" lastPosPointer=" + lastPosPointer + " lastPosBUfferUpto=" + lastPosBufferUpto);
|
||||
}
|
||||
if (payPointer != null) {
|
||||
lastPayPointer = payPointer[level];
|
||||
}
|
||||
if (startOffset != null) {
|
||||
lastStartOffset = startOffset[level];
|
||||
}
|
||||
if (payloadByteUpto != null) {
|
||||
lastPayloadByteUpto = payloadByteUpto[level];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int readSkipData(int level, IndexInput skipStream) throws IOException {
|
||||
if (DEBUG) {
|
||||
System.out.println("readSkipData level=" + level);
|
||||
}
|
||||
int delta = skipStream.readVInt();
|
||||
if (DEBUG) {
|
||||
System.out.println(" delta=" + delta);
|
||||
}
|
||||
docPointer[level] += skipStream.readVInt();
|
||||
if (DEBUG) {
|
||||
System.out.println(" docFP=" + docPointer[level]);
|
||||
}
|
||||
|
||||
if (posPointer != null) {
|
||||
posPointer[level] += skipStream.readVInt();
|
||||
if (DEBUG) {
|
||||
System.out.println(" posFP=" + posPointer[level]);
|
||||
}
|
||||
posBufferUpto[level] = skipStream.readVInt();
|
||||
if (DEBUG) {
|
||||
System.out.println(" posBufferUpto=" + posBufferUpto[level]);
|
||||
}
|
||||
|
||||
if (payloadByteUpto != null) {
|
||||
payloadByteUpto[level] = skipStream.readVInt();
|
||||
}
|
||||
|
||||
if (startOffset != null) {
|
||||
startOffset[level] += skipStream.readVInt();
|
||||
}
|
||||
|
||||
if (payPointer != null) {
|
||||
payPointer[level] += skipStream.readVInt();
|
||||
}
|
||||
}
|
||||
return delta;
|
||||
}
|
||||
}
|
@ -1,163 +0,0 @@
|
||||
package org.apache.lucene.codecs.blockpacked;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
|
||||
|
||||
/**
|
||||
* Write skip lists with multiple levels, and support skip within block ints.
|
||||
*
|
||||
* Assume that docFreq = 28, skipInterval = blockSize = 12
|
||||
*
|
||||
* | block#0 | | block#1 | |vInts|
|
||||
* d d d d d d d d d d d d d d d d d d d d d d d d d d d d (posting list)
|
||||
* ^ ^ (level 0 skip point)
|
||||
*
|
||||
* Note that skipWriter will ignore first document in block#0, since
|
||||
* it is useless as a skip point. Also, we'll never skip into the vInts
|
||||
* block, only record skip data at the start its start point(if it exist).
|
||||
*
|
||||
* For each skip point, we will record:
|
||||
* 1. docID in former position, i.e. for position 12, record docID[11], etc.
|
||||
* 2. its related file points(position, payload),
|
||||
* 3. related numbers or uptos(position, payload).
|
||||
* 4. start offset.
|
||||
*
|
||||
*/
|
||||
final class BlockPackedSkipWriter extends MultiLevelSkipListWriter {
|
||||
private boolean DEBUG = BlockPackedPostingsReader.DEBUG;
|
||||
|
||||
private int[] lastSkipDoc;
|
||||
private long[] lastSkipDocPointer;
|
||||
private long[] lastSkipPosPointer;
|
||||
private long[] lastSkipPayPointer;
|
||||
private int[] lastStartOffset;
|
||||
private int[] lastPayloadByteUpto;
|
||||
|
||||
private final IndexOutput docOut;
|
||||
private final IndexOutput posOut;
|
||||
private final IndexOutput payOut;
|
||||
|
||||
private int curDoc;
|
||||
private long curDocPointer;
|
||||
private long curPosPointer;
|
||||
private long curPayPointer;
|
||||
private int curPosBufferUpto;
|
||||
private int curStartOffset;
|
||||
private int curPayloadByteUpto;
|
||||
private boolean fieldHasPositions;
|
||||
private boolean fieldHasOffsets;
|
||||
private boolean fieldHasPayloads;
|
||||
|
||||
public BlockPackedSkipWriter(int maxSkipLevels, int blockSize, int docCount, IndexOutput docOut, IndexOutput posOut, IndexOutput payOut) {
|
||||
super(blockSize, 8, maxSkipLevels, docCount);
|
||||
this.docOut = docOut;
|
||||
this.posOut = posOut;
|
||||
this.payOut = payOut;
|
||||
|
||||
lastSkipDoc = new int[maxSkipLevels];
|
||||
lastSkipDocPointer = new long[maxSkipLevels];
|
||||
if (posOut != null) {
|
||||
lastSkipPosPointer = new long[maxSkipLevels];
|
||||
if (payOut != null) {
|
||||
lastSkipPayPointer = new long[maxSkipLevels];
|
||||
}
|
||||
lastStartOffset = new int[maxSkipLevels];
|
||||
lastPayloadByteUpto = new int[maxSkipLevels];
|
||||
}
|
||||
}
|
||||
|
||||
public void setField(boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) {
|
||||
this.fieldHasPositions = fieldHasPositions;
|
||||
this.fieldHasOffsets = fieldHasOffsets;
|
||||
this.fieldHasPayloads = fieldHasPayloads;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void resetSkip() {
|
||||
super.resetSkip();
|
||||
Arrays.fill(lastSkipDoc, 0);
|
||||
Arrays.fill(lastSkipDocPointer, docOut.getFilePointer());
|
||||
if (fieldHasPositions) {
|
||||
Arrays.fill(lastSkipPosPointer, posOut.getFilePointer());
|
||||
if (fieldHasOffsets) {
|
||||
Arrays.fill(lastStartOffset, 0);
|
||||
}
|
||||
if (fieldHasPayloads) {
|
||||
Arrays.fill(lastPayloadByteUpto, 0);
|
||||
}
|
||||
if (fieldHasOffsets || fieldHasPayloads) {
|
||||
Arrays.fill(lastSkipPayPointer, payOut.getFilePointer());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the values for the current skip data.
|
||||
*/
|
||||
public void bufferSkip(int doc, int numDocs, long posFP, long payFP, int posBufferUpto, int startOffset, int payloadByteUpto) throws IOException {
|
||||
this.curDoc = doc;
|
||||
this.curDocPointer = docOut.getFilePointer();
|
||||
this.curPosPointer = posFP;
|
||||
this.curPayPointer = payFP;
|
||||
this.curPosBufferUpto = posBufferUpto;
|
||||
this.curPayloadByteUpto = payloadByteUpto;
|
||||
this.curStartOffset = startOffset;
|
||||
bufferSkip(numDocs);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException {
|
||||
int delta = curDoc - lastSkipDoc[level];
|
||||
if (DEBUG) {
|
||||
System.out.println("writeSkipData level=" + level + " lastDoc=" + curDoc + " delta=" + delta + " curDocPointer=" + curDocPointer);
|
||||
}
|
||||
skipBuffer.writeVInt(delta);
|
||||
lastSkipDoc[level] = curDoc;
|
||||
|
||||
skipBuffer.writeVInt((int) (curDocPointer - lastSkipDocPointer[level]));
|
||||
lastSkipDocPointer[level] = curDocPointer;
|
||||
|
||||
if (fieldHasPositions) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" curPosPointer=" + curPosPointer + " curPosBufferUpto=" + curPosBufferUpto);
|
||||
}
|
||||
skipBuffer.writeVInt((int) (curPosPointer - lastSkipPosPointer[level]));
|
||||
lastSkipPosPointer[level] = curPosPointer;
|
||||
skipBuffer.writeVInt(curPosBufferUpto);
|
||||
|
||||
if (fieldHasPayloads) {
|
||||
skipBuffer.writeVInt(curPayloadByteUpto);
|
||||
}
|
||||
|
||||
if (fieldHasOffsets) {
|
||||
skipBuffer.writeVInt(curStartOffset - lastStartOffset[level]);
|
||||
lastStartOffset[level] = curStartOffset;
|
||||
}
|
||||
|
||||
if (fieldHasOffsets || fieldHasPayloads) {
|
||||
skipBuffer.writeVInt((int) (curPayPointer - lastSkipPayPointer[level]));
|
||||
lastSkipPayPointer[level] = curPayPointer;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,240 +0,0 @@
|
||||
package org.apache.lucene.codecs.blockpacked;
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.codecs.blockpacked.BlockPackedPostingsFormat.BLOCK_SIZE;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
import org.apache.lucene.util.packed.PackedInts.Decoder;
|
||||
import org.apache.lucene.util.packed.PackedInts.FormatAndBits;
|
||||
|
||||
/**
|
||||
* Encode all values in normal area with fixed bit width,
|
||||
* which is determined by the max value in this block.
|
||||
*/
|
||||
final class ForUtil {
|
||||
|
||||
/**
|
||||
* Special number of bits per value used whenever all values to encode are equal.
|
||||
*/
|
||||
private static final int ALL_VALUES_EQUAL = 0;
|
||||
private static final int PACKED_INTS_VERSION = 0; // nocommit: encode in the stream?
|
||||
|
||||
/**
|
||||
* Upper limit of the number of bytes that might be required to stored
|
||||
* <code>BLOCK_SIZE</code> encoded values.
|
||||
*/
|
||||
static final int MAX_ENCODED_SIZE = BLOCK_SIZE * 4;
|
||||
|
||||
/**
|
||||
* Upper limit of the number of values that might be decoded in a single call to
|
||||
* {@link #readBlock(IndexInput, byte[], int[])}. Although values after
|
||||
* <code>BLOCK_SIZE</code> are garbage, it is necessary to allocate value buffers
|
||||
* whose size is >= MAX_DATA_SIZE to avoid {@link ArrayIndexOutOfBoundsException}s.
|
||||
*/
|
||||
static final int MAX_DATA_SIZE;
|
||||
static {
|
||||
int minDataSize = 0;
|
||||
for (PackedInts.Format format : PackedInts.Format.values()) {
|
||||
for (int bpv = 1; bpv <= 32; ++bpv) {
|
||||
if (!format.isSupported(bpv)) {
|
||||
continue;
|
||||
}
|
||||
final PackedInts.Decoder decoder = PackedInts.getDecoder(format, PACKED_INTS_VERSION, bpv);
|
||||
final int iterations = (int) Math.ceil((float) BLOCK_SIZE / decoder.valueCount());
|
||||
minDataSize = Math.max(minDataSize, iterations * decoder.valueCount());
|
||||
}
|
||||
}
|
||||
MAX_DATA_SIZE = minDataSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the number of iterations required to decode <code>BLOCK_SIZE</code>
|
||||
* values with the provided {@link Decoder}.
|
||||
*/
|
||||
private static int computeIterations(PackedInts.Decoder decoder) {
|
||||
return (int) Math.ceil((float) BLOCK_SIZE / decoder.valueCount());
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the number of bytes required to encode a block of values that require
|
||||
* <code>bitsPerValue</code> bits per value with format <code>format</code>.
|
||||
*/
|
||||
private static int encodedSize(PackedInts.Format format, int bitsPerValue) {
|
||||
return format.nblocks(bitsPerValue, BLOCK_SIZE) << 3;
|
||||
}
|
||||
|
||||
private final int[] encodedSizes;
|
||||
private final PackedInts.Encoder[] encoders;
|
||||
private final PackedInts.Decoder[] decoders;
|
||||
private final int[] iterations;
|
||||
|
||||
/**
|
||||
* Create a new {@link ForUtil} instance and save state into <code>out</code>.
|
||||
*/
|
||||
ForUtil(float acceptableOverheadRatio, DataOutput out) throws IOException {
|
||||
encodedSizes = new int[33];
|
||||
encoders = new PackedInts.Encoder[33];
|
||||
decoders = new PackedInts.Decoder[33];
|
||||
iterations = new int[33];
|
||||
|
||||
for (int bpv = 1; bpv <= 32; ++bpv) {
|
||||
final FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(
|
||||
BLOCK_SIZE, bpv, acceptableOverheadRatio);
|
||||
assert formatAndBits.format.isSupported(formatAndBits.bitsPerValue);
|
||||
assert formatAndBits.bitsPerValue <= 32;
|
||||
encodedSizes[bpv] = encodedSize(formatAndBits.format, formatAndBits.bitsPerValue);
|
||||
encoders[bpv] = PackedInts.getEncoder(
|
||||
formatAndBits.format, PACKED_INTS_VERSION, formatAndBits.bitsPerValue);
|
||||
decoders[bpv] = PackedInts.getDecoder(
|
||||
formatAndBits.format, PACKED_INTS_VERSION, formatAndBits.bitsPerValue);
|
||||
iterations[bpv] = computeIterations(decoders[bpv]);
|
||||
|
||||
out.writeVInt(formatAndBits.format.getId() << 5 | (formatAndBits.bitsPerValue - 1));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Restore a {@link ForUtil} from a {@link DataInput}.
|
||||
*/
|
||||
ForUtil(DataInput in) throws IOException {
|
||||
encodedSizes = new int[33];
|
||||
encoders = new PackedInts.Encoder[33];
|
||||
decoders = new PackedInts.Decoder[33];
|
||||
iterations = new int[33];
|
||||
|
||||
for (int bpv = 1; bpv <= 32; ++bpv) {
|
||||
final int code = in.readVInt();
|
||||
final int formatId = code >>> 5;
|
||||
final int bitsPerValue = (code & 31) + 1;
|
||||
|
||||
final PackedInts.Format format = PackedInts.Format.byId(formatId);
|
||||
assert format.isSupported(bitsPerValue);
|
||||
encodedSizes[bpv] = encodedSize(format, bitsPerValue);
|
||||
encoders[bpv] = PackedInts.getEncoder(
|
||||
format, PACKED_INTS_VERSION, bitsPerValue);
|
||||
decoders[bpv] = PackedInts.getDecoder(
|
||||
format, PACKED_INTS_VERSION, bitsPerValue);
|
||||
iterations[bpv] = computeIterations(decoders[bpv]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a block of data (<code>For</code> format).
|
||||
*
|
||||
* @param data the data to write
|
||||
* @param encoded a buffer to use to encode data
|
||||
* @param out the destination output
|
||||
* @throws IOException
|
||||
*/
|
||||
void writeBlock(int[] data, byte[] encoded, IndexOutput out) throws IOException {
|
||||
if (isAllEqual(data)) {
|
||||
out.writeVInt(ALL_VALUES_EQUAL);
|
||||
out.writeInt(data[0]);
|
||||
return;
|
||||
}
|
||||
|
||||
final int numBits = bitsRequired(data);
|
||||
assert numBits > 0 && numBits <= 32 : numBits;
|
||||
final PackedInts.Encoder encoder = encoders[numBits];
|
||||
final int iters = iterations[numBits];
|
||||
assert iters * encoder.valueCount() >= BLOCK_SIZE;
|
||||
final int encodedSize = encodedSizes[numBits];
|
||||
assert (iters * encoder.blockCount()) << 3 >= encodedSize;
|
||||
|
||||
out.writeVInt(numBits);
|
||||
|
||||
encoder.encode(data, 0, encoded, 0, iters);
|
||||
out.writeBytes(encoded, encodedSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the next block of data (<code>For</code> format).
|
||||
*
|
||||
* @param in the input to use to read data
|
||||
* @param encoded a buffer that can be used to store encoded data
|
||||
* @param decoded where to write decoded data
|
||||
* @throws IOException
|
||||
*/
|
||||
void readBlock(IndexInput in, byte[] encoded, int[] decoded) throws IOException {
|
||||
final int numBits = in.readVInt();
|
||||
assert numBits <= 32 : numBits;
|
||||
|
||||
if (numBits == ALL_VALUES_EQUAL) {
|
||||
final int value = in.readInt();
|
||||
Arrays.fill(decoded, 0, BLOCK_SIZE, value);
|
||||
return;
|
||||
}
|
||||
|
||||
final int encodedSize = encodedSizes[numBits];
|
||||
in.readBytes(encoded, 0, encodedSize);
|
||||
|
||||
final PackedInts.Decoder decoder = decoders[numBits];
|
||||
final int iters = iterations[numBits];
|
||||
assert iters * decoder.valueCount() >= BLOCK_SIZE;
|
||||
|
||||
decoder.decode(encoded, 0, decoded, 0, iters);
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip the next block of data.
|
||||
*
|
||||
* @param in the input where to read data
|
||||
* @throws IOException
|
||||
*/
|
||||
void skipBlock(IndexInput in) throws IOException {
|
||||
final int numBits = in.readVInt();
|
||||
if (numBits == ALL_VALUES_EQUAL) {
|
||||
in.seek(in.getFilePointer() + 4);
|
||||
return;
|
||||
}
|
||||
assert numBits > 0 && numBits <= 32 : numBits;
|
||||
final int encodedSize = encodedSizes[numBits];
|
||||
in.seek(in.getFilePointer() + encodedSize);
|
||||
}
|
||||
|
||||
// nocommit: we must have a util function for this, hmm?
|
||||
private static boolean isAllEqual(final int[] data) {
|
||||
final long v = data[0];
|
||||
for (int i = 1; i < BLOCK_SIZE; ++i) {
|
||||
if (data[i] != v) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the number of bits required to serialize any of the longs in
|
||||
* <code>data</code>.
|
||||
*/
|
||||
private static int bitsRequired(final int[] data) {
|
||||
long or = 0;
|
||||
for (int i = 0; i < BLOCK_SIZE; ++i) {
|
||||
or |= data[i];
|
||||
}
|
||||
return PackedInts.bitsRequired(or);
|
||||
}
|
||||
|
||||
}
|
@ -21,4 +21,3 @@ org.apache.lucene.codecs.bulkvint.BulkVIntPostingsFormat
|
||||
org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat
|
||||
org.apache.lucene.codecs.memory.DirectPostingsFormat
|
||||
org.apache.lucene.codecs.block.BlockPostingsFormat
|
||||
org.apache.lucene.codecs.blockpacked.BlockPackedPostingsFormat
|
||||
|
@ -1,4 +1,4 @@
|
||||
package org.apache.lucene.codecs.blockpacked;
|
||||
package org.apache.lucene.codecs.block;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
@ -17,9 +17,9 @@ package org.apache.lucene.codecs.blockpacked;
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.codecs.blockpacked.BlockPackedPostingsFormat.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.blockpacked.ForUtil.MAX_DATA_SIZE;
|
||||
import static org.apache.lucene.codecs.blockpacked.ForUtil.MAX_ENCODED_SIZE;
|
||||
import static org.apache.lucene.codecs.block.BlockPostingsFormat.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.block.ForUtil.MAX_DATA_SIZE;
|
||||
import static org.apache.lucene.codecs.block.ForUtil.MAX_ENCODED_SIZE;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
Loading…
x
Reference in New Issue
Block a user