LUCENE-3892: replace Block with BlockPacked

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1371519 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-08-09 22:29:36 +00:00
parent 90918ad2d6
commit 4a5496ac97
16 changed files with 284 additions and 4968 deletions

View File

@ -43,7 +43,8 @@ public final class BlockPostingsFormat extends PostingsFormat {
private final int maxTermBlockSize;
// nocommit is this right?:
// NOTE: must be factor of .... 32?
// NOTE: should be at least 64 because of PackedInts long-aligned encoding/decoding
// NOTE: must be factor of ... 64?
public final static int BLOCK_SIZE = 128;
public BlockPostingsFormat() {

View File

@ -17,9 +17,11 @@ package org.apache.lucene.codecs.block;
* limitations under the License.
*/
import static org.apache.lucene.codecs.block.BlockPostingsFormat.BLOCK_SIZE;
import static org.apache.lucene.codecs.block.ForUtil.MAX_DATA_SIZE;
import static org.apache.lucene.codecs.block.ForUtil.MAX_ENCODED_SIZE;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import java.util.Arrays;
import org.apache.lucene.codecs.BlockTermState;
@ -27,8 +29,8 @@ import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
@ -43,8 +45,6 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import static org.apache.lucene.codecs.blockpacked.BlockPackedPostingsFormat.BLOCK_SIZE;
/**
* Concrete class that reads docId(maybe frq,pos,offset,payloads) list
* with postings format.
@ -58,6 +58,8 @@ public final class BlockPostingsReader extends PostingsReaderBase {
private final IndexInput posIn;
private final IndexInput payIn;
private final ForUtil forUtil;
public static boolean DEBUG = false;
// nocommit
@ -76,6 +78,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
BlockPostingsWriter.DOC_CODEC,
BlockPostingsWriter.VERSION_START,
BlockPostingsWriter.VERSION_START);
forUtil = new ForUtil(docIn);
if (fieldInfos.hasProx()) {
posIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, BlockPostingsFormat.POS_EXTENSION),
@ -119,7 +122,11 @@ public final class BlockPostingsReader extends PostingsReaderBase {
}
}
static void readVIntBlock(IndexInput docIn, int[] docBuffer, int[] freqBuffer, int num, boolean indexHasFreq) throws IOException {
/**
* Read values that have been written using variable-length encoding instead of bit-packing.
*/
private static void readVIntBlock(IndexInput docIn, int[] docBuffer,
int[] freqBuffer, int num, boolean indexHasFreq) throws IOException {
if (indexHasFreq) {
for(int i=0;i<num;i++) {
final int code = docIn.readVInt();
@ -137,17 +144,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
}
}
static void readBlock(IndexInput in, byte[] encoded, IntBuffer encodedBuffer, int[] buffer) throws IOException {
int header = in.readVInt();
in.readBytes(encoded, 0, ForUtil.getEncodedSize(header));
ForUtil.decompress(encodedBuffer, buffer, header);
}
static void skipBlock(IndexInput in) throws IOException {
int header = in.readVInt();
in.seek(in.getFilePointer() + ForUtil.getEncodedSize(header));
}
// Must keep final because we do non-standard clone
private final static class IntBlockTermState extends BlockTermState {
long docStartFP;
@ -323,10 +319,9 @@ public final class BlockPostingsReader extends PostingsReaderBase {
final class BlockDocsEnum extends DocsEnum {
private final byte[] encoded;
private final IntBuffer encodedBuffer;
private final int[] docDeltaBuffer = new int[BLOCK_SIZE];
private final int[] freqBuffer = new int[BLOCK_SIZE];
private final int[] docDeltaBuffer = new int[MAX_DATA_SIZE];
private final int[] freqBuffer = new int[MAX_DATA_SIZE];
private int docBufferUpto;
@ -368,8 +363,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
indexHasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
indexHasPayloads = fieldInfo.hasPayloads();
encoded = new byte[BLOCK_SIZE*4];
encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
encoded = new byte[MAX_ENCODED_SIZE];
}
public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) {
@ -419,14 +413,16 @@ public final class BlockPostingsReader extends PostingsReaderBase {
if (DEBUG) {
System.out.println(" fill doc block from fp=" + docIn.getFilePointer());
}
readBlock(docIn, encoded, encodedBuffer, docDeltaBuffer);
forUtil.readBlock(docIn, encoded, docDeltaBuffer);
if (indexHasFreq) {
if (DEBUG) {
System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
}
readBlock(docIn, encoded, encodedBuffer, freqBuffer);
forUtil.readBlock(docIn, encoded, freqBuffer);
}
} else {
// Read vInts:
if (DEBUG) {
System.out.println(" fill last vInt block from fp=" + docIn.getFilePointer());
}
@ -444,6 +440,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
if (DEBUG) {
System.out.println(" docUpto=" + docUpto + " (of df=" + docFreq + ") docBufferUpto=" + docBufferUpto);
}
if (docUpto == docFreq) {
if (DEBUG) {
System.out.println(" return doc=END");
@ -453,6 +450,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
if (docBufferUpto == BLOCK_SIZE) {
refillDocs();
}
if (DEBUG) {
System.out.println(" accum=" + accum + " docDeltaBuffer[" + docBufferUpto + "]=" + docDeltaBuffer[docBufferUpto]);
}
@ -571,11 +569,10 @@ public final class BlockPostingsReader extends PostingsReaderBase {
final class BlockDocsAndPositionsEnum extends DocsAndPositionsEnum {
private final byte[] encoded;
private final IntBuffer encodedBuffer;
private final int[] docDeltaBuffer = new int[BLOCK_SIZE];
private final int[] freqBuffer = new int[BLOCK_SIZE];
private final int[] posDeltaBuffer = new int[BLOCK_SIZE];
private final int[] docDeltaBuffer = new int[MAX_DATA_SIZE];
private final int[] freqBuffer = new int[MAX_DATA_SIZE];
private final int[] posDeltaBuffer = new int[MAX_DATA_SIZE];
private int docBufferUpto;
private int posBufferUpto;
@ -634,8 +631,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
this.startDocIn = BlockPostingsReader.this.docIn;
this.docIn = (IndexInput) startDocIn.clone();
this.posIn = (IndexInput) BlockPostingsReader.this.posIn.clone();
encoded = new byte[BLOCK_SIZE*4];
encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
encoded = new byte[MAX_ENCODED_SIZE];
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
indexHasPayloads = fieldInfo.hasPayloads();
}
@ -694,12 +690,13 @@ public final class BlockPostingsReader extends PostingsReaderBase {
if (DEBUG) {
System.out.println(" fill doc block from fp=" + docIn.getFilePointer());
}
readBlock(docIn, encoded, encodedBuffer, docDeltaBuffer);
forUtil.readBlock(docIn, encoded, docDeltaBuffer);
if (DEBUG) {
System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
}
readBlock(docIn, encoded, encodedBuffer, freqBuffer);
forUtil.readBlock(docIn, encoded, freqBuffer);
} else {
// Read vInts:
if (DEBUG) {
System.out.println(" fill last vInt doc block from fp=" + docIn.getFilePointer());
}
@ -740,7 +737,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
if (DEBUG) {
System.out.println(" bulk pos block @ fp=" + posIn.getFilePointer());
}
readBlock(posIn, encoded, encodedBuffer, posDeltaBuffer);
forUtil.readBlock(posIn, encoded, posDeltaBuffer);
}
}
@ -905,7 +902,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
System.out.println(" skip whole block @ fp=" + posIn.getFilePointer());
}
assert posIn.getFilePointer() != lastPosBlockFP;
skipBlock(posIn);
forUtil.skipBlock(posIn);
toSkip -= BLOCK_SIZE;
}
refillPositions();
@ -976,11 +973,10 @@ public final class BlockPostingsReader extends PostingsReaderBase {
final class EverythingEnum extends DocsAndPositionsEnum {
private final byte[] encoded;
private final IntBuffer encodedBuffer;
private final int[] docDeltaBuffer = new int[BLOCK_SIZE];
private final int[] freqBuffer = new int[BLOCK_SIZE];
private final int[] posDeltaBuffer = new int[BLOCK_SIZE];
private final int[] docDeltaBuffer = new int[MAX_DATA_SIZE];
private final int[] freqBuffer = new int[MAX_DATA_SIZE];
private final int[] posDeltaBuffer = new int[MAX_DATA_SIZE];
private final int[] payloadLengthBuffer;
private final int[] offsetStartDeltaBuffer;
@ -1058,12 +1054,11 @@ public final class BlockPostingsReader extends PostingsReaderBase {
this.docIn = (IndexInput) startDocIn.clone();
this.posIn = (IndexInput) BlockPostingsReader.this.posIn.clone();
this.payIn = (IndexInput) BlockPostingsReader.this.payIn.clone();
encoded = new byte[BLOCK_SIZE*4];
encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
encoded = new byte[MAX_ENCODED_SIZE];
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
if (indexHasOffsets) {
offsetStartDeltaBuffer = new int[BLOCK_SIZE];
offsetLengthBuffer = new int[BLOCK_SIZE];
offsetStartDeltaBuffer = new int[MAX_DATA_SIZE];
offsetLengthBuffer = new int[MAX_DATA_SIZE];
} else {
offsetStartDeltaBuffer = null;
offsetLengthBuffer = null;
@ -1073,7 +1068,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
indexHasPayloads = fieldInfo.hasPayloads();
if (indexHasPayloads) {
payloadLengthBuffer = new int[BLOCK_SIZE];
payloadLengthBuffer = new int[MAX_DATA_SIZE];
payloadBytes = new byte[128];
payload = new BytesRef();
} else {
@ -1138,11 +1133,11 @@ public final class BlockPostingsReader extends PostingsReaderBase {
if (DEBUG) {
System.out.println(" fill doc block from fp=" + docIn.getFilePointer());
}
readBlock(docIn, encoded, encodedBuffer, docDeltaBuffer);
forUtil.readBlock(docIn, encoded, docDeltaBuffer);
if (DEBUG) {
System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
}
readBlock(docIn, encoded, encodedBuffer, freqBuffer);
forUtil.readBlock(docIn, encoded, freqBuffer);
} else {
if (DEBUG) {
System.out.println(" fill last vInt doc block from fp=" + docIn.getFilePointer());
@ -1202,13 +1197,13 @@ public final class BlockPostingsReader extends PostingsReaderBase {
if (DEBUG) {
System.out.println(" bulk pos block @ fp=" + posIn.getFilePointer());
}
readBlock(posIn, encoded, encodedBuffer, posDeltaBuffer);
forUtil.readBlock(posIn, encoded, posDeltaBuffer);
if (indexHasPayloads) {
if (DEBUG) {
System.out.println(" bulk payload block @ pay.fp=" + payIn.getFilePointer());
}
readBlock(payIn, encoded, encodedBuffer, payloadLengthBuffer);
forUtil.readBlock(payIn, encoded, payloadLengthBuffer);
int numBytes = payIn.readVInt();
if (DEBUG) {
System.out.println(" " + numBytes + " payload bytes @ pay.fp=" + payIn.getFilePointer());
@ -1224,8 +1219,8 @@ public final class BlockPostingsReader extends PostingsReaderBase {
if (DEBUG) {
System.out.println(" bulk offset block @ pay.fp=" + payIn.getFilePointer());
}
readBlock(payIn, encoded, encodedBuffer, offsetStartDeltaBuffer);
readBlock(payIn, encoded, encodedBuffer, offsetLengthBuffer);
forUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer);
forUtil.readBlock(payIn, encoded, offsetLengthBuffer);
}
}
}
@ -1268,6 +1263,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
lastStartOffset = 0;
return doc;
}
if (DEBUG) {
System.out.println(" doc=" + accum + " is deleted; try next doc");
}
@ -1412,11 +1408,11 @@ public final class BlockPostingsReader extends PostingsReaderBase {
System.out.println(" skip whole block @ fp=" + posIn.getFilePointer());
}
assert posIn.getFilePointer() != lastPosBlockFP;
skipBlock(posIn);
forUtil.skipBlock(posIn);
if (indexHasPayloads) {
// Skip payloadLength block:
skipBlock(payIn);
forUtil.skipBlock(payIn);
// Skip payloadBytes block:
int numBytes = payIn.readVInt();
@ -1426,8 +1422,8 @@ public final class BlockPostingsReader extends PostingsReaderBase {
if (indexHasOffsets) {
// Must load offset blocks merely to sum
// up into lastStartOffset:
readBlock(payIn, encoded, encodedBuffer, offsetStartDeltaBuffer);
readBlock(payIn, encoded, encodedBuffer, offsetLengthBuffer);
forUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer);
forUtil.readBlock(payIn, encoded, offsetLengthBuffer);
for(int i=0;i<BLOCK_SIZE;i++) {
lastStartOffset += offsetStartDeltaBuffer[i] + offsetLengthBuffer[i];
}

View File

@ -17,9 +17,12 @@ package org.apache.lucene.codecs.block;
* limitations under the License.
*/
import static org.apache.lucene.codecs.block.BlockPostingsFormat.BLOCK_SIZE;
import static org.apache.lucene.codecs.block.BlockPostingsReader.DEBUG;
import static org.apache.lucene.codecs.block.ForUtil.MAX_DATA_SIZE;
import static org.apache.lucene.codecs.block.ForUtil.MAX_ENCODED_SIZE;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import java.util.ArrayList;
import java.util.List;
@ -27,8 +30,8 @@ import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
@ -36,8 +39,8 @@ import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;
import static org.apache.lucene.codecs.blockpacked.BlockPackedPostingsFormat.BLOCK_SIZE;
/**
* Concrete class that writes docId(maybe frq,pos,offset,payloads) list
@ -50,8 +53,6 @@ import static org.apache.lucene.codecs.blockpacked.BlockPackedPostingsFormat.BLO
*/
public final class BlockPostingsWriter extends PostingsWriterBase {
private boolean DEBUG = BlockPostingsReader.DEBUG;
// nocommit move these constants to the PF:
static final int maxSkipLevels = 10;
@ -108,11 +109,11 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
private int docCount;
final byte[] encoded;
final IntBuffer encodedBuffer;
private final ForUtil forUtil;
private final BlockSkipWriter skipWriter;
public BlockPostingsWriter(SegmentWriteState state) throws IOException {
public BlockPostingsWriter(SegmentWriteState state, float acceptableOverheadRatio) throws IOException {
super();
docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.DOC_EXTENSION),
@ -122,23 +123,24 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
boolean success = false;
try {
CodecUtil.writeHeader(docOut, DOC_CODEC, VERSION_CURRENT);
forUtil = new ForUtil(acceptableOverheadRatio, docOut);
if (state.fieldInfos.hasProx()) {
posDeltaBuffer = new int[BLOCK_SIZE];
posDeltaBuffer = new int[MAX_DATA_SIZE];
posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.POS_EXTENSION),
state.context);
CodecUtil.writeHeader(posOut, POS_CODEC, VERSION_CURRENT);
if (state.fieldInfos.hasPayloads()) {
payloadBytes = new byte[128];
payloadLengthBuffer = new int[BLOCK_SIZE];
payloadLengthBuffer = new int[MAX_DATA_SIZE];
} else {
payloadBytes = null;
payloadLengthBuffer = null;
}
if (state.fieldInfos.hasOffsets()) {
offsetStartDeltaBuffer = new int[BLOCK_SIZE];
offsetLengthBuffer = new int[BLOCK_SIZE];
offsetStartDeltaBuffer = new int[MAX_DATA_SIZE];
offsetLengthBuffer = new int[MAX_DATA_SIZE];
} else {
offsetStartDeltaBuffer = null;
offsetLengthBuffer = null;
@ -165,19 +167,22 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
}
}
docDeltaBuffer = new int[BLOCK_SIZE];
freqBuffer = new int[BLOCK_SIZE];
docDeltaBuffer = new int[MAX_DATA_SIZE];
freqBuffer = new int[MAX_DATA_SIZE];
// nocommit should we try skipping every 2/4 blocks...?
skipWriter = new BlockSkipWriter(maxSkipLevels,
BLOCK_SIZE,
skipWriter = new BlockSkipWriter(maxSkipLevels,
BLOCK_SIZE,
state.segmentInfo.getDocCount(),
docOut,
posOut,
payOut);
encoded = new byte[BLOCK_SIZE*4];
encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
encoded = new byte[MAX_ENCODED_SIZE];
}
public BlockPostingsWriter(SegmentWriteState state) throws IOException {
this(state, PackedInts.DEFAULT);
}
@Override
@ -214,17 +219,12 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
skipWriter.resetSkip();
}
private void writeBlock(int[] buffer, IndexOutput out) throws IOException {
final int header = ForUtil.compress(buffer, encodedBuffer);
out.writeVInt(header);
out.writeBytes(encoded, ForUtil.getEncodedSize(header));
}
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
if (DEBUG) {
System.out.println("FPW.startDoc docID["+docBufferUpto+"]=" + docID);
}
final int docDelta = docID - lastDocID;
if (docID < 0 || (docCount > 0 && docDelta <= 0)) {
@ -245,17 +245,18 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
if (DEBUG) {
System.out.println(" write docDelta block @ fp=" + docOut.getFilePointer());
}
writeBlock(docDeltaBuffer, docOut);
forUtil.writeBlock(docDeltaBuffer, encoded, docOut);
if (fieldHasFreqs) {
if (DEBUG) {
System.out.println(" write freq block @ fp=" + docOut.getFilePointer());
}
writeBlock(freqBuffer, docOut);
forUtil.writeBlock(freqBuffer, encoded, docOut);
}
// NOTE: don't set docBufferUpto back to 0 here;
// finishDoc will do so (because it needs to see that
// the block was filled so it can save skip data)
}
lastDocID = docID;
lastPosition = 0;
lastStartOffset = 0;
@ -296,17 +297,17 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
if (DEBUG) {
System.out.println(" write pos bulk block @ fp=" + posOut.getFilePointer());
}
writeBlock(posDeltaBuffer, posOut);
forUtil.writeBlock(posDeltaBuffer, encoded, posOut);
if (fieldHasPayloads) {
writeBlock(payloadLengthBuffer, payOut);
forUtil.writeBlock(payloadLengthBuffer, encoded, payOut);
payOut.writeVInt(payloadByteUpto);
payOut.writeBytes(payloadBytes, 0, payloadByteUpto);
payloadByteUpto = 0;
}
if (fieldHasOffsets) {
writeBlock(offsetStartDeltaBuffer, payOut);
writeBlock(offsetLengthBuffer, payOut);
forUtil.writeBlock(offsetStartDeltaBuffer, encoded, payOut);
forUtil.writeBlock(offsetLengthBuffer, encoded, payOut);
}
posBufferUpto = 0;
}
@ -475,7 +476,7 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
int skipOffset;
if (docCount > BLOCK_SIZE) {
skipOffset = (int) (skipWriter.writeSkip(docOut)-docTermStartFP);
skipOffset = (int) (skipWriter.writeSkip(docOut) - docTermStartFP);
if (DEBUG) {
System.out.println("skip packet " + (docOut.getFilePointer() - (docTermStartFP + skipOffset)) + " bytes");

View File

@ -52,7 +52,7 @@ import org.apache.lucene.store.IndexInput;
*/
final class BlockSkipReader extends MultiLevelSkipListReader {
private boolean DEBUG = BlockPostingsReader.DEBUG;
private int blockSize;
private final int blockSize;
private long docPointer[];
private long posPointer[];
@ -95,7 +95,6 @@ final class BlockSkipReader extends MultiLevelSkipListReader {
}
}
/**
* Trim original docFreq to tell skipReader read proper number of skip points.
*

View File

@ -37,7 +37,7 @@ import org.apache.lucene.codecs.MultiLevelSkipListWriter;
* block, only record skip data at the start its start point(if it exist).
*
* For each skip point, we will record:
* 1. docID in former position, i.e. for position 12, record docID[11], etc.
* 1. docID in former position, i.e. for position 12, record docID[11], etc.
* 2. its related file points(position, payload),
* 3. related numbers or uptos(position, payload).
* 4. start offset.

View File

@ -16,149 +16,208 @@ package org.apache.lucene.codecs.block;
* limitations under the License.
*/
import java.nio.IntBuffer;
import static org.apache.lucene.codecs.block.BlockPostingsFormat.BLOCK_SIZE;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedInts.Decoder;
import org.apache.lucene.util.packed.PackedInts.FormatAndBits;
/**
* Encode all values in normal area with fixed bit width,
* which is determined by the max value in this block.
*/
public final class ForUtil {
protected static final int[] MASK = { 0x00000000,
0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff,
0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
0x7fffffff, 0xffffffff};
/** Compress given int[] into Integer buffer, with For format
*
* @param data uncompressed data
* @param intBuffer integer buffer to hold compressed data
* @return the header for the current block
*/
static int compress(final int[] data, IntBuffer intBuffer) {
int numBits = getNumBits(data);
if (numBits == 0) {
return compressDuplicateBlock(data, intBuffer);
}
for (int i=0; i<BLOCK_SIZE; ++i) {
assert data[i] >= 0;
encodeNormalValue(intBuffer, i, data[i], numBits);
}
return numBits;
}
final class ForUtil {
/**
* Save only one int when the whole block equals to a
* single value.
* Special number of bits per value used whenever all values to encode are equal.
*/
static int compressDuplicateBlock(final int[] data, IntBuffer intBuffer) {
intBuffer.put(0, data[0]);
return 0;
}
/** Decompress given Integer buffer into int array.
*
* @param intBuffer integer buffer to hold compressed data
* @param data int array to hold uncompressed data
* @param header header of current block, which contains numFrameBits
*/
static void decompress(IntBuffer intBuffer, int[] data, int header) {
// since this buffer is reused at upper level, rewind first
intBuffer.rewind();
// NOTE: header == numBits now, but we may change that
final int numBits = header;
assert numBits >=0 && numBits < 32;
decompressCore(intBuffer, data, numBits);
}
public static void decompressCore(IntBuffer intBuffer, int[] data, int numBits) {
switch(numBits) {
case 0: PackedIntsDecompress.decode0(intBuffer, data); break;
case 1: PackedIntsDecompress.decode1(intBuffer, data); break;
case 2: PackedIntsDecompress.decode2(intBuffer, data); break;
case 3: PackedIntsDecompress.decode3(intBuffer, data); break;
case 4: PackedIntsDecompress.decode4(intBuffer, data); break;
case 5: PackedIntsDecompress.decode5(intBuffer, data); break;
case 6: PackedIntsDecompress.decode6(intBuffer, data); break;
case 7: PackedIntsDecompress.decode7(intBuffer, data); break;
case 8: PackedIntsDecompress.decode8(intBuffer, data); break;
case 9: PackedIntsDecompress.decode9(intBuffer, data); break;
case 10: PackedIntsDecompress.decode10(intBuffer, data); break;
case 11: PackedIntsDecompress.decode11(intBuffer, data); break;
case 12: PackedIntsDecompress.decode12(intBuffer, data); break;
case 13: PackedIntsDecompress.decode13(intBuffer, data); break;
case 14: PackedIntsDecompress.decode14(intBuffer, data); break;
case 15: PackedIntsDecompress.decode15(intBuffer, data); break;
case 16: PackedIntsDecompress.decode16(intBuffer, data); break;
case 17: PackedIntsDecompress.decode17(intBuffer, data); break;
case 18: PackedIntsDecompress.decode18(intBuffer, data); break;
case 19: PackedIntsDecompress.decode19(intBuffer, data); break;
case 20: PackedIntsDecompress.decode20(intBuffer, data); break;
case 21: PackedIntsDecompress.decode21(intBuffer, data); break;
case 22: PackedIntsDecompress.decode22(intBuffer, data); break;
case 23: PackedIntsDecompress.decode23(intBuffer, data); break;
case 24: PackedIntsDecompress.decode24(intBuffer, data); break;
case 25: PackedIntsDecompress.decode25(intBuffer, data); break;
case 26: PackedIntsDecompress.decode26(intBuffer, data); break;
case 27: PackedIntsDecompress.decode27(intBuffer, data); break;
case 28: PackedIntsDecompress.decode28(intBuffer, data); break;
case 29: PackedIntsDecompress.decode29(intBuffer, data); break;
case 30: PackedIntsDecompress.decode30(intBuffer, data); break;
case 31: PackedIntsDecompress.decode31(intBuffer, data); break;
// nocommit have default throw exc? or add assert up above
}
}
static void encodeNormalValue(IntBuffer intBuffer, int pos, int value, int numBits) {
final int globalBitPos = numBits*pos; // position in bit stream
final int localBitPos = globalBitPos & 31; // position inside an int
int intPos = globalBitPos/32; // which integer to locate
setBufferIntBits(intBuffer, intPos, localBitPos, numBits, value);
if ((localBitPos + numBits) > 32) { // value does not fit in this int, fill tail
setBufferIntBits(intBuffer, intPos+1, 0,
(localBitPos+numBits-32),
(value >>> (32-localBitPos)));
}
}
static void setBufferIntBits(IntBuffer intBuffer, int intPos, int firstBitPos, int numBits, int value) {
assert (value & ~MASK[numBits]) == 0;
// safely discards those msb parts when firstBitPos+numBits>32
intBuffer.put(intPos,
(intBuffer.get(intPos) & ~(MASK[numBits] << firstBitPos))
| (value << firstBitPos));
}
private static final int ALL_VALUES_EQUAL = 0;
private static final int PACKED_INTS_VERSION = 0; // nocommit: encode in the stream?
/**
* Returns number of bits necessary to represent max value.
* Upper limit of the number of bytes that might be required to stored
* <code>BLOCK_SIZE</code> encoded values.
*/
static int getNumBits(final int[] data) {
if (isAllEqual(data)) {
return 0;
}
int size=data.length;
int optBits=1;
for (int i=0; i<size; ++i) {
while ((data[i] & ~MASK[optBits]) != 0) {
optBits++;
static final int MAX_ENCODED_SIZE = BLOCK_SIZE * 4;
/**
* Upper limit of the number of values that might be decoded in a single call to
* {@link #readBlock(IndexInput, byte[], int[])}. Although values after
* <code>BLOCK_SIZE</code> are garbage, it is necessary to allocate value buffers
* whose size is >= MAX_DATA_SIZE to avoid {@link ArrayIndexOutOfBoundsException}s.
*/
static final int MAX_DATA_SIZE;
static {
int minDataSize = 0;
for (PackedInts.Format format : PackedInts.Format.values()) {
for (int bpv = 1; bpv <= 32; ++bpv) {
if (!format.isSupported(bpv)) {
continue;
}
final PackedInts.Decoder decoder = PackedInts.getDecoder(format, PACKED_INTS_VERSION, bpv);
final int iterations = (int) Math.ceil((float) BLOCK_SIZE / decoder.valueCount());
minDataSize = Math.max(minDataSize, iterations * decoder.valueCount());
}
}
assert optBits < 32;
return optBits;
MAX_DATA_SIZE = minDataSize;
}
/**
* Compute the number of iterations required to decode <code>BLOCK_SIZE</code>
* values with the provided {@link Decoder}.
*/
private static int computeIterations(PackedInts.Decoder decoder) {
return (int) Math.ceil((float) BLOCK_SIZE / decoder.valueCount());
}
/**
* Compute the number of bytes required to encode a block of values that require
* <code>bitsPerValue</code> bits per value with format <code>format</code>.
*/
private static int encodedSize(PackedInts.Format format, int bitsPerValue) {
return format.nblocks(bitsPerValue, BLOCK_SIZE) << 3;
}
private final int[] encodedSizes;
private final PackedInts.Encoder[] encoders;
private final PackedInts.Decoder[] decoders;
private final int[] iterations;
/**
* Create a new {@link ForUtil} instance and save state into <code>out</code>.
*/
ForUtil(float acceptableOverheadRatio, DataOutput out) throws IOException {
encodedSizes = new int[33];
encoders = new PackedInts.Encoder[33];
decoders = new PackedInts.Decoder[33];
iterations = new int[33];
for (int bpv = 1; bpv <= 32; ++bpv) {
final FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(
BLOCK_SIZE, bpv, acceptableOverheadRatio);
assert formatAndBits.format.isSupported(formatAndBits.bitsPerValue);
assert formatAndBits.bitsPerValue <= 32;
encodedSizes[bpv] = encodedSize(formatAndBits.format, formatAndBits.bitsPerValue);
encoders[bpv] = PackedInts.getEncoder(
formatAndBits.format, PACKED_INTS_VERSION, formatAndBits.bitsPerValue);
decoders[bpv] = PackedInts.getDecoder(
formatAndBits.format, PACKED_INTS_VERSION, formatAndBits.bitsPerValue);
iterations[bpv] = computeIterations(decoders[bpv]);
out.writeVInt(formatAndBits.format.getId() << 5 | (formatAndBits.bitsPerValue - 1));
}
}
/**
* Restore a {@link ForUtil} from a {@link DataInput}.
*/
ForUtil(DataInput in) throws IOException {
encodedSizes = new int[33];
encoders = new PackedInts.Encoder[33];
decoders = new PackedInts.Decoder[33];
iterations = new int[33];
for (int bpv = 1; bpv <= 32; ++bpv) {
final int code = in.readVInt();
final int formatId = code >>> 5;
final int bitsPerValue = (code & 31) + 1;
final PackedInts.Format format = PackedInts.Format.byId(formatId);
assert format.isSupported(bitsPerValue);
encodedSizes[bpv] = encodedSize(format, bitsPerValue);
encoders[bpv] = PackedInts.getEncoder(
format, PACKED_INTS_VERSION, bitsPerValue);
decoders[bpv] = PackedInts.getDecoder(
format, PACKED_INTS_VERSION, bitsPerValue);
iterations[bpv] = computeIterations(decoders[bpv]);
}
}
/**
* Write a block of data (<code>For</code> format).
*
* @param data the data to write
* @param encoded a buffer to use to encode data
* @param out the destination output
* @throws IOException
*/
void writeBlock(int[] data, byte[] encoded, IndexOutput out) throws IOException {
if (isAllEqual(data)) {
out.writeVInt(ALL_VALUES_EQUAL);
out.writeInt(data[0]);
return;
}
final int numBits = bitsRequired(data);
assert numBits > 0 && numBits <= 32 : numBits;
final PackedInts.Encoder encoder = encoders[numBits];
final int iters = iterations[numBits];
assert iters * encoder.valueCount() >= BLOCK_SIZE;
final int encodedSize = encodedSizes[numBits];
assert (iters * encoder.blockCount()) << 3 >= encodedSize;
out.writeVInt(numBits);
encoder.encode(data, 0, encoded, 0, iters);
out.writeBytes(encoded, encodedSize);
}
/**
* Read the next block of data (<code>For</code> format).
*
* @param in the input to use to read data
* @param encoded a buffer that can be used to store encoded data
* @param decoded where to write decoded data
* @throws IOException
*/
void readBlock(IndexInput in, byte[] encoded, int[] decoded) throws IOException {
final int numBits = in.readVInt();
assert numBits <= 32 : numBits;
if (numBits == ALL_VALUES_EQUAL) {
final int value = in.readInt();
Arrays.fill(decoded, 0, BLOCK_SIZE, value);
return;
}
final int encodedSize = encodedSizes[numBits];
in.readBytes(encoded, 0, encodedSize);
final PackedInts.Decoder decoder = decoders[numBits];
final int iters = iterations[numBits];
assert iters * decoder.valueCount() >= BLOCK_SIZE;
decoder.decode(encoded, 0, decoded, 0, iters);
}
/**
* Skip the next block of data.
*
* @param in the input where to read data
* @throws IOException
*/
void skipBlock(IndexInput in) throws IOException {
final int numBits = in.readVInt();
if (numBits == ALL_VALUES_EQUAL) {
in.seek(in.getFilePointer() + 4);
return;
}
assert numBits > 0 && numBits <= 32 : numBits;
final int encodedSize = encodedSizes[numBits];
in.seek(in.getFilePointer() + encodedSize);
}
// nocommit: we must have a util function for this, hmm?
protected static boolean isAllEqual(final int[] data) {
int len = data.length;
int v = data[0];
for (int i=1; i<len; i++) {
private static boolean isAllEqual(final int[] data) {
final long v = data[0];
for (int i = 1; i < BLOCK_SIZE; ++i) {
if (data[i] != v) {
return false;
}
@ -166,11 +225,16 @@ public final class ForUtil {
return true;
}
/**
* Expert: get compressed block size(in byte)
/**
* Compute the number of bits required to serialize any of the longs in
* <code>data</code>.
*/
static int getEncodedSize(int numBits) {
// NOTE: works only because BLOCK_SIZE is 0 mod 8:
return numBits == 0 ? 4 : numBits*BLOCK_SIZE/8;
private static int bitsRequired(final int[] data) {
long or = 0;
for (int i = 0; i < BLOCK_SIZE; ++i) {
or |= data[i];
}
return PackedInts.bitsRequired(or);
}
}

View File

@ -1,107 +0,0 @@
#!/usr/bin/env python2
"""
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
"""
Generate source code for java classes for For or PFor decompression.
"""
def bitsExpr(i, numFrameBits):
framePos = i * numFrameBits
intValNum = (framePos / 32)
bitPos = framePos % 32
bitsInInt = "intValue" + str(intValNum)
needBrackets = 0
if bitPos > 0:
bitsInInt += " >>> " + str(bitPos)
needBrackets = 1
if bitPos + numFrameBits > 32:
if needBrackets:
bitsInInt = "(" + bitsInInt + ")"
bitsInInt += " | (intValue" + str(intValNum+1) + " << "+ str(32 - bitPos) + ")"
needBrackets = 1
if bitPos + numFrameBits != 32:
if needBrackets:
bitsInInt = "(" + bitsInInt + ")"
bitsInInt += " & mask"
return bitsInInt
def genDecompress():
className = "PackedIntsDecompress"
fileName = className + ".java"
imports = "import java.nio.IntBuffer;\n"
f = open(fileName, 'w')
w = f.write
try:
w("package org.apache.lucene.codecs.block;\n")
w("""/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
""")
w("\n/* This code is generated, do not modify. See gendecompress.py */\n\n")
w("import java.nio.IntBuffer;\n")
w("import java.util.Arrays;\n\n")
w("final class PackedIntsDecompress {\n")
w('\n // nocommit: assess perf of this to see if specializing is really needed\n')
w('\n // NOTE: hardwired to blockSize == 128\n\n')
w(' public static void decode0(final IntBuffer compressedBuffer, final int[] output) {\n')
w(' Arrays.fill(output, compressedBuffer.get());\n')
w(' }\n')
for numFrameBits in xrange(1, 32):
w(' public static void decode%d(final IntBuffer compressedBuffer, final int[] output) {\n' % numFrameBits)
w(' final int numFrameBits = %d;\n' % numFrameBits)
w(' final int mask = (int) ((1L<<numFrameBits) - 1);\n')
w(' int outputOffset = 0;\n')
w(' for(int step=0;step<4;step++) {\n')
for i in range(numFrameBits): # declare int vars and init from buffer
w(" int intValue" + str(i) + " = compressedBuffer.get();\n")
for i in range(32): # set output from int vars
w(" output[" + str(i) + " + outputOffset] = " + bitsExpr(i, numFrameBits) + ";\n")
w(' outputOffset += 32;\n')
w(' }\n')
w(' }\n')
w('}\n')
finally:
f.close()
if __name__ == "__main__":
genDecompress()

View File

@ -1,110 +0,0 @@
package org.apache.lucene.codecs.blockpacked;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.BlockTreeTermsReader;
import org.apache.lucene.codecs.BlockTreeTermsWriter;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
/**
* Encodes/decode postings in packed int blocks for faster
* decode.
*/
public final class BlockPackedPostingsFormat extends PostingsFormat {
public static final String DOC_EXTENSION = "doc";
public static final String POS_EXTENSION = "pos";
public static final String PAY_EXTENSION = "pay";
private final int minTermBlockSize;
private final int maxTermBlockSize;
// nocommit is this right?:
// NOTE: should be at least 64 because of PackedInts long-aligned encoding/decoding
// NOTE: must be factor of ... 64?
public final static int BLOCK_SIZE = 128;
public BlockPackedPostingsFormat() {
this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
}
public BlockPackedPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
super("BlockPacked");
this.minTermBlockSize = minTermBlockSize;
assert minTermBlockSize > 1;
this.maxTermBlockSize = maxTermBlockSize;
assert minTermBlockSize <= maxTermBlockSize;
}
@Override
public String toString() {
return getName() + "(blocksize=" + BLOCK_SIZE + ")";
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new BlockPackedPostingsWriter(state);
boolean success = false;
try {
FieldsConsumer ret = new BlockTreeTermsWriter(state,
postingsWriter,
minTermBlockSize,
maxTermBlockSize);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new BlockPackedPostingsReader(state.dir,
state.fieldInfos,
state.segmentInfo,
state.context,
state.segmentSuffix);
boolean success = false;
try {
FieldsProducer ret = new BlockTreeTermsReader(state.dir,
state.fieldInfos,
state.segmentInfo.name,
postingsReader,
state.context,
state.segmentSuffix,
state.termsIndexDivisor);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsReader);
}
}
}
}

View File

@ -1,562 +0,0 @@
package org.apache.lucene.codecs.blockpacked;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.codecs.blockpacked.BlockPackedPostingsFormat.BLOCK_SIZE;
import static org.apache.lucene.codecs.blockpacked.BlockPackedPostingsReader.DEBUG;
import static org.apache.lucene.codecs.blockpacked.ForUtil.MAX_DATA_SIZE;
import static org.apache.lucene.codecs.blockpacked.ForUtil.MAX_ENCODED_SIZE;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;
/**
* Concrete class that writes docId(maybe frq,pos,offset,payloads) list
* with postings format.
*
* Postings list for each term will be stored separately.
*
* @see BlockPackedSkipWriter for details about skipping setting and postings layout.
*
*/
public final class BlockPackedPostingsWriter extends PostingsWriterBase {
// nocommit move these constants to the PF:
static final int maxSkipLevels = 10;
final static String TERMS_CODEC = "BlockPackedPostingsWriterTerms";
final static String DOC_CODEC = "BlockPackedPostingsWriterDoc";
final static String POS_CODEC = "BlockPackedPostingsWriterPos";
final static String PAY_CODEC = "BlockPackedPostingsWriterPay";
// Increment version to change it:
final static int VERSION_START = 0;
final static int VERSION_CURRENT = VERSION_START;
final IndexOutput docOut;
final IndexOutput posOut;
final IndexOutput payOut;
private IndexOutput termsOut;
// How current field indexes postings:
private boolean fieldHasFreqs;
private boolean fieldHasPositions;
private boolean fieldHasOffsets;
private boolean fieldHasPayloads;
// Holds starting file pointers for each term:
private long docTermStartFP;
private long posTermStartFP;
private long payTermStartFP;
final int[] docDeltaBuffer;
final int[] freqBuffer;
private int docBufferUpto;
final int[] posDeltaBuffer;
final int[] payloadLengthBuffer;
final int[] offsetStartDeltaBuffer;
final int[] offsetLengthBuffer;
private int posBufferUpto;
private byte[] payloadBytes;
private int payloadByteUpto;
private int lastBlockDocID;
private long lastBlockPosFP;
private long lastBlockPayFP;
private int lastBlockPosBufferUpto;
private int lastBlockStartOffset;
private int lastBlockPayloadByteUpto;
private int lastDocID;
private int lastPosition;
private int lastStartOffset;
private int docCount;
final byte[] encoded;
private final ForUtil forUtil;
private final BlockPackedSkipWriter skipWriter;
public BlockPackedPostingsWriter(SegmentWriteState state, float acceptableOverheadRatio) throws IOException {
super();
docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPackedPostingsFormat.DOC_EXTENSION),
state.context);
IndexOutput posOut = null;
IndexOutput payOut = null;
boolean success = false;
try {
CodecUtil.writeHeader(docOut, DOC_CODEC, VERSION_CURRENT);
forUtil = new ForUtil(acceptableOverheadRatio, docOut);
if (state.fieldInfos.hasProx()) {
posDeltaBuffer = new int[MAX_DATA_SIZE];
posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPackedPostingsFormat.POS_EXTENSION),
state.context);
CodecUtil.writeHeader(posOut, POS_CODEC, VERSION_CURRENT);
if (state.fieldInfos.hasPayloads()) {
payloadBytes = new byte[128];
payloadLengthBuffer = new int[MAX_DATA_SIZE];
} else {
payloadBytes = null;
payloadLengthBuffer = null;
}
if (state.fieldInfos.hasOffsets()) {
offsetStartDeltaBuffer = new int[MAX_DATA_SIZE];
offsetLengthBuffer = new int[MAX_DATA_SIZE];
} else {
offsetStartDeltaBuffer = null;
offsetLengthBuffer = null;
}
if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) {
payOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPackedPostingsFormat.PAY_EXTENSION),
state.context);
CodecUtil.writeHeader(payOut, PAY_CODEC, VERSION_CURRENT);
}
} else {
posDeltaBuffer = null;
payloadLengthBuffer = null;
offsetStartDeltaBuffer = null;
offsetLengthBuffer = null;
payloadBytes = null;
}
this.payOut = payOut;
this.posOut = posOut;
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(docOut, posOut, payOut);
}
}
docDeltaBuffer = new int[MAX_DATA_SIZE];
freqBuffer = new int[MAX_DATA_SIZE];
// nocommit should we try skipping every 2/4 blocks...?
skipWriter = new BlockPackedSkipWriter(maxSkipLevels,
BlockPackedPostingsFormat.BLOCK_SIZE,
state.segmentInfo.getDocCount(),
docOut,
posOut,
payOut);
encoded = new byte[MAX_ENCODED_SIZE];
}
public BlockPackedPostingsWriter(SegmentWriteState state) throws IOException {
this(state, PackedInts.DEFAULT);
}
@Override
public void start(IndexOutput termsOut) throws IOException {
this.termsOut = termsOut;
CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT);
termsOut.writeVInt(BLOCK_SIZE);
}
@Override
public void setField(FieldInfo fieldInfo) {
IndexOptions indexOptions = fieldInfo.getIndexOptions();
fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
fieldHasPayloads = fieldInfo.hasPayloads();
skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads);
}
@Override
public void startTerm() {
docTermStartFP = docOut.getFilePointer();
if (fieldHasPositions) {
posTermStartFP = posOut.getFilePointer();
if (fieldHasPayloads || fieldHasOffsets) {
payTermStartFP = payOut.getFilePointer();
}
}
lastDocID = 0;
lastBlockDocID = -1;
if (DEBUG) {
System.out.println("FPW.startTerm startFP=" + docTermStartFP);
}
skipWriter.resetSkip();
}
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
if (DEBUG) {
System.out.println("FPW.startDoc docID["+docBufferUpto+"]=" + docID);
}
final int docDelta = docID - lastDocID;
if (docID < 0 || (docCount > 0 && docDelta <= 0)) {
throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (docOut: " + docOut + ")");
}
docDeltaBuffer[docBufferUpto] = docDelta;
// if (DEBUG) {
// System.out.println(" docDeltaBuffer[" + docBufferUpto + "]=" + docDelta);
// }
if (fieldHasFreqs) {
freqBuffer[docBufferUpto] = termDocFreq;
}
docBufferUpto++;
docCount++;
if (docBufferUpto == BLOCK_SIZE) {
if (DEBUG) {
System.out.println(" write docDelta block @ fp=" + docOut.getFilePointer());
}
forUtil.writeBlock(docDeltaBuffer, encoded, docOut);
if (fieldHasFreqs) {
if (DEBUG) {
System.out.println(" write freq block @ fp=" + docOut.getFilePointer());
}
forUtil.writeBlock(freqBuffer, encoded, docOut);
}
// NOTE: don't set docBufferUpto back to 0 here;
// finishDoc will do so (because it needs to see that
// the block was filled so it can save skip data)
}
lastDocID = docID;
lastPosition = 0;
lastStartOffset = 0;
}
/** Add a new position & payload */
@Override
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
// if (DEBUG) {
// System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: ""));
// }
posDeltaBuffer[posBufferUpto] = position - lastPosition;
if (fieldHasPayloads) {
if (payload == null || payload.length == 0) {
// no payload
payloadLengthBuffer[posBufferUpto] = 0;
} else {
payloadLengthBuffer[posBufferUpto] = payload.length;
if (payloadByteUpto + payload.length > payloadBytes.length) {
payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length);
}
System.arraycopy(payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length);
payloadByteUpto += payload.length;
}
}
if (fieldHasOffsets) {
assert startOffset >= lastStartOffset;
assert endOffset >= startOffset;
offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset;
offsetLengthBuffer[posBufferUpto] = endOffset - startOffset;
lastStartOffset = startOffset;
}
posBufferUpto++;
lastPosition = position;
if (posBufferUpto == BLOCK_SIZE) {
if (DEBUG) {
System.out.println(" write pos bulk block @ fp=" + posOut.getFilePointer());
}
forUtil.writeBlock(posDeltaBuffer, encoded, posOut);
if (fieldHasPayloads) {
forUtil.writeBlock(payloadLengthBuffer, encoded, payOut);
payOut.writeVInt(payloadByteUpto);
payOut.writeBytes(payloadBytes, 0, payloadByteUpto);
payloadByteUpto = 0;
}
if (fieldHasOffsets) {
forUtil.writeBlock(offsetStartDeltaBuffer, encoded, payOut);
forUtil.writeBlock(offsetLengthBuffer, encoded, payOut);
}
posBufferUpto = 0;
}
}
@Override
public void finishDoc() throws IOException {
// Have collected a block of docs, and get a new doc.
// Should write skip data as well as postings list for
// current block
if (lastBlockDocID != -1 && docBufferUpto == 1) {
// nocomit move to startDoc? ie we can write skip
// data as soon as the next doc starts...
if (DEBUG) {
System.out.println(" bufferSkip at writeBlock: lastDocID=" + lastBlockDocID + " docCount=" + (docCount-1));
}
skipWriter.bufferSkip(lastBlockDocID, docCount-1, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockStartOffset, lastBlockPayloadByteUpto);
}
// Since we don't know df for current term, we had to buffer
// those skip data for each block, and when a new doc comes,
// write them to skip file.
if (docBufferUpto == BLOCK_SIZE) {
lastBlockDocID = lastDocID;
if (posOut != null) {
if (payOut != null) {
lastBlockPayFP = payOut.getFilePointer();
}
lastBlockPosFP = posOut.getFilePointer();
lastBlockPosBufferUpto = posBufferUpto;
lastBlockStartOffset = lastStartOffset;
lastBlockPayloadByteUpto = payloadByteUpto;
}
if (DEBUG) {
System.out.println(" docBufferUpto="+docBufferUpto+" now get lastBlockDocID="+lastBlockDocID+" lastBlockPosFP=" + lastBlockPosFP + " lastBlockPosBufferUpto=" + lastBlockPosBufferUpto + " lastBlockPayloadByteUpto=" + lastBlockPayloadByteUpto);
}
docBufferUpto = 0;
}
}
private static class PendingTerm {
public final long docStartFP;
public final long posStartFP;
public final long payStartFP;
public final int skipOffset;
public final int lastPosBlockOffset;
public PendingTerm(long docStartFP, long posStartFP, long payStartFP, int skipOffset, int lastPosBlockOffset) {
this.docStartFP = docStartFP;
this.posStartFP = posStartFP;
this.payStartFP = payStartFP;
this.skipOffset = skipOffset;
this.lastPosBlockOffset = lastPosBlockOffset;
}
}
private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();
/** Called when we are done adding docs to this term */
@Override
public void finishTerm(TermStats stats) throws IOException {
assert stats.docFreq > 0;
// TODO: wasteful we are counting this (counting # docs
// for this term) in two places?
assert stats.docFreq == docCount: stats.docFreq + " vs " + docCount;
if (DEBUG) {
System.out.println("FPW.finishTerm docFreq=" + stats.docFreq);
}
if (DEBUG) {
if (docBufferUpto > 0) {
System.out.println(" write doc/freq vInt block (count=" + docBufferUpto + ") at fp=" + docOut.getFilePointer() + " docTermStartFP=" + docTermStartFP);
}
}
// vInt encode the remaining doc deltas and freqs:
for(int i=0;i<docBufferUpto;i++) {
final int docDelta = docDeltaBuffer[i];
final int freq = freqBuffer[i];
if (!fieldHasFreqs) {
docOut.writeVInt(docDelta);
} else if (freqBuffer[i] == 1) {
docOut.writeVInt((docDelta<<1)|1);
} else {
docOut.writeVInt(docDelta<<1);
docOut.writeVInt(freq);
}
}
final int lastPosBlockOffset;
if (fieldHasPositions) {
if (DEBUG) {
if (posBufferUpto > 0) {
System.out.println(" write pos vInt block (count=" + posBufferUpto + ") at fp=" + posOut.getFilePointer() + " posTermStartFP=" + posTermStartFP + " hasPayloads=" + fieldHasPayloads + " hasOffsets=" + fieldHasOffsets);
}
}
assert stats.totalTermFreq != -1;
if (stats.totalTermFreq > BLOCK_SIZE) {
lastPosBlockOffset = (int) (posOut.getFilePointer() - posTermStartFP);
} else {
lastPosBlockOffset = -1;
}
if (posBufferUpto > 0) {
posOut.writeVInt(posBufferUpto);
// nocommit should we send offsets/payloads to
// .pay...? seems wasteful (have to store extra
// vLong for low (< BLOCK_SIZE) DF terms = vast vast
// majority)
// vInt encode the remaining positions/payloads/offsets:
int lastPayloadLength = -1;
int payloadBytesReadUpto = 0;
for(int i=0;i<posBufferUpto;i++) {
final int posDelta = posDeltaBuffer[i];
if (fieldHasPayloads) {
final int payloadLength = payloadLengthBuffer[i];
if (payloadLength != lastPayloadLength) {
lastPayloadLength = payloadLength;
posOut.writeVInt((posDelta<<1)|1);
posOut.writeVInt(payloadLength);
} else {
posOut.writeVInt(posDelta<<1);
}
if (DEBUG) {
System.out.println(" i=" + i + " payloadLen=" + payloadLength);
}
if (payloadLength != 0) {
if (DEBUG) {
System.out.println(" write payload @ pos.fp=" + posOut.getFilePointer());
}
posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength);
payloadBytesReadUpto += payloadLength;
}
} else {
posOut.writeVInt(posDelta);
}
if (fieldHasOffsets) {
if (DEBUG) {
System.out.println(" write offset @ pos.fp=" + posOut.getFilePointer());
}
posOut.writeVInt(offsetStartDeltaBuffer[i]);
posOut.writeVInt(offsetLengthBuffer[i]);
}
}
if (fieldHasPayloads) {
assert payloadBytesReadUpto == payloadByteUpto;
payloadByteUpto = 0;
}
}
if (DEBUG) {
System.out.println(" totalTermFreq=" + stats.totalTermFreq + " lastPosBlockOffset=" + lastPosBlockOffset);
}
} else {
lastPosBlockOffset = -1;
}
int skipOffset;
if (docCount > BLOCK_SIZE) {
skipOffset = (int) (skipWriter.writeSkip(docOut) - docTermStartFP);
if (DEBUG) {
System.out.println("skip packet " + (docOut.getFilePointer() - (docTermStartFP + skipOffset)) + " bytes");
}
} else {
skipOffset = -1;
if (DEBUG) {
System.out.println(" no skip: docCount=" + docCount);
}
}
long payStartFP;
if (stats.totalTermFreq >= BLOCK_SIZE) {
payStartFP = payTermStartFP;
} else {
payStartFP = -1;
}
if (DEBUG) {
System.out.println(" payStartFP=" + payStartFP);
}
pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset));
docBufferUpto = 0;
posBufferUpto = 0;
lastDocID = 0;
docCount = 0;
}
private final RAMOutputStream bytesWriter = new RAMOutputStream();
@Override
public void flushTermsBlock(int start, int count) throws IOException {
if (count == 0) {
termsOut.writeByte((byte) 0);
return;
}
assert start <= pendingTerms.size();
assert count <= start;
final int limit = pendingTerms.size() - start + count;
long lastDocStartFP = 0;
long lastPosStartFP = 0;
long lastPayStartFP = 0;
for(int idx=limit-count; idx<limit; idx++) {
PendingTerm term = pendingTerms.get(idx);
bytesWriter.writeVLong(term.docStartFP - lastDocStartFP);
lastDocStartFP = term.docStartFP;
if (fieldHasPositions) {
bytesWriter.writeVLong(term.posStartFP - lastPosStartFP);
lastPosStartFP = term.posStartFP;
if (term.lastPosBlockOffset != -1) {
bytesWriter.writeVInt(term.lastPosBlockOffset);
}
if ((fieldHasPayloads || fieldHasOffsets) && term.payStartFP != -1) {
bytesWriter.writeVLong(term.payStartFP - lastPayStartFP);
lastPayStartFP = term.payStartFP;
}
}
if (term.skipOffset != -1) {
bytesWriter.writeVInt(term.skipOffset);
}
}
termsOut.writeVInt((int) bytesWriter.getFilePointer());
bytesWriter.writeTo(termsOut);
bytesWriter.reset();
// Remove the terms we just wrote:
pendingTerms.subList(limit-count, limit).clear();
}
@Override
public void close() throws IOException {
IOUtils.close(docOut, posOut, payOut);
}
}

View File

@ -1,244 +0,0 @@
package org.apache.lucene.codecs.blockpacked;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.codecs.MultiLevelSkipListReader;
import org.apache.lucene.store.IndexInput;
/**
* Implements the skip list reader for block postings format
* that stores positions and payloads.
*
* Although this skipper uses MultiLevelSkipListReader as an interface,
* its definition of skip position will be a little different.
*
* For example, when skipInterval = blockSize = 3, df = 2*skipInterval = 6,
*
* 0 1 2 3 4 5
* d d d d d d (posting list)
* ^ ^ (skip point in MultiLeveSkipWriter)
* ^ (skip point in BlockSkipWriter)
*
* In this case, MultiLevelSkipListReader will use the last document as a skip point,
* while BlockSkipReader should assume no skip point will comes.
*
* If we use the interface directly in BlockSkipReader, it may silly try to read
* another skip data after the only skip point is loaded.
*
* To illustrate this, we can call skipTo(d[5]), since skip point d[3] has smaller docId,
* and numSkipped+blockSize== df, the MultiLevelSkipListReader will assume the skip list
* isn't exhausted yet, and try to load a non-existed skip point
*
* Therefore, we'll trim df before passing it to the interface. see trim(int)
*
*/
final class BlockPackedSkipReader extends MultiLevelSkipListReader {
private boolean DEBUG = BlockPackedPostingsReader.DEBUG;
private final int blockSize;
private long docPointer[];
private long posPointer[];
private long payPointer[];
private int posBufferUpto[];
private int startOffset[];
private int payloadByteUpto[];
private long lastPosPointer;
private long lastPayPointer;
private int lastStartOffset;
private int lastPayloadByteUpto;
private long lastDocPointer;
private int lastPosBufferUpto;
public BlockPackedSkipReader(IndexInput skipStream, int maxSkipLevels, int blockSize, boolean hasPos, boolean hasOffsets, boolean hasPayloads) {
super(skipStream, maxSkipLevels, blockSize, 8);
this.blockSize = blockSize;
docPointer = new long[maxSkipLevels];
if (hasPos) {
posPointer = new long[maxSkipLevels];
posBufferUpto = new int[maxSkipLevels];
if (hasPayloads) {
payloadByteUpto = new int[maxSkipLevels];
} else {
payloadByteUpto = null;
}
if (hasOffsets) {
startOffset = new int[maxSkipLevels];
} else {
startOffset = null;
}
if (hasOffsets || hasPayloads) {
payPointer = new long[maxSkipLevels];
} else {
payPointer = null;
}
} else {
posPointer = null;
}
}
/**
* Trim original docFreq to tell skipReader read proper number of skip points.
*
* Since our definition in BlockSkip* is a little different from MultiLevelSkip*
* This trimed docFreq will prevent skipReader from:
* 1. silly reading a non-existed skip point after the last block boundary
* 2. moving into the vInt block
*
*/
protected int trim(int df) {
return df % blockSize == 0? df - 1: df;
}
public void init(long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df) {
super.init(skipPointer, trim(df));
lastDocPointer = docBasePointer;
lastPosPointer = posBasePointer;
lastPayPointer = payBasePointer;
Arrays.fill(docPointer, docBasePointer);
if (posPointer != null) {
Arrays.fill(posPointer, posBasePointer);
if (payPointer != null) {
Arrays.fill(payPointer, payBasePointer);
}
} else {
assert posBasePointer == 0;
}
}
/** Returns the doc pointer of the doc to which the last call of
* {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */
public long getDocPointer() {
return lastDocPointer;
}
public long getPosPointer() {
return lastPosPointer;
}
public int getPosBufferUpto() {
return lastPosBufferUpto;
}
public long getPayPointer() {
return lastPayPointer;
}
public int getStartOffset() {
return lastStartOffset;
}
public int getPayloadByteUpto() {
return lastPayloadByteUpto;
}
public int getNextSkipDoc() {
return skipDoc[0];
}
@Override
protected void seekChild(int level) throws IOException {
super.seekChild(level);
if (DEBUG) {
System.out.println("seekChild level=" + level);
}
docPointer[level] = lastDocPointer;
if (posPointer != null) {
posPointer[level] = lastPosPointer;
posBufferUpto[level] = lastPosBufferUpto;
if (startOffset != null) {
startOffset[level] = lastStartOffset;
}
if (payloadByteUpto != null) {
payloadByteUpto[level] = lastPayloadByteUpto;
}
if (payPointer != null) {
payPointer[level] = lastPayPointer;
}
}
}
@Override
protected void setLastSkipData(int level) {
super.setLastSkipData(level);
lastDocPointer = docPointer[level];
if (DEBUG) {
System.out.println("setLastSkipData level=" + level);
System.out.println(" lastDocPointer=" + lastDocPointer);
}
if (posPointer != null) {
lastPosPointer = posPointer[level];
lastPosBufferUpto = posBufferUpto[level];
if (DEBUG) {
System.out.println(" lastPosPointer=" + lastPosPointer + " lastPosBUfferUpto=" + lastPosBufferUpto);
}
if (payPointer != null) {
lastPayPointer = payPointer[level];
}
if (startOffset != null) {
lastStartOffset = startOffset[level];
}
if (payloadByteUpto != null) {
lastPayloadByteUpto = payloadByteUpto[level];
}
}
}
@Override
protected int readSkipData(int level, IndexInput skipStream) throws IOException {
if (DEBUG) {
System.out.println("readSkipData level=" + level);
}
int delta = skipStream.readVInt();
if (DEBUG) {
System.out.println(" delta=" + delta);
}
docPointer[level] += skipStream.readVInt();
if (DEBUG) {
System.out.println(" docFP=" + docPointer[level]);
}
if (posPointer != null) {
posPointer[level] += skipStream.readVInt();
if (DEBUG) {
System.out.println(" posFP=" + posPointer[level]);
}
posBufferUpto[level] = skipStream.readVInt();
if (DEBUG) {
System.out.println(" posBufferUpto=" + posBufferUpto[level]);
}
if (payloadByteUpto != null) {
payloadByteUpto[level] = skipStream.readVInt();
}
if (startOffset != null) {
startOffset[level] += skipStream.readVInt();
}
if (payPointer != null) {
payPointer[level] += skipStream.readVInt();
}
}
return delta;
}
}

View File

@ -1,163 +0,0 @@
package org.apache.lucene.codecs.blockpacked;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
/**
* Write skip lists with multiple levels, and support skip within block ints.
*
* Assume that docFreq = 28, skipInterval = blockSize = 12
*
* | block#0 | | block#1 | |vInts|
* d d d d d d d d d d d d d d d d d d d d d d d d d d d d (posting list)
* ^ ^ (level 0 skip point)
*
* Note that skipWriter will ignore first document in block#0, since
* it is useless as a skip point. Also, we'll never skip into the vInts
* block, only record skip data at the start its start point(if it exist).
*
* For each skip point, we will record:
* 1. docID in former position, i.e. for position 12, record docID[11], etc.
* 2. its related file points(position, payload),
* 3. related numbers or uptos(position, payload).
* 4. start offset.
*
*/
final class BlockPackedSkipWriter extends MultiLevelSkipListWriter {
private boolean DEBUG = BlockPackedPostingsReader.DEBUG;
private int[] lastSkipDoc;
private long[] lastSkipDocPointer;
private long[] lastSkipPosPointer;
private long[] lastSkipPayPointer;
private int[] lastStartOffset;
private int[] lastPayloadByteUpto;
private final IndexOutput docOut;
private final IndexOutput posOut;
private final IndexOutput payOut;
private int curDoc;
private long curDocPointer;
private long curPosPointer;
private long curPayPointer;
private int curPosBufferUpto;
private int curStartOffset;
private int curPayloadByteUpto;
private boolean fieldHasPositions;
private boolean fieldHasOffsets;
private boolean fieldHasPayloads;
public BlockPackedSkipWriter(int maxSkipLevels, int blockSize, int docCount, IndexOutput docOut, IndexOutput posOut, IndexOutput payOut) {
super(blockSize, 8, maxSkipLevels, docCount);
this.docOut = docOut;
this.posOut = posOut;
this.payOut = payOut;
lastSkipDoc = new int[maxSkipLevels];
lastSkipDocPointer = new long[maxSkipLevels];
if (posOut != null) {
lastSkipPosPointer = new long[maxSkipLevels];
if (payOut != null) {
lastSkipPayPointer = new long[maxSkipLevels];
}
lastStartOffset = new int[maxSkipLevels];
lastPayloadByteUpto = new int[maxSkipLevels];
}
}
public void setField(boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) {
this.fieldHasPositions = fieldHasPositions;
this.fieldHasOffsets = fieldHasOffsets;
this.fieldHasPayloads = fieldHasPayloads;
}
@Override
public void resetSkip() {
super.resetSkip();
Arrays.fill(lastSkipDoc, 0);
Arrays.fill(lastSkipDocPointer, docOut.getFilePointer());
if (fieldHasPositions) {
Arrays.fill(lastSkipPosPointer, posOut.getFilePointer());
if (fieldHasOffsets) {
Arrays.fill(lastStartOffset, 0);
}
if (fieldHasPayloads) {
Arrays.fill(lastPayloadByteUpto, 0);
}
if (fieldHasOffsets || fieldHasPayloads) {
Arrays.fill(lastSkipPayPointer, payOut.getFilePointer());
}
}
}
/**
* Sets the values for the current skip data.
*/
public void bufferSkip(int doc, int numDocs, long posFP, long payFP, int posBufferUpto, int startOffset, int payloadByteUpto) throws IOException {
this.curDoc = doc;
this.curDocPointer = docOut.getFilePointer();
this.curPosPointer = posFP;
this.curPayPointer = payFP;
this.curPosBufferUpto = posBufferUpto;
this.curPayloadByteUpto = payloadByteUpto;
this.curStartOffset = startOffset;
bufferSkip(numDocs);
}
@Override
protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException {
int delta = curDoc - lastSkipDoc[level];
if (DEBUG) {
System.out.println("writeSkipData level=" + level + " lastDoc=" + curDoc + " delta=" + delta + " curDocPointer=" + curDocPointer);
}
skipBuffer.writeVInt(delta);
lastSkipDoc[level] = curDoc;
skipBuffer.writeVInt((int) (curDocPointer - lastSkipDocPointer[level]));
lastSkipDocPointer[level] = curDocPointer;
if (fieldHasPositions) {
if (DEBUG) {
System.out.println(" curPosPointer=" + curPosPointer + " curPosBufferUpto=" + curPosBufferUpto);
}
skipBuffer.writeVInt((int) (curPosPointer - lastSkipPosPointer[level]));
lastSkipPosPointer[level] = curPosPointer;
skipBuffer.writeVInt(curPosBufferUpto);
if (fieldHasPayloads) {
skipBuffer.writeVInt(curPayloadByteUpto);
}
if (fieldHasOffsets) {
skipBuffer.writeVInt(curStartOffset - lastStartOffset[level]);
lastStartOffset[level] = curStartOffset;
}
if (fieldHasOffsets || fieldHasPayloads) {
skipBuffer.writeVInt((int) (curPayPointer - lastSkipPayPointer[level]));
lastSkipPayPointer[level] = curPayPointer;
}
}
}
}

View File

@ -1,240 +0,0 @@
package org.apache.lucene.codecs.blockpacked;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.codecs.blockpacked.BlockPackedPostingsFormat.BLOCK_SIZE;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedInts.Decoder;
import org.apache.lucene.util.packed.PackedInts.FormatAndBits;
/**
* Encode all values in normal area with fixed bit width,
* which is determined by the max value in this block.
*/
final class ForUtil {
/**
* Special number of bits per value used whenever all values to encode are equal.
*/
private static final int ALL_VALUES_EQUAL = 0;
private static final int PACKED_INTS_VERSION = 0; // nocommit: encode in the stream?
/**
* Upper limit of the number of bytes that might be required to stored
* <code>BLOCK_SIZE</code> encoded values.
*/
static final int MAX_ENCODED_SIZE = BLOCK_SIZE * 4;
/**
* Upper limit of the number of values that might be decoded in a single call to
* {@link #readBlock(IndexInput, byte[], int[])}. Although values after
* <code>BLOCK_SIZE</code> are garbage, it is necessary to allocate value buffers
* whose size is >= MAX_DATA_SIZE to avoid {@link ArrayIndexOutOfBoundsException}s.
*/
static final int MAX_DATA_SIZE;
static {
int minDataSize = 0;
for (PackedInts.Format format : PackedInts.Format.values()) {
for (int bpv = 1; bpv <= 32; ++bpv) {
if (!format.isSupported(bpv)) {
continue;
}
final PackedInts.Decoder decoder = PackedInts.getDecoder(format, PACKED_INTS_VERSION, bpv);
final int iterations = (int) Math.ceil((float) BLOCK_SIZE / decoder.valueCount());
minDataSize = Math.max(minDataSize, iterations * decoder.valueCount());
}
}
MAX_DATA_SIZE = minDataSize;
}
/**
* Compute the number of iterations required to decode <code>BLOCK_SIZE</code>
* values with the provided {@link Decoder}.
*/
private static int computeIterations(PackedInts.Decoder decoder) {
return (int) Math.ceil((float) BLOCK_SIZE / decoder.valueCount());
}
/**
* Compute the number of bytes required to encode a block of values that require
* <code>bitsPerValue</code> bits per value with format <code>format</code>.
*/
private static int encodedSize(PackedInts.Format format, int bitsPerValue) {
return format.nblocks(bitsPerValue, BLOCK_SIZE) << 3;
}
private final int[] encodedSizes;
private final PackedInts.Encoder[] encoders;
private final PackedInts.Decoder[] decoders;
private final int[] iterations;
/**
* Create a new {@link ForUtil} instance and save state into <code>out</code>.
*/
ForUtil(float acceptableOverheadRatio, DataOutput out) throws IOException {
encodedSizes = new int[33];
encoders = new PackedInts.Encoder[33];
decoders = new PackedInts.Decoder[33];
iterations = new int[33];
for (int bpv = 1; bpv <= 32; ++bpv) {
final FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(
BLOCK_SIZE, bpv, acceptableOverheadRatio);
assert formatAndBits.format.isSupported(formatAndBits.bitsPerValue);
assert formatAndBits.bitsPerValue <= 32;
encodedSizes[bpv] = encodedSize(formatAndBits.format, formatAndBits.bitsPerValue);
encoders[bpv] = PackedInts.getEncoder(
formatAndBits.format, PACKED_INTS_VERSION, formatAndBits.bitsPerValue);
decoders[bpv] = PackedInts.getDecoder(
formatAndBits.format, PACKED_INTS_VERSION, formatAndBits.bitsPerValue);
iterations[bpv] = computeIterations(decoders[bpv]);
out.writeVInt(formatAndBits.format.getId() << 5 | (formatAndBits.bitsPerValue - 1));
}
}
/**
* Restore a {@link ForUtil} from a {@link DataInput}.
*/
ForUtil(DataInput in) throws IOException {
encodedSizes = new int[33];
encoders = new PackedInts.Encoder[33];
decoders = new PackedInts.Decoder[33];
iterations = new int[33];
for (int bpv = 1; bpv <= 32; ++bpv) {
final int code = in.readVInt();
final int formatId = code >>> 5;
final int bitsPerValue = (code & 31) + 1;
final PackedInts.Format format = PackedInts.Format.byId(formatId);
assert format.isSupported(bitsPerValue);
encodedSizes[bpv] = encodedSize(format, bitsPerValue);
encoders[bpv] = PackedInts.getEncoder(
format, PACKED_INTS_VERSION, bitsPerValue);
decoders[bpv] = PackedInts.getDecoder(
format, PACKED_INTS_VERSION, bitsPerValue);
iterations[bpv] = computeIterations(decoders[bpv]);
}
}
/**
* Write a block of data (<code>For</code> format).
*
* @param data the data to write
* @param encoded a buffer to use to encode data
* @param out the destination output
* @throws IOException
*/
void writeBlock(int[] data, byte[] encoded, IndexOutput out) throws IOException {
if (isAllEqual(data)) {
out.writeVInt(ALL_VALUES_EQUAL);
out.writeInt(data[0]);
return;
}
final int numBits = bitsRequired(data);
assert numBits > 0 && numBits <= 32 : numBits;
final PackedInts.Encoder encoder = encoders[numBits];
final int iters = iterations[numBits];
assert iters * encoder.valueCount() >= BLOCK_SIZE;
final int encodedSize = encodedSizes[numBits];
assert (iters * encoder.blockCount()) << 3 >= encodedSize;
out.writeVInt(numBits);
encoder.encode(data, 0, encoded, 0, iters);
out.writeBytes(encoded, encodedSize);
}
/**
* Read the next block of data (<code>For</code> format).
*
* @param in the input to use to read data
* @param encoded a buffer that can be used to store encoded data
* @param decoded where to write decoded data
* @throws IOException
*/
void readBlock(IndexInput in, byte[] encoded, int[] decoded) throws IOException {
final int numBits = in.readVInt();
assert numBits <= 32 : numBits;
if (numBits == ALL_VALUES_EQUAL) {
final int value = in.readInt();
Arrays.fill(decoded, 0, BLOCK_SIZE, value);
return;
}
final int encodedSize = encodedSizes[numBits];
in.readBytes(encoded, 0, encodedSize);
final PackedInts.Decoder decoder = decoders[numBits];
final int iters = iterations[numBits];
assert iters * decoder.valueCount() >= BLOCK_SIZE;
decoder.decode(encoded, 0, decoded, 0, iters);
}
/**
* Skip the next block of data.
*
* @param in the input where to read data
* @throws IOException
*/
void skipBlock(IndexInput in) throws IOException {
final int numBits = in.readVInt();
if (numBits == ALL_VALUES_EQUAL) {
in.seek(in.getFilePointer() + 4);
return;
}
assert numBits > 0 && numBits <= 32 : numBits;
final int encodedSize = encodedSizes[numBits];
in.seek(in.getFilePointer() + encodedSize);
}
// nocommit: we must have a util function for this, hmm?
private static boolean isAllEqual(final int[] data) {
final long v = data[0];
for (int i = 1; i < BLOCK_SIZE; ++i) {
if (data[i] != v) {
return false;
}
}
return true;
}
/**
* Compute the number of bits required to serialize any of the longs in
* <code>data</code>.
*/
private static int bitsRequired(final int[] data) {
long or = 0;
for (int i = 0; i < BLOCK_SIZE; ++i) {
or |= data[i];
}
return PackedInts.bitsRequired(or);
}
}

View File

@ -21,4 +21,3 @@ org.apache.lucene.codecs.bulkvint.BulkVIntPostingsFormat
org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat
org.apache.lucene.codecs.memory.DirectPostingsFormat
org.apache.lucene.codecs.block.BlockPostingsFormat
org.apache.lucene.codecs.blockpacked.BlockPackedPostingsFormat

View File

@ -1,4 +1,4 @@
package org.apache.lucene.codecs.blockpacked;
package org.apache.lucene.codecs.block;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -17,9 +17,9 @@ package org.apache.lucene.codecs.blockpacked;
* limitations under the License.
*/
import static org.apache.lucene.codecs.blockpacked.BlockPackedPostingsFormat.BLOCK_SIZE;
import static org.apache.lucene.codecs.blockpacked.ForUtil.MAX_DATA_SIZE;
import static org.apache.lucene.codecs.blockpacked.ForUtil.MAX_ENCODED_SIZE;
import static org.apache.lucene.codecs.block.BlockPostingsFormat.BLOCK_SIZE;
import static org.apache.lucene.codecs.block.ForUtil.MAX_DATA_SIZE;
import static org.apache.lucene.codecs.block.ForUtil.MAX_ENCODED_SIZE;
import java.io.IOException;
import java.util.Arrays;