LUCENE-3892: some cleanup/refactoring

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1369697 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-08-05 22:39:32 +00:00
parent aed1b5d760
commit de9586104c
6 changed files with 199 additions and 210 deletions

View File

@ -31,8 +31,8 @@ import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
/**
* Pass ForFactory to a PostingsWriter/ReaderBase, and get
* customized postings format plugged.
* Encodes/decode postings in packed int blocks for faster
* decode.
*/
public final class BlockPostingsFormat extends PostingsFormat {
public static final String DOC_EXTENSION = "doc";
@ -41,7 +41,8 @@ public final class BlockPostingsFormat extends PostingsFormat {
private final int minTermBlockSize;
private final int maxTermBlockSize;
public final static int DEFAULT_BLOCK_SIZE = 128;
public final static int BLOCK_SIZE = 128;
public BlockPostingsFormat() {
this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
@ -57,13 +58,12 @@ public final class BlockPostingsFormat extends PostingsFormat {
@Override
public String toString() {
return getName() + "(blocksize=" + DEFAULT_BLOCK_SIZE + ")";
return getName() + "(blocksize=" + BLOCK_SIZE + ")";
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
// TODO: implement a new PostingsWriterBase to improve skip-settings
PostingsWriterBase postingsWriter = new BlockPostingsWriter(state, 128);
PostingsWriterBase postingsWriter = new BlockPostingsWriter(state);
boolean success = false;
try {
@ -86,8 +86,7 @@ public final class BlockPostingsFormat extends PostingsFormat {
state.fieldInfos,
state.segmentInfo,
state.context,
state.segmentSuffix,
128);
state.segmentSuffix);
boolean success = false;
try {
FieldsProducer ret = new BlockTreeTermsReader(state.dir,

View File

@ -43,7 +43,14 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
// nocommit javadocs
/**
* Concrete class that reads docId(maybe frq,pos,offset,payloads) list
* with postings format.
*
* @see BlockSkipReader for details
*
*/
public final class BlockPostingsReader extends PostingsReaderBase {
private final IndexInput docIn;
@ -56,9 +63,9 @@ public final class BlockPostingsReader extends PostingsReaderBase {
final String segment;
// NOTE: not private to avoid access$NNN methods:
final int blockSize;
final static int blockSize = BlockPostingsFormat.BLOCK_SIZE;
public BlockPostingsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo segmentInfo, IOContext ioContext, String segmentSuffix, int blockSize) throws IOException {
public BlockPostingsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo segmentInfo, IOContext ioContext, String segmentSuffix) throws IOException {
boolean success = false;
segment = segmentInfo.name;
IndexInput docIn = null;
@ -99,8 +106,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
IOUtils.closeWhileHandlingException(docIn, posIn, payIn);
}
}
this.blockSize = blockSize;
}
@Override
@ -116,6 +121,24 @@ public final class BlockPostingsReader extends PostingsReaderBase {
}
}
static void readVIntBlock(IndexInput docIn, int[] docBuffer, int[] freqBuffer, int num, boolean indexHasFreq) throws IOException {
if (indexHasFreq) {
for(int i=0;i<num;i++) {
final int code = docIn.readVInt();
docBuffer[i] = code >>> 1;
if ((code & 1) != 0) {
freqBuffer[i] = 1;
} else {
freqBuffer[i] = docIn.readVInt();
}
}
} else {
for(int i=0;i<num;i++) {
docBuffer[i] = docIn.readVInt();
}
}
}
static void readBlock(IndexInput in, byte[] encoded, IntBuffer encodedBuffer, int[] buffer) throws IOException {
int header = in.readVInt();
in.readBytes(encoded, 0, ForUtil.getEncodedSize(header));
@ -343,7 +366,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
indexHasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
indexHasPayloads = fieldInfo.hasPayloads();
encoded = new byte[blockSize*4 + 4];
encoded = new byte[blockSize*4];
encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
}
@ -386,6 +409,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
}
private void refillDocs() throws IOException {
//System.out.println("["+docFreq+"]"+" refillDoc");
final int left = docFreq - docUpto;
assert left > 0;
@ -394,7 +418,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
System.out.println(" fill doc block from fp=" + docIn.getFilePointer());
}
readBlock(docIn, encoded, encodedBuffer, docDeltaBuffer);
if (indexHasFreq) {
if (DEBUG) {
System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
@ -402,50 +425,33 @@ public final class BlockPostingsReader extends PostingsReaderBase {
readBlock(docIn, encoded, encodedBuffer, freqBuffer);
}
} else {
// Read vInts:
if (DEBUG) {
System.out.println(" fill last vInt block from fp=" + docIn.getFilePointer());
}
for(int i=0;i<left;i++) {
final int code = docIn.readVInt();
if (indexHasFreq) {
docDeltaBuffer[i] = code >>> 1;
if ((code & 1) != 0) {
freqBuffer[i] = 1;
} else {
freqBuffer[i] = docIn.readVInt();
}
} else {
docDeltaBuffer[i] = code;
}
}
readVIntBlock(docIn, docDeltaBuffer, freqBuffer, left, indexHasFreq);
}
docBufferUpto = 0;
}
@Override
public int nextDoc() throws IOException {
if (DEBUG) {
System.out.println("\nFPR.nextDoc");
}
while (true) {
if (DEBUG) {
System.out.println(" docUpto=" + docUpto + " (of df=" + docFreq + ") docBufferUpto=" + docBufferUpto);
}
if (docUpto == docFreq) {
if (DEBUG) {
System.out.println(" return doc=END");
}
return doc = NO_MORE_DOCS;
}
//System.out.println("["+docFreq+"]"+" nextDoc");
if (docBufferUpto == blockSize) {
refillDocs();
}
if (DEBUG) {
System.out.println(" accum=" + accum + " docDeltaBuffer[" + docBufferUpto + "]=" + docDeltaBuffer[docBufferUpto]);
}
@ -461,23 +467,19 @@ public final class BlockPostingsReader extends PostingsReaderBase {
}
return doc;
}
if (DEBUG) {
System.out.println(" doc=" + accum + " is deleted; try next doc");
}
docBufferUpto++;
}
}
@Override
public int advance(int target) throws IOException {
// nocommit make frq block load lazy/skippable
// nocommit 2 is heuristic guess!!
// nocommit put cheating back! does it help?
// nocommit use skipper!!! it has next last doc id!!
//if (docFreq > blockSize && target - (blockSize - docBufferUpto) - 2*blockSize > accum) {
if (docFreq > blockSize && target - accum > blockSize) {
if (DEBUG) {
@ -506,18 +508,16 @@ public final class BlockPostingsReader extends PostingsReaderBase {
if (newDocUpto > docUpto) {
// Skipper moved
if (DEBUG) {
System.out.println("skipper moved to docUpto=" + newDocUpto + " vs current=" + docUpto + "; docID=" + skipper.getDoc() + " fp=" + skipper.getDocPointer());
}
assert newDocUpto % blockSize == (blockSize-1): "got " + newDocUpto;
docUpto = newDocUpto+1;
// Force block read next:
// Force to read next block
docBufferUpto = blockSize;
accum = skipper.getDoc();
docIn.seek(skipper.getDocPointer());
accum = skipper.getDoc(); // actually, this is just lastSkipEntry
docIn.seek(skipper.getDocPointer()); // now point to the block we want to search
}
}
@ -530,11 +530,9 @@ public final class BlockPostingsReader extends PostingsReaderBase {
return doc;
}
}
if (DEBUG) {
System.out.println(" advance return doc=END");
}
return NO_MORE_DOCS;
}
}
@ -604,7 +602,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
this.startDocIn = BlockPostingsReader.this.docIn;
this.docIn = (IndexInput) startDocIn.clone();
this.posIn = (IndexInput) BlockPostingsReader.this.posIn.clone();
encoded = new byte[blockSize*4 + 4];
encoded = new byte[blockSize*4];
encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
indexHasPayloads = fieldInfo.hasPayloads();
@ -656,35 +654,23 @@ public final class BlockPostingsReader extends PostingsReaderBase {
}
private void refillDocs() throws IOException {
//System.out.println("["+docFreq+"]"+" refillDoc");
final int left = docFreq - docUpto;
assert left > 0;
if (left >= blockSize) {
if (DEBUG) {
System.out.println(" fill doc block from fp=" + docIn.getFilePointer());
}
readBlock(docIn, encoded, encodedBuffer, docDeltaBuffer);
if (DEBUG) {
System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
}
readBlock(docIn, encoded, encodedBuffer, freqBuffer);
} else {
// Read vInts:
if (DEBUG) {
System.out.println(" fill last vInt doc block from fp=" + docIn.getFilePointer());
}
for(int i=0;i<left;i++) {
final int code = docIn.readVInt();
docDeltaBuffer[i] = code >>> 1;
if ((code & 1) != 0) {
freqBuffer[i] = 1;
} else {
freqBuffer[i] = docIn.readVInt();
}
}
readVIntBlock(docIn, docDeltaBuffer, freqBuffer, left, true);
}
docBufferUpto = 0;
}
@ -712,7 +698,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
} else {
posDeltaBuffer[i] = code;
}
if (indexHasOffsets) {
posIn.readVInt();
posIn.readVInt();
@ -728,24 +713,20 @@ public final class BlockPostingsReader extends PostingsReaderBase {
@Override
public int nextDoc() throws IOException {
if (DEBUG) {
System.out.println(" FPR.nextDoc");
}
while (true) {
if (DEBUG) {
System.out.println(" docUpto=" + docUpto + " (of df=" + docFreq + ") docBufferUpto=" + docBufferUpto);
}
if (docUpto == docFreq) {
return doc = NO_MORE_DOCS;
}
//System.out.println("["+docFreq+"]"+" nextDoc");
if (docBufferUpto == blockSize) {
refillDocs();
}
if (DEBUG) {
System.out.println(" accum=" + accum + " docDeltaBuffer[" + docBufferUpto + "]=" + docDeltaBuffer[docBufferUpto]);
}
@ -757,13 +738,12 @@ public final class BlockPostingsReader extends PostingsReaderBase {
if (liveDocs == null || liveDocs.get(accum)) {
doc = accum;
position = 0;
if (DEBUG) {
System.out.println(" return doc=" + doc + " freq=" + freq + " posPendingCount=" + posPendingCount);
}
position = 0;
return doc;
}
if (DEBUG) {
System.out.println(" doc=" + accum + " is deleted; try next doc");
}
@ -782,11 +762,9 @@ public final class BlockPostingsReader extends PostingsReaderBase {
// nocommit use skipper!!! it has next last doc id!!
//if (docFreq > blockSize && target - (blockSize - docBufferUpto) - 2*blockSize > accum) {
if (docFreq > blockSize && target - accum > blockSize) {
if (DEBUG) {
System.out.println(" try skipper");
}
if (skipper == null) {
// Lazy init: first time this enum has ever been used for skipping
if (DEBUG) {
@ -815,7 +793,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
if (newDocUpto > docUpto) {
// Skipper moved
if (DEBUG) {
System.out.println(" skipper moved to docUpto=" + newDocUpto + " vs current=" + docUpto + "; docID=" + skipper.getDoc() + " fp=" + skipper.getDocPointer() + " pos.fp=" + skipper.getPosPointer() + " pos.bufferUpto=" + skipper.getPosBufferUpto());
}
@ -823,7 +800,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
assert newDocUpto % blockSize == (blockSize-1): "got " + newDocUpto;
docUpto = newDocUpto+1;
// Force block read next:
// Force to read next block
docBufferUpto = blockSize;
accum = skipper.getDoc();
docIn.seek(skipper.getDocPointer());
@ -1024,7 +1001,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
this.docIn = (IndexInput) startDocIn.clone();
this.posIn = (IndexInput) BlockPostingsReader.this.posIn.clone();
this.payIn = (IndexInput) BlockPostingsReader.this.payIn.clone();
encoded = new byte[blockSize*4 + 4];
encoded = new byte[blockSize*4];
encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
if (indexHasOffsets) {
@ -1096,6 +1073,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
}
private void refillDocs() throws IOException {
//System.out.println("["+docFreq+"]"+" refillDoc");
final int left = docFreq - docUpto;
assert left > 0;
@ -1103,28 +1081,16 @@ public final class BlockPostingsReader extends PostingsReaderBase {
if (DEBUG) {
System.out.println(" fill doc block from fp=" + docIn.getFilePointer());
}
readBlock(docIn, encoded, encodedBuffer, docDeltaBuffer);
if (DEBUG) {
System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
}
readBlock(docIn, encoded, encodedBuffer, freqBuffer);
} else {
// Read vInts:
if (DEBUG) {
System.out.println(" fill last vInt doc block from fp=" + docIn.getFilePointer());
}
for(int i=0;i<left;i++) {
final int code = docIn.readVInt();
docDeltaBuffer[i] = code >>> 1;
if ((code & 1) != 0) {
freqBuffer[i] = 1;
} else {
freqBuffer[i] = docIn.readVInt();
}
}
readVIntBlock(docIn, docDeltaBuffer, freqBuffer, left, true);
}
docBufferUpto = 0;
}
@ -1209,29 +1175,24 @@ public final class BlockPostingsReader extends PostingsReaderBase {
@Override
public int nextDoc() throws IOException {
if (DEBUG) {
System.out.println(" FPR.nextDoc");
}
if (indexHasPayloads) {
payloadByteUpto += payloadLength;
payloadLength = 0;
}
while (true) {
if (DEBUG) {
System.out.println(" docUpto=" + docUpto + " (of df=" + docFreq + ") docBufferUpto=" + docBufferUpto);
}
if (docUpto == docFreq) {
return doc = NO_MORE_DOCS;
}
//System.out.println("["+docFreq+"]"+" nextDoc");
if (docBufferUpto == blockSize) {
refillDocs();
}
if (DEBUG) {
System.out.println(" accum=" + accum + " docDeltaBuffer[" + docBufferUpto + "]=" + docDeltaBuffer[docBufferUpto]);
}
@ -1251,7 +1212,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
lastStartOffset = 0;
return doc;
}
if (DEBUG) {
System.out.println(" doc=" + accum + " is deleted; try next doc");
}
@ -1303,15 +1263,13 @@ public final class BlockPostingsReader extends PostingsReaderBase {
if (newDocUpto > docUpto) {
// Skipper moved
if (DEBUG) {
System.out.println(" skipper moved to docUpto=" + newDocUpto + " vs current=" + docUpto + "; docID=" + skipper.getDoc() + " fp=" + skipper.getDocPointer() + " pos.fp=" + skipper.getPosPointer() + " pos.bufferUpto=" + skipper.getPosBufferUpto() + " pay.fp=" + skipper.getPayPointer() + " lastStartOffset=" + lastStartOffset);
}
assert newDocUpto % blockSize == (blockSize-1): "got " + newDocUpto;
docUpto = newDocUpto+1;
// Force block read next:
// Force to read next block
docBufferUpto = blockSize;
accum = skipper.getDoc();
docIn.seek(skipper.getDocPointer());

View File

@ -37,8 +37,15 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
// nocommit javadocs
/**
* Concrete class that writes docId(maybe frq,pos,offset,payloads) list
* with postings format.
*
* Postings list for each term will be stored separately.
*
* @see BlockSkipWriter for details about skipping setting and postings layout.
*
*/
public final class BlockPostingsWriter extends PostingsWriterBase {
private boolean DEBUG = BlockPostingsReader.DEBUG;
@ -60,9 +67,7 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
final IndexOutput posOut;
final IndexOutput payOut;
static final int DEFAULT_BLOCK_SIZE = 128;
final int blockSize;
final static int blockSize = BlockPostingsFormat.BLOCK_SIZE;
private IndexOutput termsOut;
@ -91,12 +96,12 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
private int payloadByteUpto;
private int lastBlockDocID;
private boolean saveNextPosBlock;
private long lastBlockPosFP;
private long lastBlockPayFP;
private int lastBlockPosBufferUpto;
private int lastBlockStartOffset;
private int lastBlockPayloadByteUpto;
private int lastDocID;
private int lastPosition;
private int lastStartOffset;
@ -107,9 +112,8 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
private final BlockSkipWriter skipWriter;
public BlockPostingsWriter(SegmentWriteState state, int blockSize) throws IOException {
public BlockPostingsWriter(SegmentWriteState state) throws IOException {
super();
this.blockSize = blockSize;
docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.DOC_EXTENSION),
state.context);
@ -164,14 +168,14 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
docDeltaBuffer = new int[blockSize];
freqBuffer = new int[blockSize];
skipWriter = new BlockSkipWriter(blockSize,
maxSkipLevels,
skipWriter = new BlockSkipWriter(maxSkipLevels,
blockSize,
state.segmentInfo.getDocCount(),
docOut,
posOut,
payOut);
encoded = new byte[blockSize*4 + 4];
encoded = new byte[blockSize*4];
encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
}
@ -201,8 +205,8 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
payTermStartFP = payOut.getFilePointer();
}
}
lastBlockDocID = -1;
lastDocID = 0;
lastBlockDocID = -1;
if (DEBUG) {
System.out.println("FPW.startTerm startFP=" + docTermStartFP);
}
@ -211,7 +215,6 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
private void writeBlock(int[] buffer, IndexOutput out) throws IOException {
final int header = ForUtil.compress(buffer, encodedBuffer);
//System.out.println(" block has " + numBytes + " bytes");
out.writeVInt(header);
out.writeBytes(encoded, ForUtil.getEncodedSize(header));
}
@ -219,61 +222,25 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
if (DEBUG) {
System.out.println("FPW.startDoc docID=" + docID);
System.out.println("FPW.startDoc docID["+docBufferUpto+"]=" + docID);
}
// nocommit do this in finishDoc... but does it fail...?
// is it not always called...?
if (posOut != null && saveNextPosBlock) {
lastBlockPosFP = posOut.getFilePointer();
if (payOut != null) {
lastBlockPayFP = payOut.getFilePointer();
}
lastBlockPosBufferUpto = posBufferUpto;
lastBlockStartOffset = lastStartOffset;
lastBlockPayloadByteUpto = payloadByteUpto;
saveNextPosBlock = false;
if (DEBUG) {
System.out.println(" now save lastBlockPosFP=" + lastBlockPosFP + " lastBlockPosBufferUpto=" + lastBlockPosBufferUpto + " lastBlockPayloadByteUpto=" + lastBlockPayloadByteUpto);
}
}
final int docDelta = docID - lastDocID;
if (docID < 0 || (docCount > 0 && docDelta <= 0)) {
throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (docOut: " + docOut + ")");
}
lastDocID = docID;
docDeltaBuffer[docBufferUpto] = docDelta;
if (DEBUG) {
System.out.println(" docDeltaBuffer[" + docBufferUpto + "]=" + docDelta);
}
// if (DEBUG) {
// System.out.println(" docDeltaBuffer[" + docBufferUpto + "]=" + docDelta);
// }
if (fieldHasFreqs) {
freqBuffer[docBufferUpto] = termDocFreq;
}
docBufferUpto++;
docCount++;
if (docBufferUpto == blockSize) {
// nocommit maybe instead of buffering skip before
// writing a block based on last block's end data
// ... we could buffer after writing the block? only
// iffiness with that approach is it could be a
// pointlness skip? like we may stop adding docs
// right after that, then we have skip point AFTER
// last doc. the thing is, in finishTerm we are
// already sometimes adding a skip point AFTER the
// last doc?
if (lastBlockDocID != -1) {
if (DEBUG) {
System.out.println(" bufferSkip at writeBlock: lastDocID=" + lastBlockDocID + " docCount=" + (docCount-blockSize));
}
skipWriter.bufferSkip(lastBlockDocID, docCount-blockSize, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockStartOffset, lastBlockPayloadByteUpto);
}
lastBlockDocID = docID;
saveNextPosBlock = true;
if (DEBUG) {
System.out.println(" write docDelta block @ fp=" + docOut.getFilePointer());
}
@ -284,9 +251,11 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
}
writeBlock(freqBuffer, docOut);
}
docBufferUpto = 0;
// NOTE: don't set docBufferUpto back to 0 here;
// finishDoc will do so (because it needs to see that
// the block was filled so it can save skip data)
}
lastDocID = docID;
lastPosition = 0;
lastStartOffset = 0;
}
@ -294,9 +263,9 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
/** Add a new position & payload */
@Override
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
if (DEBUG) {
System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: ""));
}
// if (DEBUG) {
// System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: ""));
// }
posDeltaBuffer[posBufferUpto] = position - lastPosition;
if (fieldHasPayloads) {
if (payload == null || payload.length == 0) {
@ -343,7 +312,39 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
}
@Override
public void finishDoc() {
public void finishDoc() throws IOException {
// Have collected a block of docs, and get a new doc.
// Should write skip data as well as postings list for
// current block
if (lastBlockDocID != -1 && docBufferUpto == 1) {
// nocomit move to startDoc? ie we can write skip
// data as soon as the next doc starts...
if (DEBUG) {
System.out.println(" bufferSkip at writeBlock: lastDocID=" + lastBlockDocID + " docCount=" + (docCount-1));
}
skipWriter.bufferSkip(lastBlockDocID, docCount-1, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockStartOffset, lastBlockPayloadByteUpto);
}
// Since we don't know df for current term, we had to buffer
// those skip data for each block, and when a new doc comes,
// write them to skip file.
if (docBufferUpto == blockSize) {
lastBlockDocID = lastDocID;
if (posOut != null) {
if (payOut != null) {
lastBlockPayFP = payOut.getFilePointer();
}
lastBlockPosFP = posOut.getFilePointer();
lastBlockPosBufferUpto = posBufferUpto;
lastBlockStartOffset = lastStartOffset;
lastBlockPayloadByteUpto = payloadByteUpto;
}
if (DEBUG) {
System.out.println(" docBufferUpto="+docBufferUpto+" now get lastBlockDocID="+lastBlockDocID+" lastBlockPosFP=" + lastBlockPosFP + " lastBlockPosBufferUpto=" + lastBlockPosBufferUpto + " lastBlockPayloadByteUpto=" + lastBlockPayloadByteUpto);
}
docBufferUpto = 0;
}
}
private static class PendingTerm {
@ -367,7 +368,6 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
/** Called when we are done adding docs to this term */
@Override
public void finishTerm(TermStats stats) throws IOException {
assert stats.docFreq > 0;
// TODO: wasteful we are counting this (counting # docs
@ -378,19 +378,6 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
System.out.println("FPW.finishTerm docFreq=" + stats.docFreq);
}
// nocommit silly that skipper must write skip when we no
// postings come after it, but if we don't do this, skip
// reader incorrectly thinks it can read another level 0
// skip entry here!:
//if (docCount > blockSize && docBufferUpto > 0) {
if (docCount > blockSize) {
final int lastDocCount = blockSize*(docCount/blockSize);
if (DEBUG) {
System.out.println(" bufferSkip at finishTerm: lastDocID=" + lastBlockDocID + " docCount=" + lastDocCount);
}
skipWriter.bufferSkip(lastBlockDocID, lastDocCount, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockStartOffset, lastBlockPayloadByteUpto);
}
if (DEBUG) {
if (docBufferUpto > 0) {
System.out.println(" write doc/freq vInt block (count=" + docBufferUpto + ") at fp=" + docOut.getFilePointer() + " docTermStartFP=" + docTermStartFP);

View File

@ -24,14 +24,35 @@ import org.apache.lucene.codecs.MultiLevelSkipListReader;
import org.apache.lucene.store.IndexInput;
/**
* Implements the skip list reader for the 4.0 posting list format
* Implements the skip list reader for block postings format
* that stores positions and payloads.
*
* @see Lucene40PostingsFormat
* @lucene.experimental
* Although this skipper uses MultiLevelSkipListReader as an interface,
* its definition of skip position will be a little different.
*
* For example, when skipInterval = blockSize = 3, df = 2*skipInterval = 6,
*
* 0 1 2 3 4 5
* d d d d d d (posting list)
* ^ ^ (skip point in MultiLeveSkipWriter)
* ^ (skip point in BlockSkipWriter)
*
* In this case, MultiLevelSkipListReader will use the last document as a skip point,
* while BlockSkipReader should assume no skip point will comes.
*
* If we use the interface directly in BlockSkipReader, it may silly try to read
* another skip data after the only skip point is loaded.
*
* To illustrate this, we can call skipTo(d[5]), since skip point d[3] has smaller docId,
* and numSkipped+blockSize== df, the MultiLevelSkipListReader will assume the skip list
* isn't exhausted yet, and try to load a non-existed skip point
*
* Therefore, we'll trim df before passing it to the interface. see trim(int)
*
*/
final class BlockSkipReader extends MultiLevelSkipListReader {
private boolean DEBUG = BlockPostingsReader.DEBUG;
private int blockSize;
private long docPointer[];
private long posPointer[];
@ -47,8 +68,9 @@ final class BlockSkipReader extends MultiLevelSkipListReader {
private long lastDocPointer;
private int lastPosBufferUpto;
public BlockSkipReader(IndexInput skipStream, int maxSkipLevels, int skipInterval, boolean hasPos, boolean hasOffsets, boolean hasPayloads) {
super(skipStream, maxSkipLevels, skipInterval);
public BlockSkipReader(IndexInput skipStream, int maxSkipLevels, int blockSize, boolean hasPos, boolean hasOffsets, boolean hasPayloads) {
super(skipStream, maxSkipLevels, blockSize);
this.blockSize = blockSize;
docPointer = new long[maxSkipLevels];
if (hasPos) {
posPointer = new long[maxSkipLevels];
@ -73,8 +95,22 @@ final class BlockSkipReader extends MultiLevelSkipListReader {
}
}
/**
* Trim original docFreq to tell skipReader read proper number of skip points.
*
* Since our definition in BlockSkip* is a little different from MultiLevelSkip*
* This trimed docFreq will prevent skipReader from:
* 1. silly reading a non-existed skip point after the last block boundary
* 2. moving into the vInt block
*
*/
protected int trim(int df) {
return df % blockSize == 0? df - 1: df;
}
public void init(long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df) {
super.init(skipPointer, df);
super.init(skipPointer, trim(df));
lastDocPointer = docBasePointer;
lastPosPointer = posBasePointer;
lastPayPointer = payBasePointer;

View File

@ -23,10 +23,29 @@ import java.util.Arrays;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
// nocommit do we need more frequent skips at level > 0?
// 128*128 is immense? may need to decouple
// nocommit may need to decouple
// baseSkipInterval & theRestSkipInterval?
/**
* Write skip lists with multiple levels, and support skip within block ints.
*
* Assume that docFreq = 28, skipInterval = blockSize = 12
*
* | block#0 | | block#1 | |vInts|
* d d d d d d d d d d d d d d d d d d d d d d d d d d d d (posting list)
* ^ ^ (level 0 skip point)
*
* Note that skipWriter will ignore first document in block#0, since
* it is useless as a skip point. Also, we'll never skip into the vInts
* block, only record skip data at the start its start point(if it exist).
*
* For each skip point, we will record:
* 1. lastDocID,
* 2. its related file points(position, payload),
* 3. related numbers or uptos(position, payload).
* 4. start offset.
*
*/
final class BlockSkipWriter extends MultiLevelSkipListWriter {
private boolean DEBUG = BlockPostingsReader.DEBUG;
@ -52,8 +71,8 @@ final class BlockSkipWriter extends MultiLevelSkipListWriter {
private boolean fieldHasOffsets;
private boolean fieldHasPayloads;
public BlockSkipWriter(int skipInterval, int maxSkipLevels, int docCount, IndexOutput docOut, IndexOutput posOut, IndexOutput payOut) {
super(skipInterval, maxSkipLevels, docCount);
public BlockSkipWriter(int maxSkipLevels, int blockSize, int docCount, IndexOutput docOut, IndexOutput posOut, IndexOutput payOut) {
super(blockSize, maxSkipLevels, docCount);
this.docOut = docOut;
this.posOut = posOut;
this.payOut = payOut;

View File

@ -22,7 +22,7 @@ import java.nio.IntBuffer;
* Encode all values in normal area with fixed bit width,
* which is determined by the max value in this block.
*/
public class ForUtil {
public final class ForUtil {
protected static final int[] MASK = { 0x00000000,
0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
@ -31,24 +31,24 @@ public class ForUtil {
0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
0x7fffffff, 0xffffffff};
final static int blockSize = BlockPostingsFormat.BLOCK_SIZE;
/** Compress given int[] into Integer buffer, with For format
*
* @param data uncompressed data
* @param size num of ints to compress
* @param intBuffer integer buffer to hold compressed data
* @return encoded block byte size
* @return the header for current block
*/
public static int compress(final int[] data, IntBuffer intBuffer) {
int numBits=getNumBits(data);
int numBits = getNumBits(data);
if (numBits == 0) {
return compressDuplicateBlock(data,intBuffer);
return compressDuplicateBlock(data, intBuffer);
}
int size=data.length;
int encodedSize = (size*numBits+31)/32;
int encodedSize = (blockSize*numBits+31)/32;
for (int i=0; i<size; ++i) {
encodeNormalValue(intBuffer,i,data[i], numBits);
for (int i=0; i<blockSize; ++i) {
encodeNormalValue(intBuffer, i, data[i], numBits);
}
return getHeader(encodedSize, numBits);
@ -58,7 +58,7 @@ public class ForUtil {
* Save only one int when the whole block equals to 1
*/
static int compressDuplicateBlock(final int[] data, IntBuffer intBuffer) {
intBuffer.put(0,data[0]);
intBuffer.put(0, data[0]);
return getHeader(1, 0);
}
@ -66,6 +66,7 @@ public class ForUtil {
*
* @param intBuffer integer buffer to hold compressed data
* @param data int array to hold uncompressed data
* @param header header of current block, which contains numFrameBits
*/
public static void decompress(IntBuffer intBuffer, int[] data, int header) {
// since this buffer is reused at upper level, rewind first
@ -73,22 +74,12 @@ public class ForUtil {
// nocommit assert header isn't "malformed", ie besides
// numBytes / bit-width there is nothing else!
int numBits = ((header >> 8) & MASK[6]);
decompressCore(intBuffer, data, numBits);
}
/**
* IntBuffer will not be rewinded in this method, therefore
* caller should ensure that the position is set to the first
* encoded int before decoding.
*/
static void decompressCore(IntBuffer intBuffer, int[] data, int numBits) {
assert numBits<=32;
assert numBits>=0;
// TODO: PackedIntsDecompress is hardewired to size==128 only
switch(numBits) {
case 0: PackedIntsDecompress.decode0(intBuffer, data); break;
case 1: PackedIntsDecompress.decode1(intBuffer, data); break;
@ -163,6 +154,7 @@ public class ForUtil {
return optBits;
}
// nocommit: we must have a util function for this, hmm?
protected static boolean isAllEqual(final int[] data) {
int len = data.length;
int v = data[0];
@ -177,23 +169,21 @@ public class ForUtil {
/**
* Generate the 4 byte header, which contains (from lsb to msb):
*
* 8 bits for encoded block int size (excluded header, this limits DEFAULT_BLOCK_SIZE <= 2^8)
* 6 bits for num of frame bits (when 0, values in this block are all the same)
* other bits unused
* other bits for encoded block int size (excluded header), so we can use crazy block size
*
*/
static int getHeader(int encodedSize, int numBits) {
return (encodedSize)
| ((numBits) << 8);
return numBits | (encodedSize << 6);
}
/**
* Expert: get metadata from header.
*/
public static int getEncodedSize(int header) {
return ((header & MASK[8]))*4;
static int getNumBits(int header) {
return ((header & MASK[6]));
}
public static int getNumBits(int header) {
return ((header >> 8) & MASK[6]);
static int getEncodedSize(int header) {
return ((header >>> 6))*4;
}
}