mirror of https://github.com/apache/lucene.git
LUCENE-3892: some cleanup/refactoring
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1369697 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
aed1b5d760
commit
de9586104c
|
@ -31,8 +31,8 @@ import org.apache.lucene.index.SegmentWriteState;
|
|||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Pass ForFactory to a PostingsWriter/ReaderBase, and get
|
||||
* customized postings format plugged.
|
||||
* Encodes/decode postings in packed int blocks for faster
|
||||
* decode.
|
||||
*/
|
||||
public final class BlockPostingsFormat extends PostingsFormat {
|
||||
public static final String DOC_EXTENSION = "doc";
|
||||
|
@ -41,7 +41,8 @@ public final class BlockPostingsFormat extends PostingsFormat {
|
|||
|
||||
private final int minTermBlockSize;
|
||||
private final int maxTermBlockSize;
|
||||
public final static int DEFAULT_BLOCK_SIZE = 128;
|
||||
|
||||
public final static int BLOCK_SIZE = 128;
|
||||
|
||||
public BlockPostingsFormat() {
|
||||
this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
|
@ -57,13 +58,12 @@ public final class BlockPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getName() + "(blocksize=" + DEFAULT_BLOCK_SIZE + ")";
|
||||
return getName() + "(blocksize=" + BLOCK_SIZE + ")";
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
// TODO: implement a new PostingsWriterBase to improve skip-settings
|
||||
PostingsWriterBase postingsWriter = new BlockPostingsWriter(state, 128);
|
||||
PostingsWriterBase postingsWriter = new BlockPostingsWriter(state);
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
|
@ -86,8 +86,7 @@ public final class BlockPostingsFormat extends PostingsFormat {
|
|||
state.fieldInfos,
|
||||
state.segmentInfo,
|
||||
state.context,
|
||||
state.segmentSuffix,
|
||||
128);
|
||||
state.segmentSuffix);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer ret = new BlockTreeTermsReader(state.dir,
|
||||
|
|
|
@ -43,7 +43,14 @@ import org.apache.lucene.util.Bits;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
// nocommit javadocs
|
||||
|
||||
/**
|
||||
* Concrete class that reads docId(maybe frq,pos,offset,payloads) list
|
||||
* with postings format.
|
||||
*
|
||||
* @see BlockSkipReader for details
|
||||
*
|
||||
*/
|
||||
public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
|
||||
private final IndexInput docIn;
|
||||
|
@ -56,9 +63,9 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
final String segment;
|
||||
|
||||
// NOTE: not private to avoid access$NNN methods:
|
||||
final int blockSize;
|
||||
final static int blockSize = BlockPostingsFormat.BLOCK_SIZE;
|
||||
|
||||
public BlockPostingsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo segmentInfo, IOContext ioContext, String segmentSuffix, int blockSize) throws IOException {
|
||||
public BlockPostingsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo segmentInfo, IOContext ioContext, String segmentSuffix) throws IOException {
|
||||
boolean success = false;
|
||||
segment = segmentInfo.name;
|
||||
IndexInput docIn = null;
|
||||
|
@ -99,8 +106,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
IOUtils.closeWhileHandlingException(docIn, posIn, payIn);
|
||||
}
|
||||
}
|
||||
|
||||
this.blockSize = blockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -116,6 +121,24 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
|
||||
static void readVIntBlock(IndexInput docIn, int[] docBuffer, int[] freqBuffer, int num, boolean indexHasFreq) throws IOException {
|
||||
if (indexHasFreq) {
|
||||
for(int i=0;i<num;i++) {
|
||||
final int code = docIn.readVInt();
|
||||
docBuffer[i] = code >>> 1;
|
||||
if ((code & 1) != 0) {
|
||||
freqBuffer[i] = 1;
|
||||
} else {
|
||||
freqBuffer[i] = docIn.readVInt();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(int i=0;i<num;i++) {
|
||||
docBuffer[i] = docIn.readVInt();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void readBlock(IndexInput in, byte[] encoded, IntBuffer encodedBuffer, int[] buffer) throws IOException {
|
||||
int header = in.readVInt();
|
||||
in.readBytes(encoded, 0, ForUtil.getEncodedSize(header));
|
||||
|
@ -343,7 +366,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
indexHasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
indexHasPayloads = fieldInfo.hasPayloads();
|
||||
encoded = new byte[blockSize*4 + 4];
|
||||
encoded = new byte[blockSize*4];
|
||||
encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
|
||||
}
|
||||
|
||||
|
@ -386,6 +409,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
private void refillDocs() throws IOException {
|
||||
//System.out.println("["+docFreq+"]"+" refillDoc");
|
||||
final int left = docFreq - docUpto;
|
||||
assert left > 0;
|
||||
|
||||
|
@ -394,7 +418,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
System.out.println(" fill doc block from fp=" + docIn.getFilePointer());
|
||||
}
|
||||
readBlock(docIn, encoded, encodedBuffer, docDeltaBuffer);
|
||||
|
||||
if (indexHasFreq) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
|
||||
|
@ -402,50 +425,33 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
readBlock(docIn, encoded, encodedBuffer, freqBuffer);
|
||||
}
|
||||
} else {
|
||||
// Read vInts:
|
||||
if (DEBUG) {
|
||||
System.out.println(" fill last vInt block from fp=" + docIn.getFilePointer());
|
||||
}
|
||||
for(int i=0;i<left;i++) {
|
||||
final int code = docIn.readVInt();
|
||||
if (indexHasFreq) {
|
||||
docDeltaBuffer[i] = code >>> 1;
|
||||
if ((code & 1) != 0) {
|
||||
freqBuffer[i] = 1;
|
||||
} else {
|
||||
freqBuffer[i] = docIn.readVInt();
|
||||
}
|
||||
} else {
|
||||
docDeltaBuffer[i] = code;
|
||||
}
|
||||
}
|
||||
readVIntBlock(docIn, docDeltaBuffer, freqBuffer, left, indexHasFreq);
|
||||
}
|
||||
docBufferUpto = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("\nFPR.nextDoc");
|
||||
}
|
||||
|
||||
while (true) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" docUpto=" + docUpto + " (of df=" + docFreq + ") docBufferUpto=" + docBufferUpto);
|
||||
}
|
||||
|
||||
if (docUpto == docFreq) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" return doc=END");
|
||||
}
|
||||
return doc = NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
//System.out.println("["+docFreq+"]"+" nextDoc");
|
||||
if (docBufferUpto == blockSize) {
|
||||
refillDocs();
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" accum=" + accum + " docDeltaBuffer[" + docBufferUpto + "]=" + docDeltaBuffer[docBufferUpto]);
|
||||
}
|
||||
|
@ -461,23 +467,19 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
return doc;
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" doc=" + accum + " is deleted; try next doc");
|
||||
}
|
||||
|
||||
docBufferUpto++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
// nocommit make frq block load lazy/skippable
|
||||
|
||||
// nocommit 2 is heuristic guess!!
|
||||
// nocommit put cheating back! does it help?
|
||||
// nocommit use skipper!!! it has next last doc id!!
|
||||
//if (docFreq > blockSize && target - (blockSize - docBufferUpto) - 2*blockSize > accum) {
|
||||
|
||||
if (docFreq > blockSize && target - accum > blockSize) {
|
||||
|
||||
if (DEBUG) {
|
||||
|
@ -506,18 +508,16 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
|
||||
if (newDocUpto > docUpto) {
|
||||
// Skipper moved
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("skipper moved to docUpto=" + newDocUpto + " vs current=" + docUpto + "; docID=" + skipper.getDoc() + " fp=" + skipper.getDocPointer());
|
||||
}
|
||||
|
||||
assert newDocUpto % blockSize == (blockSize-1): "got " + newDocUpto;
|
||||
docUpto = newDocUpto+1;
|
||||
|
||||
// Force block read next:
|
||||
// Force to read next block
|
||||
docBufferUpto = blockSize;
|
||||
accum = skipper.getDoc();
|
||||
docIn.seek(skipper.getDocPointer());
|
||||
accum = skipper.getDoc(); // actually, this is just lastSkipEntry
|
||||
docIn.seek(skipper.getDocPointer()); // now point to the block we want to search
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -530,11 +530,9 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
return doc;
|
||||
}
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" advance return doc=END");
|
||||
}
|
||||
|
||||
return NO_MORE_DOCS;
|
||||
}
|
||||
}
|
||||
|
@ -604,7 +602,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
this.startDocIn = BlockPostingsReader.this.docIn;
|
||||
this.docIn = (IndexInput) startDocIn.clone();
|
||||
this.posIn = (IndexInput) BlockPostingsReader.this.posIn.clone();
|
||||
encoded = new byte[blockSize*4 + 4];
|
||||
encoded = new byte[blockSize*4];
|
||||
encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
|
||||
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
indexHasPayloads = fieldInfo.hasPayloads();
|
||||
|
@ -656,35 +654,23 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
private void refillDocs() throws IOException {
|
||||
//System.out.println("["+docFreq+"]"+" refillDoc");
|
||||
final int left = docFreq - docUpto;
|
||||
assert left > 0;
|
||||
|
||||
if (left >= blockSize) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" fill doc block from fp=" + docIn.getFilePointer());
|
||||
}
|
||||
|
||||
readBlock(docIn, encoded, encodedBuffer, docDeltaBuffer);
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
|
||||
}
|
||||
|
||||
readBlock(docIn, encoded, encodedBuffer, freqBuffer);
|
||||
} else {
|
||||
// Read vInts:
|
||||
if (DEBUG) {
|
||||
System.out.println(" fill last vInt doc block from fp=" + docIn.getFilePointer());
|
||||
}
|
||||
for(int i=0;i<left;i++) {
|
||||
final int code = docIn.readVInt();
|
||||
docDeltaBuffer[i] = code >>> 1;
|
||||
if ((code & 1) != 0) {
|
||||
freqBuffer[i] = 1;
|
||||
} else {
|
||||
freqBuffer[i] = docIn.readVInt();
|
||||
}
|
||||
}
|
||||
readVIntBlock(docIn, docDeltaBuffer, freqBuffer, left, true);
|
||||
}
|
||||
docBufferUpto = 0;
|
||||
}
|
||||
|
@ -712,7 +698,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
} else {
|
||||
posDeltaBuffer[i] = code;
|
||||
}
|
||||
|
||||
if (indexHasOffsets) {
|
||||
posIn.readVInt();
|
||||
posIn.readVInt();
|
||||
|
@ -728,24 +713,20 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" FPR.nextDoc");
|
||||
}
|
||||
|
||||
while (true) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" docUpto=" + docUpto + " (of df=" + docFreq + ") docBufferUpto=" + docBufferUpto);
|
||||
}
|
||||
|
||||
if (docUpto == docFreq) {
|
||||
return doc = NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
//System.out.println("["+docFreq+"]"+" nextDoc");
|
||||
if (docBufferUpto == blockSize) {
|
||||
refillDocs();
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" accum=" + accum + " docDeltaBuffer[" + docBufferUpto + "]=" + docDeltaBuffer[docBufferUpto]);
|
||||
}
|
||||
|
@ -757,13 +738,12 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
|
||||
if (liveDocs == null || liveDocs.get(accum)) {
|
||||
doc = accum;
|
||||
position = 0;
|
||||
if (DEBUG) {
|
||||
System.out.println(" return doc=" + doc + " freq=" + freq + " posPendingCount=" + posPendingCount);
|
||||
}
|
||||
position = 0;
|
||||
return doc;
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" doc=" + accum + " is deleted; try next doc");
|
||||
}
|
||||
|
@ -782,11 +762,9 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
// nocommit use skipper!!! it has next last doc id!!
|
||||
//if (docFreq > blockSize && target - (blockSize - docBufferUpto) - 2*blockSize > accum) {
|
||||
if (docFreq > blockSize && target - accum > blockSize) {
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" try skipper");
|
||||
}
|
||||
|
||||
if (skipper == null) {
|
||||
// Lazy init: first time this enum has ever been used for skipping
|
||||
if (DEBUG) {
|
||||
|
@ -815,7 +793,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
|
||||
if (newDocUpto > docUpto) {
|
||||
// Skipper moved
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" skipper moved to docUpto=" + newDocUpto + " vs current=" + docUpto + "; docID=" + skipper.getDoc() + " fp=" + skipper.getDocPointer() + " pos.fp=" + skipper.getPosPointer() + " pos.bufferUpto=" + skipper.getPosBufferUpto());
|
||||
}
|
||||
|
@ -823,7 +800,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
assert newDocUpto % blockSize == (blockSize-1): "got " + newDocUpto;
|
||||
docUpto = newDocUpto+1;
|
||||
|
||||
// Force block read next:
|
||||
// Force to read next block
|
||||
docBufferUpto = blockSize;
|
||||
accum = skipper.getDoc();
|
||||
docIn.seek(skipper.getDocPointer());
|
||||
|
@ -1024,7 +1001,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
this.docIn = (IndexInput) startDocIn.clone();
|
||||
this.posIn = (IndexInput) BlockPostingsReader.this.posIn.clone();
|
||||
this.payIn = (IndexInput) BlockPostingsReader.this.payIn.clone();
|
||||
encoded = new byte[blockSize*4 + 4];
|
||||
encoded = new byte[blockSize*4];
|
||||
encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
|
||||
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
if (indexHasOffsets) {
|
||||
|
@ -1096,6 +1073,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
private void refillDocs() throws IOException {
|
||||
//System.out.println("["+docFreq+"]"+" refillDoc");
|
||||
final int left = docFreq - docUpto;
|
||||
assert left > 0;
|
||||
|
||||
|
@ -1103,28 +1081,16 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
if (DEBUG) {
|
||||
System.out.println(" fill doc block from fp=" + docIn.getFilePointer());
|
||||
}
|
||||
|
||||
readBlock(docIn, encoded, encodedBuffer, docDeltaBuffer);
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
|
||||
}
|
||||
|
||||
readBlock(docIn, encoded, encodedBuffer, freqBuffer);
|
||||
} else {
|
||||
// Read vInts:
|
||||
if (DEBUG) {
|
||||
System.out.println(" fill last vInt doc block from fp=" + docIn.getFilePointer());
|
||||
}
|
||||
for(int i=0;i<left;i++) {
|
||||
final int code = docIn.readVInt();
|
||||
docDeltaBuffer[i] = code >>> 1;
|
||||
if ((code & 1) != 0) {
|
||||
freqBuffer[i] = 1;
|
||||
} else {
|
||||
freqBuffer[i] = docIn.readVInt();
|
||||
}
|
||||
}
|
||||
readVIntBlock(docIn, docDeltaBuffer, freqBuffer, left, true);
|
||||
}
|
||||
docBufferUpto = 0;
|
||||
}
|
||||
|
@ -1209,29 +1175,24 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" FPR.nextDoc");
|
||||
}
|
||||
|
||||
if (indexHasPayloads) {
|
||||
payloadByteUpto += payloadLength;
|
||||
payloadLength = 0;
|
||||
}
|
||||
|
||||
while (true) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" docUpto=" + docUpto + " (of df=" + docFreq + ") docBufferUpto=" + docBufferUpto);
|
||||
}
|
||||
|
||||
if (docUpto == docFreq) {
|
||||
return doc = NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
//System.out.println("["+docFreq+"]"+" nextDoc");
|
||||
if (docBufferUpto == blockSize) {
|
||||
refillDocs();
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" accum=" + accum + " docDeltaBuffer[" + docBufferUpto + "]=" + docDeltaBuffer[docBufferUpto]);
|
||||
}
|
||||
|
@ -1251,7 +1212,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
lastStartOffset = 0;
|
||||
return doc;
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" doc=" + accum + " is deleted; try next doc");
|
||||
}
|
||||
|
@ -1303,15 +1263,13 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
|
||||
if (newDocUpto > docUpto) {
|
||||
// Skipper moved
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" skipper moved to docUpto=" + newDocUpto + " vs current=" + docUpto + "; docID=" + skipper.getDoc() + " fp=" + skipper.getDocPointer() + " pos.fp=" + skipper.getPosPointer() + " pos.bufferUpto=" + skipper.getPosBufferUpto() + " pay.fp=" + skipper.getPayPointer() + " lastStartOffset=" + lastStartOffset);
|
||||
}
|
||||
|
||||
assert newDocUpto % blockSize == (blockSize-1): "got " + newDocUpto;
|
||||
docUpto = newDocUpto+1;
|
||||
|
||||
// Force block read next:
|
||||
// Force to read next block
|
||||
docBufferUpto = blockSize;
|
||||
accum = skipper.getDoc();
|
||||
docIn.seek(skipper.getDocPointer());
|
||||
|
|
|
@ -37,8 +37,15 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
// nocommit javadocs
|
||||
|
||||
/**
|
||||
* Concrete class that writes docId(maybe frq,pos,offset,payloads) list
|
||||
* with postings format.
|
||||
*
|
||||
* Postings list for each term will be stored separately.
|
||||
*
|
||||
* @see BlockSkipWriter for details about skipping setting and postings layout.
|
||||
*
|
||||
*/
|
||||
public final class BlockPostingsWriter extends PostingsWriterBase {
|
||||
|
||||
private boolean DEBUG = BlockPostingsReader.DEBUG;
|
||||
|
@ -60,9 +67,7 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
|
|||
final IndexOutput posOut;
|
||||
final IndexOutput payOut;
|
||||
|
||||
static final int DEFAULT_BLOCK_SIZE = 128;
|
||||
|
||||
final int blockSize;
|
||||
final static int blockSize = BlockPostingsFormat.BLOCK_SIZE;
|
||||
|
||||
private IndexOutput termsOut;
|
||||
|
||||
|
@ -91,12 +96,12 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
|
|||
private int payloadByteUpto;
|
||||
|
||||
private int lastBlockDocID;
|
||||
private boolean saveNextPosBlock;
|
||||
private long lastBlockPosFP;
|
||||
private long lastBlockPayFP;
|
||||
private int lastBlockPosBufferUpto;
|
||||
private int lastBlockStartOffset;
|
||||
private int lastBlockPayloadByteUpto;
|
||||
|
||||
private int lastDocID;
|
||||
private int lastPosition;
|
||||
private int lastStartOffset;
|
||||
|
@ -107,9 +112,8 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
|
|||
|
||||
private final BlockSkipWriter skipWriter;
|
||||
|
||||
public BlockPostingsWriter(SegmentWriteState state, int blockSize) throws IOException {
|
||||
public BlockPostingsWriter(SegmentWriteState state) throws IOException {
|
||||
super();
|
||||
this.blockSize = blockSize;
|
||||
|
||||
docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPostingsFormat.DOC_EXTENSION),
|
||||
state.context);
|
||||
|
@ -164,14 +168,14 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
|
|||
docDeltaBuffer = new int[blockSize];
|
||||
freqBuffer = new int[blockSize];
|
||||
|
||||
skipWriter = new BlockSkipWriter(blockSize,
|
||||
maxSkipLevels,
|
||||
skipWriter = new BlockSkipWriter(maxSkipLevels,
|
||||
blockSize,
|
||||
state.segmentInfo.getDocCount(),
|
||||
docOut,
|
||||
posOut,
|
||||
payOut);
|
||||
|
||||
encoded = new byte[blockSize*4 + 4];
|
||||
encoded = new byte[blockSize*4];
|
||||
encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
|
||||
}
|
||||
|
||||
|
@ -201,8 +205,8 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
|
|||
payTermStartFP = payOut.getFilePointer();
|
||||
}
|
||||
}
|
||||
lastBlockDocID = -1;
|
||||
lastDocID = 0;
|
||||
lastBlockDocID = -1;
|
||||
if (DEBUG) {
|
||||
System.out.println("FPW.startTerm startFP=" + docTermStartFP);
|
||||
}
|
||||
|
@ -211,7 +215,6 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
|
|||
|
||||
private void writeBlock(int[] buffer, IndexOutput out) throws IOException {
|
||||
final int header = ForUtil.compress(buffer, encodedBuffer);
|
||||
//System.out.println(" block has " + numBytes + " bytes");
|
||||
out.writeVInt(header);
|
||||
out.writeBytes(encoded, ForUtil.getEncodedSize(header));
|
||||
}
|
||||
|
@ -219,61 +222,25 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
|
|||
@Override
|
||||
public void startDoc(int docID, int termDocFreq) throws IOException {
|
||||
if (DEBUG) {
|
||||
System.out.println("FPW.startDoc docID=" + docID);
|
||||
System.out.println("FPW.startDoc docID["+docBufferUpto+"]=" + docID);
|
||||
}
|
||||
|
||||
// nocommit do this in finishDoc... but does it fail...?
|
||||
// is it not always called...?
|
||||
if (posOut != null && saveNextPosBlock) {
|
||||
lastBlockPosFP = posOut.getFilePointer();
|
||||
if (payOut != null) {
|
||||
lastBlockPayFP = payOut.getFilePointer();
|
||||
}
|
||||
lastBlockPosBufferUpto = posBufferUpto;
|
||||
lastBlockStartOffset = lastStartOffset;
|
||||
lastBlockPayloadByteUpto = payloadByteUpto;
|
||||
saveNextPosBlock = false;
|
||||
if (DEBUG) {
|
||||
System.out.println(" now save lastBlockPosFP=" + lastBlockPosFP + " lastBlockPosBufferUpto=" + lastBlockPosBufferUpto + " lastBlockPayloadByteUpto=" + lastBlockPayloadByteUpto);
|
||||
}
|
||||
}
|
||||
|
||||
final int docDelta = docID - lastDocID;
|
||||
|
||||
if (docID < 0 || (docCount > 0 && docDelta <= 0)) {
|
||||
throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (docOut: " + docOut + ")");
|
||||
}
|
||||
lastDocID = docID;
|
||||
|
||||
docDeltaBuffer[docBufferUpto] = docDelta;
|
||||
if (DEBUG) {
|
||||
System.out.println(" docDeltaBuffer[" + docBufferUpto + "]=" + docDelta);
|
||||
}
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" docDeltaBuffer[" + docBufferUpto + "]=" + docDelta);
|
||||
// }
|
||||
if (fieldHasFreqs) {
|
||||
freqBuffer[docBufferUpto] = termDocFreq;
|
||||
}
|
||||
|
||||
docBufferUpto++;
|
||||
docCount++;
|
||||
|
||||
if (docBufferUpto == blockSize) {
|
||||
// nocommit maybe instead of buffering skip before
|
||||
// writing a block based on last block's end data
|
||||
// ... we could buffer after writing the block? only
|
||||
// iffiness with that approach is it could be a
|
||||
// pointlness skip? like we may stop adding docs
|
||||
// right after that, then we have skip point AFTER
|
||||
// last doc. the thing is, in finishTerm we are
|
||||
// already sometimes adding a skip point AFTER the
|
||||
// last doc?
|
||||
if (lastBlockDocID != -1) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" bufferSkip at writeBlock: lastDocID=" + lastBlockDocID + " docCount=" + (docCount-blockSize));
|
||||
}
|
||||
skipWriter.bufferSkip(lastBlockDocID, docCount-blockSize, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockStartOffset, lastBlockPayloadByteUpto);
|
||||
}
|
||||
lastBlockDocID = docID;
|
||||
saveNextPosBlock = true;
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" write docDelta block @ fp=" + docOut.getFilePointer());
|
||||
}
|
||||
|
@ -284,9 +251,11 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
|
|||
}
|
||||
writeBlock(freqBuffer, docOut);
|
||||
}
|
||||
docBufferUpto = 0;
|
||||
// NOTE: don't set docBufferUpto back to 0 here;
|
||||
// finishDoc will do so (because it needs to see that
|
||||
// the block was filled so it can save skip data)
|
||||
}
|
||||
|
||||
lastDocID = docID;
|
||||
lastPosition = 0;
|
||||
lastStartOffset = 0;
|
||||
}
|
||||
|
@ -294,9 +263,9 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
|
|||
/** Add a new position & payload */
|
||||
@Override
|
||||
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
|
||||
if (DEBUG) {
|
||||
System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: ""));
|
||||
}
|
||||
// if (DEBUG) {
|
||||
// System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: ""));
|
||||
// }
|
||||
posDeltaBuffer[posBufferUpto] = position - lastPosition;
|
||||
if (fieldHasPayloads) {
|
||||
if (payload == null || payload.length == 0) {
|
||||
|
@ -343,7 +312,39 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void finishDoc() {
|
||||
public void finishDoc() throws IOException {
|
||||
// Have collected a block of docs, and get a new doc.
|
||||
// Should write skip data as well as postings list for
|
||||
// current block
|
||||
|
||||
if (lastBlockDocID != -1 && docBufferUpto == 1) {
|
||||
// nocomit move to startDoc? ie we can write skip
|
||||
// data as soon as the next doc starts...
|
||||
if (DEBUG) {
|
||||
System.out.println(" bufferSkip at writeBlock: lastDocID=" + lastBlockDocID + " docCount=" + (docCount-1));
|
||||
}
|
||||
skipWriter.bufferSkip(lastBlockDocID, docCount-1, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockStartOffset, lastBlockPayloadByteUpto);
|
||||
}
|
||||
|
||||
// Since we don't know df for current term, we had to buffer
|
||||
// those skip data for each block, and when a new doc comes,
|
||||
// write them to skip file.
|
||||
if (docBufferUpto == blockSize) {
|
||||
lastBlockDocID = lastDocID;
|
||||
if (posOut != null) {
|
||||
if (payOut != null) {
|
||||
lastBlockPayFP = payOut.getFilePointer();
|
||||
}
|
||||
lastBlockPosFP = posOut.getFilePointer();
|
||||
lastBlockPosBufferUpto = posBufferUpto;
|
||||
lastBlockStartOffset = lastStartOffset;
|
||||
lastBlockPayloadByteUpto = payloadByteUpto;
|
||||
}
|
||||
if (DEBUG) {
|
||||
System.out.println(" docBufferUpto="+docBufferUpto+" now get lastBlockDocID="+lastBlockDocID+" lastBlockPosFP=" + lastBlockPosFP + " lastBlockPosBufferUpto=" + lastBlockPosBufferUpto + " lastBlockPayloadByteUpto=" + lastBlockPayloadByteUpto);
|
||||
}
|
||||
docBufferUpto = 0;
|
||||
}
|
||||
}
|
||||
|
||||
private static class PendingTerm {
|
||||
|
@ -367,7 +368,6 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
|
|||
/** Called when we are done adding docs to this term */
|
||||
@Override
|
||||
public void finishTerm(TermStats stats) throws IOException {
|
||||
|
||||
assert stats.docFreq > 0;
|
||||
|
||||
// TODO: wasteful we are counting this (counting # docs
|
||||
|
@ -378,19 +378,6 @@ public final class BlockPostingsWriter extends PostingsWriterBase {
|
|||
System.out.println("FPW.finishTerm docFreq=" + stats.docFreq);
|
||||
}
|
||||
|
||||
// nocommit silly that skipper must write skip when we no
|
||||
// postings come after it, but if we don't do this, skip
|
||||
// reader incorrectly thinks it can read another level 0
|
||||
// skip entry here!:
|
||||
//if (docCount > blockSize && docBufferUpto > 0) {
|
||||
if (docCount > blockSize) {
|
||||
final int lastDocCount = blockSize*(docCount/blockSize);
|
||||
if (DEBUG) {
|
||||
System.out.println(" bufferSkip at finishTerm: lastDocID=" + lastBlockDocID + " docCount=" + lastDocCount);
|
||||
}
|
||||
skipWriter.bufferSkip(lastBlockDocID, lastDocCount, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockStartOffset, lastBlockPayloadByteUpto);
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
if (docBufferUpto > 0) {
|
||||
System.out.println(" write doc/freq vInt block (count=" + docBufferUpto + ") at fp=" + docOut.getFilePointer() + " docTermStartFP=" + docTermStartFP);
|
||||
|
|
|
@ -24,14 +24,35 @@ import org.apache.lucene.codecs.MultiLevelSkipListReader;
|
|||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
/**
|
||||
* Implements the skip list reader for the 4.0 posting list format
|
||||
* Implements the skip list reader for block postings format
|
||||
* that stores positions and payloads.
|
||||
*
|
||||
* @see Lucene40PostingsFormat
|
||||
* @lucene.experimental
|
||||
* Although this skipper uses MultiLevelSkipListReader as an interface,
|
||||
* its definition of skip position will be a little different.
|
||||
*
|
||||
* For example, when skipInterval = blockSize = 3, df = 2*skipInterval = 6,
|
||||
*
|
||||
* 0 1 2 3 4 5
|
||||
* d d d d d d (posting list)
|
||||
* ^ ^ (skip point in MultiLeveSkipWriter)
|
||||
* ^ (skip point in BlockSkipWriter)
|
||||
*
|
||||
* In this case, MultiLevelSkipListReader will use the last document as a skip point,
|
||||
* while BlockSkipReader should assume no skip point will comes.
|
||||
*
|
||||
* If we use the interface directly in BlockSkipReader, it may silly try to read
|
||||
* another skip data after the only skip point is loaded.
|
||||
*
|
||||
* To illustrate this, we can call skipTo(d[5]), since skip point d[3] has smaller docId,
|
||||
* and numSkipped+blockSize== df, the MultiLevelSkipListReader will assume the skip list
|
||||
* isn't exhausted yet, and try to load a non-existed skip point
|
||||
*
|
||||
* Therefore, we'll trim df before passing it to the interface. see trim(int)
|
||||
*
|
||||
*/
|
||||
final class BlockSkipReader extends MultiLevelSkipListReader {
|
||||
private boolean DEBUG = BlockPostingsReader.DEBUG;
|
||||
private int blockSize;
|
||||
|
||||
private long docPointer[];
|
||||
private long posPointer[];
|
||||
|
@ -47,8 +68,9 @@ final class BlockSkipReader extends MultiLevelSkipListReader {
|
|||
private long lastDocPointer;
|
||||
private int lastPosBufferUpto;
|
||||
|
||||
public BlockSkipReader(IndexInput skipStream, int maxSkipLevels, int skipInterval, boolean hasPos, boolean hasOffsets, boolean hasPayloads) {
|
||||
super(skipStream, maxSkipLevels, skipInterval);
|
||||
public BlockSkipReader(IndexInput skipStream, int maxSkipLevels, int blockSize, boolean hasPos, boolean hasOffsets, boolean hasPayloads) {
|
||||
super(skipStream, maxSkipLevels, blockSize);
|
||||
this.blockSize = blockSize;
|
||||
docPointer = new long[maxSkipLevels];
|
||||
if (hasPos) {
|
||||
posPointer = new long[maxSkipLevels];
|
||||
|
@ -73,8 +95,22 @@ final class BlockSkipReader extends MultiLevelSkipListReader {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Trim original docFreq to tell skipReader read proper number of skip points.
|
||||
*
|
||||
* Since our definition in BlockSkip* is a little different from MultiLevelSkip*
|
||||
* This trimed docFreq will prevent skipReader from:
|
||||
* 1. silly reading a non-existed skip point after the last block boundary
|
||||
* 2. moving into the vInt block
|
||||
*
|
||||
*/
|
||||
protected int trim(int df) {
|
||||
return df % blockSize == 0? df - 1: df;
|
||||
}
|
||||
|
||||
public void init(long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df) {
|
||||
super.init(skipPointer, df);
|
||||
super.init(skipPointer, trim(df));
|
||||
lastDocPointer = docBasePointer;
|
||||
lastPosPointer = posBasePointer;
|
||||
lastPayPointer = payBasePointer;
|
||||
|
|
|
@ -23,10 +23,29 @@ import java.util.Arrays;
|
|||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
|
||||
|
||||
// nocommit do we need more frequent skips at level > 0?
|
||||
// 128*128 is immense? may need to decouple
|
||||
// nocommit may need to decouple
|
||||
// baseSkipInterval & theRestSkipInterval?
|
||||
|
||||
/**
|
||||
* Write skip lists with multiple levels, and support skip within block ints.
|
||||
*
|
||||
* Assume that docFreq = 28, skipInterval = blockSize = 12
|
||||
*
|
||||
* | block#0 | | block#1 | |vInts|
|
||||
* d d d d d d d d d d d d d d d d d d d d d d d d d d d d (posting list)
|
||||
* ^ ^ (level 0 skip point)
|
||||
*
|
||||
* Note that skipWriter will ignore first document in block#0, since
|
||||
* it is useless as a skip point. Also, we'll never skip into the vInts
|
||||
* block, only record skip data at the start its start point(if it exist).
|
||||
*
|
||||
* For each skip point, we will record:
|
||||
* 1. lastDocID,
|
||||
* 2. its related file points(position, payload),
|
||||
* 3. related numbers or uptos(position, payload).
|
||||
* 4. start offset.
|
||||
*
|
||||
*/
|
||||
final class BlockSkipWriter extends MultiLevelSkipListWriter {
|
||||
private boolean DEBUG = BlockPostingsReader.DEBUG;
|
||||
|
||||
|
@ -52,8 +71,8 @@ final class BlockSkipWriter extends MultiLevelSkipListWriter {
|
|||
private boolean fieldHasOffsets;
|
||||
private boolean fieldHasPayloads;
|
||||
|
||||
public BlockSkipWriter(int skipInterval, int maxSkipLevels, int docCount, IndexOutput docOut, IndexOutput posOut, IndexOutput payOut) {
|
||||
super(skipInterval, maxSkipLevels, docCount);
|
||||
public BlockSkipWriter(int maxSkipLevels, int blockSize, int docCount, IndexOutput docOut, IndexOutput posOut, IndexOutput payOut) {
|
||||
super(blockSize, maxSkipLevels, docCount);
|
||||
this.docOut = docOut;
|
||||
this.posOut = posOut;
|
||||
this.payOut = payOut;
|
||||
|
|
|
@ -22,7 +22,7 @@ import java.nio.IntBuffer;
|
|||
* Encode all values in normal area with fixed bit width,
|
||||
* which is determined by the max value in this block.
|
||||
*/
|
||||
public class ForUtil {
|
||||
public final class ForUtil {
|
||||
protected static final int[] MASK = { 0x00000000,
|
||||
0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
|
||||
0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
|
||||
|
@ -31,24 +31,24 @@ public class ForUtil {
|
|||
0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
|
||||
0x7fffffff, 0xffffffff};
|
||||
|
||||
final static int blockSize = BlockPostingsFormat.BLOCK_SIZE;
|
||||
|
||||
/** Compress given int[] into Integer buffer, with For format
|
||||
*
|
||||
* @param data uncompressed data
|
||||
* @param size num of ints to compress
|
||||
* @param intBuffer integer buffer to hold compressed data
|
||||
* @return encoded block byte size
|
||||
* @return the header for current block
|
||||
*/
|
||||
public static int compress(final int[] data, IntBuffer intBuffer) {
|
||||
int numBits=getNumBits(data);
|
||||
int numBits = getNumBits(data);
|
||||
if (numBits == 0) {
|
||||
return compressDuplicateBlock(data,intBuffer);
|
||||
return compressDuplicateBlock(data, intBuffer);
|
||||
}
|
||||
|
||||
int size=data.length;
|
||||
int encodedSize = (size*numBits+31)/32;
|
||||
int encodedSize = (blockSize*numBits+31)/32;
|
||||
|
||||
for (int i=0; i<size; ++i) {
|
||||
encodeNormalValue(intBuffer,i,data[i], numBits);
|
||||
for (int i=0; i<blockSize; ++i) {
|
||||
encodeNormalValue(intBuffer, i, data[i], numBits);
|
||||
}
|
||||
|
||||
return getHeader(encodedSize, numBits);
|
||||
|
@ -58,7 +58,7 @@ public class ForUtil {
|
|||
* Save only one int when the whole block equals to 1
|
||||
*/
|
||||
static int compressDuplicateBlock(final int[] data, IntBuffer intBuffer) {
|
||||
intBuffer.put(0,data[0]);
|
||||
intBuffer.put(0, data[0]);
|
||||
return getHeader(1, 0);
|
||||
}
|
||||
|
||||
|
@ -66,6 +66,7 @@ public class ForUtil {
|
|||
*
|
||||
* @param intBuffer integer buffer to hold compressed data
|
||||
* @param data int array to hold uncompressed data
|
||||
* @param header header of current block, which contains numFrameBits
|
||||
*/
|
||||
public static void decompress(IntBuffer intBuffer, int[] data, int header) {
|
||||
// since this buffer is reused at upper level, rewind first
|
||||
|
@ -73,22 +74,12 @@ public class ForUtil {
|
|||
|
||||
// nocommit assert header isn't "malformed", ie besides
|
||||
// numBytes / bit-width there is nothing else!
|
||||
|
||||
int numBits = ((header >> 8) & MASK[6]);
|
||||
|
||||
decompressCore(intBuffer, data, numBits);
|
||||
}
|
||||
|
||||
/**
|
||||
* IntBuffer will not be rewinded in this method, therefore
|
||||
* caller should ensure that the position is set to the first
|
||||
* encoded int before decoding.
|
||||
*/
|
||||
static void decompressCore(IntBuffer intBuffer, int[] data, int numBits) {
|
||||
assert numBits<=32;
|
||||
assert numBits>=0;
|
||||
|
||||
// TODO: PackedIntsDecompress is hardewired to size==128 only
|
||||
switch(numBits) {
|
||||
case 0: PackedIntsDecompress.decode0(intBuffer, data); break;
|
||||
case 1: PackedIntsDecompress.decode1(intBuffer, data); break;
|
||||
|
@ -163,6 +154,7 @@ public class ForUtil {
|
|||
return optBits;
|
||||
}
|
||||
|
||||
// nocommit: we must have a util function for this, hmm?
|
||||
protected static boolean isAllEqual(final int[] data) {
|
||||
int len = data.length;
|
||||
int v = data[0];
|
||||
|
@ -177,23 +169,21 @@ public class ForUtil {
|
|||
/**
|
||||
* Generate the 4 byte header, which contains (from lsb to msb):
|
||||
*
|
||||
* 8 bits for encoded block int size (excluded header, this limits DEFAULT_BLOCK_SIZE <= 2^8)
|
||||
* 6 bits for num of frame bits (when 0, values in this block are all the same)
|
||||
* other bits unused
|
||||
* other bits for encoded block int size (excluded header), so we can use crazy block size
|
||||
*
|
||||
*/
|
||||
static int getHeader(int encodedSize, int numBits) {
|
||||
return (encodedSize)
|
||||
| ((numBits) << 8);
|
||||
return numBits | (encodedSize << 6);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: get metadata from header.
|
||||
*/
|
||||
public static int getEncodedSize(int header) {
|
||||
return ((header & MASK[8]))*4;
|
||||
static int getNumBits(int header) {
|
||||
return ((header & MASK[6]));
|
||||
}
|
||||
public static int getNumBits(int header) {
|
||||
return ((header >> 8) & MASK[6]);
|
||||
static int getEncodedSize(int header) {
|
||||
return ((header >>> 6))*4;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue