LUCENE-3892: remove oal.codecs.pfor (it's slower than block); add new BlockPacked postings format (copy of Block postings format except it uses oal.util.packed for packed ints encode/decode)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1367338 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-07-30 22:15:06 +00:00
parent 33f6da286e
commit 94aecff6c3
19 changed files with 2631 additions and 1057 deletions

View File

@ -24,7 +24,6 @@ import java.nio.IntBuffer;
import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.pfor.ForUtil;
import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfo.IndexOptions;
@ -43,8 +42,6 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
// nocommit move ForUtil here?
// nocommit javadocs // nocommit javadocs
public final class BlockPostingsReader extends PostingsReaderBase { public final class BlockPostingsReader extends PostingsReaderBase {

View File

@ -26,7 +26,6 @@ import java.util.List;
import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsWriterBase; import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermStats; import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.codecs.pfor.ForUtil; // nocommit move here?
import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;

View File

@ -1,4 +1,4 @@
package org.apache.lucene.codecs.pfor; package org.apache.lucene.codecs.block;
/* /*
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with

View File

@ -1,4 +1,4 @@
package org.apache.lucene.codecs.pfor; package org.apache.lucene.codecs.block;
/* /*
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with

View File

@ -48,7 +48,7 @@ def genDecompress():
f = open(fileName, 'w') f = open(fileName, 'w')
w = f.write w = f.write
try: try:
w("package org.apache.lucene.codecs.pfor;\n") w("package org.apache.lucene.codecs.block;\n")
w("""/* w("""/*
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with

View File

@ -1,4 +1,4 @@
package org.apache.lucene.codecs.pfor; package org.apache.lucene.codecs.blockpacked;
/* /*
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
@ -18,49 +18,41 @@ package org.apache.lucene.codecs.pfor;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Set;
import org.apache.lucene.codecs.BlockTreeTermsReader; import org.apache.lucene.codecs.BlockTreeTermsReader;
import org.apache.lucene.codecs.BlockTreeTermsWriter; import org.apache.lucene.codecs.BlockTreeTermsWriter;
import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.FixedGapTermsIndexReader;
import org.apache.lucene.codecs.FixedGapTermsIndexWriter;
import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase; import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermsIndexReaderBase;
import org.apache.lucene.codecs.TermsIndexWriterBase;
import org.apache.lucene.codecs.sep.SepPostingsReader;
import org.apache.lucene.codecs.sep.SepPostingsWriter;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
/** /**
* Pass ForFactory to a PostingsWriter/ReaderBase, and get * Pass ForFactory to a PostingsWriter/ReaderBase, and get
* customized postings format plugged. * customized postings format plugged.
*/ */
public final class ForPostingsFormat extends PostingsFormat { public final class BlockPackedPostingsFormat extends PostingsFormat {
private final int minBlockSize; public static final String DOC_EXTENSION = "doc";
private final int maxBlockSize; public static final String POS_EXTENSION = "pos";
public static final String PAY_EXTENSION = "pay";
private final int minTermBlockSize;
private final int maxTermBlockSize;
public final static int DEFAULT_BLOCK_SIZE = 128; public final static int DEFAULT_BLOCK_SIZE = 128;
public ForPostingsFormat() { public BlockPackedPostingsFormat() {
super("For"); this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
this.minBlockSize = BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE;
this.maxBlockSize = BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE;
} }
public ForPostingsFormat(int minBlockSize, int maxBlockSize) { public BlockPackedPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
super("For"); super("BlockPacked");
this.minBlockSize = minBlockSize; this.minTermBlockSize = minTermBlockSize;
assert minBlockSize > 1; assert minTermBlockSize > 1;
this.maxBlockSize = maxBlockSize; this.maxTermBlockSize = maxTermBlockSize;
assert minBlockSize <= maxBlockSize; assert minTermBlockSize <= maxTermBlockSize;
} }
@Override @Override
@ -71,13 +63,14 @@ public final class ForPostingsFormat extends PostingsFormat {
@Override @Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
// TODO: implement a new PostingsWriterBase to improve skip-settings // TODO: implement a new PostingsWriterBase to improve skip-settings
PostingsWriterBase postingsWriter = new SepPostingsWriter(state, new ForFactory()); PostingsWriterBase postingsWriter = new BlockPackedPostingsWriter(state, 128);
boolean success = false; boolean success = false;
try { try {
FieldsConsumer ret = new BlockTreeTermsWriter(state, FieldsConsumer ret = new BlockTreeTermsWriter(state,
postingsWriter, postingsWriter,
minBlockSize, minTermBlockSize,
maxBlockSize); maxTermBlockSize);
success = true; success = true;
return ret; return ret;
} finally { } finally {
@ -89,13 +82,12 @@ public final class ForPostingsFormat extends PostingsFormat {
@Override @Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new SepPostingsReader(state.dir, PostingsReaderBase postingsReader = new BlockPackedPostingsReader(state.dir,
state.fieldInfos, state.fieldInfos,
state.segmentInfo, state.segmentInfo,
state.context, state.context,
new ForFactory(), state.segmentSuffix,
state.segmentSuffix); 128);
boolean success = false; boolean success = false;
try { try {
FieldsProducer ret = new BlockTreeTermsReader(state.dir, FieldsProducer ret = new BlockTreeTermsReader(state.dir,

View File

@ -0,0 +1,592 @@
package org.apache.lucene.codecs.blockpacked;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import java.nio.LongBuffer;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
// nocommit javadocs
public final class BlockPackedPostingsWriter extends PostingsWriterBase {
private boolean DEBUG = BlockPackedPostingsReader.DEBUG;
// nocommit move these constants to the PF:
static final int maxSkipLevels = 10;
final static String TERMS_CODEC = "BlockPackedPostingsWriterTerms";
final static String DOC_CODEC = "BlockPackedPostingsWriterDoc";
final static String POS_CODEC = "BlockPackedPostingsWriterPos";
final static String PAY_CODEC = "BlockPackedPostingsWriterPay";
// Increment version to change it:
final static int VERSION_START = 0;
final static int VERSION_CURRENT = VERSION_START;
final IndexOutput docOut;
final IndexOutput posOut;
final IndexOutput payOut;
static final int DEFAULT_BLOCK_SIZE = 128;
final int blockSize;
private IndexOutput termsOut;
// How current field indexes postings:
private boolean fieldHasFreqs;
private boolean fieldHasPositions;
private boolean fieldHasOffsets;
private boolean fieldHasPayloads;
// Holds starting file pointers for each term:
private long docTermStartFP;
private long posTermStartFP;
private long payTermStartFP;
final long[] docDeltaBuffer;
final long[] freqBuffer;
final LongBuffer docDeltaLBuffer;
final LongBuffer freqLBuffer;
private int docBufferUpto;
final long[] posDeltaBuffer;
final long[] payloadLengthBuffer;
final long[] offsetStartDeltaBuffer;
final long[] offsetLengthBuffer;
final LongBuffer posDeltaLBuffer;
final LongBuffer payloadLengthLBuffer;
final LongBuffer offsetStartDeltaLBuffer;
final LongBuffer offsetLengthLBuffer;
private int posBufferUpto;
private byte[] payloadBytes;
private int payloadByteUpto;
private int lastBlockDocID;
private boolean saveNextPosBlock;
private long lastBlockPosFP;
private long lastBlockPayFP;
private int lastBlockPosBufferUpto;
private int lastBlockEndOffset;
private int lastBlockPayloadByteUpto;
private int lastDocID;
private int lastPosition;
private int lastEndOffset;
private int docCount;
final byte[] encoded;
final LongBuffer encodedBuffer;
private final BlockPackedSkipWriter skipWriter;
public BlockPackedPostingsWriter(SegmentWriteState state, int blockSize) throws IOException {
super();
this.blockSize = blockSize;
docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPackedPostingsFormat.DOC_EXTENSION),
state.context);
IndexOutput posOut = null;
IndexOutput payOut = null;
boolean success = false;
try {
CodecUtil.writeHeader(docOut, DOC_CODEC, VERSION_CURRENT);
if (state.fieldInfos.hasProx()) {
posDeltaBuffer = new long[blockSize];
posDeltaLBuffer = LongBuffer.wrap(posDeltaBuffer);
posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPackedPostingsFormat.POS_EXTENSION),
state.context);
CodecUtil.writeHeader(posOut, POS_CODEC, VERSION_CURRENT);
if (state.fieldInfos.hasPayloads()) {
payloadBytes = new byte[128];
payloadLengthBuffer = new long[blockSize];
payloadLengthLBuffer = LongBuffer.wrap(payloadLengthBuffer);
} else {
payloadBytes = null;
payloadLengthBuffer = null;
payloadLengthLBuffer = null;
}
if (state.fieldInfos.hasOffsets()) {
offsetStartDeltaBuffer = new long[blockSize];
offsetLengthBuffer = new long[blockSize];
offsetStartDeltaLBuffer = LongBuffer.wrap(offsetStartDeltaBuffer);
offsetLengthLBuffer = LongBuffer.wrap(offsetLengthBuffer);
} else {
offsetStartDeltaBuffer = null;
offsetLengthBuffer = null;
offsetStartDeltaLBuffer = null;
offsetLengthLBuffer = null;
}
if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) {
payOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPackedPostingsFormat.PAY_EXTENSION),
state.context);
CodecUtil.writeHeader(payOut, PAY_CODEC, VERSION_CURRENT);
}
} else {
posDeltaBuffer = null;
payloadLengthBuffer = null;
offsetStartDeltaBuffer = null;
offsetLengthBuffer = null;
payloadBytes = null;
posDeltaLBuffer = null;
payloadLengthLBuffer = null;
offsetStartDeltaLBuffer = null;
offsetLengthLBuffer = null;
}
this.payOut = payOut;
this.posOut = posOut;
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(docOut, posOut, payOut);
}
}
docDeltaBuffer = new long[blockSize];
freqBuffer = new long[blockSize];
docDeltaLBuffer = LongBuffer.wrap(docDeltaBuffer);
freqLBuffer = LongBuffer.wrap(freqBuffer);
skipWriter = new BlockPackedSkipWriter(blockSize,
maxSkipLevels,
state.segmentInfo.getDocCount(),
docOut,
posOut,
payOut);
encoded = new byte[blockSize*4];
encodedBuffer = ByteBuffer.wrap(encoded).asLongBuffer();
}
@Override
public void start(IndexOutput termsOut) throws IOException {
this.termsOut = termsOut;
CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT);
termsOut.writeVInt(blockSize);
}
@Override
public void setField(FieldInfo fieldInfo) {
IndexOptions indexOptions = fieldInfo.getIndexOptions();
fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
fieldHasPayloads = fieldInfo.hasPayloads();
skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads);
}
@Override
public void startTerm() {
docTermStartFP = docOut.getFilePointer();
if (fieldHasPositions) {
posTermStartFP = posOut.getFilePointer();
if (fieldHasPayloads || fieldHasOffsets) {
payTermStartFP = payOut.getFilePointer();
}
}
lastBlockDocID = -1;
lastDocID = 0;
if (DEBUG) {
System.out.println("FPW.startTerm startFP=" + docTermStartFP);
}
skipWriter.resetSkip();
}
private void writeBlock(LongBuffer buffer, IndexOutput out) throws IOException {
final int header = ForUtil.compress(buffer, encodedBuffer);
out.writeVInt(header);
out.writeBytes(encoded, ForUtil.getEncodedSize(header));
}
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
if (DEBUG) {
System.out.println("FPW.startDoc docID=" + docID);
}
// nocommit do this in finishDoc... but does it fail...?
// is it not always called...?
if (posOut != null && saveNextPosBlock) {
lastBlockPosFP = posOut.getFilePointer();
if (payOut != null) {
lastBlockPayFP = payOut.getFilePointer();
}
lastBlockPosBufferUpto = posBufferUpto;
lastBlockEndOffset = lastEndOffset;
lastBlockPayloadByteUpto = payloadByteUpto;
saveNextPosBlock = false;
if (DEBUG) {
System.out.println(" now save lastBlockPosFP=" + lastBlockPosFP + " lastBlockPosBufferUpto=" + lastBlockPosBufferUpto + " lastBlockPayloadByteUpto=" + lastBlockPayloadByteUpto);
}
}
final int docDelta = docID - lastDocID;
if (docID < 0 || (docCount > 0 && docDelta <= 0)) {
throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (docOut: " + docOut + ")");
}
lastDocID = docID;
docDeltaBuffer[docBufferUpto] = docDelta;
if (DEBUG) {
System.out.println(" docDeltaBuffer[" + docBufferUpto + "]=" + docDelta);
}
if (fieldHasFreqs) {
freqBuffer[docBufferUpto] = termDocFreq;
}
docBufferUpto++;
docCount++;
if (docBufferUpto == blockSize) {
// nocommit maybe instead of buffering skip before
// writing a block based on last block's end data
// ... we could buffer after writing the block? only
// iffiness with that approach is it could be a
// pointlness skip? like we may stop adding docs
// right after that, then we have skip point AFTER
// last doc. the thing is, in finishTerm we are
// already sometimes adding a skip point AFTER the
// last doc?
if (lastBlockDocID != -1) {
if (DEBUG) {
System.out.println(" bufferSkip at writeBlock: lastDocID=" + lastBlockDocID + " docCount=" + (docCount-blockSize));
}
skipWriter.bufferSkip(lastBlockDocID, docCount-blockSize, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockEndOffset, lastBlockPayloadByteUpto);
}
lastBlockDocID = docID;
saveNextPosBlock = true;
if (DEBUG) {
System.out.println(" write docDelta block @ fp=" + docOut.getFilePointer());
}
writeBlock(docDeltaLBuffer, docOut);
if (fieldHasFreqs) {
if (DEBUG) {
System.out.println(" write freq block @ fp=" + docOut.getFilePointer());
}
writeBlock(freqLBuffer, docOut);
}
docBufferUpto = 0;
}
lastPosition = 0;
lastEndOffset = 0;
}
/** Add a new position & payload */
@Override
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
if (DEBUG) {
System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: ""));
}
posDeltaBuffer[posBufferUpto] = position - lastPosition;
if (fieldHasPayloads) {
if (payload == null || payload.length == 0) {
// no payload
payloadLengthBuffer[posBufferUpto] = 0;
} else {
payloadLengthBuffer[posBufferUpto] = payload.length;
if (payloadByteUpto + payload.length > payloadBytes.length) {
payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length);
}
System.arraycopy(payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length);
payloadByteUpto += payload.length;
}
}
if (fieldHasOffsets) {
assert startOffset >= lastEndOffset;
assert endOffset >= startOffset;
offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastEndOffset;
offsetLengthBuffer[posBufferUpto] = endOffset - startOffset;
lastEndOffset = endOffset;
}
posBufferUpto++;
lastPosition = position;
if (posBufferUpto == blockSize) {
if (DEBUG) {
System.out.println(" write pos bulk block @ fp=" + posOut.getFilePointer());
}
writeBlock(posDeltaLBuffer, posOut);
if (fieldHasPayloads) {
writeBlock(payloadLengthLBuffer, payOut);
payOut.writeVInt(payloadByteUpto);
payOut.writeBytes(payloadBytes, 0, payloadByteUpto);
payloadByteUpto = 0;
}
if (fieldHasOffsets) {
writeBlock(offsetStartDeltaLBuffer, payOut);
writeBlock(offsetLengthLBuffer, payOut);
}
posBufferUpto = 0;
}
}
@Override
public void finishDoc() {
}
private static class PendingTerm {
public final long docStartFP;
public final long posStartFP;
public final long payStartFP;
public final int skipOffset;
public final int lastPosBlockOffset;
public PendingTerm(long docStartFP, long posStartFP, long payStartFP, int skipOffset, int lastPosBlockOffset) {
this.docStartFP = docStartFP;
this.posStartFP = posStartFP;
this.payStartFP = payStartFP;
this.skipOffset = skipOffset;
this.lastPosBlockOffset = lastPosBlockOffset;
}
}
private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();
/** Called when we are done adding docs to this term */
@Override
public void finishTerm(TermStats stats) throws IOException {
assert stats.docFreq > 0;
// TODO: wasteful we are counting this (counting # docs
// for this term) in two places?
assert stats.docFreq == docCount: stats.docFreq + " vs " + docCount;
if (DEBUG) {
System.out.println("FPW.finishTerm docFreq=" + stats.docFreq);
}
// nocommit silly that skipper must write skip when we no
// postings come after it, but if we don't do this, skip
// reader incorrectly thinks it can read another level 0
// skip entry here!:
//if (docCount > blockSize && docBufferUpto > 0) {
if (docCount > blockSize) {
final int lastDocCount = blockSize*(docCount/blockSize);
if (DEBUG) {
System.out.println(" bufferSkip at finishTerm: lastDocID=" + lastBlockDocID + " docCount=" + lastDocCount);
}
skipWriter.bufferSkip(lastBlockDocID, lastDocCount, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockEndOffset, lastBlockPayloadByteUpto);
}
if (DEBUG) {
if (docBufferUpto > 0) {
System.out.println(" write doc/freq vInt block (count=" + docBufferUpto + ") at fp=" + docOut.getFilePointer() + " docTermStartFP=" + docTermStartFP);
}
}
// vInt encode the remaining doc deltas and freqs:
for(int i=0;i<docBufferUpto;i++) {
final int docDelta = (int)docDeltaBuffer[i];
final int freq = (int)freqBuffer[i];
if (!fieldHasFreqs) {
docOut.writeVInt(docDelta);
} else if (freqBuffer[i] == 1) {
docOut.writeVInt((docDelta<<1)|1);
} else {
docOut.writeVInt(docDelta<<1);
docOut.writeVInt(freq);
}
}
final int lastPosBlockOffset;
if (fieldHasPositions) {
if (DEBUG) {
if (posBufferUpto > 0) {
System.out.println(" write pos vInt block (count=" + posBufferUpto + ") at fp=" + posOut.getFilePointer() + " posTermStartFP=" + posTermStartFP + " hasPayloads=" + fieldHasPayloads + " hasOffsets=" + fieldHasOffsets);
}
}
assert stats.totalTermFreq != -1;
if (stats.totalTermFreq > blockSize) {
lastPosBlockOffset = (int) (posOut.getFilePointer() - posTermStartFP);
} else {
lastPosBlockOffset = -1;
}
if (posBufferUpto > 0) {
posOut.writeVInt(posBufferUpto);
// nocommit should we send offsets/payloads to
// .pay...? seems wasteful (have to store extra
// vLong for low (< blockSize) DF terms = vast vast
// majority)
// vInt encode the remaining positions/payloads/offsets:
int lastPayloadLength = -1;
int payloadBytesReadUpto = 0;
for(int i=0;i<posBufferUpto;i++) {
final int posDelta = (int)posDeltaBuffer[i];
if (fieldHasPayloads) {
final int payloadLength = (int)payloadLengthBuffer[i];
if (payloadLength != lastPayloadLength) {
lastPayloadLength = payloadLength;
posOut.writeVInt((posDelta<<1)|1);
posOut.writeVInt(payloadLength);
} else {
posOut.writeVInt(posDelta<<1);
}
if (DEBUG) {
System.out.println(" i=" + i + " payloadLen=" + payloadLength);
}
if (payloadLength != 0) {
if (DEBUG) {
System.out.println(" write payload @ pos.fp=" + posOut.getFilePointer());
}
posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength);
payloadBytesReadUpto += payloadLength;
}
} else {
posOut.writeVInt(posDelta);
}
if (fieldHasOffsets) {
if (DEBUG) {
System.out.println(" write offset @ pos.fp=" + posOut.getFilePointer());
}
posOut.writeVInt((int)offsetStartDeltaBuffer[i]);
posOut.writeVInt((int)offsetLengthBuffer[i]);
}
}
if (fieldHasPayloads) {
assert payloadBytesReadUpto == payloadByteUpto;
payloadByteUpto = 0;
}
}
if (DEBUG) {
System.out.println(" totalTermFreq=" + stats.totalTermFreq + " lastPosBlockOffset=" + lastPosBlockOffset);
}
} else {
lastPosBlockOffset = -1;
}
int skipOffset;
if (docCount > blockSize) {
skipOffset = (int) (skipWriter.writeSkip(docOut)-docTermStartFP);
if (DEBUG) {
System.out.println("skip packet " + (docOut.getFilePointer() - (docTermStartFP + skipOffset)) + " bytes");
}
} else {
skipOffset = -1;
if (DEBUG) {
System.out.println(" no skip: docCount=" + docCount);
}
}
long payStartFP;
if (stats.totalTermFreq >= blockSize) {
payStartFP = payTermStartFP;
} else {
payStartFP = -1;
}
if (DEBUG) {
System.out.println(" payStartFP=" + payStartFP);
}
pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset));
docBufferUpto = 0;
posBufferUpto = 0;
lastDocID = 0;
docCount = 0;
}
private final RAMOutputStream bytesWriter = new RAMOutputStream();
@Override
public void flushTermsBlock(int start, int count) throws IOException {
if (count == 0) {
termsOut.writeByte((byte) 0);
return;
}
assert start <= pendingTerms.size();
assert count <= start;
final int limit = pendingTerms.size() - start + count;
long lastDocStartFP = 0;
long lastPosStartFP = 0;
long lastPayStartFP = 0;
for(int idx=limit-count; idx<limit; idx++) {
PendingTerm term = pendingTerms.get(idx);
bytesWriter.writeVLong(term.docStartFP - lastDocStartFP);
lastDocStartFP = term.docStartFP;
if (fieldHasPositions) {
bytesWriter.writeVLong(term.posStartFP - lastPosStartFP);
lastPosStartFP = term.posStartFP;
if (term.lastPosBlockOffset != -1) {
bytesWriter.writeVInt(term.lastPosBlockOffset);
}
if ((fieldHasPayloads || fieldHasOffsets) && term.payStartFP != -1) {
bytesWriter.writeVLong(term.payStartFP - lastPayStartFP);
lastPayStartFP = term.payStartFP;
}
}
if (term.skipOffset != -1) {
bytesWriter.writeVInt(term.skipOffset);
}
}
termsOut.writeVInt((int) bytesWriter.getFilePointer());
bytesWriter.writeTo(termsOut);
bytesWriter.reset();
// Remove the terms we just wrote:
pendingTerms.subList(limit-count, limit).clear();
}
@Override
public void close() throws IOException {
IOUtils.close(docOut, posOut, payOut);
}
}

View File

@ -0,0 +1,205 @@
package org.apache.lucene.codecs.blockpacked;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.codecs.MultiLevelSkipListReader;
import org.apache.lucene.store.IndexInput;
/**
* Implements the skip list reader for the 4.0 posting list format
* that stores positions and payloads.
*
* @see Lucene40PostingsFormat
* @lucene.experimental
*/
final class BlockPackedSkipReader extends MultiLevelSkipListReader {
private boolean DEBUG = BlockPackedPostingsReader.DEBUG;
private long docPointer[];
private long posPointer[];
private long payPointer[];
private int posBufferUpto[];
private int endOffset[];
private int payloadByteUpto[];
private long lastPosPointer;
private long lastPayPointer;
private int lastEndOffset;
private int lastPayloadByteUpto;
private long lastDocPointer;
private int lastPosBufferUpto;
public BlockPackedSkipReader(IndexInput skipStream, int maxSkipLevels, int skipInterval, boolean hasPos, boolean hasOffsets, boolean hasPayloads) {
super(skipStream, maxSkipLevels, skipInterval);
docPointer = new long[maxSkipLevels];
if (hasPos) {
posPointer = new long[maxSkipLevels];
posBufferUpto = new int[maxSkipLevels];
if (hasPayloads) {
payloadByteUpto = new int[maxSkipLevels];
} else {
payloadByteUpto = null;
}
if (hasOffsets) {
endOffset = new int[maxSkipLevels];
} else {
endOffset = null;
}
if (hasOffsets || hasPayloads) {
payPointer = new long[maxSkipLevels];
} else {
payPointer = null;
}
} else {
posPointer = null;
}
}
public void init(long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df) {
super.init(skipPointer, df);
lastDocPointer = docBasePointer;
lastPosPointer = posBasePointer;
lastPayPointer = payBasePointer;
Arrays.fill(docPointer, docBasePointer);
if (posPointer != null) {
Arrays.fill(posPointer, posBasePointer);
if (payPointer != null) {
Arrays.fill(payPointer, payBasePointer);
}
} else {
assert posBasePointer == 0;
}
}
/** Returns the doc pointer of the doc to which the last call of
* {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */
public long getDocPointer() {
return lastDocPointer;
}
public long getPosPointer() {
return lastPosPointer;
}
public int getPosBufferUpto() {
return lastPosBufferUpto;
}
public long getPayPointer() {
return lastPayPointer;
}
public int getEndOffset() {
return lastEndOffset;
}
public int getPayloadByteUpto() {
return lastPayloadByteUpto;
}
@Override
protected void seekChild(int level) throws IOException {
super.seekChild(level);
if (DEBUG) {
System.out.println("seekChild level=" + level);
}
docPointer[level] = lastDocPointer;
if (posPointer != null) {
posPointer[level] = lastPosPointer;
posBufferUpto[level] = lastPosBufferUpto;
if (endOffset != null) {
endOffset[level] = lastEndOffset;
}
if (payloadByteUpto != null) {
payloadByteUpto[level] = lastPayloadByteUpto;
}
if (payPointer != null) {
payPointer[level] = lastPayPointer;
}
}
}
@Override
protected void setLastSkipData(int level) {
super.setLastSkipData(level);
lastDocPointer = docPointer[level];
if (DEBUG) {
System.out.println("setLastSkipData level=" + level);
System.out.println(" lastDocPointer=" + lastDocPointer);
}
if (posPointer != null) {
lastPosPointer = posPointer[level];
lastPosBufferUpto = posBufferUpto[level];
if (DEBUG) {
System.out.println(" lastPosPointer=" + lastPosPointer + " lastPosBUfferUpto=" + lastPosBufferUpto);
}
if (payPointer != null) {
lastPayPointer = payPointer[level];
}
if (endOffset != null) {
lastEndOffset = endOffset[level];
}
if (payloadByteUpto != null) {
lastPayloadByteUpto = payloadByteUpto[level];
}
}
}
@Override
protected int readSkipData(int level, IndexInput skipStream) throws IOException {
if (DEBUG) {
System.out.println("readSkipData level=" + level);
}
int delta = skipStream.readVInt();
if (DEBUG) {
System.out.println(" delta=" + delta);
}
docPointer[level] += skipStream.readVInt();
if (DEBUG) {
System.out.println(" docFP=" + docPointer[level]);
}
if (posPointer != null) {
posPointer[level] += skipStream.readVInt();
if (DEBUG) {
System.out.println(" posFP=" + posPointer[level]);
}
posBufferUpto[level] = skipStream.readVInt();
if (DEBUG) {
System.out.println(" posBufferUpto=" + posBufferUpto[level]);
}
if (payloadByteUpto != null) {
payloadByteUpto[level] = skipStream.readVInt();
}
if (endOffset != null) {
endOffset[level] += skipStream.readVInt();
}
if (payPointer != null) {
payPointer[level] += skipStream.readVInt();
}
}
return delta;
}
}

View File

@ -0,0 +1,147 @@
package org.apache.lucene.codecs.blockpacked;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
// nocommit do we need more frequent skips at level > 0?
// 128*128 is immense? may need to decouple
// baseSkipInterval & theRestSkipInterval?
final class BlockPackedSkipWriter extends MultiLevelSkipListWriter {
private boolean DEBUG = BlockPackedPostingsReader.DEBUG;
private int[] lastSkipDoc;
private long[] lastSkipDocPointer;
private long[] lastSkipPosPointer;
private long[] lastSkipPayPointer;
private int[] lastEndOffset;
private int[] lastPayloadByteUpto;
private final IndexOutput docOut;
private final IndexOutput posOut;
private final IndexOutput payOut;
private int curDoc;
private long curDocPointer;
private long curPosPointer;
private long curPayPointer;
private int curPosBufferUpto;
private int curEndOffset;
private int curPayloadByteUpto;
private boolean fieldHasPositions;
private boolean fieldHasOffsets;
private boolean fieldHasPayloads;
public BlockPackedSkipWriter(int skipInterval, int maxSkipLevels, int docCount, IndexOutput docOut, IndexOutput posOut, IndexOutput payOut) {
super(skipInterval, maxSkipLevels, docCount);
this.docOut = docOut;
this.posOut = posOut;
this.payOut = payOut;
lastSkipDoc = new int[maxSkipLevels];
lastSkipDocPointer = new long[maxSkipLevels];
if (posOut != null) {
lastSkipPosPointer = new long[maxSkipLevels];
if (payOut != null) {
lastSkipPayPointer = new long[maxSkipLevels];
}
lastEndOffset = new int[maxSkipLevels];
lastPayloadByteUpto = new int[maxSkipLevels];
}
}
public void setField(boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) {
this.fieldHasPositions = fieldHasPositions;
this.fieldHasOffsets = fieldHasOffsets;
this.fieldHasPayloads = fieldHasPayloads;
}
@Override
public void resetSkip() {
super.resetSkip();
Arrays.fill(lastSkipDoc, 0);
Arrays.fill(lastSkipDocPointer, docOut.getFilePointer());
if (fieldHasPositions) {
Arrays.fill(lastSkipPosPointer, posOut.getFilePointer());
if (fieldHasOffsets) {
Arrays.fill(lastEndOffset, 0);
}
if (fieldHasPayloads) {
Arrays.fill(lastPayloadByteUpto, 0);
}
if (fieldHasOffsets || fieldHasPayloads) {
Arrays.fill(lastSkipPayPointer, payOut.getFilePointer());
}
}
}
/**
* Sets the values for the current skip data.
*/
public void bufferSkip(int doc, int numDocs, long posFP, long payFP, int posBufferUpto, int endOffset, int payloadByteUpto) throws IOException {
this.curDoc = doc;
this.curDocPointer = docOut.getFilePointer();
this.curPosPointer = posFP;
this.curPayPointer = payFP;
this.curPosBufferUpto = posBufferUpto;
this.curPayloadByteUpto = payloadByteUpto;
this.curEndOffset = endOffset;
bufferSkip(numDocs);
}
@Override
protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException {
int delta = curDoc - lastSkipDoc[level];
if (DEBUG) {
System.out.println("writeSkipData level=" + level + " lastDoc=" + curDoc + " delta=" + delta + " curDocPointer=" + curDocPointer);
}
skipBuffer.writeVInt(delta);
lastSkipDoc[level] = curDoc;
skipBuffer.writeVInt((int) (curDocPointer - lastSkipDocPointer[level]));
lastSkipDocPointer[level] = curDocPointer;
if (fieldHasPositions) {
if (DEBUG) {
System.out.println(" curPosPointer=" + curPosPointer + " curPosBufferUpto=" + curPosBufferUpto);
}
skipBuffer.writeVInt((int) (curPosPointer - lastSkipPosPointer[level]));
lastSkipPosPointer[level] = curPosPointer;
skipBuffer.writeVInt(curPosBufferUpto);
if (fieldHasPayloads) {
skipBuffer.writeVInt(curPayloadByteUpto);
}
if (fieldHasOffsets) {
skipBuffer.writeVInt(curEndOffset - lastEndOffset[level]);
lastEndOffset[level] = curEndOffset;
}
if (fieldHasOffsets || fieldHasPayloads) {
skipBuffer.writeVInt((int) (curPayPointer - lastSkipPayPointer[level]));
lastSkipPayPointer[level] = curPayPointer;
}
}
}
}

View File

@ -0,0 +1,130 @@
package org.apache.lucene.codecs.blockpacked;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.nio.LongBuffer;
import java.nio.IntBuffer;
import java.nio.ByteBuffer;
import java.util.Arrays;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedInts.Reader;
import org.apache.lucene.util.packed.PackedInts.Writer;
import org.apache.lucene.util.packed.PackedInts.Mutable;
import org.apache.lucene.util.packed.PackedInts.Encoder;
import org.apache.lucene.util.packed.PackedInts.Decoder;
/**
* Encode all values in normal area with fixed bit width,
* which is determined by the max value in this block.
*/
public class ForUtil {
protected static final int[] MASK = { 0x00000000,
0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff,
0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
0x7fffffff, 0xffffffff};
/** Compress given int[] into output stream, with For format
*/
public static int compress(final LongBuffer data, LongBuffer packed) throws IOException {
int numBits=getNumBits(data.array());
if (numBits == 0) { // when block is equal, save the value once
packed.put(0, data.get(0)<<32); // java uses big endian for LongBuffer impl
return (getHeader(1,numBits));
}
PackedInts.Format format = PackedInts.fastestFormatAndBits(128, numBits, PackedInts.FASTEST).format;
PackedInts.Encoder encoder = PackedInts.getEncoder(format, PackedInts.VERSION_CURRENT, numBits);
int perIter = encoder.values();
int iters = 128/perIter;
int nblocks = encoder.blocks()*iters;
assert 128 % perIter == 0;
packed.rewind();
data.rewind();
encoder.encode(data, packed, iters);
int encodedSize = nblocks*2;
return getHeader(encodedSize,numBits);
}
/** Decompress given ouput stream into int array.
*/
public static void decompress(LongBuffer data, LongBuffer packed, int header) throws IOException {
// nocommit assert header isn't "malformed", ie besides
// numBytes / bit-width there is nothing else!
packed.rewind();
data.rewind();
int numBits = ((header >> 8) & MASK[6]);
if (numBits == 0) {
Arrays.fill(data.array(), (int)(packed.get(0)>>>32));
return;
}
PackedInts.Format format = PackedInts.fastestFormatAndBits(128, numBits, PackedInts.FASTEST).format;
PackedInts.Decoder decoder = PackedInts.getDecoder(format, PackedInts.VERSION_CURRENT, numBits);
int perIter = decoder.values();
int iters = 128/perIter;
int nblocks = decoder.blocks()*iters;
assert 128 % perIter == 0;
decoder.decode(packed, data, iters);
}
static int getNumBits(final long[] data) {
if (isAllEqual(data)) {
return 0;
}
int size=data.length;
int optBits=1;
for (int i=0; i<size; ++i) {
while ((data[i] & ~MASK[optBits]) != 0) {
optBits++;
}
}
return optBits;
}
protected static boolean isAllEqual(final long[] data) {
int len = data.length;
long v = data[0];
for (int i=1; i<len; i++) {
if (data[i] != v) {
return false;
}
}
return true;
}
static int getHeader(int encodedSize, int numBits) {
return (encodedSize)
| ((numBits) << 8);
}
public static int getEncodedSize(int header) {
return ((header & MASK[8]))*4;
}
public static int getNumBits(int header) {
return ((header >> 8) & MASK[6]);
}
}

View File

@ -1,128 +0,0 @@
package org.apache.lucene.codecs.pfor;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.codecs.sep.IntStreamFactory;
import org.apache.lucene.codecs.sep.IntIndexInput;
import org.apache.lucene.codecs.sep.IntIndexOutput;
import org.apache.lucene.codecs.intblock.FixedIntBlockIndexInput;
import org.apache.lucene.codecs.intblock.FixedIntBlockIndexOutput;
/**
* Used to plug to PostingsReader/WriterBase.
* Encoder and decoder in lower layers are called by
* flushBlock() and readBlock()
*/
public final class ForFactory extends IntStreamFactory {
public ForFactory() {
}
@Override
public IntIndexOutput createOutput(Directory dir, String fileName, IOContext context) throws IOException {
boolean success = false;
IndexOutput out = dir.createOutput(fileName, context);
try {
IntIndexOutput ret = new ForIndexOutput(out);
success = true;
return ret;
} finally {
if (!success) {
// For some cases (e.g. disk full), the IntIndexOutput may not be
// properly created. So we should close those opened files.
IOUtils.closeWhileHandlingException(out);
}
}
}
@Override
public IntIndexInput openInput(Directory dir, String fileName, IOContext context) throws IOException {
return new ForIndexInput(dir.openInput(fileName, context));
}
/**
* Here we'll hold both input buffer and output buffer for
* encoder/decoder.
*/
private class ForIndexInput extends FixedIntBlockIndexInput {
ForIndexInput(final IndexInput in) throws IOException {
super(in);
}
class ForBlockReader implements FixedIntBlockIndexInput.BlockReader {
private final byte[] encoded;
private final int[] buffer;
private final IndexInput in;
private final IntBuffer encodedBuffer;
ForBlockReader(final IndexInput in, final int[] buffer) {
// upperbound for encoded value should include(here header is not buffered):
// blockSize of normal value when numFrameBits=32(4x bytes);
this.encoded = new byte[ForPostingsFormat.DEFAULT_BLOCK_SIZE*4];
this.in = in;
this.buffer = buffer;
this.encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
}
// TODO: implement public void skipBlock() {} ?
@Override
public void readBlock() throws IOException {
final int header = in.readInt();
final int numBytes = ForUtil.getEncodedSize(header);
assert numBytes <= ForPostingsFormat.DEFAULT_BLOCK_SIZE*4;
in.readBytes(encoded,0,numBytes);
ForUtil.decompress(encodedBuffer,buffer,header);
}
}
@Override
protected BlockReader getBlockReader(final IndexInput in, final int[] buffer) throws IOException {
return new ForBlockReader(in,buffer);
}
}
private class ForIndexOutput extends FixedIntBlockIndexOutput {
private final byte[] encoded;
private final IntBuffer encodedBuffer;
ForIndexOutput(IndexOutput out) throws IOException {
super(out,ForPostingsFormat.DEFAULT_BLOCK_SIZE);
this.encoded = new byte[ForPostingsFormat.DEFAULT_BLOCK_SIZE*4];
this.encodedBuffer=ByteBuffer.wrap(encoded).asIntBuffer();
}
@Override
protected void flushBlock() throws IOException {
final int header = ForUtil.compress(buffer,encodedBuffer);
final int numBytes = ForUtil.getEncodedSize(header);
// nocommit writeVInt instead?
out.writeInt(header);
out.writeBytes(encoded, numBytes);
}
}
}

View File

@ -1,129 +0,0 @@
package org.apache.lucene.codecs.pfor;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.codecs.sep.IntStreamFactory;
import org.apache.lucene.codecs.sep.IntIndexInput;
import org.apache.lucene.codecs.sep.IntIndexOutput;
import org.apache.lucene.codecs.intblock.FixedIntBlockIndexInput;
import org.apache.lucene.codecs.intblock.FixedIntBlockIndexOutput;
/**
* Used to plug to PostingsReader/WriterBase.
* Encoder and decoder in lower layers are called by
* flushBlock() and readBlock()
*/
public final class PForFactory extends IntStreamFactory {
public PForFactory() {
}
@Override
public IntIndexOutput createOutput(Directory dir, String fileName, IOContext context) throws IOException {
boolean success = false;
IndexOutput out = dir.createOutput(fileName, context);
try {
IntIndexOutput ret = new PForIndexOutput(out);
success = true;
return ret;
} finally {
if (!success) {
// For some cases (e.g. disk full), the IntIndexOutput may not be
// properly created. So we should close those opened files.
IOUtils.closeWhileHandlingException(out);
}
}
}
@Override
public IntIndexInput openInput(Directory dir, String fileName, IOContext context) throws IOException {
return new PForIndexInput(dir.openInput(fileName, context));
}
/**
* Here we'll hold both input buffer and output buffer for
* encoder/decoder.
*/
private class PForIndexInput extends FixedIntBlockIndexInput {
PForIndexInput(final IndexInput in) throws IOException {
super(in);
}
class PForBlockReader implements FixedIntBlockIndexInput.BlockReader {
private final byte[] encoded;
private final int[] buffer;
private final IndexInput in;
private final IntBuffer encodedBuffer;
PForBlockReader(final IndexInput in, final int[] buffer) {
// upperbound for encoded value should include(here header is not buffered):
// 1. blockSize of normal value (4x bytes);
// 2. blockSize of exception value (4x bytes);
this.encoded = new byte[PForPostingsFormat.DEFAULT_BLOCK_SIZE*8];
this.in = in;
this.buffer = buffer;
this.encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
}
// TODO: implement public void skipBlock() {} ?
@Override
public void readBlock() throws IOException {
final int header = in.readInt();
final int numBytes = PForUtil.getEncodedSize(header);
assert numBytes <= PForPostingsFormat.DEFAULT_BLOCK_SIZE*8;
in.readBytes(encoded,0,numBytes);
PForUtil.decompress(encodedBuffer,buffer,header);
}
}
@Override
protected BlockReader getBlockReader(final IndexInput in, final int[] buffer) throws IOException {
return new PForBlockReader(in,buffer);
}
}
private class PForIndexOutput extends FixedIntBlockIndexOutput {
private final byte[] encoded;
private final IntBuffer encodedBuffer;
PForIndexOutput(IndexOutput out) throws IOException {
super(out, PForPostingsFormat.DEFAULT_BLOCK_SIZE);
this.encoded = new byte[PForPostingsFormat.DEFAULT_BLOCK_SIZE*8];
this.encodedBuffer=ByteBuffer.wrap(encoded).asIntBuffer();
}
@Override
protected void flushBlock() throws IOException {
final int header = PForUtil.compress(buffer,encodedBuffer);
final int numBytes = PForUtil.getEncodedSize(header);
// nocommit writeVInt instead?
out.writeInt(header);
out.writeBytes(encoded, numBytes);
}
}
}

View File

@ -1,115 +0,0 @@
package org.apache.lucene.codecs.pfor;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.codecs.BlockTreeTermsReader;
import org.apache.lucene.codecs.BlockTreeTermsWriter;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.FixedGapTermsIndexReader;
import org.apache.lucene.codecs.FixedGapTermsIndexWriter;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermsIndexReaderBase;
import org.apache.lucene.codecs.TermsIndexWriterBase;
import org.apache.lucene.codecs.sep.SepPostingsReader;
import org.apache.lucene.codecs.sep.SepPostingsWriter;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
/**
* Pass PForFactory to a PostingsWriter/ReaderBase, and get
* customized postings format plugged.
*/
public final class PForPostingsFormat extends PostingsFormat {
private final int minBlockSize;
private final int maxBlockSize;
public final static int DEFAULT_BLOCK_SIZE = 128;
public PForPostingsFormat() {
super("PFor");
this.minBlockSize = BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE;
this.maxBlockSize = BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE;
}
public PForPostingsFormat(int minBlockSize, int maxBlockSize) {
super("PFor");
this.minBlockSize = minBlockSize;
assert minBlockSize > 1;
this.maxBlockSize = maxBlockSize;
assert minBlockSize <= maxBlockSize;
}
@Override
public String toString() {
return getName() + "(blocksize=" + DEFAULT_BLOCK_SIZE+ ")";
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
// TODO: implement a new PostingsWriterBase to improve skip-settings
PostingsWriterBase postingsWriter = new SepPostingsWriter(state, new PForFactory());
boolean success = false;
try {
FieldsConsumer ret = new BlockTreeTermsWriter(state,
postingsWriter,
minBlockSize,
maxBlockSize);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new SepPostingsReader(state.dir,
state.fieldInfos,
state.segmentInfo,
state.context,
new PForFactory(),
state.segmentSuffix);
boolean success = false;
try {
FieldsProducer ret = new BlockTreeTermsReader(state.dir,
state.fieldInfos,
state.segmentInfo.name,
postingsReader,
state.context,
state.segmentSuffix,
state.termsIndexDivisor);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsReader);
}
}
}
}

View File

@ -1,343 +0,0 @@
package org.apache.lucene.codecs.pfor;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.nio.IntBuffer;
import java.nio.ByteBuffer;
import java.util.Arrays;
/**
* Encode all small values and exception pointers in normal area;
* Encode large values in exception area;
* Size per exception is variable, possibly: 1byte, 2bytes, or 4bytes
*/
public final class PForUtil extends ForUtil {
protected static final int[] PER_EXCEPTION_SIZE = {1,2,4};
/** Compress given int[] into Integer buffer, with PFor format
*
* @param data uncompressed data
* @param intBuffer integer buffer to hold compressed data
* @return block header
*/
public static int compress(final int[] data, IntBuffer intBuffer) {
/** estimate minimum compress size to determine numFrameBits */
int numBits=getNumBits(data);
if (numBits == 0) {
return compressDuplicateBlock(data,intBuffer);
}
int size = data.length;
int[] excValues = new int[size];
int excNum = 0, excLastPos = -1, excFirstPos = -1, excLastNonForcePos = -1;
// num of exception until the last non-forced exception
int excNumBase = 0;
// bytes per exception
int excBytes = 1;
// bytes before exception area, e.g. header and normal area
int excByteOffset = 0;
// the max value possible for current exception pointer,
// value of the first pointer is limited by header as 254
// (first exception ranges from -1 ~ 254)
long maxChainFirst = 254;
long maxChain = maxChainFirst + 1;
boolean conValue, conForce, conEnd;
int i=0;
/** estimate exceptions */
for (i=0; i<size; ++i) {
conValue = ((data[i] & MASK[numBits]) != data[i]); // value exception
conForce = (i >= maxChain + excLastPos); // force exception
if (conValue || conForce) {
excValues[excNum++] = data[i];
if (excLastPos == -1) {
maxChain = 1L<<numBits;
excFirstPos = i;
}
if (conValue) {
excLastNonForcePos = i;
excNumBase = excNum;
}
excLastPos = i;
}
}
/** encode normal area, record exception positions */
excNum = 0;
if (excFirstPos < 0) { // no exception
for (i=0; i<size; ++i) {
encodeNormalValue(intBuffer,i,data[i], numBits);
}
excLastPos = -1;
} else {
for (i=0; i<excFirstPos; ++i) {
encodeNormalValue(intBuffer,i,data[i], numBits);
}
maxChain = 1L<<numBits;
excLastPos = excFirstPos;
excNum = i<size? 1:0;
for (i=excFirstPos+1; i<size; ++i) {
conValue = ((data[i] & MASK[numBits]) != data[i]); // value exception
conForce = (i >= maxChain + excLastPos); // force exception
conEnd = (excNum == excNumBase); // following forced ignored
if ((!conValue && !conForce) || conEnd) {
encodeNormalValue(intBuffer,i,data[i], numBits);
} else {
encodeNormalValue(intBuffer, excLastPos, i-excLastPos-1, numBits);
excNum++;
excLastPos = i;
}
}
}
/** encode exception area */
for (i=0; i<excNum; ++i) {
if (excBytes < 2 && (excValues[i] & ~MASK[8]) != 0) {
excBytes=2;
}
if (excBytes < 4 && (excValues[i] & ~MASK[16]) != 0) {
excBytes=4;
}
}
excByteOffset = (size*numBits + 7)/8;
encodeExcValues(intBuffer, excValues, excNum, excBytes, excByteOffset);
/** encode header */
int encodedSize = (excByteOffset + excBytes*excNum + 3)/4;
return getHeader(encodedSize, numBits, excNum, excFirstPos, excBytes);
}
/** Decompress given Integer buffer into int array.
*
* @param intBuffer integer buffer to hold compressed data
* @param data int array to hold uncompressed data
*/
public static void decompress(IntBuffer intBuffer, int[] data, int header) {
// since this buffer is reused at upper level, rewind first
intBuffer.rewind();
int excNum = ((header >> 8) & MASK[8]) + 1;
int excFirstPos = ((header >> 16) & MASK[8]) - 1;
int excBytes = PER_EXCEPTION_SIZE[(header >> 30) & MASK[2]];
int numBits = ((header >> 24) & MASK[6]);
decompressCore(intBuffer, data, numBits);
patchException(intBuffer,data,excNum,excFirstPos,excBytes);
}
/**
* Encode exception values into exception area.
* The width for each exception will be fixed as:
* 1, 2, or 4 byte(s).
*/
static void encodeExcValues(IntBuffer intBuffer, int[] values, int num, int perbytes, int byteOffset) {
if (num == 0)
return;
if (perbytes == 1) {
int curBytePos = byteOffset;
for (int i=0; i<num; ++i) {
int curIntPos = curBytePos / 4;
setBufferIntBits(intBuffer, curIntPos, (curBytePos & 3)*8, 8, values[i]);
curBytePos++;
}
} else if (perbytes == 2) {
int shortOffset = (byteOffset+1)/2;
int curIntPos = shortOffset/2;
int i=0;
if ((shortOffset & 1) == 1) { // cut head to ensure remaining fit ints
setBufferIntBits(intBuffer, curIntPos++, 16, 16, values[i++]);
}
for (; i<num-1; i+=2) {
intBuffer.put(curIntPos++, (values[i+1]<<16) | values[i]);
}
if (i<num) {
intBuffer.put(curIntPos, values[i]); // cut tail, also clear high 16 bits
}
} else if (perbytes == 4) {
int curIntPos = (byteOffset+3) / 4;
for (int i=0; i<num; ++i) {
intBuffer.put(curIntPos++, values[i]);
}
}
}
/**
* Save only header when the whole block equals to 1
*/
static int compressDuplicateBlock(final int[] data, IntBuffer intBuffer) {
intBuffer.put(0,data[0]);
return getHeader(1, 0, 0, -1, 0);
}
/**
* Decode exception values base on the exception pointers in normal area,
* and values in exception area.
* As for current implementation, numInts is hardwired as 128, so the
* tail of normal area is naturally aligned to 32 bits, and we don't need to
* rewind intBuffer here.
* However, the normal area may share a same int with exception area,
* when numFrameBits * numInts % 32 != 0,
* In this case we should preprocess patch several heading exceptions,
* before calling this method.
*
*/
public static void patchException(IntBuffer intBuffer, int[] data, int excNum, int excFirstPos, int excBytes) {
if (excFirstPos == -1) {
return;
}
int curPos=excFirstPos;
int i,j;
if (excBytes == 1) { // each exception consumes 1 byte
for (i=0; i+3<excNum; i+=4) {
final int curInt = intBuffer.get();
curPos = patch(data, curPos, (curInt) & MASK[8]);
curPos = patch(data, curPos, (curInt >>> 8) & MASK[8]);
curPos = patch(data, curPos, (curInt >>> 16) & MASK[8]);
curPos = patch(data, curPos, (curInt >>> 24) & MASK[8]);
}
if (i<excNum) {
final int curInt = intBuffer.get();
for (j=0; j<32 && i<excNum; j+=8,i++) {
curPos = patch(data, curPos, (curInt >>> j) & MASK[8]);
}
}
} else if (excBytes == 2) { // each exception consumes 2 bytes
for (i=0; i+1<excNum; i+=2) {
final int curInt = intBuffer.get();
curPos = patch(data, curPos, (curInt) & MASK[16]);
curPos = patch(data, curPos, (curInt >>> 16) & MASK[16]);
}
if (i<excNum) {
final int curInt = intBuffer.get();
curPos = patch(data, curPos, (curInt) & MASK[16]);
}
} else if (excBytes == 4) { // each exception consumes 4 bytes
for (i=0; i<excNum; i++) {
curPos = patch(data, curPos, intBuffer.get());
}
}
}
static int patch(int[]data, int pos, int value) {
int nextPos = data[pos] + pos + 1;
data[pos] = value;
assert nextPos > pos;
return nextPos;
}
/**
* Estimate best number of frame bits according to minimum compressed size.
* It will run 32 times.
*/
static int getNumBits(final int[] data) {
if (isAllEqual(data)) {
return 0;
}
int optBits=1;
int optSize=estimateCompressedSize(data,optBits);
for (int i=2; i<=32; ++i) {
int curSize=estimateCompressedSize(data,i);
if (curSize<optSize) {
optSize=curSize;
optBits=i;
}
}
return optBits;
}
/**
* Iterate the whole block to get maximum exception bits,
* and estimate compressed size without forced exception.
* TODO: foresee forced exception for better estimation
*/
static int estimateCompressedSize(final int[] data, int numBits) {
int size=data.length;
int totalBytes=(numBits*size+7)/8; // always round to byte
int excNum=0;
int curExcBytes=1;
for (int i=0; i<size; ++i) {
if ((data[i] & ~MASK[numBits]) != 0) { // exception
excNum++;
if (curExcBytes<2 && (data[i] & ~MASK[8]) != 0) { // exceed 1 byte exception
curExcBytes=2;
}
if (curExcBytes<4 && (data[i] & ~MASK[16]) != 0) { // exceed 2 byte exception
curExcBytes=4;
}
}
}
if (curExcBytes==2) {
totalBytes=((totalBytes+1)/2)*2; // round up to 2x bytes before filling exceptions
}
else if (curExcBytes==4) {
totalBytes=((totalBytes+3)/4)*4; // round up to 4x bytes
}
totalBytes+=excNum*curExcBytes;
return totalBytes/4*4; // round up to ints
}
/**
* Generate the 4 byte header which contains (from lsb to msb):
*
* 8 bits for encoded block int size (excluding header, this limits DEFAULT_BLOCK_SIZE <= 2^(8-1))
*
* 8 bits for exception num - 1 (when no exceptions, this is undefined)
*
* 8 bits for the index of the first exception + 1 (when no exception, this is 0)
*
* 6 bits for num of frame bits (when 0, values in this block are all the same)
* 2 bits for the exception code: 00: byte, 01: short, 10: int
*
*/
static int getHeader(int encodedSize, int numBits, int excNum, int excFirstPos, int excBytes) {
return (encodedSize)
| (((excNum-1) & MASK[8]) << 8)
| ((excFirstPos+1) << 16)
| ((numBits) << 24)
| ((excBytes/2) << 30);
}
/**
* Expert: get metadata from header.
*/
public static int getEncodedSize(int header) {
return ((header & MASK[8]))*4;
}
public static int getExcNum(int header) {
return ((header >> 8) & MASK[8]) + 1;
}
public static int getFirstPos(int header) {
return ((header >> 16) & MASK[8]) - 1;
}
public static int getExcBytes(int header) {
return PER_EXCEPTION_SIZE[(header >> 30) & MASK[2]];
}
public static int getNumBits(int header) {
return ((header >> 24) & MASK[6]);
}
}

View File

@ -17,8 +17,6 @@ org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat
org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat
org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat
org.apache.lucene.codecs.memory.MemoryPostingsFormat org.apache.lucene.codecs.memory.MemoryPostingsFormat
org.apache.lucene.codecs.pfor.ForPostingsFormat
org.apache.lucene.codecs.pfor.PForPostingsFormat
org.apache.lucene.codecs.bulkvint.BulkVIntPostingsFormat org.apache.lucene.codecs.bulkvint.BulkVIntPostingsFormat
org.apache.lucene.codecs.block.BlockPostingsFormat org.apache.lucene.codecs.block.BlockPostingsFormat
org.apache.lucene.codecs.memory.DirectPostingsFormat org.apache.lucene.codecs.memory.DirectPostingsFormat

View File

@ -1,293 +0,0 @@
package org.apache.lucene.codecs.pfor;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import java.util.Arrays;
import java.util.Collections;
import java.util.Locale;
import java.util.Random;
import org.apache.lucene.codecs.pfor.ForPostingsFormat;
import org.apache.lucene.codecs.pfor.PForUtil;
import org.apache.lucene.util.LuceneTestCase;
/**
* Test the core utility for PFor compress and decompress
* We don't specially provide test case for For encoder/decoder, since
* PFor is a extended version of For, and most methods will be reused
* here.
*/
public class TestPForUtil extends LuceneTestCase {
static final int[] MASK={ 0x00000000,
0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff,
0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
0x7fffffff, 0xffffffff};
Random gen;
public void initRandom() {
this.gen = random();
}
/**
* Should not encode extra information other than single int
*/
public void testAllEqual() throws Exception {
initRandom();
int sz=ForPostingsFormat.DEFAULT_BLOCK_SIZE;
int[] data=new int[sz];
byte[] res = new byte[sz*8];
int[] copy = new int[sz];
IntBuffer resBuffer = ByteBuffer.wrap(res).asIntBuffer();
int ensz;
int header;
Arrays.fill(data,gen.nextInt());
header = ForUtil.compress(data,resBuffer); // test For
ensz = ForUtil.getEncodedSize(header);
assert ensz == 4;
ForUtil.decompress(resBuffer,copy,header);
assert cmp(data,sz,copy,sz)==true;
Arrays.fill(data,gen.nextInt());
header = PForUtil.compress(data,resBuffer); // test PFor
ensz = PForUtil.getEncodedSize(header);
assert ensz == 4;
PForUtil.decompress(resBuffer,copy,header);
assert cmp(data,sz,copy,sz)==true;
}
/**
* Test correctness of forced exception.
* the forced ones should exactly fit max chain
*/
public void testForcedExceptionDistance() throws Exception {
initRandom();
int sz=ForPostingsFormat.DEFAULT_BLOCK_SIZE;
int[] data=new int[sz];
byte[] res = new byte[sz*8];
int[] copy = new int[sz];
IntBuffer resBuffer = ByteBuffer.wrap(res).asIntBuffer();
int numBits = gen.nextInt(5)+1;
int i,j;
int pace, ensz, header;
int expect, got;
// fill exception value with same pace, there should
// be no forced exceptions.
createDistribution(data, sz, 1, MASK[numBits], MASK[numBits]);
pace = 1<<numBits;
for (i=0,j=0; i<sz; i+=pace) {
int exc = gen.nextInt();
data[i] = (exc & 0xffff0000) == 0 ? exc | 0xffff0000 : exc;
j++;
}
header = PForUtil.compress(data,resBuffer);
ensz = PForUtil.getEncodedSize(header);
expect = j;
got = PForUtil.getExcNum(header);
assert expect == got: expect+" expected but got "+got;
// there should exactly one forced exception before each
// exception when i>0
createDistribution(data, sz, 1, MASK[numBits], MASK[numBits]);
pace = (1<<numBits)+1;
for (i=0,j=0; i<sz; i+=pace) {
int exc = gen.nextInt();
data[i] = (exc & 0xffff0000) == 0 ? exc | 0xffff0000 : exc;
j++;
}
header = PForUtil.compress(data,resBuffer);
ensz = PForUtil.getEncodedSize(header);
expect = 2*(j-1)+1;
got = PForUtil.getExcNum(header);
assert expect == got: expect+" expected but got "+got;
// two forced exception
createDistribution(data, sz, 1, MASK[numBits], MASK[numBits]);
pace = (1<<numBits)*2+1;
for (i=0,j=0; i<sz; i+=pace) {
int exc = gen.nextInt();
data[i] = (exc & 0xffff0000) == 0 ? exc | 0xffff0000 : exc;
j++;
}
header = PForUtil.compress(data,resBuffer);
ensz = PForUtil.getEncodedSize(header);
expect = 3*(j-1)+1;
got = PForUtil.getExcNum(header);
assert expect == got: expect+" expected but got "+got;
}
/**
* Test correctness of ignored forced exception.
* The trailing forced exceptions should always be reverted
* since they're not necessary.
*/
public void testTrailingForcedException() throws Exception {
initRandom();
int sz=ForPostingsFormat.DEFAULT_BLOCK_SIZE;
assert sz % 32 == 0;
Integer[] buff= new Integer[sz];
int[] data = new int[sz];
int[] copy = new int[sz];
byte[] res = new byte[sz*8];
IntBuffer resBuffer = ByteBuffer.wrap(res).asIntBuffer();
int excIndex = gen.nextInt(sz/2);
int excValue = gen.nextInt();
if ((excValue & 0xffff0000) == 0) {
excValue |= 0xffff0000; // always prepare a 4 bytes exception
}
// make value of numFrameBits to be small,
// thus easy to get forced exceptions
for (int i=0; i<sz; ++i) {
buff[i]=gen.nextInt() & 1;
}
// create only one value exception
buff[excIndex]=excValue;
for (int i=0; i<sz; ++i)
data[i] = buff[i];
int header = PForUtil.compress(data,resBuffer);
int ensz = PForUtil.getEncodedSize(header);
assert (ensz <= sz*8): ensz+" > "+sz*8; // must not exceed the loose upperbound
assert (ensz >= 4); // at least we have an exception, right?
PForUtil.decompress(resBuffer,copy,header);
// println(getHex(data,sz)+"\n");
// println(getHex(res,ensz)+"\n");
// println(getHex(copy,sz)+"\n");
// fetch the last int, i.e. last exception.
int lastExc = (res[ensz-4] << 24) |
((0xff & res[ensz-3]) << 16) |
((0xff & res[ensz-2]) << 8 ) |
(0xff & res[ensz-1]);
// trailing forced exceptions are suppressed,
// so the last exception should be what we assigned.
assert lastExc==excValue;
assert cmp(data,sz,copy,sz)==true;
}
/**
* Test correctness of compressing and decompressing.
* Here we randomly assign a rate of exception (i.e. 1-alpha),
* and test different scale of normal/exception values.
*/
public void testAllDistribution() throws Exception {
initRandom();
int sz = ForPostingsFormat.DEFAULT_BLOCK_SIZE;
int[] data = new int[sz];
for (int i=0; i<=32; ++i) { // try to test every kinds of distribution
double alpha=gen.nextDouble(); // rate of normal value
for (int j=i; j<=32; ++j) {
createDistribution(data,sz,alpha,MASK[i],MASK[j]);
tryCompressAndDecompress(data, sz);
}
}
}
public void createDistribution(int[] data, int sz, double alpha, int masknorm, int maskexc) {
Integer[] buff= new Integer[sz];
int i=0;
for (; i<sz*alpha; ++i)
buff[i]=gen.nextInt() & masknorm;
for (; i<sz; ++i)
buff[i]=gen.nextInt() & maskexc;
Collections.shuffle(Arrays.asList(buff),gen);
for (i=0; i<sz; ++i)
data[i] = buff[i];
}
public void tryCompressAndDecompress(final int[] data, int sz) throws Exception {
byte[] res = new byte[sz*8]; // loosely upperbound
IntBuffer resBuffer = ByteBuffer.wrap(res).asIntBuffer();
int header = PForUtil.compress(data,resBuffer);
int ensz = PForUtil.getEncodedSize(header);
assert (ensz <= sz*8); // must not exceed the loose upperbound
int[] copy = new int[sz];
PForUtil.decompress(resBuffer,copy,header);
// println(getHex(data,sz)+"\n");
// println(getHex(res,ensz)+"\n");
// println(getHex(copy,sz)+"\n");
assert cmp(data,sz,copy,sz)==true;
}
public boolean cmp(int[] a, int sza, int[] b, int szb) {
if (sza!=szb)
return false;
for (int i=0; i<sza; ++i) {
if (a[i]!=b[i]) {
System.err.println(String.format(Locale.ENGLISH, "! %08x != %08x in %d",a[i],b[i],i));
return false;
}
}
return true;
}
public static String getHex( byte [] raw, int sz ) {
final String HEXES = "0123456789ABCDEF";
if ( raw == null ) {
return null;
}
final StringBuilder hex = new StringBuilder( 2 * raw.length );
for ( int i=0; i<sz; i++ ) {
if (i>0 && (i)%16 == 0)
hex.append("\n");
byte b=raw[i];
hex.append(HEXES.charAt((b & 0xF0) >> 4))
.append(HEXES.charAt((b & 0x0F)))
.append(" ");
}
return hex.toString();
}
public static String getHex( int [] raw, int sz ) {
if ( raw == null ) {
return null;
}
final StringBuilder hex = new StringBuilder( 4 * raw.length );
for ( int i=0; i<sz; i++ ) {
if (i>0 && i%8 == 0)
hex.append("\n");
hex.append(String.format(Locale.ENGLISH, "%08x ",raw[i]));
}
return hex.toString();
}
static void eprintln(String format, Object... args) {
System.err.println(String.format(Locale.ENGLISH, format,args));
}
static void println(String format, Object... args) {
System.out.println(String.format(Locale.ENGLISH, format,args));
}
static void print(String format, Object... args) {
System.out.print(String.format(Locale.ENGLISH, format,args));
}
}

View File

@ -61,7 +61,6 @@ import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil; import org.apache.lucene.util._TestUtil;
import org.apache.lucene.codecs.pfor.*;
/** /**
* Randomly combines terms index impl w/ postings impls. * Randomly combines terms index impl w/ postings impls.
@ -103,8 +102,6 @@ public class MockRandomPostingsFormat extends PostingsFormat {
final int baseBlockSize = _TestUtil.nextInt(random, 1, 127); final int baseBlockSize = _TestUtil.nextInt(random, 1, 127);
delegates.add(new MockVariableIntBlockPostingsFormat.MockIntFactory(baseBlockSize)); delegates.add(new MockVariableIntBlockPostingsFormat.MockIntFactory(baseBlockSize));
// TODO: others // TODO: others
delegates.add(new ForFactory());
delegates.add(new PForFactory());
} }
private static String getExtension(String fileName) { private static String getExtension(String fileName) {

View File

@ -282,9 +282,7 @@ public abstract class LuceneTestCase extends Assert {
"MockFixedIntBlock", "MockFixedIntBlock",
"MockVariableIntBlock", "MockVariableIntBlock",
"MockSep", "MockSep",
"MockRandom", "MockRandom"
"For",
"PFor"
)); ));
// ----------------------------------------------------------------- // -----------------------------------------------------------------