mirror of https://github.com/apache/lucene.git
LUCENE-3892: remove oal.codecs.pfor (it's slower than block); add new BlockPacked postings format (copy of Block postings format except it uses oal.util.packed for packed ints encode/decode)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1367338 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
33f6da286e
commit
94aecff6c3
|
@ -24,7 +24,6 @@ import java.nio.IntBuffer;
|
|||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.pfor.ForUtil;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
|
@ -43,8 +42,6 @@ import org.apache.lucene.util.Bits;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
// nocommit move ForUtil here?
|
||||
|
||||
// nocommit javadocs
|
||||
public final class BlockPostingsReader extends PostingsReaderBase {
|
||||
|
||||
|
|
|
@ -26,7 +26,6 @@ import java.util.List;
|
|||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.codecs.pfor.ForUtil; // nocommit move here?
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.codecs.pfor;
|
||||
package org.apache.lucene.codecs.block;
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.codecs.pfor;
|
||||
package org.apache.lucene.codecs.block;
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
|
@ -48,7 +48,7 @@ def genDecompress():
|
|||
f = open(fileName, 'w')
|
||||
w = f.write
|
||||
try:
|
||||
w("package org.apache.lucene.codecs.pfor;\n")
|
||||
w("package org.apache.lucene.codecs.block;\n")
|
||||
w("""/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.codecs.pfor;
|
||||
package org.apache.lucene.codecs.blockpacked;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -18,49 +18,41 @@ package org.apache.lucene.codecs.pfor;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.codecs.BlockTreeTermsReader;
|
||||
import org.apache.lucene.codecs.BlockTreeTermsWriter;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.FixedGapTermsIndexReader;
|
||||
import org.apache.lucene.codecs.FixedGapTermsIndexWriter;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.TermsIndexReaderBase;
|
||||
import org.apache.lucene.codecs.TermsIndexWriterBase;
|
||||
import org.apache.lucene.codecs.sep.SepPostingsReader;
|
||||
import org.apache.lucene.codecs.sep.SepPostingsWriter;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Pass ForFactory to a PostingsWriter/ReaderBase, and get
|
||||
* customized postings format plugged.
|
||||
*/
|
||||
public final class ForPostingsFormat extends PostingsFormat {
|
||||
private final int minBlockSize;
|
||||
private final int maxBlockSize;
|
||||
public final class BlockPackedPostingsFormat extends PostingsFormat {
|
||||
public static final String DOC_EXTENSION = "doc";
|
||||
public static final String POS_EXTENSION = "pos";
|
||||
public static final String PAY_EXTENSION = "pay";
|
||||
|
||||
private final int minTermBlockSize;
|
||||
private final int maxTermBlockSize;
|
||||
public final static int DEFAULT_BLOCK_SIZE = 128;
|
||||
|
||||
public ForPostingsFormat() {
|
||||
super("For");
|
||||
this.minBlockSize = BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE;
|
||||
this.maxBlockSize = BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE;
|
||||
public BlockPackedPostingsFormat() {
|
||||
this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
public ForPostingsFormat(int minBlockSize, int maxBlockSize) {
|
||||
super("For");
|
||||
this.minBlockSize = minBlockSize;
|
||||
assert minBlockSize > 1;
|
||||
this.maxBlockSize = maxBlockSize;
|
||||
assert minBlockSize <= maxBlockSize;
|
||||
public BlockPackedPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
||||
super("BlockPacked");
|
||||
this.minTermBlockSize = minTermBlockSize;
|
||||
assert minTermBlockSize > 1;
|
||||
this.maxTermBlockSize = maxTermBlockSize;
|
||||
assert minTermBlockSize <= maxTermBlockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -71,13 +63,14 @@ public final class ForPostingsFormat extends PostingsFormat {
|
|||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
// TODO: implement a new PostingsWriterBase to improve skip-settings
|
||||
PostingsWriterBase postingsWriter = new SepPostingsWriter(state, new ForFactory());
|
||||
PostingsWriterBase postingsWriter = new BlockPackedPostingsWriter(state, 128);
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer ret = new BlockTreeTermsWriter(state,
|
||||
postingsWriter,
|
||||
minBlockSize,
|
||||
maxBlockSize);
|
||||
minTermBlockSize,
|
||||
maxTermBlockSize);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
|
@ -89,13 +82,12 @@ public final class ForPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postingsReader = new SepPostingsReader(state.dir,
|
||||
state.fieldInfos,
|
||||
state.segmentInfo,
|
||||
state.context,
|
||||
new ForFactory(),
|
||||
state.segmentSuffix);
|
||||
|
||||
PostingsReaderBase postingsReader = new BlockPackedPostingsReader(state.dir,
|
||||
state.fieldInfos,
|
||||
state.segmentInfo,
|
||||
state.context,
|
||||
state.segmentSuffix,
|
||||
128);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer ret = new BlockTreeTermsReader(state.dir,
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,592 @@
|
|||
package org.apache.lucene.codecs.blockpacked;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.LongBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
// nocommit javadocs
|
||||
|
||||
public final class BlockPackedPostingsWriter extends PostingsWriterBase {
|
||||
|
||||
private boolean DEBUG = BlockPackedPostingsReader.DEBUG;
|
||||
|
||||
// nocommit move these constants to the PF:
|
||||
|
||||
static final int maxSkipLevels = 10;
|
||||
|
||||
final static String TERMS_CODEC = "BlockPackedPostingsWriterTerms";
|
||||
final static String DOC_CODEC = "BlockPackedPostingsWriterDoc";
|
||||
final static String POS_CODEC = "BlockPackedPostingsWriterPos";
|
||||
final static String PAY_CODEC = "BlockPackedPostingsWriterPay";
|
||||
|
||||
// Increment version to change it:
|
||||
final static int VERSION_START = 0;
|
||||
final static int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
final IndexOutput docOut;
|
||||
final IndexOutput posOut;
|
||||
final IndexOutput payOut;
|
||||
|
||||
static final int DEFAULT_BLOCK_SIZE = 128;
|
||||
|
||||
final int blockSize;
|
||||
|
||||
private IndexOutput termsOut;
|
||||
|
||||
// How current field indexes postings:
|
||||
private boolean fieldHasFreqs;
|
||||
private boolean fieldHasPositions;
|
||||
private boolean fieldHasOffsets;
|
||||
private boolean fieldHasPayloads;
|
||||
|
||||
// Holds starting file pointers for each term:
|
||||
private long docTermStartFP;
|
||||
private long posTermStartFP;
|
||||
private long payTermStartFP;
|
||||
|
||||
final long[] docDeltaBuffer;
|
||||
final long[] freqBuffer;
|
||||
final LongBuffer docDeltaLBuffer;
|
||||
final LongBuffer freqLBuffer;
|
||||
private int docBufferUpto;
|
||||
|
||||
final long[] posDeltaBuffer;
|
||||
final long[] payloadLengthBuffer;
|
||||
final long[] offsetStartDeltaBuffer;
|
||||
final long[] offsetLengthBuffer;
|
||||
final LongBuffer posDeltaLBuffer;
|
||||
final LongBuffer payloadLengthLBuffer;
|
||||
final LongBuffer offsetStartDeltaLBuffer;
|
||||
final LongBuffer offsetLengthLBuffer;
|
||||
private int posBufferUpto;
|
||||
|
||||
private byte[] payloadBytes;
|
||||
private int payloadByteUpto;
|
||||
|
||||
private int lastBlockDocID;
|
||||
private boolean saveNextPosBlock;
|
||||
private long lastBlockPosFP;
|
||||
private long lastBlockPayFP;
|
||||
private int lastBlockPosBufferUpto;
|
||||
private int lastBlockEndOffset;
|
||||
private int lastBlockPayloadByteUpto;
|
||||
private int lastDocID;
|
||||
private int lastPosition;
|
||||
private int lastEndOffset;
|
||||
private int docCount;
|
||||
|
||||
final byte[] encoded;
|
||||
final LongBuffer encodedBuffer;
|
||||
|
||||
private final BlockPackedSkipWriter skipWriter;
|
||||
|
||||
public BlockPackedPostingsWriter(SegmentWriteState state, int blockSize) throws IOException {
|
||||
super();
|
||||
this.blockSize = blockSize;
|
||||
|
||||
docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPackedPostingsFormat.DOC_EXTENSION),
|
||||
state.context);
|
||||
IndexOutput posOut = null;
|
||||
IndexOutput payOut = null;
|
||||
boolean success = false;
|
||||
try {
|
||||
CodecUtil.writeHeader(docOut, DOC_CODEC, VERSION_CURRENT);
|
||||
if (state.fieldInfos.hasProx()) {
|
||||
posDeltaBuffer = new long[blockSize];
|
||||
posDeltaLBuffer = LongBuffer.wrap(posDeltaBuffer);
|
||||
posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPackedPostingsFormat.POS_EXTENSION),
|
||||
state.context);
|
||||
CodecUtil.writeHeader(posOut, POS_CODEC, VERSION_CURRENT);
|
||||
|
||||
if (state.fieldInfos.hasPayloads()) {
|
||||
payloadBytes = new byte[128];
|
||||
payloadLengthBuffer = new long[blockSize];
|
||||
payloadLengthLBuffer = LongBuffer.wrap(payloadLengthBuffer);
|
||||
} else {
|
||||
payloadBytes = null;
|
||||
payloadLengthBuffer = null;
|
||||
payloadLengthLBuffer = null;
|
||||
}
|
||||
|
||||
if (state.fieldInfos.hasOffsets()) {
|
||||
offsetStartDeltaBuffer = new long[blockSize];
|
||||
offsetLengthBuffer = new long[blockSize];
|
||||
offsetStartDeltaLBuffer = LongBuffer.wrap(offsetStartDeltaBuffer);
|
||||
offsetLengthLBuffer = LongBuffer.wrap(offsetLengthBuffer);
|
||||
} else {
|
||||
offsetStartDeltaBuffer = null;
|
||||
offsetLengthBuffer = null;
|
||||
offsetStartDeltaLBuffer = null;
|
||||
offsetLengthLBuffer = null;
|
||||
}
|
||||
|
||||
if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) {
|
||||
payOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockPackedPostingsFormat.PAY_EXTENSION),
|
||||
state.context);
|
||||
CodecUtil.writeHeader(payOut, PAY_CODEC, VERSION_CURRENT);
|
||||
}
|
||||
} else {
|
||||
posDeltaBuffer = null;
|
||||
payloadLengthBuffer = null;
|
||||
offsetStartDeltaBuffer = null;
|
||||
offsetLengthBuffer = null;
|
||||
payloadBytes = null;
|
||||
posDeltaLBuffer = null;
|
||||
payloadLengthLBuffer = null;
|
||||
offsetStartDeltaLBuffer = null;
|
||||
offsetLengthLBuffer = null;
|
||||
}
|
||||
this.payOut = payOut;
|
||||
this.posOut = posOut;
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(docOut, posOut, payOut);
|
||||
}
|
||||
}
|
||||
|
||||
docDeltaBuffer = new long[blockSize];
|
||||
freqBuffer = new long[blockSize];
|
||||
docDeltaLBuffer = LongBuffer.wrap(docDeltaBuffer);
|
||||
freqLBuffer = LongBuffer.wrap(freqBuffer);
|
||||
|
||||
skipWriter = new BlockPackedSkipWriter(blockSize,
|
||||
maxSkipLevels,
|
||||
state.segmentInfo.getDocCount(),
|
||||
docOut,
|
||||
posOut,
|
||||
payOut);
|
||||
|
||||
encoded = new byte[blockSize*4];
|
||||
encodedBuffer = ByteBuffer.wrap(encoded).asLongBuffer();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void start(IndexOutput termsOut) throws IOException {
|
||||
this.termsOut = termsOut;
|
||||
CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT);
|
||||
termsOut.writeVInt(blockSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setField(FieldInfo fieldInfo) {
|
||||
IndexOptions indexOptions = fieldInfo.getIndexOptions();
|
||||
fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||
fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
fieldHasPayloads = fieldInfo.hasPayloads();
|
||||
skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startTerm() {
|
||||
docTermStartFP = docOut.getFilePointer();
|
||||
if (fieldHasPositions) {
|
||||
posTermStartFP = posOut.getFilePointer();
|
||||
if (fieldHasPayloads || fieldHasOffsets) {
|
||||
payTermStartFP = payOut.getFilePointer();
|
||||
}
|
||||
}
|
||||
lastBlockDocID = -1;
|
||||
lastDocID = 0;
|
||||
if (DEBUG) {
|
||||
System.out.println("FPW.startTerm startFP=" + docTermStartFP);
|
||||
}
|
||||
skipWriter.resetSkip();
|
||||
}
|
||||
|
||||
private void writeBlock(LongBuffer buffer, IndexOutput out) throws IOException {
|
||||
final int header = ForUtil.compress(buffer, encodedBuffer);
|
||||
out.writeVInt(header);
|
||||
out.writeBytes(encoded, ForUtil.getEncodedSize(header));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startDoc(int docID, int termDocFreq) throws IOException {
|
||||
if (DEBUG) {
|
||||
System.out.println("FPW.startDoc docID=" + docID);
|
||||
}
|
||||
|
||||
// nocommit do this in finishDoc... but does it fail...?
|
||||
// is it not always called...?
|
||||
if (posOut != null && saveNextPosBlock) {
|
||||
lastBlockPosFP = posOut.getFilePointer();
|
||||
if (payOut != null) {
|
||||
lastBlockPayFP = payOut.getFilePointer();
|
||||
}
|
||||
lastBlockPosBufferUpto = posBufferUpto;
|
||||
lastBlockEndOffset = lastEndOffset;
|
||||
lastBlockPayloadByteUpto = payloadByteUpto;
|
||||
saveNextPosBlock = false;
|
||||
if (DEBUG) {
|
||||
System.out.println(" now save lastBlockPosFP=" + lastBlockPosFP + " lastBlockPosBufferUpto=" + lastBlockPosBufferUpto + " lastBlockPayloadByteUpto=" + lastBlockPayloadByteUpto);
|
||||
}
|
||||
}
|
||||
|
||||
final int docDelta = docID - lastDocID;
|
||||
if (docID < 0 || (docCount > 0 && docDelta <= 0)) {
|
||||
throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (docOut: " + docOut + ")");
|
||||
}
|
||||
lastDocID = docID;
|
||||
|
||||
docDeltaBuffer[docBufferUpto] = docDelta;
|
||||
if (DEBUG) {
|
||||
System.out.println(" docDeltaBuffer[" + docBufferUpto + "]=" + docDelta);
|
||||
}
|
||||
if (fieldHasFreqs) {
|
||||
freqBuffer[docBufferUpto] = termDocFreq;
|
||||
}
|
||||
|
||||
docBufferUpto++;
|
||||
docCount++;
|
||||
|
||||
if (docBufferUpto == blockSize) {
|
||||
// nocommit maybe instead of buffering skip before
|
||||
// writing a block based on last block's end data
|
||||
// ... we could buffer after writing the block? only
|
||||
// iffiness with that approach is it could be a
|
||||
// pointlness skip? like we may stop adding docs
|
||||
// right after that, then we have skip point AFTER
|
||||
// last doc. the thing is, in finishTerm we are
|
||||
// already sometimes adding a skip point AFTER the
|
||||
// last doc?
|
||||
if (lastBlockDocID != -1) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" bufferSkip at writeBlock: lastDocID=" + lastBlockDocID + " docCount=" + (docCount-blockSize));
|
||||
}
|
||||
skipWriter.bufferSkip(lastBlockDocID, docCount-blockSize, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockEndOffset, lastBlockPayloadByteUpto);
|
||||
}
|
||||
lastBlockDocID = docID;
|
||||
saveNextPosBlock = true;
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" write docDelta block @ fp=" + docOut.getFilePointer());
|
||||
}
|
||||
writeBlock(docDeltaLBuffer, docOut);
|
||||
if (fieldHasFreqs) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" write freq block @ fp=" + docOut.getFilePointer());
|
||||
}
|
||||
writeBlock(freqLBuffer, docOut);
|
||||
}
|
||||
docBufferUpto = 0;
|
||||
}
|
||||
|
||||
lastPosition = 0;
|
||||
lastEndOffset = 0;
|
||||
}
|
||||
|
||||
/** Add a new position & payload */
|
||||
@Override
|
||||
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
|
||||
if (DEBUG) {
|
||||
System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: ""));
|
||||
}
|
||||
posDeltaBuffer[posBufferUpto] = position - lastPosition;
|
||||
if (fieldHasPayloads) {
|
||||
if (payload == null || payload.length == 0) {
|
||||
// no payload
|
||||
payloadLengthBuffer[posBufferUpto] = 0;
|
||||
} else {
|
||||
payloadLengthBuffer[posBufferUpto] = payload.length;
|
||||
if (payloadByteUpto + payload.length > payloadBytes.length) {
|
||||
payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length);
|
||||
}
|
||||
System.arraycopy(payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length);
|
||||
payloadByteUpto += payload.length;
|
||||
}
|
||||
}
|
||||
|
||||
if (fieldHasOffsets) {
|
||||
assert startOffset >= lastEndOffset;
|
||||
assert endOffset >= startOffset;
|
||||
offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastEndOffset;
|
||||
offsetLengthBuffer[posBufferUpto] = endOffset - startOffset;
|
||||
lastEndOffset = endOffset;
|
||||
}
|
||||
|
||||
posBufferUpto++;
|
||||
lastPosition = position;
|
||||
if (posBufferUpto == blockSize) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" write pos bulk block @ fp=" + posOut.getFilePointer());
|
||||
}
|
||||
writeBlock(posDeltaLBuffer, posOut);
|
||||
|
||||
if (fieldHasPayloads) {
|
||||
writeBlock(payloadLengthLBuffer, payOut);
|
||||
payOut.writeVInt(payloadByteUpto);
|
||||
payOut.writeBytes(payloadBytes, 0, payloadByteUpto);
|
||||
payloadByteUpto = 0;
|
||||
}
|
||||
if (fieldHasOffsets) {
|
||||
writeBlock(offsetStartDeltaLBuffer, payOut);
|
||||
writeBlock(offsetLengthLBuffer, payOut);
|
||||
}
|
||||
posBufferUpto = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finishDoc() {
|
||||
}
|
||||
|
||||
private static class PendingTerm {
|
||||
public final long docStartFP;
|
||||
public final long posStartFP;
|
||||
public final long payStartFP;
|
||||
public final int skipOffset;
|
||||
public final int lastPosBlockOffset;
|
||||
|
||||
public PendingTerm(long docStartFP, long posStartFP, long payStartFP, int skipOffset, int lastPosBlockOffset) {
|
||||
this.docStartFP = docStartFP;
|
||||
this.posStartFP = posStartFP;
|
||||
this.payStartFP = payStartFP;
|
||||
this.skipOffset = skipOffset;
|
||||
this.lastPosBlockOffset = lastPosBlockOffset;
|
||||
}
|
||||
}
|
||||
|
||||
private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();
|
||||
|
||||
/** Called when we are done adding docs to this term */
|
||||
@Override
|
||||
public void finishTerm(TermStats stats) throws IOException {
|
||||
|
||||
assert stats.docFreq > 0;
|
||||
|
||||
// TODO: wasteful we are counting this (counting # docs
|
||||
// for this term) in two places?
|
||||
assert stats.docFreq == docCount: stats.docFreq + " vs " + docCount;
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("FPW.finishTerm docFreq=" + stats.docFreq);
|
||||
}
|
||||
|
||||
// nocommit silly that skipper must write skip when we no
|
||||
// postings come after it, but if we don't do this, skip
|
||||
// reader incorrectly thinks it can read another level 0
|
||||
// skip entry here!:
|
||||
//if (docCount > blockSize && docBufferUpto > 0) {
|
||||
if (docCount > blockSize) {
|
||||
final int lastDocCount = blockSize*(docCount/blockSize);
|
||||
if (DEBUG) {
|
||||
System.out.println(" bufferSkip at finishTerm: lastDocID=" + lastBlockDocID + " docCount=" + lastDocCount);
|
||||
}
|
||||
skipWriter.bufferSkip(lastBlockDocID, lastDocCount, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockEndOffset, lastBlockPayloadByteUpto);
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
if (docBufferUpto > 0) {
|
||||
System.out.println(" write doc/freq vInt block (count=" + docBufferUpto + ") at fp=" + docOut.getFilePointer() + " docTermStartFP=" + docTermStartFP);
|
||||
}
|
||||
}
|
||||
|
||||
// vInt encode the remaining doc deltas and freqs:
|
||||
for(int i=0;i<docBufferUpto;i++) {
|
||||
final int docDelta = (int)docDeltaBuffer[i];
|
||||
final int freq = (int)freqBuffer[i];
|
||||
if (!fieldHasFreqs) {
|
||||
docOut.writeVInt(docDelta);
|
||||
} else if (freqBuffer[i] == 1) {
|
||||
docOut.writeVInt((docDelta<<1)|1);
|
||||
} else {
|
||||
docOut.writeVInt(docDelta<<1);
|
||||
docOut.writeVInt(freq);
|
||||
}
|
||||
}
|
||||
|
||||
final int lastPosBlockOffset;
|
||||
|
||||
if (fieldHasPositions) {
|
||||
if (DEBUG) {
|
||||
if (posBufferUpto > 0) {
|
||||
System.out.println(" write pos vInt block (count=" + posBufferUpto + ") at fp=" + posOut.getFilePointer() + " posTermStartFP=" + posTermStartFP + " hasPayloads=" + fieldHasPayloads + " hasOffsets=" + fieldHasOffsets);
|
||||
}
|
||||
}
|
||||
|
||||
assert stats.totalTermFreq != -1;
|
||||
if (stats.totalTermFreq > blockSize) {
|
||||
lastPosBlockOffset = (int) (posOut.getFilePointer() - posTermStartFP);
|
||||
} else {
|
||||
lastPosBlockOffset = -1;
|
||||
}
|
||||
if (posBufferUpto > 0) {
|
||||
posOut.writeVInt(posBufferUpto);
|
||||
|
||||
// nocommit should we send offsets/payloads to
|
||||
// .pay...? seems wasteful (have to store extra
|
||||
// vLong for low (< blockSize) DF terms = vast vast
|
||||
// majority)
|
||||
|
||||
// vInt encode the remaining positions/payloads/offsets:
|
||||
int lastPayloadLength = -1;
|
||||
int payloadBytesReadUpto = 0;
|
||||
for(int i=0;i<posBufferUpto;i++) {
|
||||
final int posDelta = (int)posDeltaBuffer[i];
|
||||
if (fieldHasPayloads) {
|
||||
final int payloadLength = (int)payloadLengthBuffer[i];
|
||||
if (payloadLength != lastPayloadLength) {
|
||||
lastPayloadLength = payloadLength;
|
||||
posOut.writeVInt((posDelta<<1)|1);
|
||||
posOut.writeVInt(payloadLength);
|
||||
} else {
|
||||
posOut.writeVInt(posDelta<<1);
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" i=" + i + " payloadLen=" + payloadLength);
|
||||
}
|
||||
|
||||
if (payloadLength != 0) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" write payload @ pos.fp=" + posOut.getFilePointer());
|
||||
}
|
||||
posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength);
|
||||
payloadBytesReadUpto += payloadLength;
|
||||
}
|
||||
} else {
|
||||
posOut.writeVInt(posDelta);
|
||||
}
|
||||
|
||||
if (fieldHasOffsets) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" write offset @ pos.fp=" + posOut.getFilePointer());
|
||||
}
|
||||
posOut.writeVInt((int)offsetStartDeltaBuffer[i]);
|
||||
posOut.writeVInt((int)offsetLengthBuffer[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (fieldHasPayloads) {
|
||||
assert payloadBytesReadUpto == payloadByteUpto;
|
||||
payloadByteUpto = 0;
|
||||
}
|
||||
}
|
||||
if (DEBUG) {
|
||||
System.out.println(" totalTermFreq=" + stats.totalTermFreq + " lastPosBlockOffset=" + lastPosBlockOffset);
|
||||
}
|
||||
} else {
|
||||
lastPosBlockOffset = -1;
|
||||
}
|
||||
|
||||
int skipOffset;
|
||||
if (docCount > blockSize) {
|
||||
skipOffset = (int) (skipWriter.writeSkip(docOut)-docTermStartFP);
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("skip packet " + (docOut.getFilePointer() - (docTermStartFP + skipOffset)) + " bytes");
|
||||
}
|
||||
} else {
|
||||
skipOffset = -1;
|
||||
if (DEBUG) {
|
||||
System.out.println(" no skip: docCount=" + docCount);
|
||||
}
|
||||
}
|
||||
|
||||
long payStartFP;
|
||||
if (stats.totalTermFreq >= blockSize) {
|
||||
payStartFP = payTermStartFP;
|
||||
} else {
|
||||
payStartFP = -1;
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" payStartFP=" + payStartFP);
|
||||
}
|
||||
|
||||
pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset));
|
||||
docBufferUpto = 0;
|
||||
posBufferUpto = 0;
|
||||
lastDocID = 0;
|
||||
docCount = 0;
|
||||
}
|
||||
|
||||
private final RAMOutputStream bytesWriter = new RAMOutputStream();
|
||||
|
||||
@Override
|
||||
public void flushTermsBlock(int start, int count) throws IOException {
|
||||
|
||||
if (count == 0) {
|
||||
termsOut.writeByte((byte) 0);
|
||||
return;
|
||||
}
|
||||
|
||||
assert start <= pendingTerms.size();
|
||||
assert count <= start;
|
||||
|
||||
final int limit = pendingTerms.size() - start + count;
|
||||
|
||||
long lastDocStartFP = 0;
|
||||
long lastPosStartFP = 0;
|
||||
long lastPayStartFP = 0;
|
||||
for(int idx=limit-count; idx<limit; idx++) {
|
||||
PendingTerm term = pendingTerms.get(idx);
|
||||
|
||||
bytesWriter.writeVLong(term.docStartFP - lastDocStartFP);
|
||||
lastDocStartFP = term.docStartFP;
|
||||
|
||||
if (fieldHasPositions) {
|
||||
bytesWriter.writeVLong(term.posStartFP - lastPosStartFP);
|
||||
lastPosStartFP = term.posStartFP;
|
||||
if (term.lastPosBlockOffset != -1) {
|
||||
bytesWriter.writeVInt(term.lastPosBlockOffset);
|
||||
}
|
||||
if ((fieldHasPayloads || fieldHasOffsets) && term.payStartFP != -1) {
|
||||
bytesWriter.writeVLong(term.payStartFP - lastPayStartFP);
|
||||
lastPayStartFP = term.payStartFP;
|
||||
}
|
||||
}
|
||||
|
||||
if (term.skipOffset != -1) {
|
||||
bytesWriter.writeVInt(term.skipOffset);
|
||||
}
|
||||
}
|
||||
|
||||
termsOut.writeVInt((int) bytesWriter.getFilePointer());
|
||||
bytesWriter.writeTo(termsOut);
|
||||
bytesWriter.reset();
|
||||
|
||||
// Remove the terms we just wrote:
|
||||
pendingTerms.subList(limit-count, limit).clear();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
IOUtils.close(docOut, posOut, payOut);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,205 @@
|
|||
package org.apache.lucene.codecs.blockpacked;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.codecs.MultiLevelSkipListReader;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
/**
|
||||
* Implements the skip list reader for the 4.0 posting list format
|
||||
* that stores positions and payloads.
|
||||
*
|
||||
* @see Lucene40PostingsFormat
|
||||
* @lucene.experimental
|
||||
*/
|
||||
final class BlockPackedSkipReader extends MultiLevelSkipListReader {
|
||||
private boolean DEBUG = BlockPackedPostingsReader.DEBUG;
|
||||
|
||||
private long docPointer[];
|
||||
private long posPointer[];
|
||||
private long payPointer[];
|
||||
private int posBufferUpto[];
|
||||
private int endOffset[];
|
||||
private int payloadByteUpto[];
|
||||
|
||||
private long lastPosPointer;
|
||||
private long lastPayPointer;
|
||||
private int lastEndOffset;
|
||||
private int lastPayloadByteUpto;
|
||||
private long lastDocPointer;
|
||||
private int lastPosBufferUpto;
|
||||
|
||||
public BlockPackedSkipReader(IndexInput skipStream, int maxSkipLevels, int skipInterval, boolean hasPos, boolean hasOffsets, boolean hasPayloads) {
|
||||
super(skipStream, maxSkipLevels, skipInterval);
|
||||
docPointer = new long[maxSkipLevels];
|
||||
if (hasPos) {
|
||||
posPointer = new long[maxSkipLevels];
|
||||
posBufferUpto = new int[maxSkipLevels];
|
||||
if (hasPayloads) {
|
||||
payloadByteUpto = new int[maxSkipLevels];
|
||||
} else {
|
||||
payloadByteUpto = null;
|
||||
}
|
||||
if (hasOffsets) {
|
||||
endOffset = new int[maxSkipLevels];
|
||||
} else {
|
||||
endOffset = null;
|
||||
}
|
||||
if (hasOffsets || hasPayloads) {
|
||||
payPointer = new long[maxSkipLevels];
|
||||
} else {
|
||||
payPointer = null;
|
||||
}
|
||||
} else {
|
||||
posPointer = null;
|
||||
}
|
||||
}
|
||||
|
||||
public void init(long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df) {
|
||||
super.init(skipPointer, df);
|
||||
lastDocPointer = docBasePointer;
|
||||
lastPosPointer = posBasePointer;
|
||||
lastPayPointer = payBasePointer;
|
||||
|
||||
Arrays.fill(docPointer, docBasePointer);
|
||||
if (posPointer != null) {
|
||||
Arrays.fill(posPointer, posBasePointer);
|
||||
if (payPointer != null) {
|
||||
Arrays.fill(payPointer, payBasePointer);
|
||||
}
|
||||
} else {
|
||||
assert posBasePointer == 0;
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the doc pointer of the doc to which the last call of
|
||||
* {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */
|
||||
public long getDocPointer() {
|
||||
return lastDocPointer;
|
||||
}
|
||||
|
||||
public long getPosPointer() {
|
||||
return lastPosPointer;
|
||||
}
|
||||
|
||||
public int getPosBufferUpto() {
|
||||
return lastPosBufferUpto;
|
||||
}
|
||||
|
||||
public long getPayPointer() {
|
||||
return lastPayPointer;
|
||||
}
|
||||
|
||||
public int getEndOffset() {
|
||||
return lastEndOffset;
|
||||
}
|
||||
|
||||
public int getPayloadByteUpto() {
|
||||
return lastPayloadByteUpto;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void seekChild(int level) throws IOException {
|
||||
super.seekChild(level);
|
||||
if (DEBUG) {
|
||||
System.out.println("seekChild level=" + level);
|
||||
}
|
||||
docPointer[level] = lastDocPointer;
|
||||
if (posPointer != null) {
|
||||
posPointer[level] = lastPosPointer;
|
||||
posBufferUpto[level] = lastPosBufferUpto;
|
||||
if (endOffset != null) {
|
||||
endOffset[level] = lastEndOffset;
|
||||
}
|
||||
if (payloadByteUpto != null) {
|
||||
payloadByteUpto[level] = lastPayloadByteUpto;
|
||||
}
|
||||
if (payPointer != null) {
|
||||
payPointer[level] = lastPayPointer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void setLastSkipData(int level) {
|
||||
super.setLastSkipData(level);
|
||||
lastDocPointer = docPointer[level];
|
||||
if (DEBUG) {
|
||||
System.out.println("setLastSkipData level=" + level);
|
||||
System.out.println(" lastDocPointer=" + lastDocPointer);
|
||||
}
|
||||
if (posPointer != null) {
|
||||
lastPosPointer = posPointer[level];
|
||||
lastPosBufferUpto = posBufferUpto[level];
|
||||
if (DEBUG) {
|
||||
System.out.println(" lastPosPointer=" + lastPosPointer + " lastPosBUfferUpto=" + lastPosBufferUpto);
|
||||
}
|
||||
if (payPointer != null) {
|
||||
lastPayPointer = payPointer[level];
|
||||
}
|
||||
if (endOffset != null) {
|
||||
lastEndOffset = endOffset[level];
|
||||
}
|
||||
if (payloadByteUpto != null) {
|
||||
lastPayloadByteUpto = payloadByteUpto[level];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int readSkipData(int level, IndexInput skipStream) throws IOException {
|
||||
if (DEBUG) {
|
||||
System.out.println("readSkipData level=" + level);
|
||||
}
|
||||
int delta = skipStream.readVInt();
|
||||
if (DEBUG) {
|
||||
System.out.println(" delta=" + delta);
|
||||
}
|
||||
docPointer[level] += skipStream.readVInt();
|
||||
if (DEBUG) {
|
||||
System.out.println(" docFP=" + docPointer[level]);
|
||||
}
|
||||
|
||||
if (posPointer != null) {
|
||||
posPointer[level] += skipStream.readVInt();
|
||||
if (DEBUG) {
|
||||
System.out.println(" posFP=" + posPointer[level]);
|
||||
}
|
||||
posBufferUpto[level] = skipStream.readVInt();
|
||||
if (DEBUG) {
|
||||
System.out.println(" posBufferUpto=" + posBufferUpto[level]);
|
||||
}
|
||||
|
||||
if (payloadByteUpto != null) {
|
||||
payloadByteUpto[level] = skipStream.readVInt();
|
||||
}
|
||||
|
||||
if (endOffset != null) {
|
||||
endOffset[level] += skipStream.readVInt();
|
||||
}
|
||||
|
||||
if (payPointer != null) {
|
||||
payPointer[level] += skipStream.readVInt();
|
||||
}
|
||||
}
|
||||
return delta;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,147 @@
|
|||
package org.apache.lucene.codecs.blockpacked;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
|
||||
|
||||
// nocommit do we need more frequent skips at level > 0?
|
||||
// 128*128 is immense? may need to decouple
|
||||
// baseSkipInterval & theRestSkipInterval?
|
||||
|
||||
final class BlockPackedSkipWriter extends MultiLevelSkipListWriter {
|
||||
private boolean DEBUG = BlockPackedPostingsReader.DEBUG;
|
||||
|
||||
private int[] lastSkipDoc;
|
||||
private long[] lastSkipDocPointer;
|
||||
private long[] lastSkipPosPointer;
|
||||
private long[] lastSkipPayPointer;
|
||||
private int[] lastEndOffset;
|
||||
private int[] lastPayloadByteUpto;
|
||||
|
||||
private final IndexOutput docOut;
|
||||
private final IndexOutput posOut;
|
||||
private final IndexOutput payOut;
|
||||
|
||||
private int curDoc;
|
||||
private long curDocPointer;
|
||||
private long curPosPointer;
|
||||
private long curPayPointer;
|
||||
private int curPosBufferUpto;
|
||||
private int curEndOffset;
|
||||
private int curPayloadByteUpto;
|
||||
private boolean fieldHasPositions;
|
||||
private boolean fieldHasOffsets;
|
||||
private boolean fieldHasPayloads;
|
||||
|
||||
public BlockPackedSkipWriter(int skipInterval, int maxSkipLevels, int docCount, IndexOutput docOut, IndexOutput posOut, IndexOutput payOut) {
|
||||
super(skipInterval, maxSkipLevels, docCount);
|
||||
this.docOut = docOut;
|
||||
this.posOut = posOut;
|
||||
this.payOut = payOut;
|
||||
|
||||
lastSkipDoc = new int[maxSkipLevels];
|
||||
lastSkipDocPointer = new long[maxSkipLevels];
|
||||
if (posOut != null) {
|
||||
lastSkipPosPointer = new long[maxSkipLevels];
|
||||
if (payOut != null) {
|
||||
lastSkipPayPointer = new long[maxSkipLevels];
|
||||
}
|
||||
lastEndOffset = new int[maxSkipLevels];
|
||||
lastPayloadByteUpto = new int[maxSkipLevels];
|
||||
}
|
||||
}
|
||||
|
||||
public void setField(boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) {
|
||||
this.fieldHasPositions = fieldHasPositions;
|
||||
this.fieldHasOffsets = fieldHasOffsets;
|
||||
this.fieldHasPayloads = fieldHasPayloads;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void resetSkip() {
|
||||
super.resetSkip();
|
||||
Arrays.fill(lastSkipDoc, 0);
|
||||
Arrays.fill(lastSkipDocPointer, docOut.getFilePointer());
|
||||
if (fieldHasPositions) {
|
||||
Arrays.fill(lastSkipPosPointer, posOut.getFilePointer());
|
||||
if (fieldHasOffsets) {
|
||||
Arrays.fill(lastEndOffset, 0);
|
||||
}
|
||||
if (fieldHasPayloads) {
|
||||
Arrays.fill(lastPayloadByteUpto, 0);
|
||||
}
|
||||
if (fieldHasOffsets || fieldHasPayloads) {
|
||||
Arrays.fill(lastSkipPayPointer, payOut.getFilePointer());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the values for the current skip data.
|
||||
*/
|
||||
public void bufferSkip(int doc, int numDocs, long posFP, long payFP, int posBufferUpto, int endOffset, int payloadByteUpto) throws IOException {
|
||||
this.curDoc = doc;
|
||||
this.curDocPointer = docOut.getFilePointer();
|
||||
this.curPosPointer = posFP;
|
||||
this.curPayPointer = payFP;
|
||||
this.curPosBufferUpto = posBufferUpto;
|
||||
this.curPayloadByteUpto = payloadByteUpto;
|
||||
this.curEndOffset = endOffset;
|
||||
bufferSkip(numDocs);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException {
|
||||
int delta = curDoc - lastSkipDoc[level];
|
||||
if (DEBUG) {
|
||||
System.out.println("writeSkipData level=" + level + " lastDoc=" + curDoc + " delta=" + delta + " curDocPointer=" + curDocPointer);
|
||||
}
|
||||
skipBuffer.writeVInt(delta);
|
||||
lastSkipDoc[level] = curDoc;
|
||||
|
||||
skipBuffer.writeVInt((int) (curDocPointer - lastSkipDocPointer[level]));
|
||||
lastSkipDocPointer[level] = curDocPointer;
|
||||
|
||||
if (fieldHasPositions) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" curPosPointer=" + curPosPointer + " curPosBufferUpto=" + curPosBufferUpto);
|
||||
}
|
||||
skipBuffer.writeVInt((int) (curPosPointer - lastSkipPosPointer[level]));
|
||||
lastSkipPosPointer[level] = curPosPointer;
|
||||
skipBuffer.writeVInt(curPosBufferUpto);
|
||||
|
||||
if (fieldHasPayloads) {
|
||||
skipBuffer.writeVInt(curPayloadByteUpto);
|
||||
}
|
||||
|
||||
if (fieldHasOffsets) {
|
||||
skipBuffer.writeVInt(curEndOffset - lastEndOffset[level]);
|
||||
lastEndOffset[level] = curEndOffset;
|
||||
}
|
||||
|
||||
if (fieldHasOffsets || fieldHasPayloads) {
|
||||
skipBuffer.writeVInt((int) (curPayPointer - lastSkipPayPointer[level]));
|
||||
lastSkipPayPointer[level] = curPayPointer;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,130 @@
|
|||
package org.apache.lucene.codecs.blockpacked;
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.LongBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
import org.apache.lucene.util.packed.PackedInts.Reader;
|
||||
import org.apache.lucene.util.packed.PackedInts.Writer;
|
||||
import org.apache.lucene.util.packed.PackedInts.Mutable;
|
||||
import org.apache.lucene.util.packed.PackedInts.Encoder;
|
||||
import org.apache.lucene.util.packed.PackedInts.Decoder;
|
||||
|
||||
/**
|
||||
* Encode all values in normal area with fixed bit width,
|
||||
* which is determined by the max value in this block.
|
||||
*/
|
||||
public class ForUtil {
|
||||
protected static final int[] MASK = { 0x00000000,
|
||||
0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
|
||||
0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
|
||||
0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff,
|
||||
0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
|
||||
0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
|
||||
0x7fffffff, 0xffffffff};
|
||||
|
||||
/** Compress given int[] into output stream, with For format
|
||||
*/
|
||||
public static int compress(final LongBuffer data, LongBuffer packed) throws IOException {
|
||||
int numBits=getNumBits(data.array());
|
||||
|
||||
if (numBits == 0) { // when block is equal, save the value once
|
||||
packed.put(0, data.get(0)<<32); // java uses big endian for LongBuffer impl
|
||||
return (getHeader(1,numBits));
|
||||
}
|
||||
|
||||
PackedInts.Format format = PackedInts.fastestFormatAndBits(128, numBits, PackedInts.FASTEST).format;
|
||||
PackedInts.Encoder encoder = PackedInts.getEncoder(format, PackedInts.VERSION_CURRENT, numBits);
|
||||
int perIter = encoder.values();
|
||||
int iters = 128/perIter;
|
||||
int nblocks = encoder.blocks()*iters;
|
||||
assert 128 % perIter == 0;
|
||||
|
||||
packed.rewind();
|
||||
data.rewind();
|
||||
|
||||
encoder.encode(data, packed, iters);
|
||||
|
||||
int encodedSize = nblocks*2;
|
||||
return getHeader(encodedSize,numBits);
|
||||
}
|
||||
|
||||
/** Decompress given ouput stream into int array.
|
||||
*/
|
||||
public static void decompress(LongBuffer data, LongBuffer packed, int header) throws IOException {
|
||||
// nocommit assert header isn't "malformed", ie besides
|
||||
// numBytes / bit-width there is nothing else!
|
||||
|
||||
packed.rewind();
|
||||
data.rewind();
|
||||
int numBits = ((header >> 8) & MASK[6]);
|
||||
|
||||
if (numBits == 0) {
|
||||
Arrays.fill(data.array(), (int)(packed.get(0)>>>32));
|
||||
return;
|
||||
}
|
||||
|
||||
PackedInts.Format format = PackedInts.fastestFormatAndBits(128, numBits, PackedInts.FASTEST).format;
|
||||
PackedInts.Decoder decoder = PackedInts.getDecoder(format, PackedInts.VERSION_CURRENT, numBits);
|
||||
int perIter = decoder.values();
|
||||
int iters = 128/perIter;
|
||||
int nblocks = decoder.blocks()*iters;
|
||||
assert 128 % perIter == 0;
|
||||
|
||||
decoder.decode(packed, data, iters);
|
||||
}
|
||||
|
||||
static int getNumBits(final long[] data) {
|
||||
if (isAllEqual(data)) {
|
||||
return 0;
|
||||
}
|
||||
int size=data.length;
|
||||
int optBits=1;
|
||||
for (int i=0; i<size; ++i) {
|
||||
while ((data[i] & ~MASK[optBits]) != 0) {
|
||||
optBits++;
|
||||
}
|
||||
}
|
||||
return optBits;
|
||||
}
|
||||
|
||||
protected static boolean isAllEqual(final long[] data) {
|
||||
int len = data.length;
|
||||
long v = data[0];
|
||||
for (int i=1; i<len; i++) {
|
||||
if (data[i] != v) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
static int getHeader(int encodedSize, int numBits) {
|
||||
return (encodedSize)
|
||||
| ((numBits) << 8);
|
||||
}
|
||||
public static int getEncodedSize(int header) {
|
||||
return ((header & MASK[8]))*4;
|
||||
}
|
||||
public static int getNumBits(int header) {
|
||||
return ((header >> 8) & MASK[6]);
|
||||
}
|
||||
}
|
|
@ -1,128 +0,0 @@
|
|||
package org.apache.lucene.codecs.pfor;
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.codecs.sep.IntStreamFactory;
|
||||
import org.apache.lucene.codecs.sep.IntIndexInput;
|
||||
import org.apache.lucene.codecs.sep.IntIndexOutput;
|
||||
import org.apache.lucene.codecs.intblock.FixedIntBlockIndexInput;
|
||||
import org.apache.lucene.codecs.intblock.FixedIntBlockIndexOutput;
|
||||
|
||||
/**
|
||||
* Used to plug to PostingsReader/WriterBase.
|
||||
* Encoder and decoder in lower layers are called by
|
||||
* flushBlock() and readBlock()
|
||||
*/
|
||||
|
||||
public final class ForFactory extends IntStreamFactory {
|
||||
|
||||
public ForFactory() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntIndexOutput createOutput(Directory dir, String fileName, IOContext context) throws IOException {
|
||||
boolean success = false;
|
||||
IndexOutput out = dir.createOutput(fileName, context);
|
||||
try {
|
||||
IntIndexOutput ret = new ForIndexOutput(out);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
// For some cases (e.g. disk full), the IntIndexOutput may not be
|
||||
// properly created. So we should close those opened files.
|
||||
IOUtils.closeWhileHandlingException(out);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntIndexInput openInput(Directory dir, String fileName, IOContext context) throws IOException {
|
||||
return new ForIndexInput(dir.openInput(fileName, context));
|
||||
}
|
||||
|
||||
/**
|
||||
* Here we'll hold both input buffer and output buffer for
|
||||
* encoder/decoder.
|
||||
*/
|
||||
private class ForIndexInput extends FixedIntBlockIndexInput {
|
||||
|
||||
ForIndexInput(final IndexInput in) throws IOException {
|
||||
super(in);
|
||||
}
|
||||
|
||||
class ForBlockReader implements FixedIntBlockIndexInput.BlockReader {
|
||||
private final byte[] encoded;
|
||||
private final int[] buffer;
|
||||
private final IndexInput in;
|
||||
private final IntBuffer encodedBuffer;
|
||||
|
||||
ForBlockReader(final IndexInput in, final int[] buffer) {
|
||||
// upperbound for encoded value should include(here header is not buffered):
|
||||
// blockSize of normal value when numFrameBits=32(4x bytes);
|
||||
this.encoded = new byte[ForPostingsFormat.DEFAULT_BLOCK_SIZE*4];
|
||||
this.in = in;
|
||||
this.buffer = buffer;
|
||||
this.encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
|
||||
}
|
||||
|
||||
// TODO: implement public void skipBlock() {} ?
|
||||
@Override
|
||||
public void readBlock() throws IOException {
|
||||
final int header = in.readInt();
|
||||
final int numBytes = ForUtil.getEncodedSize(header);
|
||||
assert numBytes <= ForPostingsFormat.DEFAULT_BLOCK_SIZE*4;
|
||||
in.readBytes(encoded,0,numBytes);
|
||||
ForUtil.decompress(encodedBuffer,buffer,header);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BlockReader getBlockReader(final IndexInput in, final int[] buffer) throws IOException {
|
||||
return new ForBlockReader(in,buffer);
|
||||
}
|
||||
}
|
||||
|
||||
private class ForIndexOutput extends FixedIntBlockIndexOutput {
|
||||
private final byte[] encoded;
|
||||
private final IntBuffer encodedBuffer;
|
||||
|
||||
ForIndexOutput(IndexOutput out) throws IOException {
|
||||
super(out,ForPostingsFormat.DEFAULT_BLOCK_SIZE);
|
||||
this.encoded = new byte[ForPostingsFormat.DEFAULT_BLOCK_SIZE*4];
|
||||
this.encodedBuffer=ByteBuffer.wrap(encoded).asIntBuffer();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void flushBlock() throws IOException {
|
||||
final int header = ForUtil.compress(buffer,encodedBuffer);
|
||||
final int numBytes = ForUtil.getEncodedSize(header);
|
||||
// nocommit writeVInt instead?
|
||||
out.writeInt(header);
|
||||
out.writeBytes(encoded, numBytes);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,129 +0,0 @@
|
|||
package org.apache.lucene.codecs.pfor;
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.codecs.sep.IntStreamFactory;
|
||||
import org.apache.lucene.codecs.sep.IntIndexInput;
|
||||
import org.apache.lucene.codecs.sep.IntIndexOutput;
|
||||
import org.apache.lucene.codecs.intblock.FixedIntBlockIndexInput;
|
||||
import org.apache.lucene.codecs.intblock.FixedIntBlockIndexOutput;
|
||||
|
||||
/**
|
||||
* Used to plug to PostingsReader/WriterBase.
|
||||
* Encoder and decoder in lower layers are called by
|
||||
* flushBlock() and readBlock()
|
||||
*/
|
||||
|
||||
public final class PForFactory extends IntStreamFactory {
|
||||
|
||||
public PForFactory() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntIndexOutput createOutput(Directory dir, String fileName, IOContext context) throws IOException {
|
||||
boolean success = false;
|
||||
IndexOutput out = dir.createOutput(fileName, context);
|
||||
try {
|
||||
IntIndexOutput ret = new PForIndexOutput(out);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
// For some cases (e.g. disk full), the IntIndexOutput may not be
|
||||
// properly created. So we should close those opened files.
|
||||
IOUtils.closeWhileHandlingException(out);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntIndexInput openInput(Directory dir, String fileName, IOContext context) throws IOException {
|
||||
return new PForIndexInput(dir.openInput(fileName, context));
|
||||
}
|
||||
|
||||
/**
|
||||
* Here we'll hold both input buffer and output buffer for
|
||||
* encoder/decoder.
|
||||
*/
|
||||
private class PForIndexInput extends FixedIntBlockIndexInput {
|
||||
|
||||
PForIndexInput(final IndexInput in) throws IOException {
|
||||
super(in);
|
||||
}
|
||||
|
||||
class PForBlockReader implements FixedIntBlockIndexInput.BlockReader {
|
||||
private final byte[] encoded;
|
||||
private final int[] buffer;
|
||||
private final IndexInput in;
|
||||
private final IntBuffer encodedBuffer;
|
||||
|
||||
PForBlockReader(final IndexInput in, final int[] buffer) {
|
||||
// upperbound for encoded value should include(here header is not buffered):
|
||||
// 1. blockSize of normal value (4x bytes);
|
||||
// 2. blockSize of exception value (4x bytes);
|
||||
this.encoded = new byte[PForPostingsFormat.DEFAULT_BLOCK_SIZE*8];
|
||||
this.in = in;
|
||||
this.buffer = buffer;
|
||||
this.encodedBuffer = ByteBuffer.wrap(encoded).asIntBuffer();
|
||||
}
|
||||
|
||||
// TODO: implement public void skipBlock() {} ?
|
||||
@Override
|
||||
public void readBlock() throws IOException {
|
||||
final int header = in.readInt();
|
||||
final int numBytes = PForUtil.getEncodedSize(header);
|
||||
assert numBytes <= PForPostingsFormat.DEFAULT_BLOCK_SIZE*8;
|
||||
in.readBytes(encoded,0,numBytes);
|
||||
PForUtil.decompress(encodedBuffer,buffer,header);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BlockReader getBlockReader(final IndexInput in, final int[] buffer) throws IOException {
|
||||
return new PForBlockReader(in,buffer);
|
||||
}
|
||||
}
|
||||
|
||||
private class PForIndexOutput extends FixedIntBlockIndexOutput {
|
||||
private final byte[] encoded;
|
||||
private final IntBuffer encodedBuffer;
|
||||
|
||||
PForIndexOutput(IndexOutput out) throws IOException {
|
||||
super(out, PForPostingsFormat.DEFAULT_BLOCK_SIZE);
|
||||
this.encoded = new byte[PForPostingsFormat.DEFAULT_BLOCK_SIZE*8];
|
||||
this.encodedBuffer=ByteBuffer.wrap(encoded).asIntBuffer();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void flushBlock() throws IOException {
|
||||
final int header = PForUtil.compress(buffer,encodedBuffer);
|
||||
final int numBytes = PForUtil.getEncodedSize(header);
|
||||
// nocommit writeVInt instead?
|
||||
out.writeInt(header);
|
||||
out.writeBytes(encoded, numBytes);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,115 +0,0 @@
|
|||
package org.apache.lucene.codecs.pfor;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.codecs.BlockTreeTermsReader;
|
||||
import org.apache.lucene.codecs.BlockTreeTermsWriter;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.FixedGapTermsIndexReader;
|
||||
import org.apache.lucene.codecs.FixedGapTermsIndexWriter;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.TermsIndexReaderBase;
|
||||
import org.apache.lucene.codecs.TermsIndexWriterBase;
|
||||
import org.apache.lucene.codecs.sep.SepPostingsReader;
|
||||
import org.apache.lucene.codecs.sep.SepPostingsWriter;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Pass PForFactory to a PostingsWriter/ReaderBase, and get
|
||||
* customized postings format plugged.
|
||||
*/
|
||||
public final class PForPostingsFormat extends PostingsFormat {
|
||||
private final int minBlockSize;
|
||||
private final int maxBlockSize;
|
||||
public final static int DEFAULT_BLOCK_SIZE = 128;
|
||||
|
||||
public PForPostingsFormat() {
|
||||
super("PFor");
|
||||
this.minBlockSize = BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE;
|
||||
this.maxBlockSize = BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE;
|
||||
}
|
||||
public PForPostingsFormat(int minBlockSize, int maxBlockSize) {
|
||||
super("PFor");
|
||||
this.minBlockSize = minBlockSize;
|
||||
assert minBlockSize > 1;
|
||||
this.maxBlockSize = maxBlockSize;
|
||||
assert minBlockSize <= maxBlockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getName() + "(blocksize=" + DEFAULT_BLOCK_SIZE+ ")";
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
// TODO: implement a new PostingsWriterBase to improve skip-settings
|
||||
PostingsWriterBase postingsWriter = new SepPostingsWriter(state, new PForFactory());
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer ret = new BlockTreeTermsWriter(state,
|
||||
postingsWriter,
|
||||
minBlockSize,
|
||||
maxBlockSize);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsWriter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postingsReader = new SepPostingsReader(state.dir,
|
||||
state.fieldInfos,
|
||||
state.segmentInfo,
|
||||
state.context,
|
||||
new PForFactory(),
|
||||
state.segmentSuffix);
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer ret = new BlockTreeTermsReader(state.dir,
|
||||
state.fieldInfos,
|
||||
state.segmentInfo.name,
|
||||
postingsReader,
|
||||
state.context,
|
||||
state.segmentSuffix,
|
||||
state.termsIndexDivisor);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsReader);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,343 +0,0 @@
|
|||
package org.apache.lucene.codecs.pfor;
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Encode all small values and exception pointers in normal area;
|
||||
* Encode large values in exception area;
|
||||
* Size per exception is variable, possibly: 1byte, 2bytes, or 4bytes
|
||||
*/
|
||||
public final class PForUtil extends ForUtil {
|
||||
|
||||
protected static final int[] PER_EXCEPTION_SIZE = {1,2,4};
|
||||
|
||||
/** Compress given int[] into Integer buffer, with PFor format
|
||||
*
|
||||
* @param data uncompressed data
|
||||
* @param intBuffer integer buffer to hold compressed data
|
||||
* @return block header
|
||||
*/
|
||||
public static int compress(final int[] data, IntBuffer intBuffer) {
|
||||
/** estimate minimum compress size to determine numFrameBits */
|
||||
int numBits=getNumBits(data);
|
||||
if (numBits == 0) {
|
||||
return compressDuplicateBlock(data,intBuffer);
|
||||
}
|
||||
|
||||
int size = data.length;
|
||||
int[] excValues = new int[size];
|
||||
int excNum = 0, excLastPos = -1, excFirstPos = -1, excLastNonForcePos = -1;
|
||||
|
||||
// num of exception until the last non-forced exception
|
||||
int excNumBase = 0;
|
||||
|
||||
// bytes per exception
|
||||
int excBytes = 1;
|
||||
|
||||
// bytes before exception area, e.g. header and normal area
|
||||
int excByteOffset = 0;
|
||||
|
||||
// the max value possible for current exception pointer,
|
||||
// value of the first pointer is limited by header as 254
|
||||
// (first exception ranges from -1 ~ 254)
|
||||
long maxChainFirst = 254;
|
||||
long maxChain = maxChainFirst + 1;
|
||||
|
||||
boolean conValue, conForce, conEnd;
|
||||
int i=0;
|
||||
|
||||
/** estimate exceptions */
|
||||
for (i=0; i<size; ++i) {
|
||||
conValue = ((data[i] & MASK[numBits]) != data[i]); // value exception
|
||||
conForce = (i >= maxChain + excLastPos); // force exception
|
||||
if (conValue || conForce) {
|
||||
excValues[excNum++] = data[i];
|
||||
if (excLastPos == -1) {
|
||||
maxChain = 1L<<numBits;
|
||||
excFirstPos = i;
|
||||
}
|
||||
if (conValue) {
|
||||
excLastNonForcePos = i;
|
||||
excNumBase = excNum;
|
||||
}
|
||||
excLastPos = i;
|
||||
}
|
||||
}
|
||||
|
||||
/** encode normal area, record exception positions */
|
||||
excNum = 0;
|
||||
if (excFirstPos < 0) { // no exception
|
||||
for (i=0; i<size; ++i) {
|
||||
encodeNormalValue(intBuffer,i,data[i], numBits);
|
||||
}
|
||||
excLastPos = -1;
|
||||
} else {
|
||||
for (i=0; i<excFirstPos; ++i) {
|
||||
encodeNormalValue(intBuffer,i,data[i], numBits);
|
||||
}
|
||||
maxChain = 1L<<numBits;
|
||||
excLastPos = excFirstPos;
|
||||
excNum = i<size? 1:0;
|
||||
for (i=excFirstPos+1; i<size; ++i) {
|
||||
conValue = ((data[i] & MASK[numBits]) != data[i]); // value exception
|
||||
conForce = (i >= maxChain + excLastPos); // force exception
|
||||
conEnd = (excNum == excNumBase); // following forced ignored
|
||||
if ((!conValue && !conForce) || conEnd) {
|
||||
encodeNormalValue(intBuffer,i,data[i], numBits);
|
||||
} else {
|
||||
encodeNormalValue(intBuffer, excLastPos, i-excLastPos-1, numBits);
|
||||
excNum++;
|
||||
excLastPos = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** encode exception area */
|
||||
for (i=0; i<excNum; ++i) {
|
||||
if (excBytes < 2 && (excValues[i] & ~MASK[8]) != 0) {
|
||||
excBytes=2;
|
||||
}
|
||||
if (excBytes < 4 && (excValues[i] & ~MASK[16]) != 0) {
|
||||
excBytes=4;
|
||||
}
|
||||
}
|
||||
excByteOffset = (size*numBits + 7)/8;
|
||||
encodeExcValues(intBuffer, excValues, excNum, excBytes, excByteOffset);
|
||||
|
||||
/** encode header */
|
||||
int encodedSize = (excByteOffset + excBytes*excNum + 3)/4;
|
||||
|
||||
return getHeader(encodedSize, numBits, excNum, excFirstPos, excBytes);
|
||||
}
|
||||
|
||||
/** Decompress given Integer buffer into int array.
|
||||
*
|
||||
* @param intBuffer integer buffer to hold compressed data
|
||||
* @param data int array to hold uncompressed data
|
||||
*/
|
||||
public static void decompress(IntBuffer intBuffer, int[] data, int header) {
|
||||
// since this buffer is reused at upper level, rewind first
|
||||
intBuffer.rewind();
|
||||
|
||||
int excNum = ((header >> 8) & MASK[8]) + 1;
|
||||
int excFirstPos = ((header >> 16) & MASK[8]) - 1;
|
||||
int excBytes = PER_EXCEPTION_SIZE[(header >> 30) & MASK[2]];
|
||||
int numBits = ((header >> 24) & MASK[6]);
|
||||
|
||||
decompressCore(intBuffer, data, numBits);
|
||||
|
||||
patchException(intBuffer,data,excNum,excFirstPos,excBytes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Encode exception values into exception area.
|
||||
* The width for each exception will be fixed as:
|
||||
* 1, 2, or 4 byte(s).
|
||||
*/
|
||||
static void encodeExcValues(IntBuffer intBuffer, int[] values, int num, int perbytes, int byteOffset) {
|
||||
if (num == 0)
|
||||
return;
|
||||
if (perbytes == 1) {
|
||||
int curBytePos = byteOffset;
|
||||
for (int i=0; i<num; ++i) {
|
||||
int curIntPos = curBytePos / 4;
|
||||
setBufferIntBits(intBuffer, curIntPos, (curBytePos & 3)*8, 8, values[i]);
|
||||
curBytePos++;
|
||||
}
|
||||
} else if (perbytes == 2) {
|
||||
int shortOffset = (byteOffset+1)/2;
|
||||
int curIntPos = shortOffset/2;
|
||||
int i=0;
|
||||
if ((shortOffset & 1) == 1) { // cut head to ensure remaining fit ints
|
||||
setBufferIntBits(intBuffer, curIntPos++, 16, 16, values[i++]);
|
||||
}
|
||||
for (; i<num-1; i+=2) {
|
||||
intBuffer.put(curIntPos++, (values[i+1]<<16) | values[i]);
|
||||
}
|
||||
if (i<num) {
|
||||
intBuffer.put(curIntPos, values[i]); // cut tail, also clear high 16 bits
|
||||
}
|
||||
} else if (perbytes == 4) {
|
||||
int curIntPos = (byteOffset+3) / 4;
|
||||
for (int i=0; i<num; ++i) {
|
||||
intBuffer.put(curIntPos++, values[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save only header when the whole block equals to 1
|
||||
*/
|
||||
static int compressDuplicateBlock(final int[] data, IntBuffer intBuffer) {
|
||||
intBuffer.put(0,data[0]);
|
||||
return getHeader(1, 0, 0, -1, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode exception values base on the exception pointers in normal area,
|
||||
* and values in exception area.
|
||||
* As for current implementation, numInts is hardwired as 128, so the
|
||||
* tail of normal area is naturally aligned to 32 bits, and we don't need to
|
||||
* rewind intBuffer here.
|
||||
* However, the normal area may share a same int with exception area,
|
||||
* when numFrameBits * numInts % 32 != 0,
|
||||
* In this case we should preprocess patch several heading exceptions,
|
||||
* before calling this method.
|
||||
*
|
||||
*/
|
||||
public static void patchException(IntBuffer intBuffer, int[] data, int excNum, int excFirstPos, int excBytes) {
|
||||
if (excFirstPos == -1) {
|
||||
return;
|
||||
}
|
||||
int curPos=excFirstPos;
|
||||
int i,j;
|
||||
|
||||
if (excBytes == 1) { // each exception consumes 1 byte
|
||||
for (i=0; i+3<excNum; i+=4) {
|
||||
final int curInt = intBuffer.get();
|
||||
curPos = patch(data, curPos, (curInt) & MASK[8]);
|
||||
curPos = patch(data, curPos, (curInt >>> 8) & MASK[8]);
|
||||
curPos = patch(data, curPos, (curInt >>> 16) & MASK[8]);
|
||||
curPos = patch(data, curPos, (curInt >>> 24) & MASK[8]);
|
||||
}
|
||||
if (i<excNum) {
|
||||
final int curInt = intBuffer.get();
|
||||
for (j=0; j<32 && i<excNum; j+=8,i++) {
|
||||
curPos = patch(data, curPos, (curInt >>> j) & MASK[8]);
|
||||
}
|
||||
}
|
||||
} else if (excBytes == 2) { // each exception consumes 2 bytes
|
||||
for (i=0; i+1<excNum; i+=2) {
|
||||
final int curInt = intBuffer.get();
|
||||
curPos = patch(data, curPos, (curInt) & MASK[16]);
|
||||
curPos = patch(data, curPos, (curInt >>> 16) & MASK[16]);
|
||||
}
|
||||
if (i<excNum) {
|
||||
final int curInt = intBuffer.get();
|
||||
curPos = patch(data, curPos, (curInt) & MASK[16]);
|
||||
}
|
||||
} else if (excBytes == 4) { // each exception consumes 4 bytes
|
||||
for (i=0; i<excNum; i++) {
|
||||
curPos = patch(data, curPos, intBuffer.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int patch(int[]data, int pos, int value) {
|
||||
int nextPos = data[pos] + pos + 1;
|
||||
data[pos] = value;
|
||||
assert nextPos > pos;
|
||||
return nextPos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimate best number of frame bits according to minimum compressed size.
|
||||
* It will run 32 times.
|
||||
*/
|
||||
static int getNumBits(final int[] data) {
|
||||
if (isAllEqual(data)) {
|
||||
return 0;
|
||||
}
|
||||
int optBits=1;
|
||||
int optSize=estimateCompressedSize(data,optBits);
|
||||
for (int i=2; i<=32; ++i) {
|
||||
int curSize=estimateCompressedSize(data,i);
|
||||
if (curSize<optSize) {
|
||||
optSize=curSize;
|
||||
optBits=i;
|
||||
}
|
||||
}
|
||||
return optBits;
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate the whole block to get maximum exception bits,
|
||||
* and estimate compressed size without forced exception.
|
||||
* TODO: foresee forced exception for better estimation
|
||||
*/
|
||||
static int estimateCompressedSize(final int[] data, int numBits) {
|
||||
int size=data.length;
|
||||
int totalBytes=(numBits*size+7)/8; // always round to byte
|
||||
int excNum=0;
|
||||
int curExcBytes=1;
|
||||
for (int i=0; i<size; ++i) {
|
||||
if ((data[i] & ~MASK[numBits]) != 0) { // exception
|
||||
excNum++;
|
||||
if (curExcBytes<2 && (data[i] & ~MASK[8]) != 0) { // exceed 1 byte exception
|
||||
curExcBytes=2;
|
||||
}
|
||||
if (curExcBytes<4 && (data[i] & ~MASK[16]) != 0) { // exceed 2 byte exception
|
||||
curExcBytes=4;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (curExcBytes==2) {
|
||||
totalBytes=((totalBytes+1)/2)*2; // round up to 2x bytes before filling exceptions
|
||||
}
|
||||
else if (curExcBytes==4) {
|
||||
totalBytes=((totalBytes+3)/4)*4; // round up to 4x bytes
|
||||
}
|
||||
totalBytes+=excNum*curExcBytes;
|
||||
|
||||
return totalBytes/4*4; // round up to ints
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate the 4 byte header which contains (from lsb to msb):
|
||||
*
|
||||
* 8 bits for encoded block int size (excluding header, this limits DEFAULT_BLOCK_SIZE <= 2^(8-1))
|
||||
*
|
||||
* 8 bits for exception num - 1 (when no exceptions, this is undefined)
|
||||
*
|
||||
* 8 bits for the index of the first exception + 1 (when no exception, this is 0)
|
||||
*
|
||||
* 6 bits for num of frame bits (when 0, values in this block are all the same)
|
||||
* 2 bits for the exception code: 00: byte, 01: short, 10: int
|
||||
*
|
||||
*/
|
||||
static int getHeader(int encodedSize, int numBits, int excNum, int excFirstPos, int excBytes) {
|
||||
return (encodedSize)
|
||||
| (((excNum-1) & MASK[8]) << 8)
|
||||
| ((excFirstPos+1) << 16)
|
||||
| ((numBits) << 24)
|
||||
| ((excBytes/2) << 30);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Expert: get metadata from header.
|
||||
*/
|
||||
public static int getEncodedSize(int header) {
|
||||
return ((header & MASK[8]))*4;
|
||||
}
|
||||
public static int getExcNum(int header) {
|
||||
return ((header >> 8) & MASK[8]) + 1;
|
||||
}
|
||||
public static int getFirstPos(int header) {
|
||||
return ((header >> 16) & MASK[8]) - 1;
|
||||
}
|
||||
public static int getExcBytes(int header) {
|
||||
return PER_EXCEPTION_SIZE[(header >> 30) & MASK[2]];
|
||||
}
|
||||
public static int getNumBits(int header) {
|
||||
return ((header >> 24) & MASK[6]);
|
||||
}
|
||||
}
|
|
@ -17,8 +17,6 @@ org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat
|
|||
org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat
|
||||
org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat
|
||||
org.apache.lucene.codecs.memory.MemoryPostingsFormat
|
||||
org.apache.lucene.codecs.pfor.ForPostingsFormat
|
||||
org.apache.lucene.codecs.pfor.PForPostingsFormat
|
||||
org.apache.lucene.codecs.bulkvint.BulkVIntPostingsFormat
|
||||
org.apache.lucene.codecs.block.BlockPostingsFormat
|
||||
org.apache.lucene.codecs.memory.DirectPostingsFormat
|
||||
|
|
|
@ -1,293 +0,0 @@
|
|||
package org.apache.lucene.codecs.pfor;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Locale;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.codecs.pfor.ForPostingsFormat;
|
||||
import org.apache.lucene.codecs.pfor.PForUtil;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/**
|
||||
* Test the core utility for PFor compress and decompress
|
||||
* We don't specially provide test case for For encoder/decoder, since
|
||||
* PFor is a extended version of For, and most methods will be reused
|
||||
* here.
|
||||
*/
|
||||
public class TestPForUtil extends LuceneTestCase {
|
||||
static final int[] MASK={ 0x00000000,
|
||||
0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
|
||||
0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
|
||||
0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff,
|
||||
0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
|
||||
0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
|
||||
0x7fffffff, 0xffffffff};
|
||||
Random gen;
|
||||
public void initRandom() {
|
||||
this.gen = random();
|
||||
}
|
||||
|
||||
/**
|
||||
* Should not encode extra information other than single int
|
||||
*/
|
||||
public void testAllEqual() throws Exception {
|
||||
initRandom();
|
||||
int sz=ForPostingsFormat.DEFAULT_BLOCK_SIZE;
|
||||
int[] data=new int[sz];
|
||||
byte[] res = new byte[sz*8];
|
||||
int[] copy = new int[sz];
|
||||
IntBuffer resBuffer = ByteBuffer.wrap(res).asIntBuffer();
|
||||
int ensz;
|
||||
int header;
|
||||
|
||||
Arrays.fill(data,gen.nextInt());
|
||||
header = ForUtil.compress(data,resBuffer); // test For
|
||||
ensz = ForUtil.getEncodedSize(header);
|
||||
assert ensz == 4;
|
||||
|
||||
ForUtil.decompress(resBuffer,copy,header);
|
||||
assert cmp(data,sz,copy,sz)==true;
|
||||
|
||||
Arrays.fill(data,gen.nextInt());
|
||||
header = PForUtil.compress(data,resBuffer); // test PFor
|
||||
ensz = PForUtil.getEncodedSize(header);
|
||||
assert ensz == 4;
|
||||
|
||||
PForUtil.decompress(resBuffer,copy,header);
|
||||
assert cmp(data,sz,copy,sz)==true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test correctness of forced exception.
|
||||
* the forced ones should exactly fit max chain
|
||||
*/
|
||||
public void testForcedExceptionDistance() throws Exception {
|
||||
initRandom();
|
||||
int sz=ForPostingsFormat.DEFAULT_BLOCK_SIZE;
|
||||
int[] data=new int[sz];
|
||||
byte[] res = new byte[sz*8];
|
||||
int[] copy = new int[sz];
|
||||
IntBuffer resBuffer = ByteBuffer.wrap(res).asIntBuffer();
|
||||
int numBits = gen.nextInt(5)+1;
|
||||
|
||||
int i,j;
|
||||
int pace, ensz, header;
|
||||
int expect, got;
|
||||
|
||||
// fill exception value with same pace, there should
|
||||
// be no forced exceptions.
|
||||
createDistribution(data, sz, 1, MASK[numBits], MASK[numBits]);
|
||||
pace = 1<<numBits;
|
||||
for (i=0,j=0; i<sz; i+=pace) {
|
||||
int exc = gen.nextInt();
|
||||
data[i] = (exc & 0xffff0000) == 0 ? exc | 0xffff0000 : exc;
|
||||
j++;
|
||||
}
|
||||
header = PForUtil.compress(data,resBuffer);
|
||||
ensz = PForUtil.getEncodedSize(header);
|
||||
expect = j;
|
||||
got = PForUtil.getExcNum(header);
|
||||
assert expect == got: expect+" expected but got "+got;
|
||||
|
||||
// there should exactly one forced exception before each
|
||||
// exception when i>0
|
||||
createDistribution(data, sz, 1, MASK[numBits], MASK[numBits]);
|
||||
pace = (1<<numBits)+1;
|
||||
for (i=0,j=0; i<sz; i+=pace) {
|
||||
int exc = gen.nextInt();
|
||||
data[i] = (exc & 0xffff0000) == 0 ? exc | 0xffff0000 : exc;
|
||||
j++;
|
||||
}
|
||||
header = PForUtil.compress(data,resBuffer);
|
||||
ensz = PForUtil.getEncodedSize(header);
|
||||
expect = 2*(j-1)+1;
|
||||
got = PForUtil.getExcNum(header);
|
||||
assert expect == got: expect+" expected but got "+got;
|
||||
|
||||
|
||||
// two forced exception
|
||||
createDistribution(data, sz, 1, MASK[numBits], MASK[numBits]);
|
||||
pace = (1<<numBits)*2+1;
|
||||
for (i=0,j=0; i<sz; i+=pace) {
|
||||
int exc = gen.nextInt();
|
||||
data[i] = (exc & 0xffff0000) == 0 ? exc | 0xffff0000 : exc;
|
||||
j++;
|
||||
}
|
||||
header = PForUtil.compress(data,resBuffer);
|
||||
ensz = PForUtil.getEncodedSize(header);
|
||||
expect = 3*(j-1)+1;
|
||||
got = PForUtil.getExcNum(header);
|
||||
assert expect == got: expect+" expected but got "+got;
|
||||
|
||||
}
|
||||
/**
|
||||
* Test correctness of ignored forced exception.
|
||||
* The trailing forced exceptions should always be reverted
|
||||
* since they're not necessary.
|
||||
*/
|
||||
public void testTrailingForcedException() throws Exception {
|
||||
initRandom();
|
||||
int sz=ForPostingsFormat.DEFAULT_BLOCK_SIZE;
|
||||
assert sz % 32 == 0;
|
||||
Integer[] buff= new Integer[sz];
|
||||
int[] data = new int[sz];
|
||||
int[] copy = new int[sz];
|
||||
byte[] res = new byte[sz*8];
|
||||
IntBuffer resBuffer = ByteBuffer.wrap(res).asIntBuffer();
|
||||
|
||||
int excIndex = gen.nextInt(sz/2);
|
||||
int excValue = gen.nextInt();
|
||||
if ((excValue & 0xffff0000) == 0) {
|
||||
excValue |= 0xffff0000; // always prepare a 4 bytes exception
|
||||
}
|
||||
|
||||
// make value of numFrameBits to be small,
|
||||
// thus easy to get forced exceptions
|
||||
for (int i=0; i<sz; ++i) {
|
||||
buff[i]=gen.nextInt() & 1;
|
||||
}
|
||||
// create only one value exception
|
||||
buff[excIndex]=excValue;
|
||||
|
||||
for (int i=0; i<sz; ++i)
|
||||
data[i] = buff[i];
|
||||
|
||||
int header = PForUtil.compress(data,resBuffer);
|
||||
int ensz = PForUtil.getEncodedSize(header);
|
||||
|
||||
assert (ensz <= sz*8): ensz+" > "+sz*8; // must not exceed the loose upperbound
|
||||
assert (ensz >= 4); // at least we have an exception, right?
|
||||
|
||||
PForUtil.decompress(resBuffer,copy,header);
|
||||
|
||||
// println(getHex(data,sz)+"\n");
|
||||
// println(getHex(res,ensz)+"\n");
|
||||
// println(getHex(copy,sz)+"\n");
|
||||
|
||||
// fetch the last int, i.e. last exception.
|
||||
int lastExc = (res[ensz-4] << 24) |
|
||||
((0xff & res[ensz-3]) << 16) |
|
||||
((0xff & res[ensz-2]) << 8 ) |
|
||||
(0xff & res[ensz-1]);
|
||||
|
||||
// trailing forced exceptions are suppressed,
|
||||
// so the last exception should be what we assigned.
|
||||
assert lastExc==excValue;
|
||||
assert cmp(data,sz,copy,sz)==true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test correctness of compressing and decompressing.
|
||||
* Here we randomly assign a rate of exception (i.e. 1-alpha),
|
||||
* and test different scale of normal/exception values.
|
||||
*/
|
||||
public void testAllDistribution() throws Exception {
|
||||
initRandom();
|
||||
int sz = ForPostingsFormat.DEFAULT_BLOCK_SIZE;
|
||||
int[] data = new int[sz];
|
||||
for (int i=0; i<=32; ++i) { // try to test every kinds of distribution
|
||||
double alpha=gen.nextDouble(); // rate of normal value
|
||||
for (int j=i; j<=32; ++j) {
|
||||
createDistribution(data,sz,alpha,MASK[i],MASK[j]);
|
||||
tryCompressAndDecompress(data, sz);
|
||||
}
|
||||
}
|
||||
}
|
||||
public void createDistribution(int[] data, int sz, double alpha, int masknorm, int maskexc) {
|
||||
Integer[] buff= new Integer[sz];
|
||||
int i=0;
|
||||
for (; i<sz*alpha; ++i)
|
||||
buff[i]=gen.nextInt() & masknorm;
|
||||
for (; i<sz; ++i)
|
||||
buff[i]=gen.nextInt() & maskexc;
|
||||
Collections.shuffle(Arrays.asList(buff),gen);
|
||||
for (i=0; i<sz; ++i)
|
||||
data[i] = buff[i];
|
||||
}
|
||||
public void tryCompressAndDecompress(final int[] data, int sz) throws Exception {
|
||||
byte[] res = new byte[sz*8]; // loosely upperbound
|
||||
IntBuffer resBuffer = ByteBuffer.wrap(res).asIntBuffer();
|
||||
|
||||
int header = PForUtil.compress(data,resBuffer);
|
||||
int ensz = PForUtil.getEncodedSize(header);
|
||||
|
||||
assert (ensz <= sz*8); // must not exceed the loose upperbound
|
||||
|
||||
int[] copy = new int[sz];
|
||||
PForUtil.decompress(resBuffer,copy,header);
|
||||
|
||||
// println(getHex(data,sz)+"\n");
|
||||
// println(getHex(res,ensz)+"\n");
|
||||
// println(getHex(copy,sz)+"\n");
|
||||
|
||||
assert cmp(data,sz,copy,sz)==true;
|
||||
}
|
||||
public boolean cmp(int[] a, int sza, int[] b, int szb) {
|
||||
if (sza!=szb)
|
||||
return false;
|
||||
for (int i=0; i<sza; ++i) {
|
||||
if (a[i]!=b[i]) {
|
||||
System.err.println(String.format(Locale.ENGLISH, "! %08x != %08x in %d",a[i],b[i],i));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
public static String getHex( byte [] raw, int sz ) {
|
||||
final String HEXES = "0123456789ABCDEF";
|
||||
if ( raw == null ) {
|
||||
return null;
|
||||
}
|
||||
final StringBuilder hex = new StringBuilder( 2 * raw.length );
|
||||
for ( int i=0; i<sz; i++ ) {
|
||||
if (i>0 && (i)%16 == 0)
|
||||
hex.append("\n");
|
||||
byte b=raw[i];
|
||||
hex.append(HEXES.charAt((b & 0xF0) >> 4))
|
||||
.append(HEXES.charAt((b & 0x0F)))
|
||||
.append(" ");
|
||||
}
|
||||
return hex.toString();
|
||||
}
|
||||
public static String getHex( int [] raw, int sz ) {
|
||||
if ( raw == null ) {
|
||||
return null;
|
||||
}
|
||||
final StringBuilder hex = new StringBuilder( 4 * raw.length );
|
||||
for ( int i=0; i<sz; i++ ) {
|
||||
if (i>0 && i%8 == 0)
|
||||
hex.append("\n");
|
||||
hex.append(String.format(Locale.ENGLISH, "%08x ",raw[i]));
|
||||
}
|
||||
return hex.toString();
|
||||
}
|
||||
static void eprintln(String format, Object... args) {
|
||||
System.err.println(String.format(Locale.ENGLISH, format,args));
|
||||
}
|
||||
static void println(String format, Object... args) {
|
||||
System.out.println(String.format(Locale.ENGLISH, format,args));
|
||||
}
|
||||
static void print(String format, Object... args) {
|
||||
System.out.print(String.format(Locale.ENGLISH, format,args));
|
||||
}
|
||||
}
|
|
@ -61,7 +61,6 @@ import org.apache.lucene.store.IndexOutput;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.apache.lucene.codecs.pfor.*;
|
||||
|
||||
/**
|
||||
* Randomly combines terms index impl w/ postings impls.
|
||||
|
@ -103,8 +102,6 @@ public class MockRandomPostingsFormat extends PostingsFormat {
|
|||
final int baseBlockSize = _TestUtil.nextInt(random, 1, 127);
|
||||
delegates.add(new MockVariableIntBlockPostingsFormat.MockIntFactory(baseBlockSize));
|
||||
// TODO: others
|
||||
delegates.add(new ForFactory());
|
||||
delegates.add(new PForFactory());
|
||||
}
|
||||
|
||||
private static String getExtension(String fileName) {
|
||||
|
|
|
@ -282,9 +282,7 @@ public abstract class LuceneTestCase extends Assert {
|
|||
"MockFixedIntBlock",
|
||||
"MockVariableIntBlock",
|
||||
"MockSep",
|
||||
"MockRandom",
|
||||
"For",
|
||||
"PFor"
|
||||
"MockRandom"
|
||||
));
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
|
|
Loading…
Reference in New Issue