LUCENE-3892: add PForPostingsFormat

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1356531 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-07-03 00:43:06 +00:00
parent c272f90605
commit 5b4e1aea06
14 changed files with 594 additions and 55 deletions

View File

@ -36,7 +36,7 @@ import org.apache.lucene.codecs.intblock.FixedIntBlockIndexOutput;
* Things really make sense are: flushBlock() and readBlock() * Things really make sense are: flushBlock() and readBlock()
*/ */
public class ForFactory extends IntStreamFactory { public final class ForFactory extends IntStreamFactory {
private final int blockSize; private final int blockSize;
public ForFactory() { public ForFactory() {

View File

@ -42,7 +42,7 @@ import org.apache.lucene.codecs.sep.SepPostingsWriter;
* to a PostingsWriter/ReaderBase, and get customized * to a PostingsWriter/ReaderBase, and get customized
* format plugged. * format plugged.
*/ */
public class ForPostingsFormat extends PostingsFormat { public final class ForPostingsFormat extends PostingsFormat {
private final int blockSize; private final int blockSize;
private final int minBlockSize; private final int minBlockSize;
private final int maxBlockSize; private final int maxBlockSize;

View File

@ -21,16 +21,15 @@ import java.nio.ByteBuffer;
import java.util.Arrays; import java.util.Arrays;
// Encode all values in normal area, based on the bit size for max value // Encode all values in normal area, based on the bit size for max value
public final class ForUtil { public class ForUtil {
public static final int HEADER_INT_SIZE=1; public static final int HEADER_INT_SIZE=1;
private static final int[] MASK = { 0x00000000, protected static final int[] MASK = { 0x00000000,
0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f, 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff, 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff,
0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff, 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff, 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
0x7fffffff, 0xffffffff}; 0x7fffffff, 0xffffffff};
private static final int[] PER_EXCEPTION_SIZE = {1,2,4};
public static int compress(final int[] data, int size, IntBuffer intBuffer) { public static int compress(final int[] data, int size, IntBuffer intBuffer) {
int numBits=getNumBits(data,size); int numBits=getNumBits(data,size);
@ -51,40 +50,40 @@ public final class ForUtil {
int numInts = (header & MASK[8]) + 1; int numInts = (header & MASK[8]) + 1;
int numBits = ((header >> 8) & MASK[5]) + 1; int numBits = ((header >> 8) & MASK[5]) + 1;
// TODO: ForDecompressImpl is hardewired to size==128 only // TODO: PackedIntsDecompress is hardewired to size==128 only
switch(numBits) { switch(numBits) {
case 1: ForDecompressImpl.decode1(intBuffer, data); break; case 1: PackedIntsDecompress.decode1(intBuffer, data); break;
case 2: ForDecompressImpl.decode2(intBuffer, data); break; case 2: PackedIntsDecompress.decode2(intBuffer, data); break;
case 3: ForDecompressImpl.decode3(intBuffer, data); break; case 3: PackedIntsDecompress.decode3(intBuffer, data); break;
case 4: ForDecompressImpl.decode4(intBuffer, data); break; case 4: PackedIntsDecompress.decode4(intBuffer, data); break;
case 5: ForDecompressImpl.decode5(intBuffer, data); break; case 5: PackedIntsDecompress.decode5(intBuffer, data); break;
case 6: ForDecompressImpl.decode6(intBuffer, data); break; case 6: PackedIntsDecompress.decode6(intBuffer, data); break;
case 7: ForDecompressImpl.decode7(intBuffer, data); break; case 7: PackedIntsDecompress.decode7(intBuffer, data); break;
case 8: ForDecompressImpl.decode8(intBuffer, data); break; case 8: PackedIntsDecompress.decode8(intBuffer, data); break;
case 9: ForDecompressImpl.decode9(intBuffer, data); break; case 9: PackedIntsDecompress.decode9(intBuffer, data); break;
case 10: ForDecompressImpl.decode10(intBuffer, data); break; case 10: PackedIntsDecompress.decode10(intBuffer, data); break;
case 11: ForDecompressImpl.decode11(intBuffer, data); break; case 11: PackedIntsDecompress.decode11(intBuffer, data); break;
case 12: ForDecompressImpl.decode12(intBuffer, data); break; case 12: PackedIntsDecompress.decode12(intBuffer, data); break;
case 13: ForDecompressImpl.decode13(intBuffer, data); break; case 13: PackedIntsDecompress.decode13(intBuffer, data); break;
case 14: ForDecompressImpl.decode14(intBuffer, data); break; case 14: PackedIntsDecompress.decode14(intBuffer, data); break;
case 15: ForDecompressImpl.decode15(intBuffer, data); break; case 15: PackedIntsDecompress.decode15(intBuffer, data); break;
case 16: ForDecompressImpl.decode16(intBuffer, data); break; case 16: PackedIntsDecompress.decode16(intBuffer, data); break;
case 17: ForDecompressImpl.decode17(intBuffer, data); break; case 17: PackedIntsDecompress.decode17(intBuffer, data); break;
case 18: ForDecompressImpl.decode18(intBuffer, data); break; case 18: PackedIntsDecompress.decode18(intBuffer, data); break;
case 19: ForDecompressImpl.decode19(intBuffer, data); break; case 19: PackedIntsDecompress.decode19(intBuffer, data); break;
case 20: ForDecompressImpl.decode20(intBuffer, data); break; case 20: PackedIntsDecompress.decode20(intBuffer, data); break;
case 21: ForDecompressImpl.decode21(intBuffer, data); break; case 21: PackedIntsDecompress.decode21(intBuffer, data); break;
case 22: ForDecompressImpl.decode22(intBuffer, data); break; case 22: PackedIntsDecompress.decode22(intBuffer, data); break;
case 23: ForDecompressImpl.decode23(intBuffer, data); break; case 23: PackedIntsDecompress.decode23(intBuffer, data); break;
case 24: ForDecompressImpl.decode24(intBuffer, data); break; case 24: PackedIntsDecompress.decode24(intBuffer, data); break;
case 25: ForDecompressImpl.decode25(intBuffer, data); break; case 25: PackedIntsDecompress.decode25(intBuffer, data); break;
case 26: ForDecompressImpl.decode26(intBuffer, data); break; case 26: PackedIntsDecompress.decode26(intBuffer, data); break;
case 27: ForDecompressImpl.decode27(intBuffer, data); break; case 27: PackedIntsDecompress.decode27(intBuffer, data); break;
case 28: ForDecompressImpl.decode28(intBuffer, data); break; case 28: PackedIntsDecompress.decode28(intBuffer, data); break;
case 29: ForDecompressImpl.decode29(intBuffer, data); break; case 29: PackedIntsDecompress.decode29(intBuffer, data); break;
case 30: ForDecompressImpl.decode30(intBuffer, data); break; case 30: PackedIntsDecompress.decode30(intBuffer, data); break;
case 31: ForDecompressImpl.decode31(intBuffer, data); break; case 31: PackedIntsDecompress.decode31(intBuffer, data); break;
case 32: ForDecompressImpl.decode32(intBuffer, data); break; case 32: PackedIntsDecompress.decode32(intBuffer, data); break;
default: default:
throw new IllegalStateException("Unknown numFrameBits " + numBits); throw new IllegalStateException("Unknown numFrameBits " + numBits);
} }

View File

@ -0,0 +1,114 @@
package org.apache.lucene.codecs.pfor;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.codecs.sep.IntStreamFactory;
import org.apache.lucene.codecs.sep.IntIndexInput;
import org.apache.lucene.codecs.sep.IntIndexOutput;
import org.apache.lucene.codecs.intblock.FixedIntBlockIndexInput;
import org.apache.lucene.codecs.intblock.FixedIntBlockIndexOutput;
/**
* Stuff to pass to PostingsReader/WriterBase.
* Things really make sense are: flushBlock() and readBlock()
*/
public final class PForFactory extends IntStreamFactory {
private final int blockSize;
public PForFactory() {
this.blockSize=PForPostingsFormat.DEFAULT_BLOCK_SIZE;
}
@Override
public IntIndexOutput createOutput(Directory dir, String fileName, IOContext context) throws IOException {
IndexOutput out = dir.createOutput(fileName, context);
boolean success = false;
try {
FixedIntBlockIndexOutput ret = new PForIndexOutput(out, blockSize);
success = true;
return ret;
} finally {
if (!success) {
// TODO: why handle exception like this?
// and why not use similar codes for read part?
IOUtils.closeWhileHandlingException(out);
}
}
}
@Override
public IntIndexInput openInput(Directory dir, String fileName, IOContext context) throws IOException {
FixedIntBlockIndexInput ret = new PForIndexInput(dir.openInput(fileName, context));
return ret;
}
// wrap input and output with buffer support
private class PForIndexInput extends FixedIntBlockIndexInput {
PForIndexInput(final IndexInput in) throws IOException {
super(in);
}
class PForBlockReader implements FixedIntBlockIndexInput.BlockReader {
byte[] encoded;
int[] buffer;
IndexInput in;
IntBuffer encodedBuffer;
PForBlockReader(final IndexInput in, final int[] buffer) {
this.encoded = new byte[blockSize*8+4];
this.in=in;
this.buffer=buffer;
this.encodedBuffer=ByteBuffer.wrap(encoded).asIntBuffer();
}
public void seek(long pos) {}
// TODO: implement public void skipBlock() {} ?
public void readBlock() throws IOException {
final int numBytes = in.readInt();
assert numBytes <= blockSize*8+4;
in.readBytes(encoded,0,numBytes);
PForUtil.decompress(encodedBuffer,buffer);
}
}
@Override
protected BlockReader getBlockReader(final IndexInput in, final int[] buffer) throws IOException {
return new PForBlockReader(in,buffer);
}
}
private class PForIndexOutput extends FixedIntBlockIndexOutput {
private byte[] encoded;
private IntBuffer encodedBuffer;
PForIndexOutput(IndexOutput out, int blockSize) throws IOException {
super(out,blockSize);
this.encoded = new byte[blockSize*8+4];
this.encodedBuffer=ByteBuffer.wrap(encoded).asIntBuffer();
}
@Override
protected void flushBlock() throws IOException {
final int numBytes = PForUtil.compress(buffer,buffer.length,encodedBuffer);
out.writeInt(numBytes);
out.writeBytes(encoded, numBytes);
}
}
}

View File

@ -0,0 +1,117 @@
package org.apache.lucene.codecs.pfor;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Set;
import java.io.IOException;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.BlockTreeTermsWriter;
import org.apache.lucene.codecs.BlockTreeTermsReader;
import org.apache.lucene.codecs.TermsIndexReaderBase;
import org.apache.lucene.codecs.TermsIndexWriterBase;
import org.apache.lucene.codecs.FixedGapTermsIndexReader;
import org.apache.lucene.codecs.FixedGapTermsIndexWriter;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.sep.SepPostingsReader;
import org.apache.lucene.codecs.sep.SepPostingsWriter;
/**
* This class actually only pass the PForFactory
* to a PostingsWriter/ReaderBase, and get customized
* format plugged.
*/
public final class PForPostingsFormat extends PostingsFormat {
private final int blockSize;
private final int minBlockSize;
private final int maxBlockSize;
protected final static int DEFAULT_BLOCK_SIZE = 128;
protected final static int DEFAULT_TERM_CACHED_SIZE = 1024;
public PForPostingsFormat() {
super("PFor");
this.blockSize = DEFAULT_BLOCK_SIZE;
this.minBlockSize = BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE;
this.maxBlockSize = BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE;
}
public PForPostingsFormat(int minBlockSize, int maxBlockSize) {
super("PFor");
this.blockSize = DEFAULT_BLOCK_SIZE;
this.minBlockSize = minBlockSize;
assert minBlockSize > 1;
this.maxBlockSize = maxBlockSize;
assert minBlockSize <= maxBlockSize;
}
@Override
public String toString() {
return getName() + "(blocksize=" + blockSize + ")";
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
// TODO: implement a new PostingsWriterBase to improve skip-settings
PostingsWriterBase postingsWriter = new SepPostingsWriter(state, new PForFactory());
boolean success = false;
try {
FieldsConsumer ret = new BlockTreeTermsWriter(state,
postingsWriter,
minBlockSize,
maxBlockSize);
success = true;
return ret;
} finally {
if (!success) {
postingsWriter.close();
}
}
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new SepPostingsReader(state.dir,
state.fieldInfos,
state.segmentInfo,
state.context,
new PForFactory(),
state.segmentSuffix);
boolean success = false;
try {
FieldsProducer ret = new BlockTreeTermsReader(state.dir,
state.fieldInfos,
state.segmentInfo.name,
postingsReader,
state.context,
state.segmentSuffix,
state.termsIndexDivisor);
success = true;
return ret;
} finally {
if (!success) {
postingsReader.close();
}
}
}
}

View File

@ -0,0 +1,308 @@
package org.apache.lucene.codecs.pfor;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.nio.IntBuffer;
import java.nio.ByteBuffer;
import java.util.Arrays;
// Encode all small values and exception pointers in normal area,
// Encode large values in exception area.
// Size per exception is variable, possibly: 1byte, 2bytes, or 4bytes
public final class PForUtil extends ForUtil {
protected static final int[] PER_EXCEPTION_SIZE = {1,2,4};
public static int compress(final int[] data, int size, IntBuffer intBuffer) {
int numBits=getNumBits(data,size);
int[] excValues = new int[size];
int excNum = 0, excLastPos = -1, excFirstPos = -1;
int excLastNonForcePos = -1;
int excNumBase = 0; // num of exception until the last non-force exception
int excBytes = 1; // bytes per exception
int excByteOffset = 0; // bytes of preceeding codes like header and normal area
long maxChain = (1<<8) - 2; // header bits limits this to 254
boolean conValue, conForce, conEnd;
int i=0;
// estimate exceptions
for (i=0; i<size; ++i) {
conValue = ((data[i] & MASK[numBits]) != data[i]); // value exception
conForce = (i >= maxChain + excLastPos); // force exception
if (conValue || conForce) {
excValues[excNum++] = data[i];
if (excLastPos == -1) {
maxChain = 1L<<numBits;
excFirstPos = i;
}
if (conValue) {
excLastNonForcePos = i;
excNumBase = excNum;
}
excLastPos = i;
}
}
// encode normal area, record exception positions
i=0;
excNum = 0;
if (excFirstPos < 0) { // no exception
for (; i<size; ++i) {
encodeNormalValue(intBuffer,i,data[i], numBits);
}
excLastPos = -1;
} else {
for (; i<excFirstPos; ++i) {
encodeNormalValue(intBuffer,i,data[i], numBits);
}
maxChain = 1L<<numBits;
excLastPos = -1;
for (; i<size; ++i) {
conValue = ((data[i] & MASK[numBits]) != data[i]); // value exception
conForce = (i >= maxChain + excLastPos); // force exception
conEnd = (excNum == excNumBase); // following forced ignored
if ((!conValue && !conForce) || conEnd) {
encodeNormalValue(intBuffer,i,data[i], numBits);
} else {
if (excLastPos >= 0) {
encodeNormalValue(intBuffer, excLastPos, i-excLastPos-1, numBits);
}
excNum++;
excLastPos = i;
}
}
if (excLastPos >= 0) {
encodeNormalValue(intBuffer, excLastPos, (i-excLastPos-1)&MASK[numBits], numBits); // mask out suppressed force exception
}
}
// encode exception area
i=0;
for (; i<excNum; ++i) {
if (excBytes < 2 && (excValues[i] & ~MASK[8]) != 0) {
excBytes=2;
}
if (excBytes < 4 && (excValues[i] & ~MASK[16]) != 0) {
excBytes=4;
}
}
excByteOffset = HEADER_INT_SIZE*4 + (size*numBits + 7)/8;
encodeExcValues(intBuffer, excValues, excNum, excBytes, excByteOffset);
// encode header
encodeHeader(intBuffer, size, numBits, excNum, excFirstPos, excBytes);
return (excByteOffset + excBytes*excNum + 3)/4*4;
}
public static int decompress(IntBuffer intBuffer, int[] data) {
intBuffer.rewind();
int header = intBuffer.get();
int numInts = (header & MASK[8]) + 1;
int excNum = ((header >> 8) & MASK[8]) + 1;
int excFirstPos = ((header >> 16) & MASK[8]) - 1;
int excBytes = PER_EXCEPTION_SIZE[(header >> 29) & MASK[2]];
int numBits = ((header >> 24) & MASK[5]) + 1;
// TODO: PackedIntsDecompress is hardewired to size==128 only
switch(numBits) {
case 1: PackedIntsDecompress.decode1(intBuffer, data); break;
case 2: PackedIntsDecompress.decode2(intBuffer, data); break;
case 3: PackedIntsDecompress.decode3(intBuffer, data); break;
case 4: PackedIntsDecompress.decode4(intBuffer, data); break;
case 5: PackedIntsDecompress.decode5(intBuffer, data); break;
case 6: PackedIntsDecompress.decode6(intBuffer, data); break;
case 7: PackedIntsDecompress.decode7(intBuffer, data); break;
case 8: PackedIntsDecompress.decode8(intBuffer, data); break;
case 9: PackedIntsDecompress.decode9(intBuffer, data); break;
case 10: PackedIntsDecompress.decode10(intBuffer, data); break;
case 11: PackedIntsDecompress.decode11(intBuffer, data); break;
case 12: PackedIntsDecompress.decode12(intBuffer, data); break;
case 13: PackedIntsDecompress.decode13(intBuffer, data); break;
case 14: PackedIntsDecompress.decode14(intBuffer, data); break;
case 15: PackedIntsDecompress.decode15(intBuffer, data); break;
case 16: PackedIntsDecompress.decode16(intBuffer, data); break;
case 17: PackedIntsDecompress.decode17(intBuffer, data); break;
case 18: PackedIntsDecompress.decode18(intBuffer, data); break;
case 19: PackedIntsDecompress.decode19(intBuffer, data); break;
case 20: PackedIntsDecompress.decode20(intBuffer, data); break;
case 21: PackedIntsDecompress.decode21(intBuffer, data); break;
case 22: PackedIntsDecompress.decode22(intBuffer, data); break;
case 23: PackedIntsDecompress.decode23(intBuffer, data); break;
case 24: PackedIntsDecompress.decode24(intBuffer, data); break;
case 25: PackedIntsDecompress.decode25(intBuffer, data); break;
case 26: PackedIntsDecompress.decode26(intBuffer, data); break;
case 27: PackedIntsDecompress.decode27(intBuffer, data); break;
case 28: PackedIntsDecompress.decode28(intBuffer, data); break;
case 29: PackedIntsDecompress.decode29(intBuffer, data); break;
case 30: PackedIntsDecompress.decode30(intBuffer, data); break;
case 31: PackedIntsDecompress.decode31(intBuffer, data); break;
case 32: PackedIntsDecompress.decode32(intBuffer, data); break;
default:
throw new IllegalStateException("Unknown numFrameBits " + numBits);
}
patchException(intBuffer,data,excNum,excFirstPos,excBytes);
return numInts;
}
static void encodeHeader(IntBuffer intBuffer, int numInts, int numBits, int excNum, int excFirstPos, int excBytes) {
int header = getHeader(numInts,numBits,excNum,excFirstPos,excBytes);
intBuffer.put(0, header);
}
static void encodeExcValues(IntBuffer intBuffer, int[] values, int num, int perbytes, int byteOffset) {
if (num == 0)
return;
if (perbytes == 1) {
int curBytePos = byteOffset;
for (int i=0; i<num; ++i) {
int curIntPos = curBytePos / 4;
setBufferIntBits(intBuffer, curIntPos, (curBytePos & 3)*8, 8, values[i]);
curBytePos++;
}
} else if (perbytes == 2) {
int shortOffset = (byteOffset+1)/2;
int curIntPos = shortOffset/2;
int i=0;
if ((shortOffset & 1) == 1) { // cut head to ensure remaining fit ints
setBufferIntBits(intBuffer, curIntPos++, 16, 16, values[i++]);
}
for (; i<num-1; i+=2) {
intBuffer.put(curIntPos++, (values[i+1]<<16) | values[i]);
}
if (i<num) {
intBuffer.put(curIntPos, values[i]); // cut tail, also clear high 16 bits
}
} else if (perbytes == 4) {
int curIntPos = (byteOffset+3) / 4;
for (int i=0; i<num; ++i) {
intBuffer.put(curIntPos++, values[i]);
}
}
}
// TODO: since numInts===128, we don't need to rewind intBuffer.
// however, tail of normal area may share a same int with head of exception area
// which means patchException may lose heading exceptions.
public static void patchException(IntBuffer intBuffer, int[] data, int excNum, int excFirstPos, int excBytes) {
if (excFirstPos == -1) {
return;
}
int curPos=excFirstPos;
int i,j;
if (excBytes == 1) {
for (i=0; i+3<excNum; i+=4) {
final int curInt = intBuffer.get();
curPos = patch(data, curPos, (curInt) & MASK[8]);
curPos = patch(data, curPos, (curInt >>> 8) & MASK[8]);
curPos = patch(data, curPos, (curInt >>> 16) & MASK[8]);
curPos = patch(data, curPos, (curInt >>> 24) & MASK[8]);
}
if (i<excNum) {
final int curInt = intBuffer.get();
for (j=0; j<32 && i<excNum; j+=8,i++) {
curPos = patch(data, curPos, (curInt >>> j) & MASK[8]);
}
}
} else if (excBytes == 2) {
for (i=0; i+1<excNum; i+=2) {
final int curInt = intBuffer.get();
curPos = patch(data, curPos, (curInt) & MASK[16]);
curPos = patch(data, curPos, (curInt >>> 16) & MASK[16]);
}
if (i<excNum) {
final int curInt = intBuffer.get();
curPos = patch(data, curPos, (curInt) & MASK[16]);
}
} else if (excBytes == 4) {
for (i=0; i<excNum; i++) {
curPos = patch(data, curPos, intBuffer.get());
}
}
}
static int patch(int[]data, int pos, int value) {
int nextPos = data[pos] + pos + 1;
data[pos] = value;
assert nextPos > pos;
return nextPos;
}
// TODO: shall we use 32 NumBits directly if it exceeds 28 bits?
static int getNumBits(final int[] data, int size) {
int optBits=1;
int optSize=estimateCompressedSize(data,size,1);
for (int i=2; i<=32; ++i) {
int curSize=estimateCompressedSize(data,size,i);
if (curSize<optSize) {
optSize=curSize;
optBits=i;
}
}
return optBits;
}
// loosely estimate int size of each compressed block, based on parameter b
// ignore force exceptions
static int estimateCompressedSize(final int[] data, int size, int numBits) {
int totalBytes=(numBits*size+7)/8; // always round to byte
int excNum=0;
int curExcBytes=1;
for (int i=0; i<size; ++i) {
if ((data[i] & ~MASK[numBits]) != 0) { // exception
excNum++;
if (curExcBytes<2 && (data[i] & ~MASK[8]) != 0) { // exceed 1 byte exception
curExcBytes=2;
}
if (curExcBytes<4 && (data[i] & ~MASK[16]) != 0) { // exceed 2 byte exception
curExcBytes=4;
}
}
}
if (curExcBytes==2) {
totalBytes=((totalBytes+1)/2)*2; // round up to 2x bytes before filling exceptions
}
else if (curExcBytes==4) {
totalBytes=((totalBytes+3)/4)*4; // round up to 4x bytes
}
totalBytes+=excNum*curExcBytes;
return totalBytes/4*4+HEADER_INT_SIZE; // round up to ints
}
/** The 4 byte header (32 bits) contains (from lsb to msb):
*
* - 8 bits for uncompressed int num - 1 (use up to 7 bits i.e 128 actually)
*
* - 8 bits for exception num - 1 (when no exceptions, this is undefined)
*
* - 8 bits for the index of the first exception + 1 (when no exception, this is 0)
*
* - 5 bits for num of frame bits - 1
* - 2 bits for the exception code: 00: byte, 01: short, 10: int
* - 1 bit unused
*
*/
static int getHeader(int numInts, int numBits, int excNum, int excFirstPos, int excBytes) {
return (numInts-1)
| (((excNum-1) & MASK[8]) << 8)
| ((excFirstPos+1) << 16)
| ((numBits-1) << 24)
| ((excBytes/2) << 29);
}
}

View File

@ -19,7 +19,7 @@ package org.apache.lucene.codecs.pfor;
import java.nio.IntBuffer; import java.nio.IntBuffer;
final class ForDecompressImpl { final class PackedIntsDecompress {
// nocommit: assess perf of this to see if specializing is really needed // nocommit: assess perf of this to see if specializing is really needed

View File

@ -21,7 +21,6 @@ Generate source code for java classes for FOR decompression.
""" """
USE_SCRATCH = False USE_SCRATCH = False
#USE_SCRATCH = True
def bitsExpr(i, numFrameBits): def bitsExpr(i, numFrameBits):
framePos = i * numFrameBits framePos = i * numFrameBits
@ -51,7 +50,7 @@ def bitsExpr(i, numFrameBits):
def genDecompress(): def genDecompress():
className = "ForDecompressImpl" className = "PackedIntsDecompress"
fileName = className + ".java" fileName = className + ".java"
imports = "import java.nio.IntBuffer;\n" imports = "import java.nio.IntBuffer;\n"
f = open(fileName, 'w') f = open(fileName, 'w')
@ -80,7 +79,7 @@ def genDecompress():
w("import java.nio.IntBuffer;\n\n") w("import java.nio.IntBuffer;\n\n")
w("final class ForDecompressImpl {\n") w("final class PackedIntsDecompress {\n")
w('\n // nocommit: assess perf of this to see if specializing is really needed\n') w('\n // nocommit: assess perf of this to see if specializing is really needed\n')
@ -118,7 +117,7 @@ def genDecompress():
def genSwitch(): def genSwitch():
for numFrameBits in xrange(1, 33): for numFrameBits in xrange(1, 33):
print ' case %d: ForDecompressImpl.decode%d(compressedBuffer, encoded); break;' % (numFrameBits, numFrameBits) print ' case %d: PackedIntsDecompress.decode%d(compressedBuffer, encoded); break;' % (numFrameBits, numFrameBits)
if __name__ == "__main__": if __name__ == "__main__":
genDecompress() genDecompress()

View File

@ -18,3 +18,4 @@ org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat
org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat
org.apache.lucene.codecs.memory.MemoryPostingsFormat org.apache.lucene.codecs.memory.MemoryPostingsFormat
org.apache.lucene.codecs.pfor.ForPostingsFormat org.apache.lucene.codecs.pfor.ForPostingsFormat
org.apache.lucene.codecs.pfor.PForPostingsFormat

View File

@ -23,7 +23,7 @@ import java.nio.*;
import org.apache.lucene.codecs.pfor.*; import org.apache.lucene.codecs.pfor.*;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
public class TestForUtil extends LuceneTestCase { public class TestPForUtil extends LuceneTestCase {
static final int[] MASK={ 0x00000000, static final int[] MASK={ 0x00000000,
0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f, 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
@ -53,25 +53,25 @@ public class TestForUtil extends LuceneTestCase {
byte[] res = new byte[4+sz*8]; byte[] res = new byte[4+sz*8];
IntBuffer resBuffer = ByteBuffer.wrap(res).asIntBuffer(); IntBuffer resBuffer = ByteBuffer.wrap(res).asIntBuffer();
for (int i=0; i<sz-1; ++i) for (int i=0; i<sz-1; ++i)
buff[i]=gen.nextInt() & 0; buff[i]=gen.nextInt() & 1;
buff[sz-1]=gen.nextInt() & 0xffffffff; // create only one exception buff[sz-1]=gen.nextInt() & 0xffffffff; // create only one exception
Collections.shuffle(Arrays.asList(buff),new Random(seed)); Collections.shuffle(Arrays.asList(buff),new Random(seed));
for (int i=0; i<sz; ++i) for (int i=0; i<sz; ++i)
data[i] = buff[i]; data[i] = buff[i];
int ensz = ForUtil.compress(data,sz,resBuffer); int ensz = PForUtil.compress(data,sz,resBuffer);
if (ensz > sz*8+4) { if (ensz > sz*8+4) {
println("Excceed? "+ensz+">"+(sz*8+4)); println("Excceed? "+ensz+">"+(sz*8+4));
ensz=sz*8+4; ensz=sz*8+4;
} }
resBuffer.rewind(); resBuffer.rewind();
ForUtil.decompress(resBuffer,copy); PForUtil.decompress(resBuffer,copy);
// println(getHex(data,sz)+"\n"); //println(getHex(data,sz)+"\n");
// println(getHex(res,ensz)+"\n"); //println(getHex(res,ensz)+"\n");
// println(getHex(copy,sz)+"\n"); //println(getHex(copy,sz)+"\n");
assert cmp(data,sz,copy,sz)==true; assert cmp(data,sz,copy,sz)==true;
} }
@ -99,15 +99,14 @@ public class TestForUtil extends LuceneTestCase {
for (i=0; i<sz; ++i) for (i=0; i<sz; ++i)
data[i] = buff[i]; data[i] = buff[i];
int ensz = ForUtil.compress(data,sz,resBuffer); int ensz = PForUtil.compress(data,sz,resBuffer);
if (ensz > sz*8+4) { if (ensz > sz*8+4) {
println("Excceed? "+ensz+">"+(sz*8+4)); println("Excceed? "+ensz+">"+(sz*8+4));
ensz=sz*8+4; ensz=sz*8+4;
} }
int[] copy = new int[sz]; int[] copy = new int[sz];
PForUtil.decompress(resBuffer,copy);
ForUtil.decompress(resBuffer,copy);
// println(getHex(data,sz)+"\n"); // println(getHex(data,sz)+"\n");
// println(getHex(res,ensz)+"\n"); // println(getHex(res,ensz)+"\n");

View File

@ -76,7 +76,7 @@ import org.junit.BeforeClass;
// we won't even be running the actual code, only the impostor // we won't even be running the actual code, only the impostor
// @SuppressCodecs("Lucene4x") // @SuppressCodecs("Lucene4x")
// Sep codec cannot yet handle the offsets in our 4.x index! // Sep codec cannot yet handle the offsets in our 4.x index!
@SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom","For"}) @SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom","For","PFor"})
public class TestBackwardsCompatibility extends LuceneTestCase { public class TestBackwardsCompatibility extends LuceneTestCase {
// Uncomment these cases & run them on an older Lucene // Uncomment these cases & run them on an older Lucene

View File

@ -49,7 +49,7 @@ import org.apache.lucene.util._TestUtil;
// TODO: we really need to test indexingoffsets, but then getting only docs / docs + freqs. // TODO: we really need to test indexingoffsets, but then getting only docs / docs + freqs.
// not all codecs store prx separate... // not all codecs store prx separate...
// TODO: fix sep codec to index offsets so we can greatly reduce this list! // TODO: fix sep codec to index offsets so we can greatly reduce this list!
@SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom","For"}) @SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom","For","PFor"})
public class TestPostingsOffsets extends LuceneTestCase { public class TestPostingsOffsets extends LuceneTestCase {
IndexWriterConfig iwc; IndexWriterConfig iwc;

View File

@ -487,6 +487,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
add("MockSep"); add("MockSep");
add("MockRandom"); add("MockRandom");
add("For"); add("For");
add("PFor");
}}; }};
private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) throws IOException { private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) throws IOException {

View File

@ -95,6 +95,7 @@ public class MockRandomPostingsFormat extends PostingsFormat {
delegates.add(new MockVariableIntBlockPostingsFormat.MockIntFactory(baseBlockSize)); delegates.add(new MockVariableIntBlockPostingsFormat.MockIntFactory(baseBlockSize));
// TODO: others // TODO: others
delegates.add(new ForFactory()); delegates.add(new ForFactory());
delegates.add(new PForFactory());
} }
private static String getExtension(String fileName) { private static String getExtension(String fileName) {