mirror of https://github.com/apache/lucene.git
LUCENE-4620: inline encoding/decoding
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1433926 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
48a5004fe6
commit
800d2f322b
|
@ -49,20 +49,20 @@ import org.apache.lucene.util.IntsRef;
|
|||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class ChunksIntEncoder extends IntEncoder {
|
||||
|
||||
|
||||
/** Holds the values which must be encoded, outside the indicator. */
|
||||
protected final IntsRef encodeQueue;
|
||||
|
||||
|
||||
/** Represents bits flag byte. */
|
||||
protected int indicator = 0;
|
||||
|
||||
|
||||
/** Counts the current ordinal of the encoded value. */
|
||||
protected byte ordinal = 0;
|
||||
|
||||
|
||||
protected ChunksIntEncoder(int chunkSize) {
|
||||
encodeQueue = new IntsRef(chunkSize);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Encodes the values of the current chunk. First it writes the indicator, and
|
||||
* then it encodes the values outside the indicator.
|
||||
|
@ -76,17 +76,40 @@ public abstract class ChunksIntEncoder extends IntEncoder {
|
|||
|
||||
buf.bytes[buf.length++] = ((byte) indicator);
|
||||
for (int i = 0; i < encodeQueue.length; i++) {
|
||||
VInt8.encode(encodeQueue.ints[i], buf);
|
||||
// it is better if the encoding is inlined like so, and not e.g.
|
||||
// in a utility method
|
||||
int value = encodeQueue.ints[i];
|
||||
if ((value & ~0x7F) == 0) {
|
||||
buf.bytes[buf.length] = (byte) value;
|
||||
buf.length++;
|
||||
} else if ((value & ~0x3FFF) == 0) {
|
||||
buf.bytes[buf.length] = (byte) (0x80 | ((value & 0x3F80) >> 7));
|
||||
buf.bytes[buf.length + 1] = (byte) (value & 0x7F);
|
||||
buf.length += 2;
|
||||
} else if ((value & ~0x1FFFFF) == 0) {
|
||||
buf.bytes[buf.length] = (byte) (0x80 | ((value & 0x1FC000) >> 14));
|
||||
buf.bytes[buf.length + 1] = (byte) (0x80 | ((value & 0x3F80) >> 7));
|
||||
buf.bytes[buf.length + 2] = (byte) (value & 0x7F);
|
||||
buf.length += 3;
|
||||
} else if ((value & ~0xFFFFFFF) == 0) {
|
||||
buf.bytes[buf.length] = (byte) (0x80 | ((value & 0xFE00000) >> 21));
|
||||
buf.bytes[buf.length + 1] = (byte) (0x80 | ((value & 0x1FC000) >> 14));
|
||||
buf.bytes[buf.length + 2] = (byte) (0x80 | ((value & 0x3F80) >> 7));
|
||||
buf.bytes[buf.length + 3] = (byte) (value & 0x7F);
|
||||
buf.length += 4;
|
||||
} else {
|
||||
buf.bytes[buf.length] = (byte) (0x80 | ((value & 0xF0000000) >> 28));
|
||||
buf.bytes[buf.length + 1] = (byte) (0x80 | ((value & 0xFE00000) >> 21));
|
||||
buf.bytes[buf.length + 2] = (byte) (0x80 | ((value & 0x1FC000) >> 14));
|
||||
buf.bytes[buf.length + 3] = (byte) (0x80 | ((value & 0x3F80) >> 7));
|
||||
buf.bytes[buf.length + 4] = (byte) (value & 0x7F);
|
||||
buf.length += 5;
|
||||
}
|
||||
}
|
||||
|
||||
reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void reset() {
|
||||
ordinal = 0;
|
||||
indicator = 0;
|
||||
encodeQueue.length = 0;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -26,7 +26,7 @@ import org.apache.lucene.util.IntsRef;
|
|||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class DGapIntDecoder extends IntDecoder {
|
||||
public final class DGapIntDecoder extends IntDecoder {
|
||||
|
||||
private final IntDecoder decoder;
|
||||
|
||||
|
@ -35,13 +35,8 @@ public class DGapIntDecoder extends IntDecoder {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void reset() {
|
||||
decoder.reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doDecode(BytesRef buf, IntsRef values, int upto) {
|
||||
decoder.doDecode(buf, values, upto);
|
||||
public void decode(BytesRef buf, IntsRef values) {
|
||||
decoder.decode(buf, values);
|
||||
int prev = 0;
|
||||
for (int i = 0; i < values.length; i++) {
|
||||
values.ints[i] += prev;
|
||||
|
@ -51,7 +46,7 @@ public class DGapIntDecoder extends IntDecoder {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "DGap (" + decoder.toString() + ")";
|
||||
return "DGap(" + decoder.toString() + ")";
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -35,7 +35,7 @@ import org.apache.lucene.util.IntsRef;
|
|||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class DGapIntEncoder extends IntEncoderFilter {
|
||||
public final class DGapIntEncoder extends IntEncoderFilter {
|
||||
|
||||
/** Initializes with the given encoder. */
|
||||
public DGapIntEncoder(IntEncoder encoder) {
|
||||
|
@ -43,14 +43,15 @@ public class DGapIntEncoder extends IntEncoderFilter {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void doEncode(IntsRef values, BytesRef buf, int upto) {
|
||||
public void encode(IntsRef values, BytesRef buf) {
|
||||
int prev = 0;
|
||||
int upto = values.offset + values.length;
|
||||
for (int i = values.offset; i < upto; i++) {
|
||||
int tmp = values.ints[i];
|
||||
values.ints[i] -= prev;
|
||||
prev = tmp;
|
||||
}
|
||||
encoder.doEncode(values, buf, upto);
|
||||
encoder.encode(values, buf);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -60,7 +61,7 @@ public class DGapIntEncoder extends IntEncoderFilter {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "DGap (" + encoder.toString() + ")";
|
||||
return "DGap(" + encoder.toString() + ")";
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -45,10 +45,13 @@ public class EightFlagsIntDecoder extends IntDecoder {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void doDecode(BytesRef buf, IntsRef values, int upto) {
|
||||
while (buf.offset < upto) {
|
||||
public void decode(BytesRef buf, IntsRef values) {
|
||||
values.offset = values.length = 0;
|
||||
int upto = buf.offset + buf.length;
|
||||
int offset = buf.offset;
|
||||
while (offset < upto) {
|
||||
// read indicator
|
||||
int indicator = buf.bytes[buf.offset++] & 0xFF;
|
||||
int indicator = buf.bytes[offset++] & 0xFF;
|
||||
int ordinal = 0;
|
||||
|
||||
int capacityNeeded = values.length + 8;
|
||||
|
@ -59,11 +62,21 @@ public class EightFlagsIntDecoder extends IntDecoder {
|
|||
// process indicator, until we read 8 values, or end-of-buffer
|
||||
while (ordinal != 8) {
|
||||
if (DECODE_TABLE[indicator][ordinal++] == 0) {
|
||||
if (buf.offset == upto) { // end of buffer
|
||||
if (offset == upto) { // end of buffer
|
||||
return;
|
||||
}
|
||||
// decode the value from the stream.
|
||||
values.ints[values.length++] = VInt8.decode(buf) + 2;
|
||||
// it is better if the decoding is inlined like so, and not e.g.
|
||||
// in a utility method
|
||||
int value = 0;
|
||||
while (true) {
|
||||
byte b = buf.bytes[offset++];
|
||||
if (b >= 0) {
|
||||
values.ints[values.length++] = ((value << 7) | b) + 2;
|
||||
break;
|
||||
} else {
|
||||
value = (value << 7) | (b & 0x7F);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
values.ints[values.length++] = 1;
|
||||
}
|
||||
|
@ -73,7 +86,7 @@ public class EightFlagsIntDecoder extends IntDecoder {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "EightFlags (VInt8)";
|
||||
return "EightFlags(VInt8)";
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -59,7 +59,9 @@ public class EightFlagsIntEncoder extends ChunksIntEncoder {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void doEncode(IntsRef values, BytesRef buf, int upto) {
|
||||
public void encode(IntsRef values, BytesRef buf) {
|
||||
buf.offset = buf.length = 0;
|
||||
int upto = values.offset + values.length;
|
||||
for (int i = values.offset; i < upto; i++) {
|
||||
int value = values.ints[i];
|
||||
if (value == 1) {
|
||||
|
@ -88,7 +90,7 @@ public class EightFlagsIntEncoder extends ChunksIntEncoder {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "EightFlags (VInt)";
|
||||
return "EightFlags(VInt)";
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -45,10 +45,13 @@ public class FourFlagsIntDecoder extends IntDecoder {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void doDecode(BytesRef buf, IntsRef values, int upto) {
|
||||
while (buf.offset < upto) {
|
||||
public void decode(BytesRef buf, IntsRef values) {
|
||||
values.offset = values.length = 0;
|
||||
int upto = buf.offset + buf.length;
|
||||
int offset = buf.offset;
|
||||
while (offset < upto) {
|
||||
// read indicator
|
||||
int indicator = buf.bytes[buf.offset++] & 0xFF;
|
||||
int indicator = buf.bytes[offset++] & 0xFF;
|
||||
int ordinal = 0;
|
||||
|
||||
int capacityNeeded = values.length + 4;
|
||||
|
@ -59,11 +62,21 @@ public class FourFlagsIntDecoder extends IntDecoder {
|
|||
while (ordinal != 4) {
|
||||
byte decodeVal = DECODE_TABLE[indicator][ordinal++];
|
||||
if (decodeVal == 0) {
|
||||
if (buf.offset == upto) { // end of buffer
|
||||
if (offset == upto) { // end of buffer
|
||||
return;
|
||||
}
|
||||
// decode the value from the stream.
|
||||
values.ints[values.length++] = VInt8.decode(buf) + 4;
|
||||
// it is better if the decoding is inlined like so, and not e.g.
|
||||
// in a utility method
|
||||
int value = 0;
|
||||
while (true) {
|
||||
byte b = buf.bytes[offset++];
|
||||
if (b >= 0) {
|
||||
values.ints[values.length++] = ((value << 7) | b) + 4;
|
||||
break;
|
||||
} else {
|
||||
value = (value << 7) | (b & 0x7F);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
values.ints[values.length++] = decodeVal;
|
||||
}
|
||||
|
@ -73,7 +86,7 @@ public class FourFlagsIntDecoder extends IntDecoder {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "FourFlags (VInt8)";
|
||||
return "FourFlags(VInt8)";
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -65,7 +65,9 @@ public class FourFlagsIntEncoder extends ChunksIntEncoder {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void doEncode(IntsRef values, BytesRef buf, int upto) {
|
||||
public void encode(IntsRef values, BytesRef buf) {
|
||||
buf.offset = buf.length = 0;
|
||||
int upto = values.offset + values.length;
|
||||
for (int i = values.offset; i < upto; i++) {
|
||||
int value = values.ints[i];
|
||||
if (value <= 3) {
|
||||
|
@ -94,7 +96,7 @@ public class FourFlagsIntEncoder extends ChunksIntEncoder {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "FourFlags (VInt)";
|
||||
return "FourFlags(VInt)";
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -27,44 +27,10 @@ import org.apache.lucene.util.IntsRef;
|
|||
*/
|
||||
public abstract class IntDecoder {
|
||||
|
||||
/**
|
||||
* Performs the actual decoding. Values should be read from
|
||||
* {@link BytesRef#offset} up to {@code upto}. Also, {@code values} offset and
|
||||
* length are set to 0 and the encoder is expected to update
|
||||
* {@link IntsRef#length}, but not {@link IntsRef#offset}.
|
||||
*
|
||||
* <p>
|
||||
* <b>NOTE:</b> it is ok to use the buffer's offset as the current position in
|
||||
* the buffer (and modify it), it will be reset by
|
||||
* {@link #decode(BytesRef, IntsRef)}.
|
||||
*/
|
||||
protected abstract void doDecode(BytesRef buf, IntsRef values, int upto);
|
||||
|
||||
/**
|
||||
* Called before {@link #doDecode(BytesRef, IntsRef, int)} so that decoders
|
||||
* can reset their state.
|
||||
*/
|
||||
protected void reset() {
|
||||
// do nothing by default
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes the values from the buffer into the given {@link IntsRef}. Note
|
||||
* that {@code values.offset} and {@code values.length} are set to 0.
|
||||
*/
|
||||
public final void decode(BytesRef buf, IntsRef values) {
|
||||
values.offset = values.length = 0; // must do that because we cannot grow() them otherwise
|
||||
|
||||
// some decoders may use the buffer's offset as a position index, so save
|
||||
// current offset.
|
||||
int bufOffset = buf.offset;
|
||||
|
||||
reset();
|
||||
doDecode(buf, values, buf.offset + buf.length);
|
||||
assert values.offset == 0 : "offset should not have been modified by the decoder.";
|
||||
|
||||
// fix offset
|
||||
buf.offset = bufOffset;
|
||||
}
|
||||
public abstract void decode(BytesRef buf, IntsRef values);
|
||||
|
||||
}
|
||||
|
|
|
@ -31,32 +31,11 @@ public abstract class IntEncoder {
|
|||
|
||||
public IntEncoder() {}
|
||||
|
||||
/**
|
||||
* Performs the actual encoding. Values should be read from
|
||||
* {@link IntsRef#offset} up to {@code upto}. Also, it is guaranteed that
|
||||
* {@code buf's} offset and length are set to 0 and the encoder is expected to
|
||||
* update {@link BytesRef#length}, but not {@link BytesRef#offset}.
|
||||
*/
|
||||
protected abstract void doEncode(IntsRef values, BytesRef buf, int upto);
|
||||
|
||||
/**
|
||||
* Called before {@link #doEncode(IntsRef, BytesRef, int)} so that encoders
|
||||
* can reset their state.
|
||||
*/
|
||||
protected void reset() {
|
||||
// do nothing by default
|
||||
}
|
||||
|
||||
/**
|
||||
* Encodes the values to the given buffer. Note that the buffer's offset and
|
||||
* length are set to 0.
|
||||
*/
|
||||
public final void encode(IntsRef values, BytesRef buf) {
|
||||
buf.offset = buf.length = 0;
|
||||
reset();
|
||||
doEncode(values, buf, values.offset + values.length);
|
||||
assert buf.offset == 0 : "offset should not have been modified by the encoder.";
|
||||
}
|
||||
public abstract void encode(IntsRef values, BytesRef buf);
|
||||
|
||||
/**
|
||||
* Returns an {@link IntDecoder} which can decode the values that were encoded
|
||||
|
|
|
@ -31,9 +31,4 @@ public abstract class IntEncoderFilter extends IntEncoder {
|
|||
this.encoder = encoder;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
encoder.reset();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -42,14 +42,10 @@ public class NOnesIntDecoder extends FourFlagsIntDecoder {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void reset() {
|
||||
public void decode(BytesRef buf, IntsRef values) {
|
||||
values.offset = values.length = 0;
|
||||
internalBuffer.length = 0;
|
||||
super.reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doDecode(BytesRef buf, IntsRef values, int upto) {
|
||||
super.doDecode(buf, internalBuffer, upto);
|
||||
super.decode(buf, internalBuffer);
|
||||
if (values.ints.length < internalBuffer.length) {
|
||||
// need space for internalBuffer.length to internalBuffer.length*N,
|
||||
// grow mildly at first
|
||||
|
@ -84,7 +80,7 @@ public class NOnesIntDecoder extends FourFlagsIntDecoder {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "NOnes (" + n + ") (" + super.toString() + ")";
|
||||
return "NOnes(" + n + ") (" + super.toString() + ")";
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -65,19 +65,15 @@ public class NOnesIntEncoder extends FourFlagsIntEncoder {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void reset() {
|
||||
public void encode(IntsRef values, BytesRef buf) {
|
||||
internalBuffer.length = 0;
|
||||
super.reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doEncode(IntsRef values, BytesRef buf, int upto) {
|
||||
// make sure the internal buffer is large enough
|
||||
if (values.length > internalBuffer.ints.length) {
|
||||
internalBuffer.grow(values.length);
|
||||
}
|
||||
|
||||
int onesCounter = 0;
|
||||
int upto = values.offset + values.length;
|
||||
for (int i = values.offset; i < upto; i++) {
|
||||
int value = values.ints[i];
|
||||
if (value == 1) {
|
||||
|
@ -102,7 +98,7 @@ public class NOnesIntEncoder extends FourFlagsIntEncoder {
|
|||
--onesCounter;
|
||||
internalBuffer.ints[internalBuffer.length++] = 1;
|
||||
}
|
||||
super.doEncode(internalBuffer, buf, internalBuffer.length);
|
||||
super.encode(internalBuffer, buf);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -112,7 +108,7 @@ public class NOnesIntEncoder extends FourFlagsIntEncoder {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "NOnes (" + n + ") (" + super.toString() + ")";
|
||||
return "NOnes(" + n + ") (" + super.toString() + ")";
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
package org.apache.lucene.util.encoding;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -25,19 +27,24 @@ import org.apache.lucene.util.IntsRef;
|
|||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class SimpleIntDecoder extends IntDecoder {
|
||||
public final class SimpleIntDecoder extends IntDecoder {
|
||||
|
||||
@Override
|
||||
protected void doDecode(BytesRef buf, IntsRef values, int upto) {
|
||||
while (buf.offset < upto) {
|
||||
if (values.length == values.ints.length) {
|
||||
values.grow(values.length + 10); // grow by few items, however not too many
|
||||
}
|
||||
public void decode(BytesRef buf, IntsRef values) {
|
||||
values.offset = values.length = 0;
|
||||
int numValues = buf.length / 4; // every value is 4 bytes
|
||||
if (values.ints.length < numValues) { // offset and length are 0
|
||||
values.ints = new int[ArrayUtil.oversize(numValues, RamUsageEstimator.NUM_BYTES_INT)];
|
||||
}
|
||||
|
||||
int offset = buf.offset;
|
||||
int upto = buf.offset + buf.length;
|
||||
while (offset < upto) {
|
||||
values.ints[values.length++] =
|
||||
((buf.bytes[buf.offset++] & 0xFF) << 24) |
|
||||
((buf.bytes[buf.offset++] & 0xFF) << 16) |
|
||||
((buf.bytes[buf.offset++] & 0xFF) << 8) |
|
||||
(buf.bytes[buf.offset++] & 0xFF);
|
||||
((buf.bytes[offset++] & 0xFF) << 24) |
|
||||
((buf.bytes[offset++] & 0xFF) << 16) |
|
||||
((buf.bytes[offset++] & 0xFF) << 8) |
|
||||
(buf.bytes[offset++] & 0xFF);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -25,16 +25,18 @@ import org.apache.lucene.util.IntsRef;
|
|||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class SimpleIntEncoder extends IntEncoder {
|
||||
public final class SimpleIntEncoder extends IntEncoder {
|
||||
|
||||
@Override
|
||||
protected void doEncode(IntsRef values, BytesRef buf, int upto) {
|
||||
public void encode(IntsRef values, BytesRef buf) {
|
||||
buf.offset = buf.length = 0;
|
||||
// ensure there's enough room in the buffer
|
||||
int bytesNeeded = values.length * 4;
|
||||
if (buf.bytes.length < bytesNeeded) {
|
||||
buf.grow(bytesNeeded);
|
||||
}
|
||||
|
||||
int upto = values.offset + values.length;
|
||||
for (int i = values.offset; i < upto; i++) {
|
||||
int value = values.ints[i];
|
||||
buf.bytes[buf.length++] = (byte) (value >>> 24);
|
||||
|
|
|
@ -28,7 +28,7 @@ import org.apache.lucene.util.IntsRef;
|
|||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class SortingIntEncoder extends IntEncoderFilter {
|
||||
public final class SortingIntEncoder extends IntEncoderFilter {
|
||||
|
||||
/** Initializes with the given encoder. */
|
||||
public SortingIntEncoder(IntEncoder encoder) {
|
||||
|
@ -36,9 +36,9 @@ public class SortingIntEncoder extends IntEncoderFilter {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void doEncode(IntsRef values, BytesRef buf, int upto) {
|
||||
Arrays.sort(values.ints, values.offset, upto);
|
||||
encoder.doEncode(values, buf, upto);
|
||||
public void encode(IntsRef values, BytesRef buf) {
|
||||
Arrays.sort(values.ints, values.offset, values.offset + values.length);
|
||||
encoder.encode(values, buf);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -48,7 +48,7 @@ public class SortingIntEncoder extends IntEncoderFilter {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Sorting (" + encoder.toString() + ")";
|
||||
return "Sorting(" + encoder.toString() + ")";
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -36,9 +36,10 @@ public final class UniqueValuesIntEncoder extends IntEncoderFilter {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void doEncode(IntsRef values, BytesRef buf, int upto) {
|
||||
public void encode(IntsRef values, BytesRef buf) {
|
||||
int prev = values.ints[values.offset];
|
||||
int idx = values.offset + 1;
|
||||
int upto = values.offset + values.length;
|
||||
for (int i = idx; i < upto; i++) {
|
||||
if (values.ints[i] != prev) {
|
||||
values.ints[idx++] = values.ints[i];
|
||||
|
@ -46,7 +47,7 @@ public final class UniqueValuesIntEncoder extends IntEncoderFilter {
|
|||
}
|
||||
}
|
||||
values.length = idx - values.offset;
|
||||
encoder.doEncode(values, buf, idx);
|
||||
encoder.encode(values, buf);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -56,7 +57,7 @@ public final class UniqueValuesIntEncoder extends IntEncoderFilter {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Unique (" + encoder.toString() + ")";
|
||||
return "Unique(" + encoder.toString() + ")";
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,138 +0,0 @@
|
|||
package org.apache.lucene.util.encoding;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Variable-length encoding of 32-bit integers, into 8-bit bytes. A number is
|
||||
* encoded as follows:
|
||||
* <ul>
|
||||
* <li>If it is less than 127 and non-negative (i.e., if the number uses only 7
|
||||
* bits), it is encoded as as single byte: 0bbbbbbb.
|
||||
* <li>If its highest nonzero bit is greater than bit 6 (0x40), it is
|
||||
* represented as a series of bytes, each byte's 7 LSB containing bits from the
|
||||
* original value, with the MSB set for all but the last byte. The first encoded
|
||||
* byte contains the highest nonzero bits from the original; the second byte
|
||||
* contains the next 7 MSB; and so on, with the last byte containing the 7 LSB
|
||||
* of the original.
|
||||
* </ul>
|
||||
* Examples:
|
||||
* <ol>
|
||||
* <li>n = 117 = 1110101: This has fewer than 8 significant bits, and so is
|
||||
* encoded as 01110101 = 0x75.
|
||||
* <li>n = 100000 = (binary) 11000011010100000. This has 17 significant bits,
|
||||
* and so needs three Vint8 bytes. Left-zero-pad it to a multiple of 7 bits,
|
||||
* then split it into chunks of 7 and add an MSB, 0 for the last byte, 1 for the
|
||||
* others: 1|0000110 1|0001101 0|0100000 = 0x86 0x8D 0x20.
|
||||
* </ol>
|
||||
* {@link #encode(int, BytesRef)} and {@link #decode(BytesRef)} will correctly
|
||||
* handle any 32-bit integer, but for negative numbers, and positive numbers
|
||||
* with more than 28 significant bits, encoding requires 5 bytes; this is not an
|
||||
* efficient encoding scheme for large positive numbers or any negative number.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class VInt8 {
|
||||
|
||||
/** The maximum number of bytes needed to encode an integer. */
|
||||
public static final int MAXIMUM_BYTES_NEEDED = 5;
|
||||
|
||||
/**
|
||||
* Decodes an int from the given bytes, starting at {@link BytesRef#offset}.
|
||||
* Returns the decoded bytes and updates {@link BytesRef#offset}.
|
||||
*/
|
||||
public static int decode(BytesRef bytes) {
|
||||
/*
|
||||
This is the original code of this method, but a Hotspot bug
|
||||
corrupted the for-loop of DataInput.readVInt() (see LUCENE-2975)
|
||||
so the loop was unwounded here too, to be on the safe side
|
||||
int value = 0;
|
||||
while (true) {
|
||||
byte first = bytes.bytes[bytes.offset++];
|
||||
value |= first & 0x7F;
|
||||
if ((first & 0x80) == 0) {
|
||||
return value;
|
||||
}
|
||||
value <<= 7;
|
||||
}
|
||||
*/
|
||||
|
||||
// byte 1
|
||||
byte b = bytes.bytes[bytes.offset++];
|
||||
if (b >= 0) return b;
|
||||
|
||||
// byte 2
|
||||
int value = b & 0x7F;
|
||||
b = bytes.bytes[bytes.offset++];
|
||||
value = (value << 7) | b & 0x7F;
|
||||
if (b >= 0) return value;
|
||||
|
||||
// byte 3
|
||||
b = bytes.bytes[bytes.offset++];
|
||||
value = (value << 7) | b & 0x7F;
|
||||
if (b >= 0) return value;
|
||||
|
||||
// byte 4
|
||||
b = bytes.bytes[bytes.offset++];
|
||||
value = (value << 7) | b & 0x7F;
|
||||
if (b >= 0) return value;
|
||||
|
||||
// byte 5
|
||||
b = bytes.bytes[bytes.offset++];
|
||||
return (value << 7) | b & 0x7F;
|
||||
}
|
||||
|
||||
/**
|
||||
* Encodes the given number into bytes, starting at {@link BytesRef#length}.
|
||||
* Assumes that the array is large enough.
|
||||
*/
|
||||
public static void encode(int value, BytesRef bytes) {
|
||||
if ((value & ~0x7F) == 0) {
|
||||
bytes.bytes[bytes.length] = (byte) value;
|
||||
bytes.length++;
|
||||
} else if ((value & ~0x3FFF) == 0) {
|
||||
bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0x3F80) >> 7));
|
||||
bytes.bytes[bytes.length + 1] = (byte) (value & 0x7F);
|
||||
bytes.length += 2;
|
||||
} else if ((value & ~0x1FFFFF) == 0) {
|
||||
bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0x1FC000) >> 14));
|
||||
bytes.bytes[bytes.length + 1] = (byte) (0x80 | ((value & 0x3F80) >> 7));
|
||||
bytes.bytes[bytes.length + 2] = (byte) (value & 0x7F);
|
||||
bytes.length += 3;
|
||||
} else if ((value & ~0xFFFFFFF) == 0) {
|
||||
bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0xFE00000) >> 21));
|
||||
bytes.bytes[bytes.length + 1] = (byte) (0x80 | ((value & 0x1FC000) >> 14));
|
||||
bytes.bytes[bytes.length + 2] = (byte) (0x80 | ((value & 0x3F80) >> 7));
|
||||
bytes.bytes[bytes.length + 3] = (byte) (value & 0x7F);
|
||||
bytes.length += 4;
|
||||
} else {
|
||||
bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0xF0000000) >> 28));
|
||||
bytes.bytes[bytes.length + 1] = (byte) (0x80 | ((value & 0xFE00000) >> 21));
|
||||
bytes.bytes[bytes.length + 2] = (byte) (0x80 | ((value & 0x1FC000) >> 14));
|
||||
bytes.bytes[bytes.length + 3] = (byte) (0x80 | ((value & 0x3F80) >> 7));
|
||||
bytes.bytes[bytes.length + 4] = (byte) (value & 0x7F);
|
||||
bytes.length += 5;
|
||||
}
|
||||
}
|
||||
|
||||
private VInt8() {
|
||||
// Just making it impossible to instantiate.
|
||||
}
|
||||
|
||||
}
|
|
@ -1,7 +1,9 @@
|
|||
package org.apache.lucene.util.encoding;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -25,15 +27,32 @@ import org.apache.lucene.util.IntsRef;
|
|||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class VInt8IntDecoder extends IntDecoder {
|
||||
public final class VInt8IntDecoder extends IntDecoder {
|
||||
|
||||
@Override
|
||||
protected void doDecode(BytesRef buf, IntsRef values, int upto) {
|
||||
while (buf.offset < upto) {
|
||||
if (values.length == values.ints.length) {
|
||||
values.grow(values.length + 10); // grow by few items, however not too many
|
||||
public void decode(BytesRef buf, IntsRef values) {
|
||||
values.offset = values.length = 0;
|
||||
|
||||
// grow the buffer up front, even if by a large number of values (buf.length)
|
||||
// that saves the need to check inside the loop for every decoded value if
|
||||
// the buffer needs to grow.
|
||||
if (values.ints.length < buf.length) {
|
||||
values.ints = new int[ArrayUtil.oversize(buf.length, RamUsageEstimator.NUM_BYTES_INT)];
|
||||
}
|
||||
|
||||
// it is better if the decoding is inlined like so, and not e.g.
|
||||
// in a utility method
|
||||
int upto = buf.offset + buf.length;
|
||||
int value = 0;
|
||||
int offset = buf.offset;
|
||||
while (offset < upto) {
|
||||
byte b = buf.bytes[offset++];
|
||||
if (b >= 0) {
|
||||
values.ints[values.length++] = (value << 7) | b;
|
||||
value = 0;
|
||||
} else {
|
||||
value = (value << 7) | (b & 0x7F);
|
||||
}
|
||||
values.ints[values.length++] = VInt8.decode(buf);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -47,17 +47,47 @@ import org.apache.lucene.util.IntsRef;
|
|||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class VInt8IntEncoder extends IntEncoder {
|
||||
public final class VInt8IntEncoder extends IntEncoder {
|
||||
|
||||
@Override
|
||||
protected void doEncode(IntsRef values, BytesRef buf, int upto) {
|
||||
public void encode(IntsRef values, BytesRef buf) {
|
||||
buf.offset = buf.length = 0;
|
||||
int maxBytesNeeded = 5 * values.length; // at most 5 bytes per VInt
|
||||
if (buf.bytes.length < maxBytesNeeded) {
|
||||
buf.grow(maxBytesNeeded);
|
||||
}
|
||||
|
||||
int upto = values.offset + values.length;
|
||||
for (int i = values.offset; i < upto; i++) {
|
||||
VInt8.encode(values.ints[i], buf);
|
||||
// it is better if the encoding is inlined like so, and not e.g.
|
||||
// in a utility method
|
||||
int value = values.ints[i];
|
||||
if ((value & ~0x7F) == 0) {
|
||||
buf.bytes[buf.length] = (byte) value;
|
||||
buf.length++;
|
||||
} else if ((value & ~0x3FFF) == 0) {
|
||||
buf.bytes[buf.length] = (byte) (0x80 | ((value & 0x3F80) >> 7));
|
||||
buf.bytes[buf.length + 1] = (byte) (value & 0x7F);
|
||||
buf.length += 2;
|
||||
} else if ((value & ~0x1FFFFF) == 0) {
|
||||
buf.bytes[buf.length] = (byte) (0x80 | ((value & 0x1FC000) >> 14));
|
||||
buf.bytes[buf.length + 1] = (byte) (0x80 | ((value & 0x3F80) >> 7));
|
||||
buf.bytes[buf.length + 2] = (byte) (value & 0x7F);
|
||||
buf.length += 3;
|
||||
} else if ((value & ~0xFFFFFFF) == 0) {
|
||||
buf.bytes[buf.length] = (byte) (0x80 | ((value & 0xFE00000) >> 21));
|
||||
buf.bytes[buf.length + 1] = (byte) (0x80 | ((value & 0x1FC000) >> 14));
|
||||
buf.bytes[buf.length + 2] = (byte) (0x80 | ((value & 0x3F80) >> 7));
|
||||
buf.bytes[buf.length + 3] = (byte) (value & 0x7F);
|
||||
buf.length += 4;
|
||||
} else {
|
||||
buf.bytes[buf.length] = (byte) (0x80 | ((value & 0xF0000000) >> 28));
|
||||
buf.bytes[buf.length + 1] = (byte) (0x80 | ((value & 0xFE00000) >> 21));
|
||||
buf.bytes[buf.length + 2] = (byte) (0x80 | ((value & 0x1FC000) >> 14));
|
||||
buf.bytes[buf.length + 3] = (byte) (0x80 | ((value & 0x3F80) >> 7));
|
||||
buf.bytes[buf.length + 4] = (byte) (value & 0x7F);
|
||||
buf.length += 5;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,12 @@
|
|||
package org.apache.lucene.facet.index.params;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.encoding.DGapIntEncoder;
|
||||
import org.apache.lucene.util.encoding.IntDecoder;
|
||||
import org.apache.lucene.util.encoding.IntEncoder;
|
||||
import org.apache.lucene.util.encoding.SortingIntEncoder;
|
||||
import org.apache.lucene.util.encoding.UniqueValuesIntEncoder;
|
||||
import org.apache.lucene.util.encoding.VInt8IntEncoder;
|
||||
import org.junit.Test;
|
||||
|
||||
/*
|
||||
|
@ -26,8 +32,10 @@ public class CategoryListParamsTest extends LuceneTestCase {
|
|||
public void testDefaultSettings() {
|
||||
CategoryListParams clp = new CategoryListParams();
|
||||
assertEquals("wrong default field", "$facets", clp.field);
|
||||
assertEquals("unexpected default encoder", "Sorting (Unique (DGap (VInt8)))", clp.createEncoder().toString());
|
||||
assertEquals("unexpected default decoder", "DGap (VInt8)", clp.createEncoder().createMatchingDecoder().toString());
|
||||
IntEncoder encoder = new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder())));
|
||||
IntDecoder decoder = encoder.createMatchingDecoder();
|
||||
assertEquals("unexpected default encoder", encoder.toString(), clp.createEncoder().toString());
|
||||
assertEquals("unexpected default decoder", decoder.toString(), clp.createEncoder().createMatchingDecoder().toString());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -64,9 +64,12 @@ public class EncodingTest extends LuceneTestCase {
|
|||
|
||||
BytesRef bytes = new BytesRef(100); // some initial capacity - encoders should grow the byte[]
|
||||
IntsRef values = new IntsRef(100); // some initial capacity - decoders should grow the int[]
|
||||
encoding(encoder, data, bytes);
|
||||
decoding(bytes, values, encoder.createMatchingDecoder());
|
||||
assertTrue(expected.intsEquals(values));
|
||||
for (int i = 0; i < 2; i++) {
|
||||
// run 2 iterations to catch encoders/decoders which don't reset properly
|
||||
encoding(encoder, data, bytes);
|
||||
decoding(bytes, values, encoder.createMatchingDecoder());
|
||||
assertTrue(expected.intsEquals(values));
|
||||
}
|
||||
}
|
||||
|
||||
private static void encoding(IntEncoder encoder, IntsRef data, BytesRef bytes) throws IOException {
|
||||
|
|
|
@ -1,54 +0,0 @@
|
|||
package org.apache.lucene.util.encoding;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Tests the {@link VInt8} class.
|
||||
*/
|
||||
public class Vint8Test extends LuceneTestCase {
|
||||
|
||||
private static final int[] TEST_VALUES = {
|
||||
-1000000000,
|
||||
-1, 0, (1 << 7) - 1, 1 << 7, (1 << 14) - 1, 1 << 14,
|
||||
(1 << 21) - 1, 1 << 21, (1 << 28) - 1, 1 << 28
|
||||
};
|
||||
private static int[] BYTES_NEEDED_TEST_VALUES = {
|
||||
5, 5, 1, 1, 2, 2, 3, 3, 4, 4, 5
|
||||
};
|
||||
|
||||
@Test
|
||||
public void testBytesRef() throws Exception {
|
||||
BytesRef bytes = new BytesRef(256);
|
||||
int expectedSize = 0;
|
||||
for (int j = 0; j < TEST_VALUES.length; j++) {
|
||||
VInt8.encode(TEST_VALUES[j], bytes);
|
||||
expectedSize += BYTES_NEEDED_TEST_VALUES[j];
|
||||
}
|
||||
assertEquals(expectedSize, bytes.length);
|
||||
|
||||
for (int j = 0; j < TEST_VALUES.length; j++) {
|
||||
assertEquals(TEST_VALUES[j], VInt8.decode(bytes));
|
||||
}
|
||||
assertEquals(bytes.offset, bytes.length);
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue