Add variable-length long encoding
This commit adds a method of encoding longs using a variable-length representation. This encoding is an implementation of the zig-zag encoding from protocol buffers. Numbers that have a small absolute value will use a small number of bytes. This is achieved by zig-zagging through the space of longs in order of increasing absolute value (0, -1, 1, -2, 2, …, Long.MAX_VALUE, Long.MIN_VALUE) -> (0, 1, 2, 3, 4, …, -2, -1). The resulting values are then encoded as if they represent unsigned numbers.
This commit is contained in:
parent
d082868436
commit
9a11e75cf5
|
@ -24,6 +24,7 @@ import org.apache.lucene.index.IndexFormatTooNewException;
|
|||
import org.apache.lucene.index.IndexFormatTooOldException;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.store.LockObtainFailedException;
|
||||
import org.apache.lucene.util.BitUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
import org.elasticsearch.Version;
|
||||
|
@ -234,6 +235,20 @@ public abstract class StreamInput extends InputStream {
|
|||
return i | ((b & 0x7FL) << 56);
|
||||
}
|
||||
|
||||
public long readZLong() throws IOException {
|
||||
long accumulator = 0L;
|
||||
int i = 0;
|
||||
long currentByte;
|
||||
while (((currentByte = readByte()) & 0x80L) != 0) {
|
||||
accumulator |= (currentByte & 0x7F) << i;
|
||||
i += 7;
|
||||
if (i > 63) {
|
||||
throw new IOException("variable-length stream is too long");
|
||||
}
|
||||
}
|
||||
return BitUtil.zigZagDecode(accumulator | (currentByte << i));
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public Text readOptionalText() throws IOException {
|
||||
int length = readInt();
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.index.IndexFormatTooNewException;
|
|||
import org.apache.lucene.index.IndexFormatTooOldException;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.store.LockObtainFailedException;
|
||||
import org.apache.lucene.util.BitUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.elasticsearch.ElasticsearchException;
|
||||
|
@ -172,9 +173,9 @@ public abstract class StreamOutput extends OutputStream {
|
|||
}
|
||||
|
||||
/**
|
||||
* Writes an long in a variable-length format. Writes between one and nine
|
||||
* bytes. Smaller values take fewer bytes. Negative numbers are not
|
||||
* supported.
|
||||
* Writes a non-negative long in a variable-length format.
|
||||
* Writes between one and nine bytes. Smaller values take fewer bytes.
|
||||
* Negative numbers are not supported.
|
||||
*/
|
||||
public void writeVLong(long i) throws IOException {
|
||||
assert i >= 0;
|
||||
|
@ -185,6 +186,23 @@ public abstract class StreamOutput extends OutputStream {
|
|||
writeByte((byte) i);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a long in a variable-length format. Writes between one and ten bytes.
|
||||
* Values are remapped by sliding the sign bit into the lsb and then encoded as an unsigned number
|
||||
* e.g., 0 -;> 0, -1 -;> 1, 1 -;> 2, ..., Long.MIN_VALUE -;> -1, Long.MAX_VALUE -;> -2
|
||||
* Numbers with small absolute value will have a small encoding
|
||||
* If the numbers are known to be non-negative, use {@link #writeVLong(long)}
|
||||
*/
|
||||
public void writeZLong(long i) throws IOException {
|
||||
// zig-zag encoding cf. https://developers.google.com/protocol-buffers/docs/encoding?hl=en
|
||||
long value = BitUtil.zigZagEncode(i);
|
||||
while ((value & 0xFFFFFFFFFFFFFF80L) != 0L) {
|
||||
writeByte((byte)((value & 0x7F) | 0x80));
|
||||
value >>>= 7;
|
||||
}
|
||||
writeByte((byte) (value & 0x7F));
|
||||
}
|
||||
|
||||
public void writeOptionalString(@Nullable String str) throws IOException {
|
||||
if (str == null) {
|
||||
writeBoolean(false);
|
||||
|
|
|
@ -0,0 +1,43 @@
|
|||
package org.elasticsearch.common.io.stream;
|
||||
|
||||
import org.elasticsearch.common.bytes.ByteBufferBytesReference;
|
||||
import org.elasticsearch.common.collect.Tuple;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class StreamTests extends ESTestCase {
|
||||
public void testRandomVLongSerialization() throws IOException {
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
long write = randomLong();
|
||||
BytesStreamOutput out = new BytesStreamOutput();
|
||||
out.writeZLong(write);
|
||||
long read = out.bytes().streamInput().readZLong();
|
||||
assertEquals(write, read);
|
||||
}
|
||||
}
|
||||
|
||||
public void testSpecificVLongSerialization() throws IOException {
|
||||
List<Tuple<Long, byte[]>> values =
|
||||
Arrays.asList(
|
||||
new Tuple<>(0L, new byte[]{0}),
|
||||
new Tuple<>(-1L, new byte[]{1}),
|
||||
new Tuple<>(1L, new byte[]{2}),
|
||||
new Tuple<>(-2L, new byte[]{3}),
|
||||
new Tuple<>(2L, new byte[]{4}),
|
||||
new Tuple<>(Long.MIN_VALUE, new byte[]{-1, -1, -1, -1, -1, -1, -1, -1, -1, 1}),
|
||||
new Tuple<>(Long.MAX_VALUE, new byte[]{-2, -1, -1, -1, -1, -1, -1, -1, -1, 1})
|
||||
|
||||
);
|
||||
for (Tuple<Long, byte[]> value : values) {
|
||||
BytesStreamOutput out = new BytesStreamOutput();
|
||||
out.writeZLong(value.v1());
|
||||
assertArrayEquals(Long.toString(value.v1()), value.v2(), out.bytes().toBytes());
|
||||
ByteBufferBytesReference bytes = new ByteBufferBytesReference(ByteBuffer.wrap(value.v2()));
|
||||
assertEquals(Arrays.toString(value.v2()), (long)value.v1(), bytes.streamInput().readZLong());
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue