Add variable-length long encoding

This commit adds a method of encoding longs using a variable-length
representation. This encoding is an implementation of the zig-zag
encoding from protocol buffers. Numbers that have a small absolute value
will use a small number of bytes. This is achieved by zig-zagging
through the space of longs in order of increasing absolute value (0, -1,
1, -2, 2, …, Long.MAX_VALUE, Long.MIN_VALUE) -> (0, 1, 2, 3, 4, …, -2,
-1). The resulting values are then encoded as if they represent unsigned
numbers.
This commit is contained in:
Jason Tedor 2015-11-16 11:49:52 -06:00
parent d082868436
commit 9a11e75cf5
3 changed files with 79 additions and 3 deletions

View File

@ -24,6 +24,7 @@ import org.apache.lucene.index.IndexFormatTooNewException;
import org.apache.lucene.index.IndexFormatTooOldException; import org.apache.lucene.index.IndexFormatTooOldException;
import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.CharsRefBuilder;
import org.elasticsearch.Version; import org.elasticsearch.Version;
@ -234,6 +235,20 @@ public abstract class StreamInput extends InputStream {
return i | ((b & 0x7FL) << 56); return i | ((b & 0x7FL) << 56);
} }
public long readZLong() throws IOException {
long accumulator = 0L;
int i = 0;
long currentByte;
while (((currentByte = readByte()) & 0x80L) != 0) {
accumulator |= (currentByte & 0x7F) << i;
i += 7;
if (i > 63) {
throw new IOException("variable-length stream is too long");
}
}
return BitUtil.zigZagDecode(accumulator | (currentByte << i));
}
@Nullable @Nullable
public Text readOptionalText() throws IOException { public Text readOptionalText() throws IOException {
int length = readInt(); int length = readInt();

View File

@ -24,6 +24,7 @@ import org.apache.lucene.index.IndexFormatTooNewException;
import org.apache.lucene.index.IndexFormatTooOldException; import org.apache.lucene.index.IndexFormatTooOldException;
import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.BytesRefBuilder;
import org.elasticsearch.ElasticsearchException; import org.elasticsearch.ElasticsearchException;
@ -172,9 +173,9 @@ public abstract class StreamOutput extends OutputStream {
} }
/** /**
* Writes an long in a variable-length format. Writes between one and nine * Writes a non-negative long in a variable-length format.
* bytes. Smaller values take fewer bytes. Negative numbers are not * Writes between one and nine bytes. Smaller values take fewer bytes.
* supported. * Negative numbers are not supported.
*/ */
public void writeVLong(long i) throws IOException { public void writeVLong(long i) throws IOException {
assert i >= 0; assert i >= 0;
@ -185,6 +186,23 @@ public abstract class StreamOutput extends OutputStream {
writeByte((byte) i); writeByte((byte) i);
} }
/**
* Writes a long in a variable-length format. Writes between one and ten bytes.
* Values are remapped by sliding the sign bit into the lsb and then encoded as an unsigned number
* e.g., 0 -;&gt; 0, -1 -;&gt; 1, 1 -;&gt; 2, ..., Long.MIN_VALUE -;&gt; -1, Long.MAX_VALUE -;&gt; -2
* Numbers with small absolute value will have a small encoding
* If the numbers are known to be non-negative, use {@link #writeVLong(long)}
*/
public void writeZLong(long i) throws IOException {
// zig-zag encoding cf. https://developers.google.com/protocol-buffers/docs/encoding?hl=en
long value = BitUtil.zigZagEncode(i);
while ((value & 0xFFFFFFFFFFFFFF80L) != 0L) {
writeByte((byte)((value & 0x7F) | 0x80));
value >>>= 7;
}
writeByte((byte) (value & 0x7F));
}
public void writeOptionalString(@Nullable String str) throws IOException { public void writeOptionalString(@Nullable String str) throws IOException {
if (str == null) { if (str == null) {
writeBoolean(false); writeBoolean(false);

View File

@ -0,0 +1,43 @@
package org.elasticsearch.common.io.stream;
import org.elasticsearch.common.bytes.ByteBufferBytesReference;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.test.ESTestCase;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.List;
public class StreamTests extends ESTestCase {
public void testRandomVLongSerialization() throws IOException {
for (int i = 0; i < 1024; i++) {
long write = randomLong();
BytesStreamOutput out = new BytesStreamOutput();
out.writeZLong(write);
long read = out.bytes().streamInput().readZLong();
assertEquals(write, read);
}
}
public void testSpecificVLongSerialization() throws IOException {
List<Tuple<Long, byte[]>> values =
Arrays.asList(
new Tuple<>(0L, new byte[]{0}),
new Tuple<>(-1L, new byte[]{1}),
new Tuple<>(1L, new byte[]{2}),
new Tuple<>(-2L, new byte[]{3}),
new Tuple<>(2L, new byte[]{4}),
new Tuple<>(Long.MIN_VALUE, new byte[]{-1, -1, -1, -1, -1, -1, -1, -1, -1, 1}),
new Tuple<>(Long.MAX_VALUE, new byte[]{-2, -1, -1, -1, -1, -1, -1, -1, -1, 1})
);
for (Tuple<Long, byte[]> value : values) {
BytesStreamOutput out = new BytesStreamOutput();
out.writeZLong(value.v1());
assertArrayEquals(Long.toString(value.v1()), value.v2(), out.bytes().toBytes());
ByteBufferBytesReference bytes = new ByteBufferBytesReference(ByteBuffer.wrap(value.v2()));
assertEquals(Arrays.toString(value.v2()), (long)value.v1(), bytes.streamInput().readZLong());
}
}
}