From 9a11e75cf55e2b89a07f8492535d10aa45491338 Mon Sep 17 00:00:00 2001 From: Jason Tedor Date: Mon, 16 Nov 2015 11:49:52 -0600 Subject: [PATCH] Add variable-length long encoding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds a method of encoding longs using a variable-length representation. This encoding is an implementation of the zig-zag encoding from protocol buffers. Numbers that have a small absolute value will use a small number of bytes. This is achieved by zig-zagging through the space of longs in order of increasing absolute value (0, -1, 1, -2, 2, …, Long.MAX_VALUE, Long.MIN_VALUE) -> (0, 1, 2, 3, 4, …, -2, -1). The resulting values are then encoded as if they represent unsigned numbers. --- .../common/io/stream/StreamInput.java | 15 +++++++ .../common/io/stream/StreamOutput.java | 24 +++++++++-- .../common/io/stream/StreamTests.java | 43 +++++++++++++++++++ 3 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 core/src/test/java/org/elasticsearch/common/io/stream/StreamTests.java diff --git a/core/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java b/core/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java index 93da5ac8f0c..b151daf1bf0 100644 --- a/core/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java +++ b/core/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.IndexFormatTooNewException; import org.apache.lucene.index.IndexFormatTooOldException; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.util.BitUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRefBuilder; import org.elasticsearch.Version; @@ -234,6 +235,20 @@ public abstract class StreamInput extends InputStream { return i | ((b & 0x7FL) << 56); } + public long readZLong() throws IOException { + long accumulator = 0L; + int i = 0; + long currentByte; + while (((currentByte = readByte()) & 0x80L) != 0) { + accumulator |= (currentByte & 0x7F) << i; + i += 7; + if (i > 63) { + throw new IOException("variable-length stream is too long"); + } + } + return BitUtil.zigZagDecode(accumulator | (currentByte << i)); + } + @Nullable public Text readOptionalText() throws IOException { int length = readInt(); diff --git a/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java b/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java index 71558ff49c4..5f1e7623d28 100644 --- a/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java +++ b/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.IndexFormatTooNewException; import org.apache.lucene.index.IndexFormatTooOldException; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.util.BitUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.elasticsearch.ElasticsearchException; @@ -172,9 +173,9 @@ public abstract class StreamOutput extends OutputStream { } /** - * Writes an long in a variable-length format. Writes between one and nine - * bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. + * Writes a non-negative long in a variable-length format. + * Writes between one and nine bytes. Smaller values take fewer bytes. + * Negative numbers are not supported. */ public void writeVLong(long i) throws IOException { assert i >= 0; @@ -185,6 +186,23 @@ public abstract class StreamOutput extends OutputStream { writeByte((byte) i); } + /** + * Writes a long in a variable-length format. Writes between one and ten bytes. + * Values are remapped by sliding the sign bit into the lsb and then encoded as an unsigned number + * e.g., 0 -;> 0, -1 -;> 1, 1 -;> 2, ..., Long.MIN_VALUE -;> -1, Long.MAX_VALUE -;> -2 + * Numbers with small absolute value will have a small encoding + * If the numbers are known to be non-negative, use {@link #writeVLong(long)} + */ + public void writeZLong(long i) throws IOException { + // zig-zag encoding cf. https://developers.google.com/protocol-buffers/docs/encoding?hl=en + long value = BitUtil.zigZagEncode(i); + while ((value & 0xFFFFFFFFFFFFFF80L) != 0L) { + writeByte((byte)((value & 0x7F) | 0x80)); + value >>>= 7; + } + writeByte((byte) (value & 0x7F)); + } + public void writeOptionalString(@Nullable String str) throws IOException { if (str == null) { writeBoolean(false); diff --git a/core/src/test/java/org/elasticsearch/common/io/stream/StreamTests.java b/core/src/test/java/org/elasticsearch/common/io/stream/StreamTests.java new file mode 100644 index 00000000000..b4d60136e69 --- /dev/null +++ b/core/src/test/java/org/elasticsearch/common/io/stream/StreamTests.java @@ -0,0 +1,43 @@ +package org.elasticsearch.common.io.stream; + +import org.elasticsearch.common.bytes.ByteBufferBytesReference; +import org.elasticsearch.common.collect.Tuple; +import org.elasticsearch.test.ESTestCase; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.List; + +public class StreamTests extends ESTestCase { + public void testRandomVLongSerialization() throws IOException { + for (int i = 0; i < 1024; i++) { + long write = randomLong(); + BytesStreamOutput out = new BytesStreamOutput(); + out.writeZLong(write); + long read = out.bytes().streamInput().readZLong(); + assertEquals(write, read); + } + } + + public void testSpecificVLongSerialization() throws IOException { + List> values = + Arrays.asList( + new Tuple<>(0L, new byte[]{0}), + new Tuple<>(-1L, new byte[]{1}), + new Tuple<>(1L, new byte[]{2}), + new Tuple<>(-2L, new byte[]{3}), + new Tuple<>(2L, new byte[]{4}), + new Tuple<>(Long.MIN_VALUE, new byte[]{-1, -1, -1, -1, -1, -1, -1, -1, -1, 1}), + new Tuple<>(Long.MAX_VALUE, new byte[]{-2, -1, -1, -1, -1, -1, -1, -1, -1, 1}) + + ); + for (Tuple value : values) { + BytesStreamOutput out = new BytesStreamOutput(); + out.writeZLong(value.v1()); + assertArrayEquals(Long.toString(value.v1()), value.v2(), out.bytes().toBytes()); + ByteBufferBytesReference bytes = new ByteBufferBytesReference(ByteBuffer.wrap(value.v2())); + assertEquals(Arrays.toString(value.v2()), (long)value.v1(), bytes.streamInput().readZLong()); + } + } +}