From fd1954d74f79311e203315d62cdc5a25c9b9faab Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 19 May 2015 18:38:23 +0200 Subject: [PATCH] Internal: Fix CompressedString.equals. CompressedString relied on the assumption that two CompressedString instanes are equal if there compressed representation are equal. Unfortunately this is not always true because the compressed representation also depends on when flush() was called on the output stream or on the size of the hash table that has been used at compression time. --- .../common/compress/CompressedString.java | 85 +++++++++---------- .../compress/CompressedStringTests.java | 50 ++++++++++- 2 files changed, 86 insertions(+), 49 deletions(-) diff --git a/src/main/java/org/elasticsearch/common/compress/CompressedString.java b/src/main/java/org/elasticsearch/common/compress/CompressedString.java index 2596a4d420e..784c57c527e 100644 --- a/src/main/java/org/elasticsearch/common/compress/CompressedString.java +++ b/src/main/java/org/elasticsearch/common/compress/CompressedString.java @@ -24,28 +24,20 @@ import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; -import org.elasticsearch.common.io.stream.Streamable; import java.io.IOException; import java.util.Arrays; /** - * + * Similar class to the {@link String} class except that it internally stores + * data using a compressed representation in order to require less permanent + * memory. Note that the compressed string might still sometimes need to be + * decompressed in order to perform equality checks or to compute hash codes. */ -public class CompressedString implements Streamable { +public final class CompressedString { - private byte[] bytes; - - CompressedString() { - } - - /** - * Constructor assuming the data provided is compressed (UTF8). It uses the provided - * array without copying it. - */ - public CompressedString(byte[] compressed) { - this.bytes = compressed; - } + private final byte[] bytes; + private int hashCode; public CompressedString(BytesReference data) throws IOException { Compressor compressor = CompressorFactory.compressor(data); @@ -55,40 +47,37 @@ public class CompressedString implements Streamable { } else { BytesArray bytesArray = data.toBytesArray(); this.bytes = CompressorFactory.defaultCompressor().compress(bytesArray.array(), bytesArray.arrayOffset(), bytesArray.length()); + assert CompressorFactory.compressor(bytes) == CompressorFactory.defaultCompressor(); } + } - /** - * Constructs a new compressed string, assuming the bytes are UTF8, by copying it over. - * - * @param data The byte array - * @param offset Offset into the byte array - * @param length The length of the data - * @throws IOException - */ public CompressedString(byte[] data, int offset, int length) throws IOException { - Compressor compressor = CompressorFactory.compressor(data, offset, length); - if (compressor != null) { - // already compressed... - this.bytes = Arrays.copyOfRange(data, offset, offset + length); - } else { - // default to LZF - this.bytes = CompressorFactory.defaultCompressor().compress(data, offset, length); - } + this(new BytesArray(data, offset, length)); + } + + public CompressedString(byte[] data) throws IOException { + this(data, 0, data.length); } public CompressedString(String str) throws IOException { - BytesRef result = new BytesRef(str); - this.bytes = CompressorFactory.defaultCompressor().compress(result.bytes, result.offset, result.length); + this(new BytesArray(new BytesRef(str))); } + /** Return the compressed bytes. */ public byte[] compressed() { return this.bytes; } - public byte[] uncompressed() throws IOException { + /** Return the uncompressed bytes. */ + public byte[] uncompressed() { Compressor compressor = CompressorFactory.compressor(bytes); - return compressor.uncompress(bytes, 0, bytes.length); + assert compressor != null; + try { + return compressor.uncompress(bytes, 0, bytes.length); + } catch (IOException e) { + throw new IllegalStateException("Cannot decompress compressed string", e); + } } public String string() throws IOException { @@ -96,18 +85,11 @@ public class CompressedString implements Streamable { } public static CompressedString readCompressedString(StreamInput in) throws IOException { - CompressedString compressedString = new CompressedString(); - compressedString.readFrom(in); - return compressedString; - } - - @Override - public void readFrom(StreamInput in) throws IOException { - bytes = new byte[in.readVInt()]; + byte[] bytes = new byte[in.readVInt()]; in.readBytes(bytes, 0, bytes.length); + return new CompressedString(bytes); } - @Override public void writeTo(StreamOutput out) throws IOException { out.writeVInt(bytes.length); out.writeBytes(bytes); @@ -120,14 +102,23 @@ public class CompressedString implements Streamable { CompressedString that = (CompressedString) o; - if (!Arrays.equals(bytes, that.bytes)) return false; + if (Arrays.equals(compressed(), that.compressed())) { + return true; + } - return true; + return Arrays.equals(uncompressed(), that.uncompressed()); } @Override public int hashCode() { - return bytes != null ? Arrays.hashCode(bytes) : 0; + if (hashCode == 0) { + int h = Arrays.hashCode(uncompressed()); + if (h == 0) { + h = 1; + } + hashCode = h; + } + return hashCode; } @Override diff --git a/src/test/java/org/elasticsearch/common/compress/CompressedStringTests.java b/src/test/java/org/elasticsearch/common/compress/CompressedStringTests.java index f1fb90f56f8..1e1718ffa1c 100644 --- a/src/test/java/org/elasticsearch/common/compress/CompressedStringTests.java +++ b/src/test/java/org/elasticsearch/common/compress/CompressedStringTests.java @@ -20,8 +20,12 @@ package org.elasticsearch.common.compress; import org.apache.lucene.util.TestUtil; +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.io.stream.BytesStreamOutput; +import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.test.ElasticsearchTestCase; +import org.junit.Assert; import org.junit.Test; import java.io.IOException; @@ -40,6 +44,12 @@ public class CompressedStringTests extends ElasticsearchTestCase { simpleTests("lzf"); } + private void assertEquals(CompressedString s1, CompressedString s2) { + Assert.assertEquals(s1, s2); + assertArrayEquals(s1.uncompressed(), s2.uncompressed()); + assertEquals(s1.hashCode(), s2.hashCode()); + } + public void simpleTests(String compressor) throws IOException { CompressorFactory.configure(ImmutableSettings.settingsBuilder().put("compress.default.type", compressor).build()); String str = "this is a simple string"; @@ -51,9 +61,9 @@ public class CompressedStringTests extends ElasticsearchTestCase { CompressedString cstr2 = new CompressedString(str2); assertThat(cstr2.string(), not(equalTo(str))); assertThat(new CompressedString(str2), not(equalTo(cstr))); - assertThat(new CompressedString(str2), equalTo(cstr2)); + assertEquals(new CompressedString(str2), cstr2); } - + public void testRandom() throws IOException { String compressor = "lzf"; CompressorFactory.configure(ImmutableSettings.settingsBuilder().put("compress.default.type", compressor).build()); @@ -64,4 +74,40 @@ public class CompressedStringTests extends ElasticsearchTestCase { assertThat(compressedString.string(), equalTo(string)); } } + + public void testDifferentCompressedRepresentation() throws Exception { + byte[] b = "abcdefghijabcdefghij".getBytes("UTF-8"); + CompressorFactory.defaultCompressor(); + + Compressor compressor = CompressorFactory.defaultCompressor(); + BytesStreamOutput bout = new BytesStreamOutput(); + StreamOutput out = compressor.streamOutput(bout); + out.writeBytes(b); + out.flush(); + out.writeBytes(b); + out.close(); + final BytesReference b1 = bout.bytes(); + + bout = new BytesStreamOutput(); + out = compressor.streamOutput(bout); + out.writeBytes(b); + out.writeBytes(b); + out.close(); + final BytesReference b2 = bout.bytes(); + + // because of the intermediate flush, the two compressed representations + // are different. It can also happen for other reasons like if hash tables + // of different size are being used + assertFalse(b1.equals(b2)); + // we used the compressed representation directly and did not recompress + assertArrayEquals(b1.toBytes(), new CompressedString(b1).compressed()); + assertArrayEquals(b2.toBytes(), new CompressedString(b2).compressed()); + // but compressedstring instances are still equal + assertEquals(new CompressedString(b1), new CompressedString(b2)); + } + + public void testHashCode() throws IOException { + assertFalse(new CompressedString("a").hashCode() == new CompressedString("b").hashCode()); + } + }