diff --git a/src/main/java/org/elasticsearch/common/compress/CompressedString.java b/src/main/java/org/elasticsearch/common/compress/CompressedString.java index 2596a4d420e..784c57c527e 100644 --- a/src/main/java/org/elasticsearch/common/compress/CompressedString.java +++ b/src/main/java/org/elasticsearch/common/compress/CompressedString.java @@ -24,28 +24,20 @@ import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; -import org.elasticsearch.common.io.stream.Streamable; import java.io.IOException; import java.util.Arrays; /** - * + * Similar class to the {@link String} class except that it internally stores + * data using a compressed representation in order to require less permanent + * memory. Note that the compressed string might still sometimes need to be + * decompressed in order to perform equality checks or to compute hash codes. */ -public class CompressedString implements Streamable { +public final class CompressedString { - private byte[] bytes; - - CompressedString() { - } - - /** - * Constructor assuming the data provided is compressed (UTF8). It uses the provided - * array without copying it. - */ - public CompressedString(byte[] compressed) { - this.bytes = compressed; - } + private final byte[] bytes; + private int hashCode; public CompressedString(BytesReference data) throws IOException { Compressor compressor = CompressorFactory.compressor(data); @@ -55,40 +47,37 @@ public class CompressedString implements Streamable { } else { BytesArray bytesArray = data.toBytesArray(); this.bytes = CompressorFactory.defaultCompressor().compress(bytesArray.array(), bytesArray.arrayOffset(), bytesArray.length()); + assert CompressorFactory.compressor(bytes) == CompressorFactory.defaultCompressor(); } + } - /** - * Constructs a new compressed string, assuming the bytes are UTF8, by copying it over. - * - * @param data The byte array - * @param offset Offset into the byte array - * @param length The length of the data - * @throws IOException - */ public CompressedString(byte[] data, int offset, int length) throws IOException { - Compressor compressor = CompressorFactory.compressor(data, offset, length); - if (compressor != null) { - // already compressed... - this.bytes = Arrays.copyOfRange(data, offset, offset + length); - } else { - // default to LZF - this.bytes = CompressorFactory.defaultCompressor().compress(data, offset, length); - } + this(new BytesArray(data, offset, length)); + } + + public CompressedString(byte[] data) throws IOException { + this(data, 0, data.length); } public CompressedString(String str) throws IOException { - BytesRef result = new BytesRef(str); - this.bytes = CompressorFactory.defaultCompressor().compress(result.bytes, result.offset, result.length); + this(new BytesArray(new BytesRef(str))); } + /** Return the compressed bytes. */ public byte[] compressed() { return this.bytes; } - public byte[] uncompressed() throws IOException { + /** Return the uncompressed bytes. */ + public byte[] uncompressed() { Compressor compressor = CompressorFactory.compressor(bytes); - return compressor.uncompress(bytes, 0, bytes.length); + assert compressor != null; + try { + return compressor.uncompress(bytes, 0, bytes.length); + } catch (IOException e) { + throw new IllegalStateException("Cannot decompress compressed string", e); + } } public String string() throws IOException { @@ -96,18 +85,11 @@ public class CompressedString implements Streamable { } public static CompressedString readCompressedString(StreamInput in) throws IOException { - CompressedString compressedString = new CompressedString(); - compressedString.readFrom(in); - return compressedString; - } - - @Override - public void readFrom(StreamInput in) throws IOException { - bytes = new byte[in.readVInt()]; + byte[] bytes = new byte[in.readVInt()]; in.readBytes(bytes, 0, bytes.length); + return new CompressedString(bytes); } - @Override public void writeTo(StreamOutput out) throws IOException { out.writeVInt(bytes.length); out.writeBytes(bytes); @@ -120,14 +102,23 @@ public class CompressedString implements Streamable { CompressedString that = (CompressedString) o; - if (!Arrays.equals(bytes, that.bytes)) return false; + if (Arrays.equals(compressed(), that.compressed())) { + return true; + } - return true; + return Arrays.equals(uncompressed(), that.uncompressed()); } @Override public int hashCode() { - return bytes != null ? Arrays.hashCode(bytes) : 0; + if (hashCode == 0) { + int h = Arrays.hashCode(uncompressed()); + if (h == 0) { + h = 1; + } + hashCode = h; + } + return hashCode; } @Override diff --git a/src/test/java/org/elasticsearch/common/compress/CompressedStringTests.java b/src/test/java/org/elasticsearch/common/compress/CompressedStringTests.java index f1fb90f56f8..1e1718ffa1c 100644 --- a/src/test/java/org/elasticsearch/common/compress/CompressedStringTests.java +++ b/src/test/java/org/elasticsearch/common/compress/CompressedStringTests.java @@ -20,8 +20,12 @@ package org.elasticsearch.common.compress; import org.apache.lucene.util.TestUtil; +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.io.stream.BytesStreamOutput; +import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.test.ElasticsearchTestCase; +import org.junit.Assert; import org.junit.Test; import java.io.IOException; @@ -40,6 +44,12 @@ public class CompressedStringTests extends ElasticsearchTestCase { simpleTests("lzf"); } + private void assertEquals(CompressedString s1, CompressedString s2) { + Assert.assertEquals(s1, s2); + assertArrayEquals(s1.uncompressed(), s2.uncompressed()); + assertEquals(s1.hashCode(), s2.hashCode()); + } + public void simpleTests(String compressor) throws IOException { CompressorFactory.configure(ImmutableSettings.settingsBuilder().put("compress.default.type", compressor).build()); String str = "this is a simple string"; @@ -51,9 +61,9 @@ public class CompressedStringTests extends ElasticsearchTestCase { CompressedString cstr2 = new CompressedString(str2); assertThat(cstr2.string(), not(equalTo(str))); assertThat(new CompressedString(str2), not(equalTo(cstr))); - assertThat(new CompressedString(str2), equalTo(cstr2)); + assertEquals(new CompressedString(str2), cstr2); } - + public void testRandom() throws IOException { String compressor = "lzf"; CompressorFactory.configure(ImmutableSettings.settingsBuilder().put("compress.default.type", compressor).build()); @@ -64,4 +74,40 @@ public class CompressedStringTests extends ElasticsearchTestCase { assertThat(compressedString.string(), equalTo(string)); } } + + public void testDifferentCompressedRepresentation() throws Exception { + byte[] b = "abcdefghijabcdefghij".getBytes("UTF-8"); + CompressorFactory.defaultCompressor(); + + Compressor compressor = CompressorFactory.defaultCompressor(); + BytesStreamOutput bout = new BytesStreamOutput(); + StreamOutput out = compressor.streamOutput(bout); + out.writeBytes(b); + out.flush(); + out.writeBytes(b); + out.close(); + final BytesReference b1 = bout.bytes(); + + bout = new BytesStreamOutput(); + out = compressor.streamOutput(bout); + out.writeBytes(b); + out.writeBytes(b); + out.close(); + final BytesReference b2 = bout.bytes(); + + // because of the intermediate flush, the two compressed representations + // are different. It can also happen for other reasons like if hash tables + // of different size are being used + assertFalse(b1.equals(b2)); + // we used the compressed representation directly and did not recompress + assertArrayEquals(b1.toBytes(), new CompressedString(b1).compressed()); + assertArrayEquals(b2.toBytes(), new CompressedString(b2).compressed()); + // but compressedstring instances are still equal + assertEquals(new CompressedString(b1), new CompressedString(b2)); + } + + public void testHashCode() throws IOException { + assertFalse(new CompressedString("a").hashCode() == new CompressedString("b").hashCode()); + } + }