Internal: Fix CompressedString.equals.

CompressedString relied on the assumption that two CompressedString instanes
are equal if there compressed representation are equal. Unfortunately this is
not always true because the compressed representation also depends on when
flush() was called on the output stream or on the size of the hash table that
has been used at compression time.
This commit is contained in:
Adrien Grand 2015-05-19 18:38:23 +02:00
parent 5fbb6a714d
commit fd1954d74f
2 changed files with 86 additions and 49 deletions

View File

@ -24,28 +24,20 @@ import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Streamable;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
/** /**
* * Similar class to the {@link String} class except that it internally stores
* data using a compressed representation in order to require less permanent
* memory. Note that the compressed string might still sometimes need to be
* decompressed in order to perform equality checks or to compute hash codes.
*/ */
public class CompressedString implements Streamable { public final class CompressedString {
private byte[] bytes; private final byte[] bytes;
private int hashCode;
CompressedString() {
}
/**
* Constructor assuming the data provided is compressed (UTF8). It uses the provided
* array without copying it.
*/
public CompressedString(byte[] compressed) {
this.bytes = compressed;
}
public CompressedString(BytesReference data) throws IOException { public CompressedString(BytesReference data) throws IOException {
Compressor compressor = CompressorFactory.compressor(data); Compressor compressor = CompressorFactory.compressor(data);
@ -55,40 +47,37 @@ public class CompressedString implements Streamable {
} else { } else {
BytesArray bytesArray = data.toBytesArray(); BytesArray bytesArray = data.toBytesArray();
this.bytes = CompressorFactory.defaultCompressor().compress(bytesArray.array(), bytesArray.arrayOffset(), bytesArray.length()); this.bytes = CompressorFactory.defaultCompressor().compress(bytesArray.array(), bytesArray.arrayOffset(), bytesArray.length());
} assert CompressorFactory.compressor(bytes) == CompressorFactory.defaultCompressor();
} }
/**
* Constructs a new compressed string, assuming the bytes are UTF8, by copying it over.
*
* @param data The byte array
* @param offset Offset into the byte array
* @param length The length of the data
* @throws IOException
*/
public CompressedString(byte[] data, int offset, int length) throws IOException {
Compressor compressor = CompressorFactory.compressor(data, offset, length);
if (compressor != null) {
// already compressed...
this.bytes = Arrays.copyOfRange(data, offset, offset + length);
} else {
// default to LZF
this.bytes = CompressorFactory.defaultCompressor().compress(data, offset, length);
} }
public CompressedString(byte[] data, int offset, int length) throws IOException {
this(new BytesArray(data, offset, length));
}
public CompressedString(byte[] data) throws IOException {
this(data, 0, data.length);
} }
public CompressedString(String str) throws IOException { public CompressedString(String str) throws IOException {
BytesRef result = new BytesRef(str); this(new BytesArray(new BytesRef(str)));
this.bytes = CompressorFactory.defaultCompressor().compress(result.bytes, result.offset, result.length);
} }
/** Return the compressed bytes. */
public byte[] compressed() { public byte[] compressed() {
return this.bytes; return this.bytes;
} }
public byte[] uncompressed() throws IOException { /** Return the uncompressed bytes. */
public byte[] uncompressed() {
Compressor compressor = CompressorFactory.compressor(bytes); Compressor compressor = CompressorFactory.compressor(bytes);
assert compressor != null;
try {
return compressor.uncompress(bytes, 0, bytes.length); return compressor.uncompress(bytes, 0, bytes.length);
} catch (IOException e) {
throw new IllegalStateException("Cannot decompress compressed string", e);
}
} }
public String string() throws IOException { public String string() throws IOException {
@ -96,18 +85,11 @@ public class CompressedString implements Streamable {
} }
public static CompressedString readCompressedString(StreamInput in) throws IOException { public static CompressedString readCompressedString(StreamInput in) throws IOException {
CompressedString compressedString = new CompressedString(); byte[] bytes = new byte[in.readVInt()];
compressedString.readFrom(in);
return compressedString;
}
@Override
public void readFrom(StreamInput in) throws IOException {
bytes = new byte[in.readVInt()];
in.readBytes(bytes, 0, bytes.length); in.readBytes(bytes, 0, bytes.length);
return new CompressedString(bytes);
} }
@Override
public void writeTo(StreamOutput out) throws IOException { public void writeTo(StreamOutput out) throws IOException {
out.writeVInt(bytes.length); out.writeVInt(bytes.length);
out.writeBytes(bytes); out.writeBytes(bytes);
@ -120,14 +102,23 @@ public class CompressedString implements Streamable {
CompressedString that = (CompressedString) o; CompressedString that = (CompressedString) o;
if (!Arrays.equals(bytes, that.bytes)) return false; if (Arrays.equals(compressed(), that.compressed())) {
return true; return true;
} }
return Arrays.equals(uncompressed(), that.uncompressed());
}
@Override @Override
public int hashCode() { public int hashCode() {
return bytes != null ? Arrays.hashCode(bytes) : 0; if (hashCode == 0) {
int h = Arrays.hashCode(uncompressed());
if (h == 0) {
h = 1;
}
hashCode = h;
}
return hashCode;
} }
@Override @Override

View File

@ -20,8 +20,12 @@
package org.elasticsearch.common.compress; package org.elasticsearch.common.compress;
import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.TestUtil;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.test.ElasticsearchTestCase; import org.elasticsearch.test.ElasticsearchTestCase;
import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
import java.io.IOException; import java.io.IOException;
@ -40,6 +44,12 @@ public class CompressedStringTests extends ElasticsearchTestCase {
simpleTests("lzf"); simpleTests("lzf");
} }
private void assertEquals(CompressedString s1, CompressedString s2) {
Assert.assertEquals(s1, s2);
assertArrayEquals(s1.uncompressed(), s2.uncompressed());
assertEquals(s1.hashCode(), s2.hashCode());
}
public void simpleTests(String compressor) throws IOException { public void simpleTests(String compressor) throws IOException {
CompressorFactory.configure(ImmutableSettings.settingsBuilder().put("compress.default.type", compressor).build()); CompressorFactory.configure(ImmutableSettings.settingsBuilder().put("compress.default.type", compressor).build());
String str = "this is a simple string"; String str = "this is a simple string";
@ -51,7 +61,7 @@ public class CompressedStringTests extends ElasticsearchTestCase {
CompressedString cstr2 = new CompressedString(str2); CompressedString cstr2 = new CompressedString(str2);
assertThat(cstr2.string(), not(equalTo(str))); assertThat(cstr2.string(), not(equalTo(str)));
assertThat(new CompressedString(str2), not(equalTo(cstr))); assertThat(new CompressedString(str2), not(equalTo(cstr)));
assertThat(new CompressedString(str2), equalTo(cstr2)); assertEquals(new CompressedString(str2), cstr2);
} }
public void testRandom() throws IOException { public void testRandom() throws IOException {
@ -64,4 +74,40 @@ public class CompressedStringTests extends ElasticsearchTestCase {
assertThat(compressedString.string(), equalTo(string)); assertThat(compressedString.string(), equalTo(string));
} }
} }
public void testDifferentCompressedRepresentation() throws Exception {
byte[] b = "abcdefghijabcdefghij".getBytes("UTF-8");
CompressorFactory.defaultCompressor();
Compressor compressor = CompressorFactory.defaultCompressor();
BytesStreamOutput bout = new BytesStreamOutput();
StreamOutput out = compressor.streamOutput(bout);
out.writeBytes(b);
out.flush();
out.writeBytes(b);
out.close();
final BytesReference b1 = bout.bytes();
bout = new BytesStreamOutput();
out = compressor.streamOutput(bout);
out.writeBytes(b);
out.writeBytes(b);
out.close();
final BytesReference b2 = bout.bytes();
// because of the intermediate flush, the two compressed representations
// are different. It can also happen for other reasons like if hash tables
// of different size are being used
assertFalse(b1.equals(b2));
// we used the compressed representation directly and did not recompress
assertArrayEquals(b1.toBytes(), new CompressedString(b1).compressed());
assertArrayEquals(b2.toBytes(), new CompressedString(b2).compressed());
// but compressedstring instances are still equal
assertEquals(new CompressedString(b1), new CompressedString(b2));
}
public void testHashCode() throws IOException {
assertFalse(new CompressedString("a").hashCode() == new CompressedString("b").hashCode());
}
} }