From b7a20caf84cd753569edffcb84cdba00824dacf4 Mon Sep 17 00:00:00 2001 From: Adrien Grand <jpountz@apache.org> Date: Thu, 11 Dec 2014 15:01:00 +0000 Subject: [PATCH] LUCENE-6086: Space optimizations for numeric stored fields. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1644661 13f79535-47bb-0310-9956-ffa450edef68 --- .../CompressingStoredFieldsReader.java | 103 +++++++++- .../CompressingStoredFieldsWriter.java | 164 +++++++++++++++- .../TestCompressingStoredFieldsFormat.java | 183 +++++++++++++++++- 3 files changed, 432 insertions(+), 18 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java index 32d9b3ff186..f749d0a668c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java @@ -20,17 +20,23 @@ package org.apache.lucene.codecs.compressing; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.BYTE_ARR; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.CODEC_SFX_DAT; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.CODEC_SFX_IDX; +import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.DAY; +import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.DAY_ENCODING; +import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.FIELDS_EXTENSION; +import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.FIELDS_INDEX_EXTENSION; +import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.HOUR; +import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.HOUR_ENCODING; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_DOUBLE; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_FLOAT; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_INT; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_LONG; +import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.SECOND; +import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.SECOND_ENCODING; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.STRING; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.TYPE_BITS; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.TYPE_MASK; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_CURRENT; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_START; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.FIELDS_EXTENSION; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.FIELDS_INDEX_EXTENSION; import java.io.EOFException; import java.io.IOException; @@ -57,6 +63,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.Accountables; import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.packed.PackedInts; @@ -201,16 +208,16 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader { visitor.stringField(info, new String(data, StandardCharsets.UTF_8)); break; case NUMERIC_INT: - visitor.intField(info, in.readInt()); + visitor.intField(info, in.readZInt()); break; case NUMERIC_FLOAT: - visitor.floatField(info, Float.intBitsToFloat(in.readInt())); + visitor.floatField(info, readZFloat(in)); break; case NUMERIC_LONG: - visitor.longField(info, in.readLong()); + visitor.longField(info, readTLong(in)); break; case NUMERIC_DOUBLE: - visitor.doubleField(info, Double.longBitsToDouble(in.readLong())); + visitor.doubleField(info, readZDouble(in)); break; default: throw new AssertionError("Unknown type flag: " + Integer.toHexString(bits)); @@ -225,18 +232,98 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader { in.skipBytes(length); break; case NUMERIC_INT: + in.readZInt(); + break; case NUMERIC_FLOAT: - in.readInt(); + readZFloat(in); break; case NUMERIC_LONG: + readTLong(in); + break; case NUMERIC_DOUBLE: - in.readLong(); + readZDouble(in); break; default: throw new AssertionError("Unknown type flag: " + Integer.toHexString(bits)); } } + /** + * Reads a float in a variable-length format. Reads between one and + * five bytes. Small integral values typically take fewer bytes. + */ + static float readZFloat(DataInput in) throws IOException { + int b = in.readByte() & 0xFF; + if (b == 0xFF) { + // negative value + return Float.intBitsToFloat(in.readInt()); + } else if ((b & 0x80) != 0) { + // small integer [-1..125] + return (b & 0x7f) - 1; + } else { + // positive float + int bits = b << 24 | ((in.readShort() & 0xFFFF) << 8) | (in.readByte() & 0xFF); + return Float.intBitsToFloat(bits); + } + } + + /** + * Reads a double in a variable-length format. Reads between one and + * nine bytes. Small integral values typically take fewer bytes. + */ + static double readZDouble(DataInput in) throws IOException { + int b = in.readByte() & 0xFF; + if (b == 0xFF) { + // negative value + return Double.longBitsToDouble(in.readLong()); + } else if (b == 0xFE) { + // float + return Float.intBitsToFloat(in.readInt()); + } else if ((b & 0x80) != 0) { + // small integer [-1..124] + return (b & 0x7f) - 1; + } else { + // positive double + long bits = ((long) b) << 56 | ((in.readInt() & 0xFFFFFFFFL) << 24) | ((in.readShort() & 0xFFFFL) << 8) | (in.readByte() & 0xFFL); + return Double.longBitsToDouble(bits); + } + } + + /** + * Reads a long in a variable-length format. Reads between one and + * nine bytes. Small values typically take fewer bytes. + */ + static long readTLong(DataInput in) throws IOException { + int header = in.readByte() & 0xFF; + + long bits = header & 0x1F; + if ((header & 0x20) != 0) { + // continuation bit + bits |= in.readVLong() << 5; + } + + long l = BitUtil.zigZagDecode(bits); + + switch (header & DAY_ENCODING) { + case SECOND_ENCODING: + l *= SECOND; + break; + case HOUR_ENCODING: + l *= HOUR; + break; + case DAY_ENCODING: + l *= DAY; + break; + case 0: + // uncompressed + break; + default: + throw new AssertionError(); + } + + return l; + } + @Override public void visitDocument(int docID, StoredFieldVisitor visitor) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java index 2aa507ef171..c197498a54a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java @@ -37,6 +37,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.GrowableByteArrayDataOutput; @@ -70,9 +71,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { static final int VERSION_START = 0; static final int VERSION_CURRENT = VERSION_START; - private final Directory directory; private final String segment; - private final String segmentSuffix; private CompressingStoredFieldsIndexWriter indexWriter; private IndexOutput fieldsStream; @@ -91,9 +90,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { public CompressingStoredFieldsWriter(Directory directory, SegmentInfo si, String segmentSuffix, IOContext context, String formatName, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) throws IOException { assert directory != null; - this.directory = directory; this.segment = si.name; - this.segmentSuffix = segmentSuffix; this.compressionMode = compressionMode; this.compressor = compressionMode.newCompressor(); this.chunkSize = chunkSize; @@ -288,19 +285,170 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { bufferedDocs.writeString(field.stringValue()); } else { if (number instanceof Byte || number instanceof Short || number instanceof Integer) { - bufferedDocs.writeInt(number.intValue()); + bufferedDocs.writeZInt(number.intValue()); } else if (number instanceof Long) { - bufferedDocs.writeLong(number.longValue()); + writeTLong(bufferedDocs, number.longValue()); } else if (number instanceof Float) { - bufferedDocs.writeInt(Float.floatToIntBits(number.floatValue())); + writeZFloat(bufferedDocs, number.floatValue()); } else if (number instanceof Double) { - bufferedDocs.writeLong(Double.doubleToLongBits(number.doubleValue())); + writeZDouble(bufferedDocs, number.doubleValue()); } else { throw new AssertionError("Cannot get here"); } } } + // -0 isn't compressed. + static final int NEGATIVE_ZERO_FLOAT = Float.floatToIntBits(-0f); + static final long NEGATIVE_ZERO_DOUBLE = Double.doubleToLongBits(-0d); + + // for compression of timestamps + static final long SECOND = 1000L; + static final long HOUR = 60 * 60 * SECOND; + static final long DAY = 24 * HOUR; + static final int SECOND_ENCODING = 0x40; + static final int HOUR_ENCODING = 0x80; + static final int DAY_ENCODING = 0xC0; + + /** + * Writes a float in a variable-length format. Writes between one and + * five bytes. Small integral values typically take fewer bytes. + * <p> + * ZFloat --> Header, Bytes*? + * <ul> + * <li>Header --> {@link DataOutput#writeByte Uint8}. When it is + * equal to 0xFF then the value is negative and stored in the next + * 4 bytes. Otherwise if the first bit is set then the other bits + * in the header encode the value plus one and no other + * bytes are read. Otherwise, the value is a positive float value + * whose first byte is the header, and 3 bytes need to be read to + * complete it. + * <li>Bytes --> Potential additional bytes to read depending on the + * header. + * </ul> + * <p> + */ + static void writeZFloat(DataOutput out, float f) throws IOException { + int intVal = (int) f; + final int floatBits = Float.floatToIntBits(f); + + if (f == intVal + && intVal >= -1 + && intVal <= 0x7D + && floatBits != NEGATIVE_ZERO_FLOAT) { + // small integer value [-1..125]: single byte + out.writeByte((byte) (0x80 | (1 + intVal))); + } else if ((floatBits >>> 31) == 0) { + // other positive floats: 4 bytes + out.writeInt(floatBits); + } else { + // other negative float: 5 bytes + out.writeByte((byte) 0xFF); + out.writeInt(floatBits); + } + } + + /** + * Writes a float in a variable-length format. Writes between one and + * five bytes. Small integral values typically take fewer bytes. + * <p> + * ZFloat --> Header, Bytes*? + * <ul> + * <li>Header --> {@link DataOutput#writeByte Uint8}. When it is + * equal to 0xFF then the value is negative and stored in the next + * 8 bytes. When it is equal to 0xFE then the value is stored as a + * float in the next 4 bytes. Otherwise if the first bit is set + * then the other bits in the header encode the value plus one and + * no other bytes are read. Otherwise, the value is a positive float + * value whose first byte is the header, and 7 bytes need to be read + * to complete it. + * <li>Bytes --> Potential additional bytes to read depending on the + * header. + * </ul> + * <p> + */ + static void writeZDouble(DataOutput out, double d) throws IOException { + int intVal = (int) d; + final long doubleBits = Double.doubleToLongBits(d); + + if (d == intVal && + intVal >= -1 && + intVal <= 0x7C && + doubleBits != NEGATIVE_ZERO_DOUBLE) { + // small integer value [-1..124]: single byte + out.writeByte((byte) (0x80 | (intVal + 1))); + return; + } else if (d == (float) d) { + // d has an accurate float representation: 5 bytes + out.writeByte((byte) 0xFE); + out.writeInt(Float.floatToIntBits((float) d)); + } else if ((doubleBits >>> 63) == 0) { + // other positive doubles: 8 bytes + out.writeLong(doubleBits); + } else { + // other negative doubles: 9 bytes + out.writeByte((byte) 0xFF); + out.writeLong(doubleBits); + } + } + + /** + * Writes a long in a variable-length format. Writes between one and + * ten bytes. Small values or values representing timestamps with day, + * hour or second precision typically require fewer bytes. + * <p> + * ZLong --> Header, Bytes*? + * <ul> + * <li>Header --> The first two bits indicate the compression scheme: + * <ul> + * <li>00 - uncompressed + * <li>01 - multiple of 1000 (second) + * <li>10 - multiple of 3600000 (hour) + * <li>11 - multiple of 86400000 (day) + * </ul> + * Then the next bit is a continuation bit, indicating whether more + * bytes need to be read, and the last 5 bits are the lower bits of + * the encoded value. In order to reconstruct the value, you need to + * combine the 5 lower bits of the header with a vLong in the next + * bytes (if the continuation bit is set to 1). Then + * {@link BitUtil#zigZagDecode(int) zigzag-decode} it and finally + * multiply by the multiple corresponding to the compression scheme. + * <li>Bytes --> Potential additional bytes to read depending on the + * header. + * </ul> + * <p> + */ + // T for "timestamp" + static void writeTLong(DataOutput out, long l) throws IOException { + int header; + if (l % SECOND != 0) { + header = 0; + } else if (l % DAY == 0) { + // timestamp with day precision + header = DAY_ENCODING; + l /= DAY; + } else if (l % HOUR == 0) { + // timestamp with hour precision, or day precision with a timezone + header = HOUR_ENCODING; + l /= HOUR; + } else { + // timestamp with second precision + header = SECOND_ENCODING; + l /= SECOND; + } + + final long zigZagL = BitUtil.zigZagEncode(l); + header |= (zigZagL & 0x1F); // last 5 bits + final long upperBits = zigZagL >>> 5; + if (upperBits != 0) { + header |= 0x20; + } + out.writeByte((byte) header); + if (upperBits != 0) { + out.writeVLong(upperBits); + } + } + @Override public void finish(FieldInfos fis, int numDocs) throws IOException { if (numBufferedDocs > 0) { diff --git a/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestCompressingStoredFieldsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestCompressingStoredFieldsFormat.java index 313dda9a45c..d1ea80a8066 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestCompressingStoredFieldsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestCompressingStoredFieldsFormat.java @@ -18,25 +18,32 @@ package org.apache.lucene.codecs.compressing; */ import java.io.IOException; +import java.util.Random; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.IntField; import org.apache.lucene.index.BaseStoredFieldsFormatTestCase; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.store.AlreadyClosedException; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.MockDirectoryWrapper; import org.junit.Test; + import com.carrotsearch.randomizedtesting.generators.RandomInts; public class TestCompressingStoredFieldsFormat extends BaseStoredFieldsFormatTestCase { + static final long SECOND = 1000L; + static final long HOUR = 60 * 60 * SECOND; + static final long DAY = 24 * HOUR; + @Override protected Codec getCodec() { return CompressingCodec.randomInstance(random()); @@ -88,4 +95,176 @@ public class TestCompressingStoredFieldsFormat extends BaseStoredFieldsFormatTes dir.close(); } } + + public void testZFloat() throws Exception { + byte buffer[] = new byte[5]; // we never need more than 5 bytes + ByteArrayDataOutput out = new ByteArrayDataOutput(buffer); + ByteArrayDataInput in = new ByteArrayDataInput(buffer); + + // round-trip small integer values + for (int i = Short.MIN_VALUE; i < Short.MAX_VALUE; i++) { + float f = (float) i; + CompressingStoredFieldsWriter.writeZFloat(out, f); + in.reset(buffer, 0, out.getPosition()); + float g = CompressingStoredFieldsReader.readZFloat(in); + assertTrue(in.eof()); + assertEquals(Float.floatToIntBits(f), Float.floatToIntBits(g)); + + // check that compression actually works + if (i >= -1 && i <= 123) { + assertEquals(1, out.getPosition()); // single byte compression + } + out.reset(buffer); + } + + // round-trip special values + float special[] = { + -0.0f, + +0.0f, + Float.NEGATIVE_INFINITY, + Float.POSITIVE_INFINITY, + Float.MIN_VALUE, + Float.MAX_VALUE, + Float.NaN, + }; + + for (float f : special) { + CompressingStoredFieldsWriter.writeZFloat(out, f); + in.reset(buffer, 0, out.getPosition()); + float g = CompressingStoredFieldsReader.readZFloat(in); + assertTrue(in.eof()); + assertEquals(Float.floatToIntBits(f), Float.floatToIntBits(g)); + out.reset(buffer); + } + + // round-trip random values + Random r = random(); + for (int i = 0; i < 100000; i++) { + float f = r.nextFloat() * (random().nextInt(100) - 50); + CompressingStoredFieldsWriter.writeZFloat(out, f); + assertTrue("length=" + out.getPosition() + ", f=" + f, out.getPosition() <= ((Float.floatToIntBits(f) >>> 31) == 1 ? 5 : 4)); + in.reset(buffer, 0, out.getPosition()); + float g = CompressingStoredFieldsReader.readZFloat(in); + assertTrue(in.eof()); + assertEquals(Float.floatToIntBits(f), Float.floatToIntBits(g)); + out.reset(buffer); + } + } + + public void testZDouble() throws Exception { + byte buffer[] = new byte[9]; // we never need more than 9 bytes + ByteArrayDataOutput out = new ByteArrayDataOutput(buffer); + ByteArrayDataInput in = new ByteArrayDataInput(buffer); + + // round-trip small integer values + for (int i = Short.MIN_VALUE; i < Short.MAX_VALUE; i++) { + double x = (double) i; + CompressingStoredFieldsWriter.writeZDouble(out, x); + in.reset(buffer, 0, out.getPosition()); + double y = CompressingStoredFieldsReader.readZDouble(in); + assertTrue(in.eof()); + assertEquals(Double.doubleToLongBits(x), Double.doubleToLongBits(y)); + + // check that compression actually works + if (i >= -1 && i <= 124) { + assertEquals(1, out.getPosition()); // single byte compression + } + out.reset(buffer); + } + + // round-trip special values + double special[] = { + -0.0d, + +0.0d, + Double.NEGATIVE_INFINITY, + Double.POSITIVE_INFINITY, + Double.MIN_VALUE, + Double.MAX_VALUE, + Double.NaN + }; + + for (double x : special) { + CompressingStoredFieldsWriter.writeZDouble(out, x); + in.reset(buffer, 0, out.getPosition()); + double y = CompressingStoredFieldsReader.readZDouble(in); + assertTrue(in.eof()); + assertEquals(Double.doubleToLongBits(x), Double.doubleToLongBits(y)); + out.reset(buffer); + } + + // round-trip random values + Random r = random(); + for (int i = 0; i < 100000; i++) { + double x = r.nextDouble() * (random().nextInt(100) - 50); + CompressingStoredFieldsWriter.writeZDouble(out, x); + assertTrue("length=" + out.getPosition() + ", d=" + x, out.getPosition() <= (x < 0 ? 9 : 8)); + in.reset(buffer, 0, out.getPosition()); + double y = CompressingStoredFieldsReader.readZDouble(in); + assertTrue(in.eof()); + assertEquals(Double.doubleToLongBits(x), Double.doubleToLongBits(y)); + out.reset(buffer); + } + + // same with floats + for (int i = 0; i < 100000; i++) { + double x = (double) (r.nextFloat() * (random().nextInt(100) - 50)); + CompressingStoredFieldsWriter.writeZDouble(out, x); + assertTrue("length=" + out.getPosition() + ", d=" + x, out.getPosition() <= 5); + in.reset(buffer, 0, out.getPosition()); + double y = CompressingStoredFieldsReader.readZDouble(in); + assertTrue(in.eof()); + assertEquals(Double.doubleToLongBits(x), Double.doubleToLongBits(y)); + out.reset(buffer); + } + } + + public void testTLong() throws Exception { + byte buffer[] = new byte[10]; // we never need more than 10 bytes + ByteArrayDataOutput out = new ByteArrayDataOutput(buffer); + ByteArrayDataInput in = new ByteArrayDataInput(buffer); + + // round-trip small integer values + for (int i = Short.MIN_VALUE; i < Short.MAX_VALUE; i++) { + for (long mul : new long[] {SECOND, HOUR, DAY}) { + long l1 = (long) i * mul; + CompressingStoredFieldsWriter.writeTLong(out, l1); + in.reset(buffer, 0, out.getPosition()); + long l2 = CompressingStoredFieldsReader.readTLong(in); + assertTrue(in.eof()); + assertEquals(l1, l2); + + // check that compression actually works + if (i >= -16 && i <= 15) { + assertEquals(1, out.getPosition()); // single byte compression + } + out.reset(buffer); + } + } + + // round-trip random values + Random r = random(); + for (int i = 0; i < 100000; i++) { + final int numBits = r.nextInt(65); + long l1 = r.nextLong() & ((1L << numBits) - 1); + switch (r.nextInt(4)) { + case 0: + l1 *= SECOND; + break; + case 1: + l1 *= HOUR; + break; + case 2: + l1 *= DAY; + break; + default: + break; + } + CompressingStoredFieldsWriter.writeTLong(out, l1); + in.reset(buffer, 0, out.getPosition()); + long l2 = CompressingStoredFieldsReader.readTLong(in); + assertTrue(in.eof()); + assertEquals(l1, l2); + out.reset(buffer); + } + } }