LUCENE-6086: Space optimizations for numeric stored fields.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1644661 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2014-12-11 15:01:00 +00:00
parent 16337aba86
commit b7a20caf84
3 changed files with 432 additions and 18 deletions

View File

@ -20,17 +20,23 @@ package org.apache.lucene.codecs.compressing;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.BYTE_ARR;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.CODEC_SFX_DAT;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.CODEC_SFX_IDX;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.DAY;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.DAY_ENCODING;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.FIELDS_EXTENSION;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.FIELDS_INDEX_EXTENSION;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.HOUR;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.HOUR_ENCODING;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_DOUBLE;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_FLOAT;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_INT;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_LONG;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.SECOND;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.SECOND_ENCODING;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.STRING;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.TYPE_BITS;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.TYPE_MASK;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_CURRENT;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_START;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.FIELDS_EXTENSION;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.FIELDS_INDEX_EXTENSION;
import java.io.EOFException;
import java.io.IOException;
@ -57,6 +63,7 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;
@ -201,16 +208,16 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
visitor.stringField(info, new String(data, StandardCharsets.UTF_8));
break;
case NUMERIC_INT:
visitor.intField(info, in.readInt());
visitor.intField(info, in.readZInt());
break;
case NUMERIC_FLOAT:
visitor.floatField(info, Float.intBitsToFloat(in.readInt()));
visitor.floatField(info, readZFloat(in));
break;
case NUMERIC_LONG:
visitor.longField(info, in.readLong());
visitor.longField(info, readTLong(in));
break;
case NUMERIC_DOUBLE:
visitor.doubleField(info, Double.longBitsToDouble(in.readLong()));
visitor.doubleField(info, readZDouble(in));
break;
default:
throw new AssertionError("Unknown type flag: " + Integer.toHexString(bits));
@ -225,18 +232,98 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
in.skipBytes(length);
break;
case NUMERIC_INT:
in.readZInt();
break;
case NUMERIC_FLOAT:
in.readInt();
readZFloat(in);
break;
case NUMERIC_LONG:
readTLong(in);
break;
case NUMERIC_DOUBLE:
in.readLong();
readZDouble(in);
break;
default:
throw new AssertionError("Unknown type flag: " + Integer.toHexString(bits));
}
}
/**
* Reads a float in a variable-length format. Reads between one and
* five bytes. Small integral values typically take fewer bytes.
*/
static float readZFloat(DataInput in) throws IOException {
int b = in.readByte() & 0xFF;
if (b == 0xFF) {
// negative value
return Float.intBitsToFloat(in.readInt());
} else if ((b & 0x80) != 0) {
// small integer [-1..125]
return (b & 0x7f) - 1;
} else {
// positive float
int bits = b << 24 | ((in.readShort() & 0xFFFF) << 8) | (in.readByte() & 0xFF);
return Float.intBitsToFloat(bits);
}
}
/**
* Reads a double in a variable-length format. Reads between one and
* nine bytes. Small integral values typically take fewer bytes.
*/
static double readZDouble(DataInput in) throws IOException {
int b = in.readByte() & 0xFF;
if (b == 0xFF) {
// negative value
return Double.longBitsToDouble(in.readLong());
} else if (b == 0xFE) {
// float
return Float.intBitsToFloat(in.readInt());
} else if ((b & 0x80) != 0) {
// small integer [-1..124]
return (b & 0x7f) - 1;
} else {
// positive double
long bits = ((long) b) << 56 | ((in.readInt() & 0xFFFFFFFFL) << 24) | ((in.readShort() & 0xFFFFL) << 8) | (in.readByte() & 0xFFL);
return Double.longBitsToDouble(bits);
}
}
/**
* Reads a long in a variable-length format. Reads between one and
* nine bytes. Small values typically take fewer bytes.
*/
static long readTLong(DataInput in) throws IOException {
int header = in.readByte() & 0xFF;
long bits = header & 0x1F;
if ((header & 0x20) != 0) {
// continuation bit
bits |= in.readVLong() << 5;
}
long l = BitUtil.zigZagDecode(bits);
switch (header & DAY_ENCODING) {
case SECOND_ENCODING:
l *= SECOND;
break;
case HOUR_ENCODING:
l *= HOUR;
break;
case DAY_ENCODING:
l *= DAY;
break;
case 0:
// uncompressed
break;
default:
throw new AssertionError();
}
return l;
}
@Override
public void visitDocument(int docID, StoredFieldVisitor visitor)
throws IOException {

View File

@ -37,6 +37,7 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.GrowableByteArrayDataOutput;
@ -70,9 +71,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
private final Directory directory;
private final String segment;
private final String segmentSuffix;
private CompressingStoredFieldsIndexWriter indexWriter;
private IndexOutput fieldsStream;
@ -91,9 +90,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
public CompressingStoredFieldsWriter(Directory directory, SegmentInfo si, String segmentSuffix, IOContext context,
String formatName, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) throws IOException {
assert directory != null;
this.directory = directory;
this.segment = si.name;
this.segmentSuffix = segmentSuffix;
this.compressionMode = compressionMode;
this.compressor = compressionMode.newCompressor();
this.chunkSize = chunkSize;
@ -288,19 +285,170 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
bufferedDocs.writeString(field.stringValue());
} else {
if (number instanceof Byte || number instanceof Short || number instanceof Integer) {
bufferedDocs.writeInt(number.intValue());
bufferedDocs.writeZInt(number.intValue());
} else if (number instanceof Long) {
bufferedDocs.writeLong(number.longValue());
writeTLong(bufferedDocs, number.longValue());
} else if (number instanceof Float) {
bufferedDocs.writeInt(Float.floatToIntBits(number.floatValue()));
writeZFloat(bufferedDocs, number.floatValue());
} else if (number instanceof Double) {
bufferedDocs.writeLong(Double.doubleToLongBits(number.doubleValue()));
writeZDouble(bufferedDocs, number.doubleValue());
} else {
throw new AssertionError("Cannot get here");
}
}
}
// -0 isn't compressed.
static final int NEGATIVE_ZERO_FLOAT = Float.floatToIntBits(-0f);
static final long NEGATIVE_ZERO_DOUBLE = Double.doubleToLongBits(-0d);
// for compression of timestamps
static final long SECOND = 1000L;
static final long HOUR = 60 * 60 * SECOND;
static final long DAY = 24 * HOUR;
static final int SECOND_ENCODING = 0x40;
static final int HOUR_ENCODING = 0x80;
static final int DAY_ENCODING = 0xC0;
/**
* Writes a float in a variable-length format. Writes between one and
* five bytes. Small integral values typically take fewer bytes.
* <p>
* ZFloat --&gt; Header, Bytes*?
* <ul>
* <li>Header --&gt; {@link DataOutput#writeByte Uint8}. When it is
* equal to 0xFF then the value is negative and stored in the next
* 4 bytes. Otherwise if the first bit is set then the other bits
* in the header encode the value plus one and no other
* bytes are read. Otherwise, the value is a positive float value
* whose first byte is the header, and 3 bytes need to be read to
* complete it.
* <li>Bytes --&gt; Potential additional bytes to read depending on the
* header.
* </ul>
* <p>
*/
static void writeZFloat(DataOutput out, float f) throws IOException {
int intVal = (int) f;
final int floatBits = Float.floatToIntBits(f);
if (f == intVal
&& intVal >= -1
&& intVal <= 0x7D
&& floatBits != NEGATIVE_ZERO_FLOAT) {
// small integer value [-1..125]: single byte
out.writeByte((byte) (0x80 | (1 + intVal)));
} else if ((floatBits >>> 31) == 0) {
// other positive floats: 4 bytes
out.writeInt(floatBits);
} else {
// other negative float: 5 bytes
out.writeByte((byte) 0xFF);
out.writeInt(floatBits);
}
}
/**
* Writes a float in a variable-length format. Writes between one and
* five bytes. Small integral values typically take fewer bytes.
* <p>
* ZFloat --&gt; Header, Bytes*?
* <ul>
* <li>Header --&gt; {@link DataOutput#writeByte Uint8}. When it is
* equal to 0xFF then the value is negative and stored in the next
* 8 bytes. When it is equal to 0xFE then the value is stored as a
* float in the next 4 bytes. Otherwise if the first bit is set
* then the other bits in the header encode the value plus one and
* no other bytes are read. Otherwise, the value is a positive float
* value whose first byte is the header, and 7 bytes need to be read
* to complete it.
* <li>Bytes --&gt; Potential additional bytes to read depending on the
* header.
* </ul>
* <p>
*/
static void writeZDouble(DataOutput out, double d) throws IOException {
int intVal = (int) d;
final long doubleBits = Double.doubleToLongBits(d);
if (d == intVal &&
intVal >= -1 &&
intVal <= 0x7C &&
doubleBits != NEGATIVE_ZERO_DOUBLE) {
// small integer value [-1..124]: single byte
out.writeByte((byte) (0x80 | (intVal + 1)));
return;
} else if (d == (float) d) {
// d has an accurate float representation: 5 bytes
out.writeByte((byte) 0xFE);
out.writeInt(Float.floatToIntBits((float) d));
} else if ((doubleBits >>> 63) == 0) {
// other positive doubles: 8 bytes
out.writeLong(doubleBits);
} else {
// other negative doubles: 9 bytes
out.writeByte((byte) 0xFF);
out.writeLong(doubleBits);
}
}
/**
* Writes a long in a variable-length format. Writes between one and
* ten bytes. Small values or values representing timestamps with day,
* hour or second precision typically require fewer bytes.
* <p>
* ZLong --&gt; Header, Bytes*?
* <ul>
* <li>Header --&gt; The first two bits indicate the compression scheme:
* <ul>
* <li>00 - uncompressed
* <li>01 - multiple of 1000 (second)
* <li>10 - multiple of 3600000 (hour)
* <li>11 - multiple of 86400000 (day)
* </ul>
* Then the next bit is a continuation bit, indicating whether more
* bytes need to be read, and the last 5 bits are the lower bits of
* the encoded value. In order to reconstruct the value, you need to
* combine the 5 lower bits of the header with a vLong in the next
* bytes (if the continuation bit is set to 1). Then
* {@link BitUtil#zigZagDecode(int) zigzag-decode} it and finally
* multiply by the multiple corresponding to the compression scheme.
* <li>Bytes --&gt; Potential additional bytes to read depending on the
* header.
* </ul>
* <p>
*/
// T for "timestamp"
static void writeTLong(DataOutput out, long l) throws IOException {
int header;
if (l % SECOND != 0) {
header = 0;
} else if (l % DAY == 0) {
// timestamp with day precision
header = DAY_ENCODING;
l /= DAY;
} else if (l % HOUR == 0) {
// timestamp with hour precision, or day precision with a timezone
header = HOUR_ENCODING;
l /= HOUR;
} else {
// timestamp with second precision
header = SECOND_ENCODING;
l /= SECOND;
}
final long zigZagL = BitUtil.zigZagEncode(l);
header |= (zigZagL & 0x1F); // last 5 bits
final long upperBits = zigZagL >>> 5;
if (upperBits != 0) {
header |= 0x20;
}
out.writeByte((byte) header);
if (upperBits != 0) {
out.writeVLong(upperBits);
}
}
@Override
public void finish(FieldInfos fis, int numDocs) throws IOException {
if (numBufferedDocs > 0) {

View File

@ -18,25 +18,32 @@ package org.apache.lucene.codecs.compressing;
*/
import java.io.IOException;
import java.util.Random;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntField;
import org.apache.lucene.index.BaseStoredFieldsFormatTestCase;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.junit.Test;
import com.carrotsearch.randomizedtesting.generators.RandomInts;
public class TestCompressingStoredFieldsFormat extends BaseStoredFieldsFormatTestCase {
static final long SECOND = 1000L;
static final long HOUR = 60 * 60 * SECOND;
static final long DAY = 24 * HOUR;
@Override
protected Codec getCodec() {
return CompressingCodec.randomInstance(random());
@ -88,4 +95,176 @@ public class TestCompressingStoredFieldsFormat extends BaseStoredFieldsFormatTes
dir.close();
}
}
public void testZFloat() throws Exception {
byte buffer[] = new byte[5]; // we never need more than 5 bytes
ByteArrayDataOutput out = new ByteArrayDataOutput(buffer);
ByteArrayDataInput in = new ByteArrayDataInput(buffer);
// round-trip small integer values
for (int i = Short.MIN_VALUE; i < Short.MAX_VALUE; i++) {
float f = (float) i;
CompressingStoredFieldsWriter.writeZFloat(out, f);
in.reset(buffer, 0, out.getPosition());
float g = CompressingStoredFieldsReader.readZFloat(in);
assertTrue(in.eof());
assertEquals(Float.floatToIntBits(f), Float.floatToIntBits(g));
// check that compression actually works
if (i >= -1 && i <= 123) {
assertEquals(1, out.getPosition()); // single byte compression
}
out.reset(buffer);
}
// round-trip special values
float special[] = {
-0.0f,
+0.0f,
Float.NEGATIVE_INFINITY,
Float.POSITIVE_INFINITY,
Float.MIN_VALUE,
Float.MAX_VALUE,
Float.NaN,
};
for (float f : special) {
CompressingStoredFieldsWriter.writeZFloat(out, f);
in.reset(buffer, 0, out.getPosition());
float g = CompressingStoredFieldsReader.readZFloat(in);
assertTrue(in.eof());
assertEquals(Float.floatToIntBits(f), Float.floatToIntBits(g));
out.reset(buffer);
}
// round-trip random values
Random r = random();
for (int i = 0; i < 100000; i++) {
float f = r.nextFloat() * (random().nextInt(100) - 50);
CompressingStoredFieldsWriter.writeZFloat(out, f);
assertTrue("length=" + out.getPosition() + ", f=" + f, out.getPosition() <= ((Float.floatToIntBits(f) >>> 31) == 1 ? 5 : 4));
in.reset(buffer, 0, out.getPosition());
float g = CompressingStoredFieldsReader.readZFloat(in);
assertTrue(in.eof());
assertEquals(Float.floatToIntBits(f), Float.floatToIntBits(g));
out.reset(buffer);
}
}
public void testZDouble() throws Exception {
byte buffer[] = new byte[9]; // we never need more than 9 bytes
ByteArrayDataOutput out = new ByteArrayDataOutput(buffer);
ByteArrayDataInput in = new ByteArrayDataInput(buffer);
// round-trip small integer values
for (int i = Short.MIN_VALUE; i < Short.MAX_VALUE; i++) {
double x = (double) i;
CompressingStoredFieldsWriter.writeZDouble(out, x);
in.reset(buffer, 0, out.getPosition());
double y = CompressingStoredFieldsReader.readZDouble(in);
assertTrue(in.eof());
assertEquals(Double.doubleToLongBits(x), Double.doubleToLongBits(y));
// check that compression actually works
if (i >= -1 && i <= 124) {
assertEquals(1, out.getPosition()); // single byte compression
}
out.reset(buffer);
}
// round-trip special values
double special[] = {
-0.0d,
+0.0d,
Double.NEGATIVE_INFINITY,
Double.POSITIVE_INFINITY,
Double.MIN_VALUE,
Double.MAX_VALUE,
Double.NaN
};
for (double x : special) {
CompressingStoredFieldsWriter.writeZDouble(out, x);
in.reset(buffer, 0, out.getPosition());
double y = CompressingStoredFieldsReader.readZDouble(in);
assertTrue(in.eof());
assertEquals(Double.doubleToLongBits(x), Double.doubleToLongBits(y));
out.reset(buffer);
}
// round-trip random values
Random r = random();
for (int i = 0; i < 100000; i++) {
double x = r.nextDouble() * (random().nextInt(100) - 50);
CompressingStoredFieldsWriter.writeZDouble(out, x);
assertTrue("length=" + out.getPosition() + ", d=" + x, out.getPosition() <= (x < 0 ? 9 : 8));
in.reset(buffer, 0, out.getPosition());
double y = CompressingStoredFieldsReader.readZDouble(in);
assertTrue(in.eof());
assertEquals(Double.doubleToLongBits(x), Double.doubleToLongBits(y));
out.reset(buffer);
}
// same with floats
for (int i = 0; i < 100000; i++) {
double x = (double) (r.nextFloat() * (random().nextInt(100) - 50));
CompressingStoredFieldsWriter.writeZDouble(out, x);
assertTrue("length=" + out.getPosition() + ", d=" + x, out.getPosition() <= 5);
in.reset(buffer, 0, out.getPosition());
double y = CompressingStoredFieldsReader.readZDouble(in);
assertTrue(in.eof());
assertEquals(Double.doubleToLongBits(x), Double.doubleToLongBits(y));
out.reset(buffer);
}
}
public void testTLong() throws Exception {
byte buffer[] = new byte[10]; // we never need more than 10 bytes
ByteArrayDataOutput out = new ByteArrayDataOutput(buffer);
ByteArrayDataInput in = new ByteArrayDataInput(buffer);
// round-trip small integer values
for (int i = Short.MIN_VALUE; i < Short.MAX_VALUE; i++) {
for (long mul : new long[] {SECOND, HOUR, DAY}) {
long l1 = (long) i * mul;
CompressingStoredFieldsWriter.writeTLong(out, l1);
in.reset(buffer, 0, out.getPosition());
long l2 = CompressingStoredFieldsReader.readTLong(in);
assertTrue(in.eof());
assertEquals(l1, l2);
// check that compression actually works
if (i >= -16 && i <= 15) {
assertEquals(1, out.getPosition()); // single byte compression
}
out.reset(buffer);
}
}
// round-trip random values
Random r = random();
for (int i = 0; i < 100000; i++) {
final int numBits = r.nextInt(65);
long l1 = r.nextLong() & ((1L << numBits) - 1);
switch (r.nextInt(4)) {
case 0:
l1 *= SECOND;
break;
case 1:
l1 *= HOUR;
break;
case 2:
l1 *= DAY;
break;
default:
break;
}
CompressingStoredFieldsWriter.writeTLong(out, l1);
in.reset(buffer, 0, out.getPosition());
long l2 = CompressingStoredFieldsReader.readTLong(in);
assertTrue(in.eof());
assertEquals(l1, l2);
out.reset(buffer);
}
}
}