mirror of https://github.com/apache/lucene.git
LUCENE-5720: Optimize DirectPackedReader's decompression
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1599180 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6ff06b0856
commit
e83229fd20
|
@ -219,6 +219,8 @@ Optimizations
|
||||||
* LUCENE-5694: Don't score() subscorers in DisjunctionSumScorer or
|
* LUCENE-5694: Don't score() subscorers in DisjunctionSumScorer or
|
||||||
DisjunctionMaxScorer unless score() is called. (Robert Muir)
|
DisjunctionMaxScorer unless score() is called. (Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-5720: Optimize DirectPackedReader's decompression. (Robert Muir)
|
||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
||||||
* LUCENE-5673: MMapDirectory: Work around a "bug" in the JDK that throws
|
* LUCENE-5673: MMapDirectory: Work around a "bug" in the JDK that throws
|
||||||
|
|
|
@ -182,7 +182,7 @@ class MemoryDocValuesConsumer extends DocValuesConsumer {
|
||||||
data.writeLong(gcd);
|
data.writeLong(gcd);
|
||||||
data.writeVInt(BLOCK_SIZE);
|
data.writeVInt(BLOCK_SIZE);
|
||||||
|
|
||||||
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
|
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE, PackedInts.DEFAULT);
|
||||||
for (Number nv : values) {
|
for (Number nv : values) {
|
||||||
long value = nv == null ? 0 : nv.longValue();
|
long value = nv == null ? 0 : nv.longValue();
|
||||||
writer.add((value - minValue) / gcd);
|
writer.add((value - minValue) / gcd);
|
||||||
|
@ -194,7 +194,7 @@ class MemoryDocValuesConsumer extends DocValuesConsumer {
|
||||||
meta.writeVInt(PackedInts.VERSION_CURRENT);
|
meta.writeVInt(PackedInts.VERSION_CURRENT);
|
||||||
data.writeVInt(BLOCK_SIZE);
|
data.writeVInt(BLOCK_SIZE);
|
||||||
|
|
||||||
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
|
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE, PackedInts.DEFAULT);
|
||||||
for (Number nv : values) {
|
for (Number nv : values) {
|
||||||
writer.add(nv == null ? 0 : nv.longValue());
|
writer.add(nv == null ? 0 : nv.longValue());
|
||||||
}
|
}
|
||||||
|
|
|
@ -241,7 +241,7 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter {
|
||||||
|
|
||||||
vectorsStream.writeVInt(PackedInts.VERSION_CURRENT);
|
vectorsStream.writeVInt(PackedInts.VERSION_CURRENT);
|
||||||
vectorsStream.writeVInt(chunkSize);
|
vectorsStream.writeVInt(chunkSize);
|
||||||
writer = new BlockPackedWriter(vectorsStream, BLOCK_SIZE);
|
writer = new BlockPackedWriter(vectorsStream, BLOCK_SIZE, PackedInts.COMPACT);
|
||||||
|
|
||||||
positionsBuf = new int[1024];
|
positionsBuf = new int[1024];
|
||||||
startOffsetsBuf = new int[1024];
|
startOffsetsBuf = new int[1024];
|
||||||
|
|
|
@ -153,7 +153,7 @@ class Lucene42NormsConsumer extends DocValuesConsumer {
|
||||||
data.writeLong(gcd);
|
data.writeLong(gcd);
|
||||||
data.writeVInt(BLOCK_SIZE);
|
data.writeVInt(BLOCK_SIZE);
|
||||||
|
|
||||||
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
|
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE, PackedInts.DEFAULT);
|
||||||
for (Number nv : values) {
|
for (Number nv : values) {
|
||||||
long value = nv == null ? 0 : nv.longValue();
|
long value = nv == null ? 0 : nv.longValue();
|
||||||
writer.add((value - minValue) / gcd);
|
writer.add((value - minValue) / gcd);
|
||||||
|
@ -165,7 +165,7 @@ class Lucene42NormsConsumer extends DocValuesConsumer {
|
||||||
meta.writeVInt(PackedInts.VERSION_CURRENT);
|
meta.writeVInt(PackedInts.VERSION_CURRENT);
|
||||||
data.writeVInt(BLOCK_SIZE);
|
data.writeVInt(BLOCK_SIZE);
|
||||||
|
|
||||||
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
|
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE, PackedInts.DEFAULT);
|
||||||
for (Number nv : values) {
|
for (Number nv : values) {
|
||||||
writer.add(nv == null ? 0 : nv.longValue());
|
writer.add(nv == null ? 0 : nv.longValue());
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,6 +51,8 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer implements Clos
|
||||||
public static final int GCD_COMPRESSED = 1;
|
public static final int GCD_COMPRESSED = 1;
|
||||||
/** Compressed by giving IDs to unique values. */
|
/** Compressed by giving IDs to unique values. */
|
||||||
public static final int TABLE_COMPRESSED = 2;
|
public static final int TABLE_COMPRESSED = 2;
|
||||||
|
/** Compressed using just bitpacked integers */
|
||||||
|
public static final int BITPACK_COMPRESSED = 3;
|
||||||
|
|
||||||
/** Uncompressed binary, written directly (fixed length). */
|
/** Uncompressed binary, written directly (fixed length). */
|
||||||
public static final int BINARY_FIXED_UNCOMPRESSED = 0;
|
public static final int BINARY_FIXED_UNCOMPRESSED = 0;
|
||||||
|
@ -99,6 +101,7 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer implements Clos
|
||||||
long maxValue = Long.MIN_VALUE;
|
long maxValue = Long.MIN_VALUE;
|
||||||
long gcd = 0;
|
long gcd = 0;
|
||||||
boolean missing = false;
|
boolean missing = false;
|
||||||
|
boolean block = true;
|
||||||
// TODO: more efficient?
|
// TODO: more efficient?
|
||||||
HashSet<Long> uniqueValues = null;
|
HashSet<Long> uniqueValues = null;
|
||||||
if (optimizeStorage) {
|
if (optimizeStorage) {
|
||||||
|
@ -138,9 +141,19 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer implements Clos
|
||||||
++count;
|
++count;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (@SuppressWarnings("unused") Number nv : values) {
|
for (Number nv : values) {
|
||||||
|
long value = nv.longValue();
|
||||||
|
assert value >= -1;
|
||||||
|
minValue = Math.min(minValue, value);
|
||||||
|
maxValue = Math.max(maxValue, value);
|
||||||
++count;
|
++count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// packed ints doesnt support valueCount > maxValue, and
|
||||||
|
// we must represent missing ordinal (-1)
|
||||||
|
if (count < Integer.MAX_VALUE && maxValue < Long.MAX_VALUE) {
|
||||||
|
block = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final long delta = maxValue - minValue;
|
final long delta = maxValue - minValue;
|
||||||
|
@ -152,6 +165,8 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer implements Clos
|
||||||
format = TABLE_COMPRESSED;
|
format = TABLE_COMPRESSED;
|
||||||
} else if (gcd != 0 && gcd != 1) {
|
} else if (gcd != 0 && gcd != 1) {
|
||||||
format = GCD_COMPRESSED;
|
format = GCD_COMPRESSED;
|
||||||
|
} else if (block == false) {
|
||||||
|
format = BITPACK_COMPRESSED;
|
||||||
} else {
|
} else {
|
||||||
format = DELTA_COMPRESSED;
|
format = DELTA_COMPRESSED;
|
||||||
}
|
}
|
||||||
|
@ -173,7 +188,7 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer implements Clos
|
||||||
case GCD_COMPRESSED:
|
case GCD_COMPRESSED:
|
||||||
meta.writeLong(minValue);
|
meta.writeLong(minValue);
|
||||||
meta.writeLong(gcd);
|
meta.writeLong(gcd);
|
||||||
final BlockPackedWriter quotientWriter = new BlockPackedWriter(data, BLOCK_SIZE);
|
final BlockPackedWriter quotientWriter = new BlockPackedWriter(data, BLOCK_SIZE, PackedInts.DEFAULT);
|
||||||
for (Number nv : values) {
|
for (Number nv : values) {
|
||||||
long value = nv == null ? 0 : nv.longValue();
|
long value = nv == null ? 0 : nv.longValue();
|
||||||
quotientWriter.add((value - minValue) / gcd);
|
quotientWriter.add((value - minValue) / gcd);
|
||||||
|
@ -181,7 +196,7 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer implements Clos
|
||||||
quotientWriter.finish();
|
quotientWriter.finish();
|
||||||
break;
|
break;
|
||||||
case DELTA_COMPRESSED:
|
case DELTA_COMPRESSED:
|
||||||
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
|
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE, PackedInts.DEFAULT);
|
||||||
for (Number nv : values) {
|
for (Number nv : values) {
|
||||||
writer.add(nv == null ? 0 : nv.longValue());
|
writer.add(nv == null ? 0 : nv.longValue());
|
||||||
}
|
}
|
||||||
|
@ -202,6 +217,18 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer implements Clos
|
||||||
}
|
}
|
||||||
ordsWriter.finish();
|
ordsWriter.finish();
|
||||||
break;
|
break;
|
||||||
|
case BITPACK_COMPRESSED:
|
||||||
|
assert count > 0 && count < Integer.MAX_VALUE;
|
||||||
|
assert maxValue >= -1 && maxValue < Long.MAX_VALUE : maxValue;
|
||||||
|
int bpv = PackedInts.bitsRequired(maxValue+1);
|
||||||
|
bpv = PackedInts.fastestDirectBits(bpv, PackedInts.DEFAULT);
|
||||||
|
meta.writeVInt(bpv);
|
||||||
|
final PackedInts.Writer bitWriter = PackedInts.getWriterNoHeader(data, PackedInts.Format.PACKED, (int) count, bpv, PackedInts.DEFAULT_BUFFER_SIZE);
|
||||||
|
for (Number nv : values) {
|
||||||
|
bitWriter.add(nv.longValue()+1);
|
||||||
|
}
|
||||||
|
bitWriter.finish();
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
throw new AssertionError();
|
throw new AssertionError();
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,6 +50,9 @@ import org.apache.lucene.util.packed.PackedInts;
|
||||||
* to this table, and those ordinals are compressed with bitpacking ({@link PackedInts}).
|
* to this table, and those ordinals are compressed with bitpacking ({@link PackedInts}).
|
||||||
* <li>GCD-compressed: when all numbers share a common divisor, such as dates, the greatest
|
* <li>GCD-compressed: when all numbers share a common divisor, such as dates, the greatest
|
||||||
* common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics.
|
* common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics.
|
||||||
|
* <li>Bitpack-compressed: per-document integers written as a block for the entire segment.
|
||||||
|
* This technique will only be used when numbers range from {@code -1 .. Long.MAX_VALUE-1},
|
||||||
|
* when the blocking for the delta-compressed method would not provide additional compression.
|
||||||
* </ul>
|
* </ul>
|
||||||
* <p>
|
* <p>
|
||||||
* {@link DocValuesType#BINARY BINARY}:
|
* {@link DocValuesType#BINARY BINARY}:
|
||||||
|
@ -96,6 +99,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
||||||
* <li>GCDNumericEntry --> NumericHeader,MinValue,GCD</li>
|
* <li>GCDNumericEntry --> NumericHeader,MinValue,GCD</li>
|
||||||
* <li>TableNumericEntry --> NumericHeader,TableSize,{@link DataOutput#writeLong Int64}<sup>TableSize</sup></li>
|
* <li>TableNumericEntry --> NumericHeader,TableSize,{@link DataOutput#writeLong Int64}<sup>TableSize</sup></li>
|
||||||
* <li>DeltaNumericEntry --> NumericHeader</li>
|
* <li>DeltaNumericEntry --> NumericHeader</li>
|
||||||
|
* <li>DeltaNumericEntry --> NumericHeader,BitsPerValue</li>
|
||||||
* <li>NumericHeader --> FieldNumber,EntryType,NumericType,MissingOffset,PackedVersion,DataOffset,Count,BlockSize</li>
|
* <li>NumericHeader --> FieldNumber,EntryType,NumericType,MissingOffset,PackedVersion,DataOffset,Count,BlockSize</li>
|
||||||
* <li>BinaryEntry --> FixedBinaryEntry | VariableBinaryEntry | PrefixBinaryEntry</li>
|
* <li>BinaryEntry --> FixedBinaryEntry | VariableBinaryEntry | PrefixBinaryEntry</li>
|
||||||
* <li>FixedBinaryEntry --> BinaryHeader</li>
|
* <li>FixedBinaryEntry --> BinaryHeader</li>
|
||||||
|
@ -108,7 +112,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
||||||
* <li>EntryType,CompressionType --> {@link DataOutput#writeByte Byte}</li>
|
* <li>EntryType,CompressionType --> {@link DataOutput#writeByte Byte}</li>
|
||||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||||
* <li>MinValue,GCD,MissingOffset,AddressOffset,DataOffset --> {@link DataOutput#writeLong Int64}</li>
|
* <li>MinValue,GCD,MissingOffset,AddressOffset,DataOffset --> {@link DataOutput#writeLong Int64}</li>
|
||||||
* <li>TableSize --> {@link DataOutput#writeVInt vInt}</li>
|
* <li>BitsPerValue,TableSize --> {@link DataOutput#writeVInt vInt}</li>
|
||||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
* <p>Sorted fields have two entries: a BinaryEntry with the value metadata,
|
* <p>Sorted fields have two entries: a BinaryEntry with the value metadata,
|
||||||
|
@ -122,15 +126,17 @@ import org.apache.lucene.util.packed.PackedInts;
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>0 --> delta-compressed. For each block of 16k integers, every integer is delta-encoded
|
* <li>0 --> delta-compressed. For each block of 16k integers, every integer is delta-encoded
|
||||||
* from the minimum value within the block.
|
* from the minimum value within the block.
|
||||||
* <li>1 -->, gcd-compressed. When all integers share a common divisor, only quotients are stored
|
* <li>1 --> gcd-compressed. When all integers share a common divisor, only quotients are stored
|
||||||
* using blocks of delta-encoded ints.
|
* using blocks of delta-encoded ints.
|
||||||
* <li>2 --> table-compressed. When the number of unique numeric values is small and it would save space,
|
* <li>2 --> table-compressed. When the number of unique numeric values is small and it would save space,
|
||||||
* a lookup table of unique values is written, followed by the ordinal for each document.
|
* a lookup table of unique values is written, followed by the ordinal for each document.
|
||||||
|
* <li>3 --> bitpack-compressed. When the delta method would not save space, every integer is
|
||||||
|
* delta encoded from {@code -1} for the entire segment.
|
||||||
* </ul>
|
* </ul>
|
||||||
* <p>BinaryType indicates how Binary values will be stored:
|
* <p>BinaryType indicates how Binary values will be stored:
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>0 --> fixed-width. All values have the same length, addressing by multiplication.
|
* <li>0 --> fixed-width. All values have the same length, addressing by multiplication.
|
||||||
* <li>1 -->, variable-width. An address for each value is stored.
|
* <li>1 --> variable-width. An address for each value is stored.
|
||||||
* <li>2 --> prefix-compressed. An address to the start of every interval'th value is stored.
|
* <li>2 --> prefix-compressed. An address to the start of every interval'th value is stored.
|
||||||
* </ul>
|
* </ul>
|
||||||
* <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
|
* <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
|
||||||
|
@ -185,7 +191,8 @@ public final class Lucene45DocValuesFormat extends DocValuesFormat {
|
||||||
static final int VERSION_START = 0;
|
static final int VERSION_START = 0;
|
||||||
static final int VERSION_SORTED_SET_SINGLE_VALUE_OPTIMIZED = 1;
|
static final int VERSION_SORTED_SET_SINGLE_VALUE_OPTIMIZED = 1;
|
||||||
static final int VERSION_CHECKSUM = 2;
|
static final int VERSION_CHECKSUM = 2;
|
||||||
static final int VERSION_CURRENT = VERSION_CHECKSUM;
|
static final int VERSION_BITPACK_COMPRESSED = 3;
|
||||||
|
static final int VERSION_CURRENT = VERSION_BITPACK_COMPRESSED;
|
||||||
static final byte NUMERIC = 0;
|
static final byte NUMERIC = 0;
|
||||||
static final byte BINARY = 1;
|
static final byte BINARY = 1;
|
||||||
static final byte SORTED = 2;
|
static final byte SORTED = 2;
|
||||||
|
|
|
@ -25,6 +25,7 @@ import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.GCD_CO
|
||||||
import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.SORTED_SET_SINGLE_VALUED_SORTED;
|
import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.SORTED_SET_SINGLE_VALUED_SORTED;
|
||||||
import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.SORTED_SET_WITH_ADDRESSES;
|
import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.SORTED_SET_WITH_ADDRESSES;
|
||||||
import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.TABLE_COMPRESSED;
|
import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.TABLE_COMPRESSED;
|
||||||
|
import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.BITPACK_COMPRESSED;
|
||||||
import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesFormat.VERSION_SORTED_SET_SINGLE_VALUE_OPTIMIZED;
|
import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesFormat.VERSION_SORTED_SET_SINGLE_VALUE_OPTIMIZED;
|
||||||
|
|
||||||
import java.io.Closeable; // javadocs
|
import java.io.Closeable; // javadocs
|
||||||
|
@ -264,6 +265,9 @@ public class Lucene45DocValuesProducer extends DocValuesProducer implements Clos
|
||||||
entry.table[i] = meta.readLong();
|
entry.table[i] = meta.readLong();
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case BITPACK_COMPRESSED:
|
||||||
|
entry.bitsRequired = meta.readVInt();
|
||||||
|
break;
|
||||||
case DELTA_COMPRESSED:
|
case DELTA_COMPRESSED:
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
@ -339,6 +343,14 @@ public class Lucene45DocValuesProducer extends DocValuesProducer implements Clos
|
||||||
case DELTA_COMPRESSED:
|
case DELTA_COMPRESSED:
|
||||||
final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true);
|
final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true);
|
||||||
return reader;
|
return reader;
|
||||||
|
case BITPACK_COMPRESSED:
|
||||||
|
final PackedInts.Reader bits = PackedInts.getDirectReaderNoHeader(data, PackedInts.Format.PACKED, entry.packedIntsVersion, (int) entry.count, entry.bitsRequired);
|
||||||
|
return new LongValues() {
|
||||||
|
@Override
|
||||||
|
public long get(long id) {
|
||||||
|
return bits.get((int) id) - 1;
|
||||||
|
}
|
||||||
|
};
|
||||||
case GCD_COMPRESSED:
|
case GCD_COMPRESSED:
|
||||||
final long min = entry.minValue;
|
final long min = entry.minValue;
|
||||||
final long mult = entry.gcd;
|
final long mult = entry.gcd;
|
||||||
|
@ -484,10 +496,7 @@ public class Lucene45DocValuesProducer extends DocValuesProducer implements Clos
|
||||||
final int valueCount = (int) binaries.get(field.number).count;
|
final int valueCount = (int) binaries.get(field.number).count;
|
||||||
final BinaryDocValues binary = getBinary(field);
|
final BinaryDocValues binary = getBinary(field);
|
||||||
NumericEntry entry = ords.get(field.number);
|
NumericEntry entry = ords.get(field.number);
|
||||||
IndexInput data = this.data.clone();
|
final LongValues ordinals = getNumeric(entry);
|
||||||
data.seek(entry.offset);
|
|
||||||
final BlockPackedReader ordinals = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true);
|
|
||||||
|
|
||||||
return new SortedDocValues() {
|
return new SortedDocValues() {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -686,6 +695,8 @@ public class Lucene45DocValuesProducer extends DocValuesProducer implements Clos
|
||||||
/** packed ints blocksize */
|
/** packed ints blocksize */
|
||||||
public int blockSize;
|
public int blockSize;
|
||||||
|
|
||||||
|
int bitsRequired;
|
||||||
|
|
||||||
long minValue;
|
long minValue;
|
||||||
long gcd;
|
long gcd;
|
||||||
long table[];
|
long table[];
|
||||||
|
|
|
@ -58,13 +58,16 @@ import org.apache.lucene.store.DataOutput;
|
||||||
* @lucene.internal
|
* @lucene.internal
|
||||||
*/
|
*/
|
||||||
public final class BlockPackedWriter extends AbstractBlockPackedWriter {
|
public final class BlockPackedWriter extends AbstractBlockPackedWriter {
|
||||||
|
final float acceptableOverheadRatio;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sole constructor.
|
* Sole constructor.
|
||||||
* @param blockSize the number of values of a single block, must be a power of 2
|
* @param blockSize the number of values of a single block, must be a power of 2
|
||||||
|
* @param acceptableOverheadRatio an acceptable overhead ratio per value
|
||||||
*/
|
*/
|
||||||
public BlockPackedWriter(DataOutput out, int blockSize) {
|
public BlockPackedWriter(DataOutput out, int blockSize, float acceptableOverheadRatio) {
|
||||||
super(out, blockSize);
|
super(out, blockSize);
|
||||||
|
this.acceptableOverheadRatio = acceptableOverheadRatio;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void flush() throws IOException {
|
protected void flush() throws IOException {
|
||||||
|
@ -76,7 +79,8 @@ public final class BlockPackedWriter extends AbstractBlockPackedWriter {
|
||||||
}
|
}
|
||||||
|
|
||||||
final long delta = max - min;
|
final long delta = max - min;
|
||||||
final int bitsRequired = delta < 0 ? 64 : delta == 0L ? 0 : PackedInts.bitsRequired(delta);
|
int bitsRequired = delta < 0 ? 64 : delta == 0L ? 0 : PackedInts.bitsRequired(delta);
|
||||||
|
bitsRequired = PackedInts.fastestDirectBits(bitsRequired, acceptableOverheadRatio);
|
||||||
if (bitsRequired == 64) {
|
if (bitsRequired == 64) {
|
||||||
// no need to delta-encode
|
// no need to delta-encode
|
||||||
min = 0L;
|
min = 0L;
|
||||||
|
|
|
@ -23,11 +23,11 @@ import java.io.IOException;
|
||||||
|
|
||||||
/* Reads directly from disk on each get */
|
/* Reads directly from disk on each get */
|
||||||
class DirectPackedReader extends PackedInts.ReaderImpl {
|
class DirectPackedReader extends PackedInts.ReaderImpl {
|
||||||
private final IndexInput in;
|
final IndexInput in;
|
||||||
private final long startPointer;
|
final long startPointer;
|
||||||
private final long valueMask;
|
final long valueMask;
|
||||||
|
|
||||||
public DirectPackedReader(int bitsPerValue, int valueCount, IndexInput in) {
|
DirectPackedReader(int bitsPerValue, int valueCount, IndexInput in) {
|
||||||
super(valueCount, bitsPerValue);
|
super(valueCount, bitsPerValue);
|
||||||
this.in = in;
|
this.in = in;
|
||||||
|
|
||||||
|
@ -90,7 +90,7 @@ class DirectPackedReader extends PackedInts.ReaderImpl {
|
||||||
return (rawValue >>> shiftRightBits) & valueMask;
|
return (rawValue >>> shiftRightBits) & valueMask;
|
||||||
|
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
throw new IllegalStateException("failed", ioe);
|
throw new RuntimeException(ioe);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -98,4 +98,258 @@ class DirectPackedReader extends PackedInts.ReaderImpl {
|
||||||
public long ramBytesUsed() {
|
public long ramBytesUsed() {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static class DirectPackedReader1 extends DirectPackedReader {
|
||||||
|
DirectPackedReader1(int valueCount, IndexInput in) {
|
||||||
|
super(1, valueCount, in);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long get(int index) {
|
||||||
|
try {
|
||||||
|
in.seek(startPointer + (index >>> 3));
|
||||||
|
int shift = 7 - (index & 7);
|
||||||
|
return (in.readByte() >>> shift) & 0x1;
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class DirectPackedReader2 extends DirectPackedReader {
|
||||||
|
DirectPackedReader2(int valueCount, IndexInput in) {
|
||||||
|
super(2, valueCount, in);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long get(int index) {
|
||||||
|
try {
|
||||||
|
in.seek(startPointer + (index >>> 2));
|
||||||
|
int shift = (3 - (index & 3)) << 1;
|
||||||
|
return (in.readByte() >>> shift) & 0x3;
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class DirectPackedReader4 extends DirectPackedReader {
|
||||||
|
DirectPackedReader4(int valueCount, IndexInput in) {
|
||||||
|
super(4, valueCount, in);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long get(int index) {
|
||||||
|
try {
|
||||||
|
in.seek(startPointer + (index >>> 1));
|
||||||
|
int shift = ((index + 1) & 1) << 2;
|
||||||
|
return (in.readByte() >>> shift) & 0xF;
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class DirectPackedReader8 extends DirectPackedReader {
|
||||||
|
DirectPackedReader8(int valueCount, IndexInput in) {
|
||||||
|
super(8, valueCount, in);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long get(int index) {
|
||||||
|
try {
|
||||||
|
in.seek(startPointer + index);
|
||||||
|
return in.readByte() & 0xFF;
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class DirectPackedReader12 extends DirectPackedReader {
|
||||||
|
DirectPackedReader12(int valueCount, IndexInput in) {
|
||||||
|
super(12, valueCount, in);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long get(int index) {
|
||||||
|
try {
|
||||||
|
long offset = (index * 12L) >>> 3;
|
||||||
|
in.seek(startPointer + offset);
|
||||||
|
int shift = ((index + 1) & 1) << 2;
|
||||||
|
return (in.readShort() >>> shift) & 0xFFF;
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class DirectPackedReader16 extends DirectPackedReader {
|
||||||
|
DirectPackedReader16(int valueCount, IndexInput in) {
|
||||||
|
super(16, valueCount, in);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long get(int index) {
|
||||||
|
try {
|
||||||
|
in.seek(startPointer + (index<<1));
|
||||||
|
return in.readShort() & 0xFFFF;
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class DirectPackedReader20 extends DirectPackedReader {
|
||||||
|
DirectPackedReader20(int valueCount, IndexInput in) {
|
||||||
|
super(20, valueCount, in);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long get(int index) {
|
||||||
|
try {
|
||||||
|
long offset = (index * 20L) >>> 3;
|
||||||
|
in.seek(startPointer + offset);
|
||||||
|
int v = in.readShort() << 8 | (in.readByte() & 0xFF);
|
||||||
|
int shift = ((index + 1) & 1) << 2;
|
||||||
|
return (v >>> shift) & 0xFFFFF;
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class DirectPackedReader24 extends DirectPackedReader {
|
||||||
|
DirectPackedReader24(int valueCount, IndexInput in) {
|
||||||
|
super(24, valueCount, in);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long get(int index) {
|
||||||
|
try {
|
||||||
|
in.seek(startPointer + (index*3));
|
||||||
|
return (in.readShort() & 0xFFFF) << 8 | (in.readByte() & 0xFF);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class DirectPackedReader28 extends DirectPackedReader {
|
||||||
|
DirectPackedReader28(int valueCount, IndexInput in) {
|
||||||
|
super(28, valueCount, in);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long get(int index) {
|
||||||
|
try {
|
||||||
|
long offset = (index * 28L) >>> 3;
|
||||||
|
in.seek(startPointer + offset);
|
||||||
|
int shift = ((index + 1) & 1) << 2;
|
||||||
|
return (in.readInt() >>> shift) & 0xFFFFFFFL;
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class DirectPackedReader32 extends DirectPackedReader {
|
||||||
|
DirectPackedReader32(int valueCount, IndexInput in) {
|
||||||
|
super(32, valueCount, in);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long get(int index) {
|
||||||
|
try {
|
||||||
|
in.seek(startPointer + (index<<2));
|
||||||
|
return in.readInt() & 0xFFFFFFFFL;
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class DirectPackedReader40 extends DirectPackedReader {
|
||||||
|
DirectPackedReader40(int valueCount, IndexInput in) {
|
||||||
|
super(40, valueCount, in);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long get(int index) {
|
||||||
|
try {
|
||||||
|
in.seek(startPointer + (index*5));
|
||||||
|
return (in.readInt() & 0xFFFFFFFFL) << 8 | (in.readByte() & 0xFF);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class DirectPackedReader48 extends DirectPackedReader {
|
||||||
|
DirectPackedReader48(int valueCount, IndexInput in) {
|
||||||
|
super(48, valueCount, in);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long get(int index) {
|
||||||
|
try {
|
||||||
|
in.seek(startPointer + (index*6));
|
||||||
|
return (in.readInt() & 0xFFFFFFFFL) << 16 | (in.readShort() & 0xFFFF);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class DirectPackedReader56 extends DirectPackedReader {
|
||||||
|
DirectPackedReader56(int valueCount, IndexInput in) {
|
||||||
|
super(56, valueCount, in);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long get(int index) {
|
||||||
|
try {
|
||||||
|
in.seek(startPointer + (index*7));
|
||||||
|
return (in.readInt() & 0xFFFFFFFFL) << 24 | (in.readShort() & 0xFFFF) << 8 | (in.readByte() & 0xFF);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class DirectPackedReader64 extends DirectPackedReader {
|
||||||
|
DirectPackedReader64(int valueCount, IndexInput in) {
|
||||||
|
super(64, valueCount, in);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long get(int index) {
|
||||||
|
try {
|
||||||
|
in.seek(startPointer + (index<<3));
|
||||||
|
return in.readLong();
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static DirectPackedReader getInstance(int bitsPerValue, int valueCount, IndexInput in) {
|
||||||
|
switch(bitsPerValue) {
|
||||||
|
case 1: return new DirectPackedReader1(valueCount, in);
|
||||||
|
case 2: return new DirectPackedReader2(valueCount, in);
|
||||||
|
case 4: return new DirectPackedReader4(valueCount, in);
|
||||||
|
case 8: return new DirectPackedReader8(valueCount, in);
|
||||||
|
case 12: return new DirectPackedReader12(valueCount, in);
|
||||||
|
case 16: return new DirectPackedReader16(valueCount, in);
|
||||||
|
case 20: return new DirectPackedReader20(valueCount, in);
|
||||||
|
case 24: return new DirectPackedReader24(valueCount, in);
|
||||||
|
case 28: return new DirectPackedReader28(valueCount, in);
|
||||||
|
case 32: return new DirectPackedReader32(valueCount, in);
|
||||||
|
case 40: return new DirectPackedReader40(valueCount, in);
|
||||||
|
case 48: return new DirectPackedReader48(valueCount, in);
|
||||||
|
case 56: return new DirectPackedReader56(valueCount, in);
|
||||||
|
case 64: return new DirectPackedReader64(valueCount, in);
|
||||||
|
default: return new DirectPackedReader(bitsPerValue, valueCount, in);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,9 +50,9 @@ public class PackedInts {
|
||||||
public static final float FAST = 0.5f;
|
public static final float FAST = 0.5f;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* At most 20% memory overhead.
|
* At most 25% memory overhead.
|
||||||
*/
|
*/
|
||||||
public static final float DEFAULT = 0.2f;
|
public static final float DEFAULT = 0.25f;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* No memory overhead at all, but the returned implementation may be slow.
|
* No memory overhead at all, but the returned implementation may be slow.
|
||||||
|
@ -282,6 +282,39 @@ public class PackedInts {
|
||||||
|
|
||||||
return new FormatAndBits(format, actualBitsPerValue);
|
return new FormatAndBits(format, actualBitsPerValue);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Try to find the number of bits per value that would
|
||||||
|
* read from disk the fastest reader whose overhead is less than
|
||||||
|
* <code>acceptableOverheadRatio</code>.
|
||||||
|
* </p><p>
|
||||||
|
* The <code>acceptableOverheadRatio</code> parameter makes sense for
|
||||||
|
* random-access {@link Reader}s. In case you only plan to perform
|
||||||
|
* sequential access on this stream later on, you should probably use
|
||||||
|
* {@link PackedInts#COMPACT}.
|
||||||
|
* </p><p>
|
||||||
|
*/
|
||||||
|
public static int fastestDirectBits(int bitsPerValue, float acceptableOverheadRatio) {
|
||||||
|
acceptableOverheadRatio = Math.max(COMPACT, acceptableOverheadRatio);
|
||||||
|
acceptableOverheadRatio = Math.min(FASTEST, acceptableOverheadRatio);
|
||||||
|
float acceptableOverheadPerValue = acceptableOverheadRatio * bitsPerValue; // in bits
|
||||||
|
|
||||||
|
int maxBitsPerValue = bitsPerValue + (int) acceptableOverheadPerValue;
|
||||||
|
|
||||||
|
// first see if we can upgrade to byte
|
||||||
|
int byteAlign = (bitsPerValue + 7) & 0xF8;
|
||||||
|
if (byteAlign <= maxBitsPerValue) {
|
||||||
|
return byteAlign;
|
||||||
|
}
|
||||||
|
|
||||||
|
// otherwise try to upgrade to a nibble boundary (for numbers < 32)
|
||||||
|
int nibbleAlign = (bitsPerValue + 3) & 0xFC;
|
||||||
|
if (bitsPerValue < 32 && nibbleAlign <= maxBitsPerValue) {
|
||||||
|
return nibbleAlign;
|
||||||
|
}
|
||||||
|
|
||||||
|
return bitsPerValue;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A decoder for packed integers.
|
* A decoder for packed integers.
|
||||||
|
@ -964,7 +997,7 @@ public class PackedInts {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
return new DirectPackedReader(bitsPerValue, valueCount, in);
|
return DirectPackedReader.getInstance(bitsPerValue, valueCount, in);
|
||||||
}
|
}
|
||||||
case PACKED_SINGLE_BLOCK:
|
case PACKED_SINGLE_BLOCK:
|
||||||
return new DirectPacked64SingleBlockReader(bitsPerValue, valueCount, in);
|
return new DirectPacked64SingleBlockReader(bitsPerValue, valueCount, in);
|
||||||
|
|
|
@ -1123,7 +1123,7 @@ public class TestPackedInts extends LuceneTestCase {
|
||||||
|
|
||||||
final Directory dir = newDirectory();
|
final Directory dir = newDirectory();
|
||||||
final IndexOutput out = dir.createOutput("out.bin", IOContext.DEFAULT);
|
final IndexOutput out = dir.createOutput("out.bin", IOContext.DEFAULT);
|
||||||
final BlockPackedWriter writer = new BlockPackedWriter(out, blockSize);
|
final BlockPackedWriter writer = new BlockPackedWriter(out, blockSize, PackedInts.COMPACT);
|
||||||
for (int i = 0; i < valueCount; ++i) {
|
for (int i = 0; i < valueCount; ++i) {
|
||||||
assertEquals(i, writer.ord());
|
assertEquals(i, writer.ord());
|
||||||
writer.add(values[i]);
|
writer.add(values[i]);
|
||||||
|
@ -1247,7 +1247,7 @@ public class TestPackedInts extends LuceneTestCase {
|
||||||
final int blockSize = 1 << TestUtil.nextInt(random(), 20, 22);
|
final int blockSize = 1 << TestUtil.nextInt(random(), 20, 22);
|
||||||
final Directory dir = newDirectory();
|
final Directory dir = newDirectory();
|
||||||
final IndexOutput out = dir.createOutput("out.bin", IOContext.DEFAULT);
|
final IndexOutput out = dir.createOutput("out.bin", IOContext.DEFAULT);
|
||||||
final BlockPackedWriter writer = new BlockPackedWriter(out, blockSize);
|
final BlockPackedWriter writer = new BlockPackedWriter(out, blockSize, PackedInts.COMPACT);
|
||||||
long value = random().nextInt() & 0xFFFFFFFFL;
|
long value = random().nextInt() & 0xFFFFFFFFL;
|
||||||
long valueOffset = TestUtil.nextLong(random(), 0, valueCount - 1);
|
long valueOffset = TestUtil.nextLong(random(), 0, valueCount - 1);
|
||||||
for (long i = 0; i < valueCount; ) {
|
for (long i = 0; i < valueCount; ) {
|
||||||
|
|
|
@ -169,7 +169,7 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer {
|
||||||
data.writeLong(gcd);
|
data.writeLong(gcd);
|
||||||
data.writeVInt(BLOCK_SIZE);
|
data.writeVInt(BLOCK_SIZE);
|
||||||
|
|
||||||
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
|
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE, PackedInts.COMPACT);
|
||||||
for (Number nv : values) {
|
for (Number nv : values) {
|
||||||
long value = nv == null ? 0 : nv.longValue();
|
long value = nv == null ? 0 : nv.longValue();
|
||||||
writer.add((value - minValue) / gcd);
|
writer.add((value - minValue) / gcd);
|
||||||
|
@ -181,7 +181,7 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer {
|
||||||
meta.writeVInt(PackedInts.VERSION_CURRENT);
|
meta.writeVInt(PackedInts.VERSION_CURRENT);
|
||||||
data.writeVInt(BLOCK_SIZE);
|
data.writeVInt(BLOCK_SIZE);
|
||||||
|
|
||||||
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
|
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE, PackedInts.COMPACT);
|
||||||
for (Number nv : values) {
|
for (Number nv : values) {
|
||||||
writer.add(nv == null ? 0 : nv.longValue());
|
writer.add(nv == null ? 0 : nv.longValue());
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue