mirror of https://github.com/apache/lucene.git
LUCENE-5751: Bring MemoryDocValues up to speed
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1601929 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4f09e4d781
commit
0aa0bb19e3
|
@ -263,6 +263,8 @@ Optimizations
|
|||
* LUCENE-5750: Speed up monotonic addressing for BINARY and SORTED_SET
|
||||
docvalues. (Robert Muir)
|
||||
|
||||
* LUCENE-5751: Speed up MemoryDocValues. (Adrien Grand, Robert Muir)
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-5738: Ensure NativeFSLock prevents opening the file channel for the
|
||||
|
|
|
@ -51,9 +51,9 @@ import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.BYTES;
|
|||
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.NUMBER;
|
||||
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.FST;
|
||||
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.DELTA_COMPRESSED;
|
||||
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.BLOCK_COMPRESSED;
|
||||
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.GCD_COMPRESSED;
|
||||
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.TABLE_COMPRESSED;
|
||||
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.UNCOMPRESSED;
|
||||
|
||||
/**
|
||||
* Writer for {@link MemoryDocValuesFormat}
|
||||
|
@ -93,6 +93,7 @@ class MemoryDocValuesConsumer extends DocValuesConsumer {
|
|||
meta.writeLong(data.getFilePointer());
|
||||
long minValue = Long.MAX_VALUE;
|
||||
long maxValue = Long.MIN_VALUE;
|
||||
long blockSum = 0;
|
||||
long gcd = 0;
|
||||
boolean missing = false;
|
||||
// TODO: more efficient?
|
||||
|
@ -101,6 +102,8 @@ class MemoryDocValuesConsumer extends DocValuesConsumer {
|
|||
uniqueValues = new HashSet<>();
|
||||
|
||||
long count = 0;
|
||||
long currentBlockMin = Long.MAX_VALUE;
|
||||
long currentBlockMax = Long.MIN_VALUE;
|
||||
for (Number nv : values) {
|
||||
final long v;
|
||||
if (nv == null) {
|
||||
|
@ -121,6 +124,9 @@ class MemoryDocValuesConsumer extends DocValuesConsumer {
|
|||
}
|
||||
}
|
||||
|
||||
currentBlockMin = Math.min(minValue, v);
|
||||
currentBlockMax = Math.max(maxValue, v);
|
||||
|
||||
minValue = Math.min(minValue, v);
|
||||
maxValue = Math.max(maxValue, v);
|
||||
|
||||
|
@ -133,8 +139,22 @@ class MemoryDocValuesConsumer extends DocValuesConsumer {
|
|||
}
|
||||
|
||||
++count;
|
||||
if (count % BLOCK_SIZE == 0) {
|
||||
final long blockDelta = currentBlockMax - currentBlockMin;
|
||||
final int blockDeltaRequired = blockDelta < 0 ? 64 : PackedInts.bitsRequired(blockDelta);
|
||||
final int blockBPV = PackedInts.fastestFormatAndBits(BLOCK_SIZE, blockDeltaRequired, acceptableOverheadRatio).bitsPerValue;
|
||||
blockSum += blockBPV;
|
||||
currentBlockMax = Long.MIN_VALUE;
|
||||
currentBlockMin = Long.MAX_VALUE;
|
||||
}
|
||||
}
|
||||
assert count == maxDoc;
|
||||
} else {
|
||||
for (Number nv : values) {
|
||||
long v = nv.longValue();
|
||||
maxValue = Math.max(v, maxValue);
|
||||
minValue = Math.min(v, minValue);
|
||||
}
|
||||
}
|
||||
|
||||
if (missing) {
|
||||
|
@ -145,60 +165,99 @@ class MemoryDocValuesConsumer extends DocValuesConsumer {
|
|||
} else {
|
||||
meta.writeLong(-1L);
|
||||
}
|
||||
|
||||
|
||||
final long delta = maxValue - minValue;
|
||||
final int deltaRequired = delta < 0 ? 64 : PackedInts.bitsRequired(delta);
|
||||
final FormatAndBits deltaBPV = PackedInts.fastestFormatAndBits(maxDoc, deltaRequired, acceptableOverheadRatio);
|
||||
|
||||
final FormatAndBits tableBPV;
|
||||
if (uniqueValues != null) {
|
||||
// small number of unique values
|
||||
final int bitsPerValue = PackedInts.bitsRequired(uniqueValues.size()-1);
|
||||
FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio);
|
||||
if (formatAndBits.bitsPerValue == 8 && minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE) {
|
||||
meta.writeByte(UNCOMPRESSED); // uncompressed
|
||||
for (Number nv : values) {
|
||||
data.writeByte(nv == null ? 0 : (byte) nv.longValue());
|
||||
}
|
||||
} else {
|
||||
meta.writeByte(TABLE_COMPRESSED); // table-compressed
|
||||
Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
|
||||
final HashMap<Long,Integer> encode = new HashMap<>();
|
||||
data.writeVInt(decode.length);
|
||||
for (int i = 0; i < decode.length; i++) {
|
||||
data.writeLong(decode[i]);
|
||||
encode.put(decode[i], i);
|
||||
}
|
||||
|
||||
meta.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
data.writeVInt(formatAndBits.format.getId());
|
||||
data.writeVInt(formatAndBits.bitsPerValue);
|
||||
|
||||
final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, formatAndBits.format, maxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
|
||||
for(Number nv : values) {
|
||||
writer.add(encode.get(nv == null ? 0 : nv.longValue()));
|
||||
}
|
||||
writer.finish();
|
||||
tableBPV = PackedInts.fastestFormatAndBits(maxDoc, PackedInts.bitsRequired(uniqueValues.size()-1), acceptableOverheadRatio);
|
||||
} else {
|
||||
tableBPV = null;
|
||||
}
|
||||
|
||||
final FormatAndBits gcdBPV;
|
||||
if (gcd != 0 && gcd != 1) {
|
||||
final long gcdDelta = (maxValue - minValue) / gcd;
|
||||
final int gcdRequired = gcdDelta < 0 ? 64 : PackedInts.bitsRequired(gcdDelta);
|
||||
gcdBPV = PackedInts.fastestFormatAndBits(maxDoc, gcdRequired, acceptableOverheadRatio);
|
||||
} else {
|
||||
gcdBPV = null;
|
||||
}
|
||||
|
||||
boolean doBlock = false;
|
||||
if (blockSum != 0) {
|
||||
int numBlocks = maxDoc / BLOCK_SIZE;
|
||||
float avgBPV = blockSum / (float)numBlocks;
|
||||
// just a heuristic, with tiny amounts of blocks our estimate is skewed as we ignore the final "incomplete" block.
|
||||
// with at least 4 blocks its pretty accurate. The difference must also be significant (according to acceptable overhead).
|
||||
if (numBlocks >= 4 && (avgBPV+avgBPV*acceptableOverheadRatio) < deltaBPV.bitsPerValue) {
|
||||
doBlock = true;
|
||||
}
|
||||
} else if (gcd != 0 && gcd != 1) {
|
||||
}
|
||||
|
||||
if (tableBPV != null && (tableBPV.bitsPerValue+tableBPV.bitsPerValue*acceptableOverheadRatio) < deltaBPV.bitsPerValue) {
|
||||
// small number of unique values
|
||||
meta.writeByte(TABLE_COMPRESSED); // table-compressed
|
||||
Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
|
||||
final HashMap<Long,Integer> encode = new HashMap<>();
|
||||
int length = 1 << tableBPV.bitsPerValue;
|
||||
data.writeVInt(length);
|
||||
for (int i = 0; i < decode.length; i++) {
|
||||
data.writeLong(decode[i]);
|
||||
encode.put(decode[i], i);
|
||||
}
|
||||
for (int i = decode.length; i < length; i++) {
|
||||
data.writeLong(0);
|
||||
}
|
||||
|
||||
meta.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
data.writeVInt(tableBPV.format.getId());
|
||||
data.writeVInt(tableBPV.bitsPerValue);
|
||||
|
||||
final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, tableBPV.format, maxDoc, tableBPV.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
|
||||
for(Number nv : values) {
|
||||
writer.add(encode.get(nv == null ? 0 : nv.longValue()));
|
||||
}
|
||||
writer.finish();
|
||||
} else if (gcdBPV != null && (gcdBPV.bitsPerValue+gcdBPV.bitsPerValue*acceptableOverheadRatio) < deltaBPV.bitsPerValue) {
|
||||
meta.writeByte(GCD_COMPRESSED);
|
||||
meta.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
data.writeLong(minValue);
|
||||
data.writeLong(gcd);
|
||||
data.writeVInt(BLOCK_SIZE);
|
||||
data.writeVInt(gcdBPV.format.getId());
|
||||
data.writeVInt(gcdBPV.bitsPerValue);
|
||||
|
||||
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
|
||||
final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, gcdBPV.format, maxDoc, gcdBPV.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
|
||||
for (Number nv : values) {
|
||||
long value = nv == null ? 0 : nv.longValue();
|
||||
writer.add((value - minValue) / gcd);
|
||||
}
|
||||
writer.finish();
|
||||
} else {
|
||||
meta.writeByte(DELTA_COMPRESSED); // delta-compressed
|
||||
|
||||
} else if (doBlock) {
|
||||
meta.writeByte(BLOCK_COMPRESSED); // block delta-compressed
|
||||
meta.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
data.writeVInt(BLOCK_SIZE);
|
||||
|
||||
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
|
||||
for (Number nv : values) {
|
||||
writer.add(nv == null ? 0 : nv.longValue());
|
||||
}
|
||||
writer.finish();
|
||||
} else {
|
||||
meta.writeByte(DELTA_COMPRESSED); // delta-compressed
|
||||
meta.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
final long minDelta = deltaBPV.bitsPerValue == 64 ? 0 : minValue;
|
||||
data.writeLong(minDelta);
|
||||
data.writeVInt(deltaBPV.format.getId());
|
||||
data.writeVInt(deltaBPV.bitsPerValue);
|
||||
|
||||
final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, deltaBPV.format, maxDoc, deltaBPV.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
|
||||
for (Number nv : values) {
|
||||
long v = nv == null ? 0 : nv.longValue();
|
||||
writer.add(v - minDelta);
|
||||
}
|
||||
writer.finish();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -89,13 +89,14 @@ class MemoryDocValuesProducer extends DocValuesProducer {
|
|||
|
||||
static final byte DELTA_COMPRESSED = 0;
|
||||
static final byte TABLE_COMPRESSED = 1;
|
||||
static final byte UNCOMPRESSED = 2;
|
||||
static final byte BLOCK_COMPRESSED = 2;
|
||||
static final byte GCD_COMPRESSED = 3;
|
||||
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_GCD_COMPRESSION = 1;
|
||||
static final int VERSION_CHECKSUM = 2;
|
||||
static final int VERSION_CURRENT = VERSION_CHECKSUM;
|
||||
static final int VERSION_BLOCKDETECTION = 3;
|
||||
static final int VERSION_CURRENT = VERSION_BLOCKDETECTION;
|
||||
|
||||
MemoryDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||
maxDoc = state.segmentInfo.getDocCount();
|
||||
|
@ -162,15 +163,13 @@ class MemoryDocValuesProducer extends DocValuesProducer {
|
|||
switch(entry.format) {
|
||||
case DELTA_COMPRESSED:
|
||||
case TABLE_COMPRESSED:
|
||||
case BLOCK_COMPRESSED:
|
||||
case GCD_COMPRESSED:
|
||||
case UNCOMPRESSED:
|
||||
break;
|
||||
default:
|
||||
throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta);
|
||||
}
|
||||
if (entry.format != UNCOMPRESSED) {
|
||||
entry.packedIntsVersion = meta.readVInt();
|
||||
}
|
||||
entry.packedIntsVersion = meta.readVInt();
|
||||
numerics.put(fieldNumber, entry);
|
||||
} else if (fieldType == BYTES) {
|
||||
BinaryEntry entry = new BinaryEntry();
|
||||
|
@ -247,25 +246,28 @@ class MemoryDocValuesProducer extends DocValuesProducer {
|
|||
}
|
||||
};
|
||||
case DELTA_COMPRESSED:
|
||||
final long minDelta = data.readLong();
|
||||
final int formatIDDelta = data.readVInt();
|
||||
final int bitsPerValueDelta = data.readVInt();
|
||||
final PackedInts.Reader deltaReader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatIDDelta), entry.packedIntsVersion, maxDoc, bitsPerValueDelta);
|
||||
ramBytesUsed.addAndGet(deltaReader.ramBytesUsed());
|
||||
return new NumericDocValues() {
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
return minDelta + deltaReader.get(docID);
|
||||
}
|
||||
};
|
||||
case BLOCK_COMPRESSED:
|
||||
final int blockSize = data.readVInt();
|
||||
final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, blockSize, maxDoc, false);
|
||||
ramBytesUsed.addAndGet(reader.ramBytesUsed());
|
||||
return reader;
|
||||
case UNCOMPRESSED:
|
||||
final byte bytes[] = new byte[maxDoc];
|
||||
data.readBytes(bytes, 0, bytes.length);
|
||||
ramBytesUsed.addAndGet(RamUsageEstimator.sizeOf(bytes));
|
||||
return new NumericDocValues() {
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
return bytes[docID];
|
||||
}
|
||||
};
|
||||
case GCD_COMPRESSED:
|
||||
final long min = data.readLong();
|
||||
final long mult = data.readLong();
|
||||
final int quotientBlockSize = data.readVInt();
|
||||
final BlockPackedReader quotientReader = new BlockPackedReader(data, entry.packedIntsVersion, quotientBlockSize, maxDoc, false);
|
||||
final int formatIDGCD = data.readVInt();
|
||||
final int bitsPerValueGCD = data.readVInt();
|
||||
final PackedInts.Reader quotientReader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatIDGCD), entry.packedIntsVersion, maxDoc, bitsPerValueGCD);
|
||||
ramBytesUsed.addAndGet(quotientReader.ramBytesUsed());
|
||||
return new NumericDocValues() {
|
||||
@Override
|
||||
|
|
|
@ -150,13 +150,15 @@ public class Lucene49DocValuesConsumer extends DocValuesConsumer implements Clos
|
|||
}
|
||||
|
||||
final long delta = maxValue - minValue;
|
||||
final int deltaBitsRequired = delta < 0 ? 64 : DirectWriter.bitsRequired(delta);
|
||||
|
||||
final int format;
|
||||
if (uniqueValues != null
|
||||
&& (delta < 0L || PackedInts.bitsRequired(uniqueValues.size() - 1) < PackedInts.bitsRequired(delta))) {
|
||||
if (uniqueValues != null && DirectWriter.bitsRequired(uniqueValues.size() - 1) < deltaBitsRequired) {
|
||||
format = TABLE_COMPRESSED;
|
||||
} else if (gcd != 0 && gcd != 1) {
|
||||
format = GCD_COMPRESSED;
|
||||
final long gcdDelta = (maxValue - minValue) / gcd;
|
||||
final long gcdBitsRequired = gcdDelta < 0 ? 64 : DirectWriter.bitsRequired(gcdDelta);
|
||||
format = gcdBitsRequired < deltaBitsRequired ? GCD_COMPRESSED : DELTA_COMPRESSED;
|
||||
} else {
|
||||
format = DELTA_COMPRESSED;
|
||||
}
|
||||
|
@ -189,9 +191,8 @@ public class Lucene49DocValuesConsumer extends DocValuesConsumer implements Clos
|
|||
case DELTA_COMPRESSED:
|
||||
final long minDelta = delta < 0 ? 0 : minValue;
|
||||
meta.writeLong(minDelta);
|
||||
final int bpv = delta < 0 ? 64 : DirectWriter.bitsRequired(delta);
|
||||
meta.writeVInt(bpv);
|
||||
final DirectWriter writer = DirectWriter.getInstance(data, count, bpv);
|
||||
meta.writeVInt(deltaBitsRequired);
|
||||
final DirectWriter writer = DirectWriter.getInstance(data, count, deltaBitsRequired);
|
||||
for (Number nv : values) {
|
||||
long v = nv == null ? 0 : nv.longValue();
|
||||
writer.add(v - minDelta);
|
||||
|
|
Loading…
Reference in New Issue