LUCENE-5751: Bring MemoryDocValues up to speed

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1601929 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-06-11 15:35:25 +00:00
parent 4f09e4d781
commit 0aa0bb19e3
4 changed files with 124 additions and 60 deletions

View File

@ -263,6 +263,8 @@ Optimizations
* LUCENE-5750: Speed up monotonic addressing for BINARY and SORTED_SET
docvalues. (Robert Muir)
* LUCENE-5751: Speed up MemoryDocValues. (Adrien Grand, Robert Muir)
Bug fixes
* LUCENE-5738: Ensure NativeFSLock prevents opening the file channel for the

View File

@ -51,9 +51,9 @@ import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.BYTES;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.NUMBER;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.FST;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.DELTA_COMPRESSED;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.BLOCK_COMPRESSED;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.GCD_COMPRESSED;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.TABLE_COMPRESSED;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.UNCOMPRESSED;
/**
* Writer for {@link MemoryDocValuesFormat}
@ -93,6 +93,7 @@ class MemoryDocValuesConsumer extends DocValuesConsumer {
meta.writeLong(data.getFilePointer());
long minValue = Long.MAX_VALUE;
long maxValue = Long.MIN_VALUE;
long blockSum = 0;
long gcd = 0;
boolean missing = false;
// TODO: more efficient?
@ -101,6 +102,8 @@ class MemoryDocValuesConsumer extends DocValuesConsumer {
uniqueValues = new HashSet<>();
long count = 0;
long currentBlockMin = Long.MAX_VALUE;
long currentBlockMax = Long.MIN_VALUE;
for (Number nv : values) {
final long v;
if (nv == null) {
@ -121,6 +124,9 @@ class MemoryDocValuesConsumer extends DocValuesConsumer {
}
}
currentBlockMin = Math.min(minValue, v);
currentBlockMax = Math.max(maxValue, v);
minValue = Math.min(minValue, v);
maxValue = Math.max(maxValue, v);
@ -133,8 +139,22 @@ class MemoryDocValuesConsumer extends DocValuesConsumer {
}
++count;
if (count % BLOCK_SIZE == 0) {
final long blockDelta = currentBlockMax - currentBlockMin;
final int blockDeltaRequired = blockDelta < 0 ? 64 : PackedInts.bitsRequired(blockDelta);
final int blockBPV = PackedInts.fastestFormatAndBits(BLOCK_SIZE, blockDeltaRequired, acceptableOverheadRatio).bitsPerValue;
blockSum += blockBPV;
currentBlockMax = Long.MIN_VALUE;
currentBlockMin = Long.MAX_VALUE;
}
}
assert count == maxDoc;
} else {
for (Number nv : values) {
long v = nv.longValue();
maxValue = Math.max(v, maxValue);
minValue = Math.min(v, minValue);
}
}
if (missing) {
@ -145,60 +165,99 @@ class MemoryDocValuesConsumer extends DocValuesConsumer {
} else {
meta.writeLong(-1L);
}
final long delta = maxValue - minValue;
final int deltaRequired = delta < 0 ? 64 : PackedInts.bitsRequired(delta);
final FormatAndBits deltaBPV = PackedInts.fastestFormatAndBits(maxDoc, deltaRequired, acceptableOverheadRatio);
final FormatAndBits tableBPV;
if (uniqueValues != null) {
// small number of unique values
final int bitsPerValue = PackedInts.bitsRequired(uniqueValues.size()-1);
FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio);
if (formatAndBits.bitsPerValue == 8 && minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE) {
meta.writeByte(UNCOMPRESSED); // uncompressed
for (Number nv : values) {
data.writeByte(nv == null ? 0 : (byte) nv.longValue());
}
} else {
meta.writeByte(TABLE_COMPRESSED); // table-compressed
Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
final HashMap<Long,Integer> encode = new HashMap<>();
data.writeVInt(decode.length);
for (int i = 0; i < decode.length; i++) {
data.writeLong(decode[i]);
encode.put(decode[i], i);
}
meta.writeVInt(PackedInts.VERSION_CURRENT);
data.writeVInt(formatAndBits.format.getId());
data.writeVInt(formatAndBits.bitsPerValue);
final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, formatAndBits.format, maxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
for(Number nv : values) {
writer.add(encode.get(nv == null ? 0 : nv.longValue()));
}
writer.finish();
tableBPV = PackedInts.fastestFormatAndBits(maxDoc, PackedInts.bitsRequired(uniqueValues.size()-1), acceptableOverheadRatio);
} else {
tableBPV = null;
}
final FormatAndBits gcdBPV;
if (gcd != 0 && gcd != 1) {
final long gcdDelta = (maxValue - minValue) / gcd;
final int gcdRequired = gcdDelta < 0 ? 64 : PackedInts.bitsRequired(gcdDelta);
gcdBPV = PackedInts.fastestFormatAndBits(maxDoc, gcdRequired, acceptableOverheadRatio);
} else {
gcdBPV = null;
}
boolean doBlock = false;
if (blockSum != 0) {
int numBlocks = maxDoc / BLOCK_SIZE;
float avgBPV = blockSum / (float)numBlocks;
// just a heuristic, with tiny amounts of blocks our estimate is skewed as we ignore the final "incomplete" block.
// with at least 4 blocks its pretty accurate. The difference must also be significant (according to acceptable overhead).
if (numBlocks >= 4 && (avgBPV+avgBPV*acceptableOverheadRatio) < deltaBPV.bitsPerValue) {
doBlock = true;
}
} else if (gcd != 0 && gcd != 1) {
}
if (tableBPV != null && (tableBPV.bitsPerValue+tableBPV.bitsPerValue*acceptableOverheadRatio) < deltaBPV.bitsPerValue) {
// small number of unique values
meta.writeByte(TABLE_COMPRESSED); // table-compressed
Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
final HashMap<Long,Integer> encode = new HashMap<>();
int length = 1 << tableBPV.bitsPerValue;
data.writeVInt(length);
for (int i = 0; i < decode.length; i++) {
data.writeLong(decode[i]);
encode.put(decode[i], i);
}
for (int i = decode.length; i < length; i++) {
data.writeLong(0);
}
meta.writeVInt(PackedInts.VERSION_CURRENT);
data.writeVInt(tableBPV.format.getId());
data.writeVInt(tableBPV.bitsPerValue);
final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, tableBPV.format, maxDoc, tableBPV.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
for(Number nv : values) {
writer.add(encode.get(nv == null ? 0 : nv.longValue()));
}
writer.finish();
} else if (gcdBPV != null && (gcdBPV.bitsPerValue+gcdBPV.bitsPerValue*acceptableOverheadRatio) < deltaBPV.bitsPerValue) {
meta.writeByte(GCD_COMPRESSED);
meta.writeVInt(PackedInts.VERSION_CURRENT);
data.writeLong(minValue);
data.writeLong(gcd);
data.writeVInt(BLOCK_SIZE);
data.writeVInt(gcdBPV.format.getId());
data.writeVInt(gcdBPV.bitsPerValue);
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, gcdBPV.format, maxDoc, gcdBPV.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
for (Number nv : values) {
long value = nv == null ? 0 : nv.longValue();
writer.add((value - minValue) / gcd);
}
writer.finish();
} else {
meta.writeByte(DELTA_COMPRESSED); // delta-compressed
} else if (doBlock) {
meta.writeByte(BLOCK_COMPRESSED); // block delta-compressed
meta.writeVInt(PackedInts.VERSION_CURRENT);
data.writeVInt(BLOCK_SIZE);
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
for (Number nv : values) {
writer.add(nv == null ? 0 : nv.longValue());
}
writer.finish();
} else {
meta.writeByte(DELTA_COMPRESSED); // delta-compressed
meta.writeVInt(PackedInts.VERSION_CURRENT);
final long minDelta = deltaBPV.bitsPerValue == 64 ? 0 : minValue;
data.writeLong(minDelta);
data.writeVInt(deltaBPV.format.getId());
data.writeVInt(deltaBPV.bitsPerValue);
final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, deltaBPV.format, maxDoc, deltaBPV.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
for (Number nv : values) {
long v = nv == null ? 0 : nv.longValue();
writer.add(v - minDelta);
}
writer.finish();
}
}

View File

@ -89,13 +89,14 @@ class MemoryDocValuesProducer extends DocValuesProducer {
static final byte DELTA_COMPRESSED = 0;
static final byte TABLE_COMPRESSED = 1;
static final byte UNCOMPRESSED = 2;
static final byte BLOCK_COMPRESSED = 2;
static final byte GCD_COMPRESSED = 3;
static final int VERSION_START = 0;
static final int VERSION_GCD_COMPRESSION = 1;
static final int VERSION_CHECKSUM = 2;
static final int VERSION_CURRENT = VERSION_CHECKSUM;
static final int VERSION_BLOCKDETECTION = 3;
static final int VERSION_CURRENT = VERSION_BLOCKDETECTION;
MemoryDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
maxDoc = state.segmentInfo.getDocCount();
@ -162,15 +163,13 @@ class MemoryDocValuesProducer extends DocValuesProducer {
switch(entry.format) {
case DELTA_COMPRESSED:
case TABLE_COMPRESSED:
case BLOCK_COMPRESSED:
case GCD_COMPRESSED:
case UNCOMPRESSED:
break;
default:
throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta);
}
if (entry.format != UNCOMPRESSED) {
entry.packedIntsVersion = meta.readVInt();
}
entry.packedIntsVersion = meta.readVInt();
numerics.put(fieldNumber, entry);
} else if (fieldType == BYTES) {
BinaryEntry entry = new BinaryEntry();
@ -247,25 +246,28 @@ class MemoryDocValuesProducer extends DocValuesProducer {
}
};
case DELTA_COMPRESSED:
final long minDelta = data.readLong();
final int formatIDDelta = data.readVInt();
final int bitsPerValueDelta = data.readVInt();
final PackedInts.Reader deltaReader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatIDDelta), entry.packedIntsVersion, maxDoc, bitsPerValueDelta);
ramBytesUsed.addAndGet(deltaReader.ramBytesUsed());
return new NumericDocValues() {
@Override
public long get(int docID) {
return minDelta + deltaReader.get(docID);
}
};
case BLOCK_COMPRESSED:
final int blockSize = data.readVInt();
final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, blockSize, maxDoc, false);
ramBytesUsed.addAndGet(reader.ramBytesUsed());
return reader;
case UNCOMPRESSED:
final byte bytes[] = new byte[maxDoc];
data.readBytes(bytes, 0, bytes.length);
ramBytesUsed.addAndGet(RamUsageEstimator.sizeOf(bytes));
return new NumericDocValues() {
@Override
public long get(int docID) {
return bytes[docID];
}
};
case GCD_COMPRESSED:
final long min = data.readLong();
final long mult = data.readLong();
final int quotientBlockSize = data.readVInt();
final BlockPackedReader quotientReader = new BlockPackedReader(data, entry.packedIntsVersion, quotientBlockSize, maxDoc, false);
final int formatIDGCD = data.readVInt();
final int bitsPerValueGCD = data.readVInt();
final PackedInts.Reader quotientReader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatIDGCD), entry.packedIntsVersion, maxDoc, bitsPerValueGCD);
ramBytesUsed.addAndGet(quotientReader.ramBytesUsed());
return new NumericDocValues() {
@Override

View File

@ -150,13 +150,15 @@ public class Lucene49DocValuesConsumer extends DocValuesConsumer implements Clos
}
final long delta = maxValue - minValue;
final int deltaBitsRequired = delta < 0 ? 64 : DirectWriter.bitsRequired(delta);
final int format;
if (uniqueValues != null
&& (delta < 0L || PackedInts.bitsRequired(uniqueValues.size() - 1) < PackedInts.bitsRequired(delta))) {
if (uniqueValues != null && DirectWriter.bitsRequired(uniqueValues.size() - 1) < deltaBitsRequired) {
format = TABLE_COMPRESSED;
} else if (gcd != 0 && gcd != 1) {
format = GCD_COMPRESSED;
final long gcdDelta = (maxValue - minValue) / gcd;
final long gcdBitsRequired = gcdDelta < 0 ? 64 : DirectWriter.bitsRequired(gcdDelta);
format = gcdBitsRequired < deltaBitsRequired ? GCD_COMPRESSED : DELTA_COMPRESSED;
} else {
format = DELTA_COMPRESSED;
}
@ -189,9 +191,8 @@ public class Lucene49DocValuesConsumer extends DocValuesConsumer implements Clos
case DELTA_COMPRESSED:
final long minDelta = delta < 0 ? 0 : minValue;
meta.writeLong(minDelta);
final int bpv = delta < 0 ? 64 : DirectWriter.bitsRequired(delta);
meta.writeVInt(bpv);
final DirectWriter writer = DirectWriter.getInstance(data, count, bpv);
meta.writeVInt(deltaBitsRequired);
final DirectWriter writer = DirectWriter.getInstance(data, count, deltaBitsRequired);
for (Number nv : values) {
long v = nv == null ? 0 : nv.longValue();
writer.add(v - minDelta);