mirror of https://github.com/apache/lucene.git
LUCENE-5797: Optimize norms merging
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1607074 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b0b4c711e8
commit
f01d562581
|
@ -117,6 +117,8 @@ Optimizations
|
||||||
|
|
||||||
* LUCENE-5799: Optimize numeric docvalues merging. (Robert Muir)
|
* LUCENE-5799: Optimize numeric docvalues merging. (Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-5797: Optimize norms merging (Adrien Grand, Robert Muir)
|
||||||
|
|
||||||
Test Framework
|
Test Framework
|
||||||
|
|
||||||
* LUCENE-5786: Unflushed/ truncated events file (hung testing subprocess).
|
* LUCENE-5786: Unflushed/ truncated events file (hung testing subprocess).
|
||||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.lucene.codecs.lucene49;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.codecs.CodecUtil;
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||||
|
@ -79,8 +79,7 @@ class Lucene49NormsConsumer extends DocValuesConsumer {
|
||||||
long minValue = Long.MAX_VALUE;
|
long minValue = Long.MAX_VALUE;
|
||||||
long maxValue = Long.MIN_VALUE;
|
long maxValue = Long.MIN_VALUE;
|
||||||
// TODO: more efficient?
|
// TODO: more efficient?
|
||||||
HashSet<Long> uniqueValues = null;
|
NormMap uniqueValues = new NormMap();
|
||||||
uniqueValues = new HashSet<>();
|
|
||||||
|
|
||||||
long count = 0;
|
long count = 0;
|
||||||
for (Number nv : values) {
|
for (Number nv : values) {
|
||||||
|
@ -94,7 +93,7 @@ class Lucene49NormsConsumer extends DocValuesConsumer {
|
||||||
|
|
||||||
if (uniqueValues != null) {
|
if (uniqueValues != null) {
|
||||||
if (uniqueValues.add(v)) {
|
if (uniqueValues.add(v)) {
|
||||||
if (uniqueValues.size() > 256) {
|
if (uniqueValues.size > 256) {
|
||||||
uniqueValues = null;
|
uniqueValues = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -106,7 +105,7 @@ class Lucene49NormsConsumer extends DocValuesConsumer {
|
||||||
throw new IllegalStateException("illegal norms data for field " + field.name + ", expected " + maxDoc + " values, got " + count);
|
throw new IllegalStateException("illegal norms data for field " + field.name + ", expected " + maxDoc + " values, got " + count);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (uniqueValues != null && uniqueValues.size() == 1) {
|
if (uniqueValues != null && uniqueValues.size == 1) {
|
||||||
// 0 bpv
|
// 0 bpv
|
||||||
meta.writeByte(CONST_COMPRESSED);
|
meta.writeByte(CONST_COMPRESSED);
|
||||||
meta.writeLong(minValue);
|
meta.writeLong(minValue);
|
||||||
|
@ -114,7 +113,7 @@ class Lucene49NormsConsumer extends DocValuesConsumer {
|
||||||
// small number of unique values: this is the typical case:
|
// small number of unique values: this is the typical case:
|
||||||
// we only use bpv=1,2,4,8
|
// we only use bpv=1,2,4,8
|
||||||
PackedInts.Format format = PackedInts.Format.PACKED_SINGLE_BLOCK;
|
PackedInts.Format format = PackedInts.Format.PACKED_SINGLE_BLOCK;
|
||||||
int bitsPerValue = PackedInts.bitsRequired(uniqueValues.size()-1);
|
int bitsPerValue = PackedInts.bitsRequired(uniqueValues.size-1);
|
||||||
if (bitsPerValue == 3) {
|
if (bitsPerValue == 3) {
|
||||||
bitsPerValue = 4;
|
bitsPerValue = 4;
|
||||||
} else if (bitsPerValue > 4) {
|
} else if (bitsPerValue > 4) {
|
||||||
|
@ -132,15 +131,12 @@ class Lucene49NormsConsumer extends DocValuesConsumer {
|
||||||
meta.writeLong(data.getFilePointer());
|
meta.writeLong(data.getFilePointer());
|
||||||
data.writeVInt(PackedInts.VERSION_CURRENT);
|
data.writeVInt(PackedInts.VERSION_CURRENT);
|
||||||
|
|
||||||
Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
|
long[] decode = uniqueValues.getDecodeTable();
|
||||||
Arrays.sort(decode);
|
|
||||||
final HashMap<Long,Integer> encode = new HashMap<>();
|
|
||||||
// upgrade to power of two sized array
|
// upgrade to power of two sized array
|
||||||
int size = 1 << bitsPerValue;
|
int size = 1 << bitsPerValue;
|
||||||
data.writeVInt(size);
|
data.writeVInt(size);
|
||||||
for (int i = 0; i < decode.length; i++) {
|
for (int i = 0; i < decode.length; i++) {
|
||||||
data.writeLong(decode[i]);
|
data.writeLong(decode[i]);
|
||||||
encode.put(decode[i], i);
|
|
||||||
}
|
}
|
||||||
for (int i = decode.length; i < size; i++) {
|
for (int i = decode.length; i < size; i++) {
|
||||||
data.writeLong(0);
|
data.writeLong(0);
|
||||||
|
@ -151,7 +147,7 @@ class Lucene49NormsConsumer extends DocValuesConsumer {
|
||||||
|
|
||||||
final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, format, maxDoc, bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
|
final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, format, maxDoc, bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
|
||||||
for(Number nv : values) {
|
for(Number nv : values) {
|
||||||
writer.add(encode.get(nv.longValue()));
|
writer.add(uniqueValues.getOrd(nv.longValue()));
|
||||||
}
|
}
|
||||||
writer.finish();
|
writer.finish();
|
||||||
}
|
}
|
||||||
|
@ -210,4 +206,66 @@ class Lucene49NormsConsumer extends DocValuesConsumer {
|
||||||
public void addSortedNumericField(FieldInfo field, Iterable<Number> docToValueCount, Iterable<Number> values) throws IOException {
|
public void addSortedNumericField(FieldInfo field, Iterable<Number> docToValueCount, Iterable<Number> values) throws IOException {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// specialized deduplication of long->ord for norms: 99.99999% of the time this will be a single-byte range.
|
||||||
|
static class NormMap {
|
||||||
|
// we use short: at most we will add 257 values to this map before its rejected as too big above.
|
||||||
|
short[] singleByteRange = new short[256];
|
||||||
|
Map<Long,Short> other = new HashMap<Long,Short>();
|
||||||
|
int size;
|
||||||
|
|
||||||
|
{
|
||||||
|
Arrays.fill(singleByteRange, (short)-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** adds an item to the mapping. returns true if actually added */
|
||||||
|
public boolean add(long l) {
|
||||||
|
assert size <= 256; // once we add > 256 values, we nullify the map in addNumericField and don't use this strategy
|
||||||
|
if (l >= Byte.MIN_VALUE && l <= Byte.MAX_VALUE) {
|
||||||
|
int index = (int) (l + 128);
|
||||||
|
short previous = singleByteRange[index];
|
||||||
|
if (previous < 0) {
|
||||||
|
singleByteRange[index] = (short) size;
|
||||||
|
size++;
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (!other.containsKey(l)) {
|
||||||
|
other.put(l, (short)size);
|
||||||
|
size++;
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** gets the ordinal for a previously added item */
|
||||||
|
public int getOrd(long l) {
|
||||||
|
if (l >= Byte.MIN_VALUE && l <= Byte.MAX_VALUE) {
|
||||||
|
int index = (int) (l + 128);
|
||||||
|
return singleByteRange[index];
|
||||||
|
} else {
|
||||||
|
// NPE if something is screwed up
|
||||||
|
return other.get(l);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** retrieves the ordinal table for previously added items */
|
||||||
|
public long[] getDecodeTable() {
|
||||||
|
long decode[] = new long[size];
|
||||||
|
for (int i = 0; i < singleByteRange.length; i++) {
|
||||||
|
short s = singleByteRange[i];
|
||||||
|
if (s >= 0) {
|
||||||
|
decode[s] = i - 128;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (Map.Entry<Long,Short> entry : other.entrySet()) {
|
||||||
|
decode[entry.getValue()] = entry.getKey();
|
||||||
|
}
|
||||||
|
return decode;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue