LUCENE-5797: Optimize norms merging

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1607074 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-07-01 13:37:15 +00:00
parent b0b4c711e8
commit f01d562581
2 changed files with 71 additions and 11 deletions

View File

@ -117,6 +117,8 @@ Optimizations
* LUCENE-5799: Optimize numeric docvalues merging. (Robert Muir) * LUCENE-5799: Optimize numeric docvalues merging. (Robert Muir)
* LUCENE-5797: Optimize norms merging (Adrien Grand, Robert Muir)
Test Framework Test Framework
* LUCENE-5786: Unflushed/ truncated events file (hung testing subprocess). * LUCENE-5786: Unflushed/ truncated events file (hung testing subprocess).

View File

@ -20,7 +20,7 @@ package org.apache.lucene.codecs.lucene49;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.Map;
import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesConsumer;
@ -79,8 +79,7 @@ class Lucene49NormsConsumer extends DocValuesConsumer {
long minValue = Long.MAX_VALUE; long minValue = Long.MAX_VALUE;
long maxValue = Long.MIN_VALUE; long maxValue = Long.MIN_VALUE;
// TODO: more efficient? // TODO: more efficient?
HashSet<Long> uniqueValues = null; NormMap uniqueValues = new NormMap();
uniqueValues = new HashSet<>();
long count = 0; long count = 0;
for (Number nv : values) { for (Number nv : values) {
@ -94,7 +93,7 @@ class Lucene49NormsConsumer extends DocValuesConsumer {
if (uniqueValues != null) { if (uniqueValues != null) {
if (uniqueValues.add(v)) { if (uniqueValues.add(v)) {
if (uniqueValues.size() > 256) { if (uniqueValues.size > 256) {
uniqueValues = null; uniqueValues = null;
} }
} }
@ -106,7 +105,7 @@ class Lucene49NormsConsumer extends DocValuesConsumer {
throw new IllegalStateException("illegal norms data for field " + field.name + ", expected " + maxDoc + " values, got " + count); throw new IllegalStateException("illegal norms data for field " + field.name + ", expected " + maxDoc + " values, got " + count);
} }
if (uniqueValues != null && uniqueValues.size() == 1) { if (uniqueValues != null && uniqueValues.size == 1) {
// 0 bpv // 0 bpv
meta.writeByte(CONST_COMPRESSED); meta.writeByte(CONST_COMPRESSED);
meta.writeLong(minValue); meta.writeLong(minValue);
@ -114,7 +113,7 @@ class Lucene49NormsConsumer extends DocValuesConsumer {
// small number of unique values: this is the typical case: // small number of unique values: this is the typical case:
// we only use bpv=1,2,4,8 // we only use bpv=1,2,4,8
PackedInts.Format format = PackedInts.Format.PACKED_SINGLE_BLOCK; PackedInts.Format format = PackedInts.Format.PACKED_SINGLE_BLOCK;
int bitsPerValue = PackedInts.bitsRequired(uniqueValues.size()-1); int bitsPerValue = PackedInts.bitsRequired(uniqueValues.size-1);
if (bitsPerValue == 3) { if (bitsPerValue == 3) {
bitsPerValue = 4; bitsPerValue = 4;
} else if (bitsPerValue > 4) { } else if (bitsPerValue > 4) {
@ -132,15 +131,12 @@ class Lucene49NormsConsumer extends DocValuesConsumer {
meta.writeLong(data.getFilePointer()); meta.writeLong(data.getFilePointer());
data.writeVInt(PackedInts.VERSION_CURRENT); data.writeVInt(PackedInts.VERSION_CURRENT);
Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]); long[] decode = uniqueValues.getDecodeTable();
Arrays.sort(decode);
final HashMap<Long,Integer> encode = new HashMap<>();
// upgrade to power of two sized array // upgrade to power of two sized array
int size = 1 << bitsPerValue; int size = 1 << bitsPerValue;
data.writeVInt(size); data.writeVInt(size);
for (int i = 0; i < decode.length; i++) { for (int i = 0; i < decode.length; i++) {
data.writeLong(decode[i]); data.writeLong(decode[i]);
encode.put(decode[i], i);
} }
for (int i = decode.length; i < size; i++) { for (int i = decode.length; i < size; i++) {
data.writeLong(0); data.writeLong(0);
@ -151,7 +147,7 @@ class Lucene49NormsConsumer extends DocValuesConsumer {
final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, format, maxDoc, bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE); final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, format, maxDoc, bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
for(Number nv : values) { for(Number nv : values) {
writer.add(encode.get(nv.longValue())); writer.add(uniqueValues.getOrd(nv.longValue()));
} }
writer.finish(); writer.finish();
} }
@ -210,4 +206,66 @@ class Lucene49NormsConsumer extends DocValuesConsumer {
public void addSortedNumericField(FieldInfo field, Iterable<Number> docToValueCount, Iterable<Number> values) throws IOException { public void addSortedNumericField(FieldInfo field, Iterable<Number> docToValueCount, Iterable<Number> values) throws IOException {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
// specialized deduplication of long->ord for norms: 99.99999% of the time this will be a single-byte range.
static class NormMap {
// we use short: at most we will add 257 values to this map before its rejected as too big above.
short[] singleByteRange = new short[256];
Map<Long,Short> other = new HashMap<Long,Short>();
int size;
{
Arrays.fill(singleByteRange, (short)-1);
}
/** adds an item to the mapping. returns true if actually added */
public boolean add(long l) {
assert size <= 256; // once we add > 256 values, we nullify the map in addNumericField and don't use this strategy
if (l >= Byte.MIN_VALUE && l <= Byte.MAX_VALUE) {
int index = (int) (l + 128);
short previous = singleByteRange[index];
if (previous < 0) {
singleByteRange[index] = (short) size;
size++;
return true;
} else {
return false;
}
} else {
if (!other.containsKey(l)) {
other.put(l, (short)size);
size++;
return true;
} else {
return false;
}
}
}
/** gets the ordinal for a previously added item */
public int getOrd(long l) {
if (l >= Byte.MIN_VALUE && l <= Byte.MAX_VALUE) {
int index = (int) (l + 128);
return singleByteRange[index];
} else {
// NPE if something is screwed up
return other.get(l);
}
}
/** retrieves the ordinal table for previously added items */
public long[] getDecodeTable() {
long decode[] = new long[size];
for (int i = 0; i < singleByteRange.length; i++) {
short s = singleByteRange[i];
if (s >= 0) {
decode[s] = i - 128;
}
}
for (Map.Entry<Long,Short> entry : other.entrySet()) {
decode[entry.getValue()] = entry.getKey();
}
return decode;
}
}
} }