mirror of https://github.com/apache/lucene.git
LUCENE-7081: prefix-compress compressible fixed-width data (like InetAddress/BigInteger)
This commit is contained in:
parent
89cc676f2b
commit
162636bf05
|
@ -411,17 +411,32 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
|
||||||
|
|
||||||
/** expert: writes a value dictionary for a sorted/sortedset field */
|
/** expert: writes a value dictionary for a sorted/sortedset field */
|
||||||
private void addTermsDict(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
|
private void addTermsDict(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
|
||||||
// first check if it's a "fixed-length" terms dict
|
// first check if it's a "fixed-length" terms dict, and compressibility if so
|
||||||
int minLength = Integer.MAX_VALUE;
|
int minLength = Integer.MAX_VALUE;
|
||||||
int maxLength = Integer.MIN_VALUE;
|
int maxLength = Integer.MIN_VALUE;
|
||||||
long numValues = 0;
|
long numValues = 0;
|
||||||
|
BytesRefBuilder previousValue = new BytesRefBuilder();
|
||||||
|
long prefixSum = 0; // only valid for fixed-width data, as we have a choice there
|
||||||
for (BytesRef v : values) {
|
for (BytesRef v : values) {
|
||||||
minLength = Math.min(minLength, v.length);
|
minLength = Math.min(minLength, v.length);
|
||||||
maxLength = Math.max(maxLength, v.length);
|
maxLength = Math.max(maxLength, v.length);
|
||||||
|
if (minLength == maxLength) {
|
||||||
|
int termPosition = (int) (numValues & INTERVAL_MASK);
|
||||||
|
if (termPosition == 0) {
|
||||||
|
// first term in block, save it away to compare against the last term later
|
||||||
|
previousValue.copyBytes(v);
|
||||||
|
} else if (termPosition == INTERVAL_COUNT - 1) {
|
||||||
|
// last term in block, accumulate shared prefix against first term
|
||||||
|
prefixSum += StringHelper.bytesDifference(previousValue.get(), v);
|
||||||
|
}
|
||||||
|
}
|
||||||
numValues++;
|
numValues++;
|
||||||
}
|
}
|
||||||
if (minLength == maxLength) {
|
// for fixed width data, look at the avg(shared prefix) before deciding how to encode:
|
||||||
// no index needed: direct addressing by mult
|
// prefix compression "costs" worst case 2 bytes per term because we must store suffix lengths.
|
||||||
|
// so if we share at least 3 bytes on average, always compress.
|
||||||
|
if (minLength == maxLength && prefixSum <= 3*(numValues >> INTERVAL_SHIFT)) {
|
||||||
|
// no index needed: not very compressible, direct addressing by mult
|
||||||
addBinaryField(field, values);
|
addBinaryField(field, values);
|
||||||
} else if (numValues < REVERSE_INTERVAL_COUNT) {
|
} else if (numValues < REVERSE_INTERVAL_COUNT) {
|
||||||
// low cardinality: waste a few KB of ram, but can't really use fancy index etc
|
// low cardinality: waste a few KB of ram, but can't really use fancy index etc
|
||||||
|
|
Loading…
Reference in New Issue