From 162636bf05b5b6b35a79bacd2e7440830b05960f Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 10 Mar 2016 07:25:48 -0500 Subject: [PATCH] LUCENE-7081: prefix-compress compressible fixed-width data (like InetAddress/BigInteger) --- .../lucene54/Lucene54DocValuesConsumer.java | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java index 858c54b362f..96acfd25b5a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java @@ -411,17 +411,32 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close /** expert: writes a value dictionary for a sorted/sortedset field */ private void addTermsDict(FieldInfo field, final Iterable values) throws IOException { - // first check if it's a "fixed-length" terms dict + // first check if it's a "fixed-length" terms dict, and compressibility if so int minLength = Integer.MAX_VALUE; int maxLength = Integer.MIN_VALUE; long numValues = 0; + BytesRefBuilder previousValue = new BytesRefBuilder(); + long prefixSum = 0; // only valid for fixed-width data, as we have a choice there for (BytesRef v : values) { minLength = Math.min(minLength, v.length); maxLength = Math.max(maxLength, v.length); + if (minLength == maxLength) { + int termPosition = (int) (numValues & INTERVAL_MASK); + if (termPosition == 0) { + // first term in block, save it away to compare against the last term later + previousValue.copyBytes(v); + } else if (termPosition == INTERVAL_COUNT - 1) { + // last term in block, accumulate shared prefix against first term + prefixSum += StringHelper.bytesDifference(previousValue.get(), v); + } + } numValues++; } - if (minLength == maxLength) { - // no index needed: direct addressing by mult + // for fixed width data, look at the avg(shared prefix) before deciding how to encode: + // prefix compression "costs" worst case 2 bytes per term because we must store suffix lengths. + // so if we share at least 3 bytes on average, always compress. + if (minLength == maxLength && prefixSum <= 3*(numValues >> INTERVAL_SHIFT)) { + // no index needed: not very compressible, direct addressing by mult addBinaryField(field, values); } else if (numValues < REVERSE_INTERVAL_COUNT) { // low cardinality: waste a few KB of ram, but can't really use fancy index etc