mirror of https://github.com/apache/lucene.git
LUCENE-5721: Monotonic compression doesn't use zig-zag encoding anymore.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1600694 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8f267c8560
commit
b31ae698e5
|
@ -254,6 +254,9 @@ Optimizations
|
|||
* LUCENE-5703: BinaryDocValues producers don't allocate or copy bytes on
|
||||
each access anymore. (Adrien Grand)
|
||||
|
||||
* LUCENE-5721: Monotonic compression doesn't use zig-zag encoding anymore.
|
||||
(Robert Muir, Adrien Grand)
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-5673: MMapDirectory: Work around a "bug" in the JDK that throws
|
||||
|
|
|
@ -256,10 +256,10 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
|
|||
termBytes.copy(clone, numTermBytes);
|
||||
|
||||
// records offsets into main terms dict file
|
||||
termsDictOffsets = new MonotonicBlockPackedReader(clone, packedIntsVersion, blocksize, numIndexTerms, false);
|
||||
termsDictOffsets = MonotonicBlockPackedReader.of(clone, packedIntsVersion, blocksize, numIndexTerms, false);
|
||||
|
||||
// records offsets into byte[] term data
|
||||
termOffsets = new MonotonicBlockPackedReader(clone, packedIntsVersion, blocksize, 1+numIndexTerms, false);
|
||||
termOffsets = MonotonicBlockPackedReader.of(clone, packedIntsVersion, blocksize, 1+numIndexTerms, false);
|
||||
} finally {
|
||||
clone.close();
|
||||
}
|
||||
|
|
|
@ -34,7 +34,7 @@ class DiskDocValuesProducer extends Lucene49DocValuesProducer {
|
|||
@Override
|
||||
protected MonotonicBlockPackedReader getAddressInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException {
|
||||
data.seek(bytes.addressesOffset);
|
||||
return new MonotonicBlockPackedReader(data.clone(), bytes.packedIntsVersion, bytes.blockSize, bytes.count, true);
|
||||
return MonotonicBlockPackedReader.of(data.clone(), bytes.packedIntsVersion, bytes.blockSize, bytes.count, true);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -45,6 +45,6 @@ class DiskDocValuesProducer extends Lucene49DocValuesProducer {
|
|||
@Override
|
||||
protected MonotonicBlockPackedReader getOrdIndexInstance(IndexInput data, FieldInfo field, NumericEntry entry) throws IOException {
|
||||
data.seek(entry.offset);
|
||||
return new MonotonicBlockPackedReader(data.clone(), entry.packedIntsVersion, entry.blockSize, entry.count, true);
|
||||
return MonotonicBlockPackedReader.of(data.clone(), entry.packedIntsVersion, entry.blockSize, entry.count, true);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -330,7 +330,7 @@ class MemoryDocValuesProducer extends DocValuesProducer {
|
|||
ramBytesUsed.addAndGet(bytesAndAddresses.reader.ramBytesUsed());
|
||||
if (entry.minLength != entry.maxLength) {
|
||||
data.seek(data.getFilePointer() + entry.missingBytes);
|
||||
bytesAndAddresses.addresses = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, maxDoc, false);
|
||||
bytesAndAddresses.addresses = MonotonicBlockPackedReader.of(data, entry.packedIntsVersion, entry.blockSize, maxDoc, false);
|
||||
ramBytesUsed.addAndGet(bytesAndAddresses.addresses.ramBytesUsed());
|
||||
}
|
||||
return bytesAndAddresses;
|
||||
|
|
|
@ -300,7 +300,7 @@ class Lucene42DocValuesProducer extends DocValuesProducer {
|
|||
}
|
||||
};
|
||||
} else {
|
||||
final MonotonicBlockPackedReader addresses = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, maxDoc, false);
|
||||
final MonotonicBlockPackedReader addresses = MonotonicBlockPackedReader.of(data, entry.packedIntsVersion, entry.blockSize, maxDoc, false);
|
||||
ramBytesUsed.addAndGet(bytes.ramBytesUsed() + addresses.ramBytesUsed());
|
||||
return new BinaryDocValues() {
|
||||
|
||||
|
|
|
@ -412,7 +412,7 @@ public class Lucene45DocValuesProducer extends DocValuesProducer implements Clos
|
|||
MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number);
|
||||
if (addrInstance == null) {
|
||||
data.seek(bytes.addressesOffset);
|
||||
addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, false);
|
||||
addrInstance = MonotonicBlockPackedReader.of(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, false);
|
||||
addressInstances.put(field.number, addrInstance);
|
||||
ramBytesUsed.addAndGet(addrInstance.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_INT);
|
||||
}
|
||||
|
@ -461,7 +461,7 @@ public class Lucene45DocValuesProducer extends DocValuesProducer implements Clos
|
|||
} else {
|
||||
size = 1L + bytes.count / interval;
|
||||
}
|
||||
addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, size, false);
|
||||
addrInstance = MonotonicBlockPackedReader.of(data, bytes.packedIntsVersion, bytes.blockSize, size, false);
|
||||
addressInstances.put(field.number, addrInstance);
|
||||
ramBytesUsed.addAndGet(addrInstance.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_INT);
|
||||
}
|
||||
|
@ -533,7 +533,7 @@ public class Lucene45DocValuesProducer extends DocValuesProducer implements Clos
|
|||
MonotonicBlockPackedReader ordIndexInstance = ordIndexInstances.get(field.number);
|
||||
if (ordIndexInstance == null) {
|
||||
data.seek(entry.offset);
|
||||
ordIndexInstance = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, false);
|
||||
ordIndexInstance = MonotonicBlockPackedReader.of(data, entry.packedIntsVersion, entry.blockSize, entry.count, false);
|
||||
ordIndexInstances.put(field.number, ordIndexInstance);
|
||||
ramBytesUsed.addAndGet(ordIndexInstance.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_INT);
|
||||
}
|
||||
|
|
|
@ -392,7 +392,7 @@ public class Lucene49DocValuesProducer extends DocValuesProducer implements Clos
|
|||
MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number);
|
||||
if (addrInstance == null) {
|
||||
data.seek(bytes.addressesOffset);
|
||||
addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, false);
|
||||
addrInstance = MonotonicBlockPackedReader.of(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, false);
|
||||
addressInstances.put(field.number, addrInstance);
|
||||
ramBytesUsed.addAndGet(addrInstance.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_INT);
|
||||
}
|
||||
|
@ -441,7 +441,7 @@ public class Lucene49DocValuesProducer extends DocValuesProducer implements Clos
|
|||
} else {
|
||||
size = 1L + bytes.count / interval;
|
||||
}
|
||||
addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, size, false);
|
||||
addrInstance = MonotonicBlockPackedReader.of(data, bytes.packedIntsVersion, bytes.blockSize, size, false);
|
||||
addressInstances.put(field.number, addrInstance);
|
||||
ramBytesUsed.addAndGet(addrInstance.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_INT);
|
||||
}
|
||||
|
@ -511,7 +511,7 @@ public class Lucene49DocValuesProducer extends DocValuesProducer implements Clos
|
|||
MonotonicBlockPackedReader ordIndexInstance = ordIndexInstances.get(field.number);
|
||||
if (ordIndexInstance == null) {
|
||||
data.seek(entry.offset);
|
||||
ordIndexInstance = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, false);
|
||||
ordIndexInstance = MonotonicBlockPackedReader.of(data, entry.packedIntsVersion, entry.blockSize, entry.count, false);
|
||||
ordIndexInstances.put(field.number, ordIndexInstance);
|
||||
ramBytesUsed.addAndGet(ordIndexInstance.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_INT);
|
||||
}
|
||||
|
|
|
@ -17,10 +17,12 @@ package org.apache.lucene.util.packed;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import static org.apache.lucene.util.packed.MonotonicBlockPackedReader.expected;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
/**
|
||||
* Utility class to buffer signed longs in memory, which is optimized for the
|
||||
* case where the sequence is monotonic, although it can encode any sequence of
|
||||
|
@ -30,14 +32,6 @@ import java.util.Arrays;
|
|||
*/
|
||||
public final class MonotonicAppendingLongBuffer extends AbstractAppendingLongBuffer {
|
||||
|
||||
static long zigZagDecode(long n) {
|
||||
return ((n >>> 1) ^ -(n & 1));
|
||||
}
|
||||
|
||||
static long zigZagEncode(long n) {
|
||||
return (n >> 63) ^ (n << 1);
|
||||
}
|
||||
|
||||
float[] averages;
|
||||
long[] minValues;
|
||||
|
||||
|
@ -68,18 +62,12 @@ public final class MonotonicAppendingLongBuffer extends AbstractAppendingLongBuf
|
|||
this(16, 1024, acceptableOverheadRatio);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
long get(int block, int element) {
|
||||
if (block == valuesOff) {
|
||||
return pending[element];
|
||||
} else {
|
||||
final long base = minValues[block] + (long) (averages[block] * (long) element);
|
||||
if (values[block] == null) {
|
||||
return base;
|
||||
} else {
|
||||
return base + zigZagDecode(values[block].get(element));
|
||||
}
|
||||
return expected(minValues[block], averages[block], element) + values[block].get(element);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -90,21 +78,11 @@ public final class MonotonicAppendingLongBuffer extends AbstractAppendingLongBuf
|
|||
System.arraycopy(pending, element, arr, off, sysCopyToRead);
|
||||
return sysCopyToRead;
|
||||
} else {
|
||||
if (values[block] == null) {
|
||||
int toFill = Math.min(len, pending.length - element);
|
||||
for (int r = 0; r < toFill; r++, off++, element++) {
|
||||
arr[off] = minValues[block] + (long) (averages[block] * (long) element);
|
||||
}
|
||||
return toFill;
|
||||
} else {
|
||||
|
||||
/* packed block */
|
||||
int read = values[block].get(element, arr, off, len);
|
||||
for (int r = 0; r < read; r++, off++, element++) {
|
||||
arr[off] = minValues[block] + (long) (averages[block] * (long) element) + zigZagDecode(arr[off]);
|
||||
}
|
||||
return read;
|
||||
int read = values[block].get(element, arr, off, len);
|
||||
for (int r = 0; r < read; r++, off++, element++) {
|
||||
arr[off] += expected(minValues[block], averages[block], element);
|
||||
}
|
||||
return read;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -118,11 +96,22 @@ public final class MonotonicAppendingLongBuffer extends AbstractAppendingLongBuf
|
|||
@Override
|
||||
void packPendingValues() {
|
||||
assert pendingOff > 0;
|
||||
minValues[valuesOff] = pending[0];
|
||||
averages[valuesOff] = pendingOff == 1 ? 0 : (float) (pending[pendingOff - 1] - pending[0]) / (pendingOff - 1);
|
||||
final float average = pendingOff == 1 ? 0 : (float) (pending[pendingOff - 1] - pending[0]) / (pendingOff - 1);
|
||||
long minValue = pending[0];
|
||||
// adjust minValue so that all deltas will be positive
|
||||
for (int i = 1; i < pendingOff; ++i) {
|
||||
final long actual = pending[i];
|
||||
final long expected = expected(minValue, average, i);
|
||||
if (expected > actual) {
|
||||
minValue -= (expected - actual);
|
||||
}
|
||||
}
|
||||
|
||||
minValues[valuesOff] = minValue;
|
||||
averages[valuesOff] = average;
|
||||
|
||||
for (int i = 0; i < pendingOff; ++i) {
|
||||
pending[i] = zigZagEncode(pending[i] - minValues[valuesOff] - (long) (averages[valuesOff] * (long) i));
|
||||
pending[i] = pending[i] - expected(minValue, average, i);
|
||||
}
|
||||
long maxDelta = 0;
|
||||
for (int i = 0; i < pendingOff; ++i) {
|
||||
|
|
|
@ -35,16 +35,32 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
* {@link MonotonicBlockPackedWriter}.
|
||||
* @lucene.internal
|
||||
*/
|
||||
public final class MonotonicBlockPackedReader extends LongValues implements Accountable {
|
||||
public class MonotonicBlockPackedReader extends LongValues implements Accountable {
|
||||
|
||||
private final int blockShift, blockMask;
|
||||
private final long valueCount;
|
||||
private final long[] minValues;
|
||||
private final float[] averages;
|
||||
private final PackedInts.Reader[] subReaders;
|
||||
static long expected(long origin, float average, int index) {
|
||||
return origin + (long) (average * (long) index);
|
||||
}
|
||||
|
||||
final int blockShift, blockMask;
|
||||
final long valueCount;
|
||||
final long[] minValues;
|
||||
final float[] averages;
|
||||
final PackedInts.Reader[] subReaders;
|
||||
|
||||
/** Sole constructor. */
|
||||
public MonotonicBlockPackedReader(IndexInput in, int packedIntsVersion, int blockSize, long valueCount, boolean direct) throws IOException {
|
||||
public static MonotonicBlockPackedReader of(IndexInput in, int packedIntsVersion, int blockSize, long valueCount, boolean direct) throws IOException {
|
||||
if (packedIntsVersion < PackedInts.VERSION_MONOTONIC_WITHOUT_ZIGZAG) {
|
||||
return new MonotonicBlockPackedReader(in, packedIntsVersion, blockSize, valueCount, direct) {
|
||||
@Override
|
||||
protected long decodeDelta(long delta) {
|
||||
return zigZagDecode(delta);
|
||||
}
|
||||
};
|
||||
}
|
||||
return new MonotonicBlockPackedReader(in, packedIntsVersion, blockSize, valueCount, direct);
|
||||
}
|
||||
|
||||
private MonotonicBlockPackedReader(IndexInput in, int packedIntsVersion, int blockSize, long valueCount, boolean direct) throws IOException {
|
||||
this.valueCount = valueCount;
|
||||
blockShift = checkBlockSize(blockSize, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE);
|
||||
blockMask = blockSize - 1;
|
||||
|
@ -53,7 +69,11 @@ public final class MonotonicBlockPackedReader extends LongValues implements Acco
|
|||
averages = new float[numBlocks];
|
||||
subReaders = new PackedInts.Reader[numBlocks];
|
||||
for (int i = 0; i < numBlocks; ++i) {
|
||||
minValues[i] = in.readVLong();
|
||||
if (packedIntsVersion < PackedInts.VERSION_MONOTONIC_WITHOUT_ZIGZAG) {
|
||||
minValues[i] = in.readVLong();
|
||||
} else {
|
||||
minValues[i] = zigZagDecode(in.readVLong());
|
||||
}
|
||||
averages[i] = Float.intBitsToFloat(in.readInt());
|
||||
final int bitsPerValue = in.readVInt();
|
||||
if (bitsPerValue > 64) {
|
||||
|
@ -79,7 +99,11 @@ public final class MonotonicBlockPackedReader extends LongValues implements Acco
|
|||
assert index >= 0 && index < valueCount;
|
||||
final int block = (int) (index >>> blockShift);
|
||||
final int idx = (int) (index & blockMask);
|
||||
return minValues[block] + (long) (idx * averages[block]) + zigZagDecode(subReaders[block].get(idx));
|
||||
return expected(minValues[block], averages[block], idx) + decodeDelta(subReaders[block].get(idx));
|
||||
}
|
||||
|
||||
protected long decodeDelta(long delta) {
|
||||
return delta;
|
||||
}
|
||||
|
||||
/** Returns the number of values */
|
||||
|
|
|
@ -18,10 +18,12 @@ package org.apache.lucene.util.packed;
|
|||
*/
|
||||
|
||||
import static org.apache.lucene.util.BitUtil.zigZagEncode;
|
||||
import static org.apache.lucene.util.packed.MonotonicBlockPackedReader.expected;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.BitUtil;
|
||||
|
||||
/**
|
||||
* A writer for large monotonically increasing sequences of positive longs.
|
||||
|
@ -29,7 +31,7 @@ import org.apache.lucene.store.DataOutput;
|
|||
* The sequence is divided into fixed-size blocks and for each block, values
|
||||
* are modeled after a linear function f: x → A × x + B. The block
|
||||
* encodes deltas from the expected values computed from this function using as
|
||||
* few bits as possible. Each block has an overhead between 6 and 14 bytes.
|
||||
* few bits as possible.
|
||||
* <p>
|
||||
* Format:
|
||||
* <ul>
|
||||
|
@ -38,17 +40,16 @@ import org.apache.lucene.store.DataOutput;
|
|||
* <li>Block: <Header, (Ints)>
|
||||
* <li>Header: <B, A, BitsPerValue>
|
||||
* <li>B: the B from f: x → A × x + B using a
|
||||
* {@link DataOutput#writeVLong(long) variable-length long}
|
||||
* {@link BitUtil#zigZagEncode(long) zig-zag encoded}
|
||||
* {@link DataOutput#writeVLong(long) vLong}
|
||||
* <li>A: the A from f: x → A × x + B encoded using
|
||||
* {@link Float#floatToIntBits(float)} on
|
||||
* {@link DataOutput#writeInt(int) 4 bytes}
|
||||
* <li>BitsPerValue: a {@link DataOutput#writeVInt(int) variable-length int}
|
||||
* <li>Ints: if BitsPerValue is <tt>0</tt>, then there is nothing to read and
|
||||
* all values perfectly match the result of the function. Otherwise, these
|
||||
* are the
|
||||
* <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">zigzag-encoded</a>
|
||||
* {@link PackedInts packed} deltas from the expected value (computed from
|
||||
* the function) using exaclty BitsPerValue bits per value
|
||||
* are the {@link PackedInts packed} deltas from the expected value
|
||||
* (computed from the function) using exaclty BitsPerValue bits per value.
|
||||
* </ul>
|
||||
* @see MonotonicBlockPackedReader
|
||||
* @lucene.internal
|
||||
|
@ -72,22 +73,29 @@ public final class MonotonicBlockPackedWriter extends AbstractBlockPackedWriter
|
|||
protected void flush() throws IOException {
|
||||
assert off > 0;
|
||||
|
||||
// TODO: perform a true linear regression?
|
||||
final long min = values[0];
|
||||
final float avg = off == 1 ? 0f : (float) (values[off - 1] - min) / (off - 1);
|
||||
|
||||
long maxZigZagDelta = 0;
|
||||
for (int i = 0; i < off; ++i) {
|
||||
values[i] = zigZagEncode(values[i] - min - (long) (avg * i));
|
||||
maxZigZagDelta = Math.max(maxZigZagDelta, values[i]);
|
||||
final float avg = off == 1 ? 0f : (float) (values[off - 1] - values[0]) / (off - 1);
|
||||
long min = values[0];
|
||||
// adjust min so that all deltas will be positive
|
||||
for (int i = 1; i < off; ++i) {
|
||||
final long actual = values[i];
|
||||
final long expected = expected(min, avg, i);
|
||||
if (expected > actual) {
|
||||
min -= (expected - actual);
|
||||
}
|
||||
}
|
||||
|
||||
out.writeVLong(min);
|
||||
long maxDelta = 0;
|
||||
for (int i = 0; i < off; ++i) {
|
||||
values[i] = values[i] - expected(min, avg, i);
|
||||
maxDelta = Math.max(maxDelta, values[i]);
|
||||
}
|
||||
|
||||
out.writeVLong(zigZagEncode(min));
|
||||
out.writeInt(Float.floatToIntBits(avg));
|
||||
if (maxZigZagDelta == 0) {
|
||||
if (maxDelta == 0) {
|
||||
out.writeVInt(0);
|
||||
} else {
|
||||
final int bitsRequired = PackedInts.bitsRequired(maxZigZagDelta);
|
||||
final int bitsRequired = PackedInts.bitsRequired(maxDelta);
|
||||
out.writeVInt(bitsRequired);
|
||||
writeValues(bitsRequired);
|
||||
}
|
||||
|
|
|
@ -67,7 +67,8 @@ public class PackedInts {
|
|||
public final static String CODEC_NAME = "PackedInts";
|
||||
public final static int VERSION_START = 0; // PackedInts were long-aligned
|
||||
public final static int VERSION_BYTE_ALIGNED = 1;
|
||||
public final static int VERSION_CURRENT = VERSION_BYTE_ALIGNED;
|
||||
public static final int VERSION_MONOTONIC_WITHOUT_ZIGZAG = 2;
|
||||
public final static int VERSION_CURRENT = VERSION_MONOTONIC_WITHOUT_ZIGZAG;
|
||||
|
||||
/**
|
||||
* Check the validity of a version number.
|
||||
|
|
|
@ -1230,7 +1230,7 @@ public class TestPackedInts extends LuceneTestCase {
|
|||
out.close();
|
||||
|
||||
final IndexInput in = dir.openInput("out.bin", IOContext.DEFAULT);
|
||||
final MonotonicBlockPackedReader reader = new MonotonicBlockPackedReader(in, PackedInts.VERSION_CURRENT, blockSize, valueCount, random().nextBoolean());
|
||||
final MonotonicBlockPackedReader reader = MonotonicBlockPackedReader.of(in, PackedInts.VERSION_CURRENT, blockSize, valueCount, random().nextBoolean());
|
||||
assertEquals(fp, in.getFilePointer());
|
||||
for (int i = 0; i < valueCount; ++i) {
|
||||
assertEquals("i=" +i, values[i], reader.get(i));
|
||||
|
|
Loading…
Reference in New Issue