LUCENE-5721: Monotonic compression doesn't use zig-zag encoding anymore.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1600694 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2014-06-05 16:30:04 +00:00
parent 8f267c8560
commit b31ae698e5
12 changed files with 98 additions and 73 deletions

View File

@ -254,6 +254,9 @@ Optimizations
* LUCENE-5703: BinaryDocValues producers don't allocate or copy bytes on
each access anymore. (Adrien Grand)
* LUCENE-5721: Monotonic compression doesn't use zig-zag encoding anymore.
(Robert Muir, Adrien Grand)
Bug fixes
* LUCENE-5673: MMapDirectory: Work around a "bug" in the JDK that throws

View File

@ -256,10 +256,10 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
termBytes.copy(clone, numTermBytes);
// records offsets into main terms dict file
termsDictOffsets = new MonotonicBlockPackedReader(clone, packedIntsVersion, blocksize, numIndexTerms, false);
termsDictOffsets = MonotonicBlockPackedReader.of(clone, packedIntsVersion, blocksize, numIndexTerms, false);
// records offsets into byte[] term data
termOffsets = new MonotonicBlockPackedReader(clone, packedIntsVersion, blocksize, 1+numIndexTerms, false);
termOffsets = MonotonicBlockPackedReader.of(clone, packedIntsVersion, blocksize, 1+numIndexTerms, false);
} finally {
clone.close();
}

View File

@ -34,7 +34,7 @@ class DiskDocValuesProducer extends Lucene49DocValuesProducer {
@Override
protected MonotonicBlockPackedReader getAddressInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException {
data.seek(bytes.addressesOffset);
return new MonotonicBlockPackedReader(data.clone(), bytes.packedIntsVersion, bytes.blockSize, bytes.count, true);
return MonotonicBlockPackedReader.of(data.clone(), bytes.packedIntsVersion, bytes.blockSize, bytes.count, true);
}
@Override
@ -45,6 +45,6 @@ class DiskDocValuesProducer extends Lucene49DocValuesProducer {
@Override
protected MonotonicBlockPackedReader getOrdIndexInstance(IndexInput data, FieldInfo field, NumericEntry entry) throws IOException {
data.seek(entry.offset);
return new MonotonicBlockPackedReader(data.clone(), entry.packedIntsVersion, entry.blockSize, entry.count, true);
return MonotonicBlockPackedReader.of(data.clone(), entry.packedIntsVersion, entry.blockSize, entry.count, true);
}
}

View File

@ -330,7 +330,7 @@ class MemoryDocValuesProducer extends DocValuesProducer {
ramBytesUsed.addAndGet(bytesAndAddresses.reader.ramBytesUsed());
if (entry.minLength != entry.maxLength) {
data.seek(data.getFilePointer() + entry.missingBytes);
bytesAndAddresses.addresses = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, maxDoc, false);
bytesAndAddresses.addresses = MonotonicBlockPackedReader.of(data, entry.packedIntsVersion, entry.blockSize, maxDoc, false);
ramBytesUsed.addAndGet(bytesAndAddresses.addresses.ramBytesUsed());
}
return bytesAndAddresses;

View File

@ -300,7 +300,7 @@ class Lucene42DocValuesProducer extends DocValuesProducer {
}
};
} else {
final MonotonicBlockPackedReader addresses = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, maxDoc, false);
final MonotonicBlockPackedReader addresses = MonotonicBlockPackedReader.of(data, entry.packedIntsVersion, entry.blockSize, maxDoc, false);
ramBytesUsed.addAndGet(bytes.ramBytesUsed() + addresses.ramBytesUsed());
return new BinaryDocValues() {

View File

@ -412,7 +412,7 @@ public class Lucene45DocValuesProducer extends DocValuesProducer implements Clos
MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number);
if (addrInstance == null) {
data.seek(bytes.addressesOffset);
addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, false);
addrInstance = MonotonicBlockPackedReader.of(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, false);
addressInstances.put(field.number, addrInstance);
ramBytesUsed.addAndGet(addrInstance.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_INT);
}
@ -461,7 +461,7 @@ public class Lucene45DocValuesProducer extends DocValuesProducer implements Clos
} else {
size = 1L + bytes.count / interval;
}
addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, size, false);
addrInstance = MonotonicBlockPackedReader.of(data, bytes.packedIntsVersion, bytes.blockSize, size, false);
addressInstances.put(field.number, addrInstance);
ramBytesUsed.addAndGet(addrInstance.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_INT);
}
@ -533,7 +533,7 @@ public class Lucene45DocValuesProducer extends DocValuesProducer implements Clos
MonotonicBlockPackedReader ordIndexInstance = ordIndexInstances.get(field.number);
if (ordIndexInstance == null) {
data.seek(entry.offset);
ordIndexInstance = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, false);
ordIndexInstance = MonotonicBlockPackedReader.of(data, entry.packedIntsVersion, entry.blockSize, entry.count, false);
ordIndexInstances.put(field.number, ordIndexInstance);
ramBytesUsed.addAndGet(ordIndexInstance.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_INT);
}

View File

@ -392,7 +392,7 @@ public class Lucene49DocValuesProducer extends DocValuesProducer implements Clos
MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number);
if (addrInstance == null) {
data.seek(bytes.addressesOffset);
addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, false);
addrInstance = MonotonicBlockPackedReader.of(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, false);
addressInstances.put(field.number, addrInstance);
ramBytesUsed.addAndGet(addrInstance.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_INT);
}
@ -441,7 +441,7 @@ public class Lucene49DocValuesProducer extends DocValuesProducer implements Clos
} else {
size = 1L + bytes.count / interval;
}
addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, size, false);
addrInstance = MonotonicBlockPackedReader.of(data, bytes.packedIntsVersion, bytes.blockSize, size, false);
addressInstances.put(field.number, addrInstance);
ramBytesUsed.addAndGet(addrInstance.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_INT);
}
@ -511,7 +511,7 @@ public class Lucene49DocValuesProducer extends DocValuesProducer implements Clos
MonotonicBlockPackedReader ordIndexInstance = ordIndexInstances.get(field.number);
if (ordIndexInstance == null) {
data.seek(entry.offset);
ordIndexInstance = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, false);
ordIndexInstance = MonotonicBlockPackedReader.of(data, entry.packedIntsVersion, entry.blockSize, entry.count, false);
ordIndexInstances.put(field.number, ordIndexInstance);
ramBytesUsed.addAndGet(ordIndexInstance.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_INT);
}

View File

@ -17,10 +17,12 @@ package org.apache.lucene.util.packed;
* limitations under the License.
*/
import org.apache.lucene.util.RamUsageEstimator;
import static org.apache.lucene.util.packed.MonotonicBlockPackedReader.expected;
import java.util.Arrays;
import org.apache.lucene.util.RamUsageEstimator;
/**
* Utility class to buffer signed longs in memory, which is optimized for the
* case where the sequence is monotonic, although it can encode any sequence of
@ -30,14 +32,6 @@ import java.util.Arrays;
*/
public final class MonotonicAppendingLongBuffer extends AbstractAppendingLongBuffer {
static long zigZagDecode(long n) {
return ((n >>> 1) ^ -(n & 1));
}
static long zigZagEncode(long n) {
return (n >> 63) ^ (n << 1);
}
float[] averages;
long[] minValues;
@ -68,18 +62,12 @@ public final class MonotonicAppendingLongBuffer extends AbstractAppendingLongBuf
this(16, 1024, acceptableOverheadRatio);
}
@Override
long get(int block, int element) {
if (block == valuesOff) {
return pending[element];
} else {
final long base = minValues[block] + (long) (averages[block] * (long) element);
if (values[block] == null) {
return base;
} else {
return base + zigZagDecode(values[block].get(element));
}
return expected(minValues[block], averages[block], element) + values[block].get(element);
}
}
@ -90,21 +78,11 @@ public final class MonotonicAppendingLongBuffer extends AbstractAppendingLongBuf
System.arraycopy(pending, element, arr, off, sysCopyToRead);
return sysCopyToRead;
} else {
if (values[block] == null) {
int toFill = Math.min(len, pending.length - element);
for (int r = 0; r < toFill; r++, off++, element++) {
arr[off] = minValues[block] + (long) (averages[block] * (long) element);
}
return toFill;
} else {
/* packed block */
int read = values[block].get(element, arr, off, len);
for (int r = 0; r < read; r++, off++, element++) {
arr[off] = minValues[block] + (long) (averages[block] * (long) element) + zigZagDecode(arr[off]);
}
return read;
int read = values[block].get(element, arr, off, len);
for (int r = 0; r < read; r++, off++, element++) {
arr[off] += expected(minValues[block], averages[block], element);
}
return read;
}
}
@ -118,11 +96,22 @@ public final class MonotonicAppendingLongBuffer extends AbstractAppendingLongBuf
@Override
void packPendingValues() {
assert pendingOff > 0;
minValues[valuesOff] = pending[0];
averages[valuesOff] = pendingOff == 1 ? 0 : (float) (pending[pendingOff - 1] - pending[0]) / (pendingOff - 1);
final float average = pendingOff == 1 ? 0 : (float) (pending[pendingOff - 1] - pending[0]) / (pendingOff - 1);
long minValue = pending[0];
// adjust minValue so that all deltas will be positive
for (int i = 1; i < pendingOff; ++i) {
final long actual = pending[i];
final long expected = expected(minValue, average, i);
if (expected > actual) {
minValue -= (expected - actual);
}
}
minValues[valuesOff] = minValue;
averages[valuesOff] = average;
for (int i = 0; i < pendingOff; ++i) {
pending[i] = zigZagEncode(pending[i] - minValues[valuesOff] - (long) (averages[valuesOff] * (long) i));
pending[i] = pending[i] - expected(minValue, average, i);
}
long maxDelta = 0;
for (int i = 0; i < pendingOff; ++i) {

View File

@ -35,16 +35,32 @@ import org.apache.lucene.util.RamUsageEstimator;
* {@link MonotonicBlockPackedWriter}.
* @lucene.internal
*/
public final class MonotonicBlockPackedReader extends LongValues implements Accountable {
public class MonotonicBlockPackedReader extends LongValues implements Accountable {
private final int blockShift, blockMask;
private final long valueCount;
private final long[] minValues;
private final float[] averages;
private final PackedInts.Reader[] subReaders;
static long expected(long origin, float average, int index) {
return origin + (long) (average * (long) index);
}
final int blockShift, blockMask;
final long valueCount;
final long[] minValues;
final float[] averages;
final PackedInts.Reader[] subReaders;
/** Sole constructor. */
public MonotonicBlockPackedReader(IndexInput in, int packedIntsVersion, int blockSize, long valueCount, boolean direct) throws IOException {
public static MonotonicBlockPackedReader of(IndexInput in, int packedIntsVersion, int blockSize, long valueCount, boolean direct) throws IOException {
if (packedIntsVersion < PackedInts.VERSION_MONOTONIC_WITHOUT_ZIGZAG) {
return new MonotonicBlockPackedReader(in, packedIntsVersion, blockSize, valueCount, direct) {
@Override
protected long decodeDelta(long delta) {
return zigZagDecode(delta);
}
};
}
return new MonotonicBlockPackedReader(in, packedIntsVersion, blockSize, valueCount, direct);
}
private MonotonicBlockPackedReader(IndexInput in, int packedIntsVersion, int blockSize, long valueCount, boolean direct) throws IOException {
this.valueCount = valueCount;
blockShift = checkBlockSize(blockSize, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE);
blockMask = blockSize - 1;
@ -53,7 +69,11 @@ public final class MonotonicBlockPackedReader extends LongValues implements Acco
averages = new float[numBlocks];
subReaders = new PackedInts.Reader[numBlocks];
for (int i = 0; i < numBlocks; ++i) {
minValues[i] = in.readVLong();
if (packedIntsVersion < PackedInts.VERSION_MONOTONIC_WITHOUT_ZIGZAG) {
minValues[i] = in.readVLong();
} else {
minValues[i] = zigZagDecode(in.readVLong());
}
averages[i] = Float.intBitsToFloat(in.readInt());
final int bitsPerValue = in.readVInt();
if (bitsPerValue > 64) {
@ -79,7 +99,11 @@ public final class MonotonicBlockPackedReader extends LongValues implements Acco
assert index >= 0 && index < valueCount;
final int block = (int) (index >>> blockShift);
final int idx = (int) (index & blockMask);
return minValues[block] + (long) (idx * averages[block]) + zigZagDecode(subReaders[block].get(idx));
return expected(minValues[block], averages[block], idx) + decodeDelta(subReaders[block].get(idx));
}
protected long decodeDelta(long delta) {
return delta;
}
/** Returns the number of values */

View File

@ -18,10 +18,12 @@ package org.apache.lucene.util.packed;
*/
import static org.apache.lucene.util.BitUtil.zigZagEncode;
import static org.apache.lucene.util.packed.MonotonicBlockPackedReader.expected;
import java.io.IOException;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BitUtil;
/**
* A writer for large monotonically increasing sequences of positive longs.
@ -29,7 +31,7 @@ import org.apache.lucene.store.DataOutput;
* The sequence is divided into fixed-size blocks and for each block, values
* are modeled after a linear function f: x &rarr; A &times; x + B. The block
* encodes deltas from the expected values computed from this function using as
* few bits as possible. Each block has an overhead between 6 and 14 bytes.
* few bits as possible.
* <p>
* Format:
* <ul>
@ -38,17 +40,16 @@ import org.apache.lucene.store.DataOutput;
* <li>Block: &lt;Header, (Ints)&gt;
* <li>Header: &lt;B, A, BitsPerValue&gt;
* <li>B: the B from f: x &rarr; A &times; x + B using a
* {@link DataOutput#writeVLong(long) variable-length long}
* {@link BitUtil#zigZagEncode(long) zig-zag encoded}
* {@link DataOutput#writeVLong(long) vLong}
* <li>A: the A from f: x &rarr; A &times; x + B encoded using
* {@link Float#floatToIntBits(float)} on
* {@link DataOutput#writeInt(int) 4 bytes}
* <li>BitsPerValue: a {@link DataOutput#writeVInt(int) variable-length int}
* <li>Ints: if BitsPerValue is <tt>0</tt>, then there is nothing to read and
* all values perfectly match the result of the function. Otherwise, these
* are the
* <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">zigzag-encoded</a>
* {@link PackedInts packed} deltas from the expected value (computed from
* the function) using exaclty BitsPerValue bits per value
* are the {@link PackedInts packed} deltas from the expected value
* (computed from the function) using exaclty BitsPerValue bits per value.
* </ul>
* @see MonotonicBlockPackedReader
* @lucene.internal
@ -72,22 +73,29 @@ public final class MonotonicBlockPackedWriter extends AbstractBlockPackedWriter
protected void flush() throws IOException {
assert off > 0;
// TODO: perform a true linear regression?
final long min = values[0];
final float avg = off == 1 ? 0f : (float) (values[off - 1] - min) / (off - 1);
long maxZigZagDelta = 0;
for (int i = 0; i < off; ++i) {
values[i] = zigZagEncode(values[i] - min - (long) (avg * i));
maxZigZagDelta = Math.max(maxZigZagDelta, values[i]);
final float avg = off == 1 ? 0f : (float) (values[off - 1] - values[0]) / (off - 1);
long min = values[0];
// adjust min so that all deltas will be positive
for (int i = 1; i < off; ++i) {
final long actual = values[i];
final long expected = expected(min, avg, i);
if (expected > actual) {
min -= (expected - actual);
}
}
out.writeVLong(min);
long maxDelta = 0;
for (int i = 0; i < off; ++i) {
values[i] = values[i] - expected(min, avg, i);
maxDelta = Math.max(maxDelta, values[i]);
}
out.writeVLong(zigZagEncode(min));
out.writeInt(Float.floatToIntBits(avg));
if (maxZigZagDelta == 0) {
if (maxDelta == 0) {
out.writeVInt(0);
} else {
final int bitsRequired = PackedInts.bitsRequired(maxZigZagDelta);
final int bitsRequired = PackedInts.bitsRequired(maxDelta);
out.writeVInt(bitsRequired);
writeValues(bitsRequired);
}

View File

@ -67,7 +67,8 @@ public class PackedInts {
public final static String CODEC_NAME = "PackedInts";
public final static int VERSION_START = 0; // PackedInts were long-aligned
public final static int VERSION_BYTE_ALIGNED = 1;
public final static int VERSION_CURRENT = VERSION_BYTE_ALIGNED;
public static final int VERSION_MONOTONIC_WITHOUT_ZIGZAG = 2;
public final static int VERSION_CURRENT = VERSION_MONOTONIC_WITHOUT_ZIGZAG;
/**
* Check the validity of a version number.

View File

@ -1230,7 +1230,7 @@ public class TestPackedInts extends LuceneTestCase {
out.close();
final IndexInput in = dir.openInput("out.bin", IOContext.DEFAULT);
final MonotonicBlockPackedReader reader = new MonotonicBlockPackedReader(in, PackedInts.VERSION_CURRENT, blockSize, valueCount, random().nextBoolean());
final MonotonicBlockPackedReader reader = MonotonicBlockPackedReader.of(in, PackedInts.VERSION_CURRENT, blockSize, valueCount, random().nextBoolean());
assertEquals(fp, in.getFilePointer());
for (int i = 0; i < valueCount; ++i) {
assertEquals("i=" +i, values[i], reader.get(i));