From 156e036022b6b1cde359a992d74e04554d82f94a Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Sat, 16 Feb 2013 14:02:42 +0000 Subject: [PATCH] LUCENE-4780: MonotonicAppendingLongBuffer. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4765@1446896 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/lucene/index/MultiDocValues.java | 11 +- .../packed/AbstractAppendingLongBuffer.java | 150 ++++++++++++++++++ .../util/packed/AppendingLongBuffer.java | 124 ++------------- .../packed/MonotonicAppendingLongBuffer.java | 139 ++++++++++++++++ .../lucene/util/packed/TestPackedInts.java | 73 +++++---- 5 files changed, 353 insertions(+), 144 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/util/packed/AbstractAppendingLongBuffer.java create mode 100644 lucene/core/src/java/org/apache/lucene/util/packed/MonotonicAppendingLongBuffer.java diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java index 8685559a140..97f2eecf9ff 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.MultiTermsEnum.TermsEnumIndex; import org.apache.lucene.index.MultiTermsEnum.TermsEnumWithSlice; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.packed.AppendingLongBuffer; +import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer; /** * A wrapper for CompositeIndexReader providing access to DocValues. @@ -274,11 +275,11 @@ public class MultiDocValues { // cache key of whoever asked for this aweful thing final Object owner; // globalOrd -> (globalOrd - segmentOrd) - final AppendingLongBuffer globalOrdDeltas; + final MonotonicAppendingLongBuffer globalOrdDeltas; // globalOrd -> sub index final AppendingLongBuffer subIndexes; // segmentOrd -> (globalOrd - segmentOrd) - final AppendingLongBuffer ordDeltas[]; + final MonotonicAppendingLongBuffer ordDeltas[]; /** * Creates an ordinal map that allows mapping ords to/from a merged @@ -292,11 +293,11 @@ public class MultiDocValues { // create the ordinal mappings by pulling a termsenum over each sub's // unique terms, and walking a multitermsenum over those this.owner = owner; - globalOrdDeltas = new AppendingLongBuffer(); + globalOrdDeltas = new MonotonicAppendingLongBuffer(); subIndexes = new AppendingLongBuffer(); - ordDeltas = new AppendingLongBuffer[subs.length]; + ordDeltas = new MonotonicAppendingLongBuffer[subs.length]; for (int i = 0; i < ordDeltas.length; i++) { - ordDeltas[i] = new AppendingLongBuffer(); + ordDeltas[i] = new MonotonicAppendingLongBuffer(); } long segmentOrds[] = new long[subs.length]; ReaderSlice slices[] = new ReaderSlice[subs.length]; diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/AbstractAppendingLongBuffer.java b/lucene/core/src/java/org/apache/lucene/util/packed/AbstractAppendingLongBuffer.java new file mode 100644 index 00000000000..087154d2193 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/packed/AbstractAppendingLongBuffer.java @@ -0,0 +1,150 @@ +package org.apache.lucene.util.packed; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.RamUsageEstimator; + +/** Common functionality shared by {@link AppendingLongBuffer} and {@link MonotonicAppendingLongBuffer}. */ +abstract class AbstractAppendingLongBuffer { + + static final int BLOCK_BITS = 10; + static final int MAX_PENDING_COUNT = 1 << BLOCK_BITS; + static final int BLOCK_MASK = MAX_PENDING_COUNT - 1; + + long[] minValues; + PackedInts.Reader[] deltas; + private long deltasBytes; + int valuesOff; + long[] pending; + int pendingOff; + + AbstractAppendingLongBuffer(int initialBlockCount) { + minValues = new long[16]; + deltas = new PackedInts.Reader[16]; + pending = new long[MAX_PENDING_COUNT]; + valuesOff = 0; + pendingOff = 0; + } + + /** Get the number of values that have been added to the buffer. */ + public final long size() { + return valuesOff * (long) MAX_PENDING_COUNT + pendingOff; + } + + /** Append a value to this buffer. */ + public final void add(long l) { + if (pendingOff == MAX_PENDING_COUNT) { + // check size + if (deltas.length == valuesOff) { + final int newLength = ArrayUtil.oversize(valuesOff + 1, 8); + grow(newLength); + } + packPendingValues(); + if (deltas[valuesOff] != null) { + deltasBytes += deltas[valuesOff].ramBytesUsed(); + } + ++valuesOff; + // reset pending buffer + pendingOff = 0; + } + pending[pendingOff++] = l; + } + + void grow(int newBlockCount) { + minValues = Arrays.copyOf(minValues, newBlockCount); + deltas = Arrays.copyOf(deltas, newBlockCount); + } + + abstract void packPendingValues(); + + /** Get a value from this buffer. */ + public final long get(long index) { + if (index < 0 || index >= size()) { + throw new IndexOutOfBoundsException("" + index); + } + int block = (int) (index >> BLOCK_BITS); + int element = (int) (index & BLOCK_MASK); + return get(block, element); + } + + abstract long get(int block, int element); + + abstract Iterator iterator(); + + abstract class Iterator { + + long[] currentValues; + int vOff, pOff; + + Iterator() { + vOff = pOff = 0; + if (valuesOff == 0) { + currentValues = pending; + } else { + currentValues = new long[MAX_PENDING_COUNT]; + fillValues(); + } + } + + abstract void fillValues(); + + /** Whether or not there are remaining values. */ + public final boolean hasNext() { + return vOff < valuesOff || (vOff == valuesOff && pOff < pendingOff); + } + + /** Return the next long in the buffer. */ + public final long next() { + assert hasNext(); + long result = currentValues[pOff++]; + if (pOff == MAX_PENDING_COUNT) { + vOff += 1; + pOff = 0; + if (vOff <= valuesOff) { + fillValues(); + } + } + return result; + } + + } + + long baseRamBytesUsed() { + return RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + + 3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF // the 3 arrays + + 2 * RamUsageEstimator.NUM_BYTES_INT; // the 2 offsets + } + + /** + * Return the number of bytes used by this instance. + */ + public long ramBytesUsed() { + // TODO: this is called per-doc-per-norms/dv-field, can we optimize this? + long bytesUsed = RamUsageEstimator.alignObjectSize(baseRamBytesUsed()) + + RamUsageEstimator.NUM_BYTES_LONG // valuesBytes + + RamUsageEstimator.sizeOf(pending) + + RamUsageEstimator.sizeOf(minValues) + + RamUsageEstimator.alignObjectSize(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + (long) RamUsageEstimator.NUM_BYTES_OBJECT_REF * deltas.length); // values + + return bytesUsed + deltasBytes; + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/AppendingLongBuffer.java b/lucene/core/src/java/org/apache/lucene/util/packed/AppendingLongBuffer.java index 2b1dd77d8dc..978fc32c71d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/packed/AppendingLongBuffer.java +++ b/lucene/core/src/java/org/apache/lucene/util/packed/AppendingLongBuffer.java @@ -19,72 +19,33 @@ package org.apache.lucene.util.packed; import java.util.Arrays; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.RamUsageEstimator; - /** * Utility class to buffer a list of signed longs in memory. This class only - * supports appending. + * supports appending and is optimized for the case where values are close to + * each other. * @lucene.internal */ -public class AppendingLongBuffer { - - private static final int BLOCK_BITS = 10; - private static final int MAX_PENDING_COUNT = 1 << BLOCK_BITS; - private static final int BLOCK_MASK = MAX_PENDING_COUNT - 1; - - private long[] minValues; - private PackedInts.Reader[] values; - private long valuesBytes; - private int valuesOff; - private long[] pending; - private int pendingOff; +public final class AppendingLongBuffer extends AbstractAppendingLongBuffer { /** Sole constructor. */ public AppendingLongBuffer() { - minValues = new long[16]; - values = new PackedInts.Reader[16]; - pending = new long[MAX_PENDING_COUNT]; - valuesOff = 0; - pendingOff = 0; + super(16); } - /** Append a value to this buffer. */ - public void add(long l) { - if (pendingOff == MAX_PENDING_COUNT) { - packPendingValues(); - } - pending[pendingOff++] = l; - } - - /** Get a value from this buffer. - *

- * NOTE: This class is not really designed for random access! - * You will likely get better performance by using packed ints in another way! */ - public long get(long index) { - assert index < size() : "index=" + index + ",size=" + size(); // TODO: do a better check, and throw IndexOutOfBoundsException? - // This class is currently only used by the indexer. - int block = (int) (index >> BLOCK_BITS); - int element = (int) (index & BLOCK_MASK); + @Override + long get(int block, int element) { if (block == valuesOff) { return pending[element]; - } else if (values[block] == null) { + } else if (deltas[block] == null) { return minValues[block]; } else { - return minValues[block] + values[block].get(element); + return minValues[block] + deltas[block].get(element); } } - private void packPendingValues() { + void packPendingValues() { assert pendingOff == MAX_PENDING_COUNT; - // check size - if (values.length == valuesOff) { - final int newLength = ArrayUtil.oversize(valuesOff + 1, 8); - minValues = Arrays.copyOf(minValues, newLength); - values = Arrays.copyOf(values, newLength); - } - // compute max delta long minValue = pending[0]; long maxValue = pending[0]; @@ -105,18 +66,8 @@ public class AppendingLongBuffer { for (int i = 0; i < pendingOff; ) { i += mutable.set(i, pending, i, pendingOff - i); } - values[valuesOff] = mutable; - valuesBytes += mutable.ramBytesUsed(); + deltas[valuesOff] = mutable; } - ++valuesOff; - - // reset pending buffer - pendingOff = 0; - } - - /** Get the number of values that have been added to the buffer. */ - public long size() { - return valuesOff * (long)MAX_PENDING_COUNT + pendingOff; } /** Return an iterator over the values of this buffer. */ @@ -125,29 +76,20 @@ public class AppendingLongBuffer { } /** A long iterator. */ - public class Iterator { - - long[] currentValues; - int vOff, pOff; + public final class Iterator extends AbstractAppendingLongBuffer.Iterator { private Iterator() { - vOff = pOff = 0; - if (valuesOff == 0) { - currentValues = pending; - } else { - currentValues = new long[MAX_PENDING_COUNT]; - fillValues(); - } + super(); } - private void fillValues() { + void fillValues() { if (vOff == valuesOff) { currentValues = pending; - } else if (values[vOff] == null) { + } else if (deltas[vOff] == null) { Arrays.fill(currentValues, minValues[vOff]); } else { for (int k = 0; k < MAX_PENDING_COUNT; ) { - k += values[vOff].get(k, currentValues, k, MAX_PENDING_COUNT - k); + k += deltas[vOff].get(k, currentValues, k, MAX_PENDING_COUNT - k); } for (int k = 0; k < MAX_PENDING_COUNT; ++k) { currentValues[k] += minValues[vOff]; @@ -155,42 +97,6 @@ public class AppendingLongBuffer { } } - /** Whether or not there are remaining values. */ - public boolean hasNext() { - return vOff < valuesOff || (vOff == valuesOff && pOff < pendingOff); - } - - /** Return the next long in the buffer. */ - public long next() { - assert hasNext(); - long result = currentValues[pOff++]; - if (pOff == MAX_PENDING_COUNT) { - vOff += 1; - pOff = 0; - if (vOff <= valuesOff) { - fillValues(); - } - } - return result; - } - - } - - /** - * Return the number of bytes used by this instance. - */ - public long ramBytesUsed() { - // TODO: this is called per-doc-per-norms/dv-field, can we optimize this? - long bytesUsed = RamUsageEstimator.alignObjectSize( - RamUsageEstimator.NUM_BYTES_OBJECT_HEADER - + 3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF // the 3 arrays - + 2 * RamUsageEstimator.NUM_BYTES_INT) // the 2 offsets - + RamUsageEstimator.NUM_BYTES_LONG // valuesBytes - + RamUsageEstimator.sizeOf(pending) - + RamUsageEstimator.sizeOf(minValues) - + RamUsageEstimator.alignObjectSize(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + (long) RamUsageEstimator.NUM_BYTES_OBJECT_REF * values.length); // values - - return bytesUsed + valuesBytes; } } diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicAppendingLongBuffer.java b/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicAppendingLongBuffer.java new file mode 100644 index 00000000000..4b0099407d1 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicAppendingLongBuffer.java @@ -0,0 +1,139 @@ +package org.apache.lucene.util.packed; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; + +import org.apache.lucene.util.RamUsageEstimator; + +/** + * Utility class to buffer signed longs in memory, which is optimized for the + * case where the sequence is monotonic, although it can encode any sequence of + * arbitrary longs. It only supports appending. + * @lucene.internal + */ +public final class MonotonicAppendingLongBuffer extends AbstractAppendingLongBuffer { + + static long zigZagDecode(long n) { + return ((n >>> 1) ^ -(n & 1)); + } + + static long zigZagEncode(long n) { + return (n >> 63) ^ (n << 1); + } + + private float[] averages; + + /** Sole constructor. */ + public MonotonicAppendingLongBuffer() { + super(16); + averages = new float[16]; + } + + long get(int block, int element) { + if (block == valuesOff) { + return pending[element]; + } else { + final long base = minValues[block] + (long) (averages[block] * (long) element); + if (deltas[block] == null) { + return base; + } else { + return base + zigZagDecode(deltas[block].get(element)); + } + } + } + + @Override + void grow(int newBlockCount) { + super.grow(newBlockCount); + this.averages = Arrays.copyOf(averages, newBlockCount); + } + + @Override + void packPendingValues() { + assert pendingOff == MAX_PENDING_COUNT; + + minValues[valuesOff] = pending[0]; + averages[valuesOff] = (float) (pending[BLOCK_MASK] - pending[0]) / BLOCK_MASK; + + for (int i = 0; i < MAX_PENDING_COUNT; ++i) { + pending[i] = zigZagEncode(pending[i] - minValues[valuesOff] - (long) (averages[valuesOff] * (long) i)); + } + long maxDelta = 0; + for (int i = 0; i < MAX_PENDING_COUNT; ++i) { + if (pending[i] < 0) { + maxDelta = -1; + break; + } else { + maxDelta = Math.max(maxDelta, pending[i]); + } + } + if (maxDelta != 0) { + final int bitsRequired = maxDelta < 0 ? 64 : PackedInts.bitsRequired(maxDelta); + final PackedInts.Mutable mutable = PackedInts.getMutable(pendingOff, bitsRequired, PackedInts.COMPACT); + for (int i = 0; i < pendingOff; ) { + i += mutable.set(i, pending, i, pendingOff - i); + } + deltas[valuesOff] = mutable; + } + } + + /** Return an iterator over the values of this buffer. */ + public Iterator iterator() { + return new Iterator(); + } + + /** A long iterator. */ + public final class Iterator extends AbstractAppendingLongBuffer.Iterator { + + Iterator() { + super(); + } + + void fillValues() { + if (vOff == valuesOff) { + currentValues = pending; + } else if (deltas[vOff] == null) { + for (int k = 0; k < MAX_PENDING_COUNT; ++k) { + currentValues[k] = minValues[vOff] + (long) (averages[vOff] * (long) k); + } + } else { + for (int k = 0; k < MAX_PENDING_COUNT; ) { + k += deltas[vOff].get(k, currentValues, k, MAX_PENDING_COUNT - k); + } + for (int k = 0; k < MAX_PENDING_COUNT; ++k) { + currentValues[k] = minValues[vOff] + (long) (averages[vOff] * (long) k) + zigZagDecode(currentValues[k]); + } + } + } + + } + + @Override + long baseRamBytesUsed() { + return super.baseRamBytesUsed() + + RamUsageEstimator.NUM_BYTES_OBJECT_REF; // the additional array + } + + @Override + public long ramBytesUsed() { + return super.ramBytesUsed() + + RamUsageEstimator.sizeOf(averages); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java index 260b114dbfa..10e7b01893e 100644 --- a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java +++ b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java @@ -805,42 +805,55 @@ public class TestPackedInts extends LuceneTestCase { } public void testAppendingLongBuffer() { - final long[] arr = new long[RandomInts.randomIntBetween(random(), 1, 2000000)]; - for (int bpv : new int[] {0, 1, 63, 64, RandomInts.randomIntBetween(random(), 2, 61)}) { - if (bpv == 0) { - Arrays.fill(arr, random().nextLong()); - } else if (bpv == 64) { + final long[] arr = new long[RandomInts.randomIntBetween(random(), 1, 1000000)]; + for (int bpv : new int[] {0, 1, 63, 64, RandomInts.randomIntBetween(random(), 2, 62)}) { + for (boolean monotonic : new boolean[] {true, false}) { + AbstractAppendingLongBuffer buf; + final int inc; + if (monotonic) { + buf = new MonotonicAppendingLongBuffer(); + inc = _TestUtil.nextInt(random(), -1000, 1000); + } else { + buf = new AppendingLongBuffer(); + inc = 0; + } + if (bpv == 0) { + arr[0] = random().nextLong(); + for (int i = 1; i < arr.length; ++i) { + arr[i] = arr[i-1] + inc; + } + } else if (bpv == 64) { + for (int i = 0; i < arr.length; ++i) { + arr[i] = random().nextLong(); + } + } else { + final long minValue = _TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE - PackedInts.maxValue(bpv)); + for (int i = 0; i < arr.length; ++i) { + arr[i] = minValue + inc * i + random().nextLong() & PackedInts.maxValue(bpv); // _TestUtil.nextLong is too slow + } + } for (int i = 0; i < arr.length; ++i) { - arr[i] = random().nextLong(); + buf.add(arr[i]); } - } else { - final long minValue = _TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE - PackedInts.maxValue(bpv)); + assertEquals(arr.length, buf.size()); + final AbstractAppendingLongBuffer.Iterator it = buf.iterator(); for (int i = 0; i < arr.length; ++i) { - arr[i] = minValue + random().nextLong() & PackedInts.maxValue(bpv); // _TestUtil.nextLong is too slow + if (random().nextBoolean()) { + assertTrue(it.hasNext()); + } + assertEquals(arr[i], it.next()); } - } - AppendingLongBuffer buf = new AppendingLongBuffer(); - for (int i = 0; i < arr.length; ++i) { - buf.add(arr[i]); - } - assertEquals(arr.length, buf.size()); - final AppendingLongBuffer.Iterator it = buf.iterator(); - for (int i = 0; i < arr.length; ++i) { - if (random().nextBoolean()) { - assertTrue(it.hasNext()); + assertFalse(it.hasNext()); + + for (int i = 0; i < arr.length; ++i) { + assertEquals(arr[i], buf.get(i)); } - assertEquals(arr[i], it.next()); + + final long expectedBytesUsed = RamUsageEstimator.sizeOf(buf); + final long computedBytesUsed = buf.ramBytesUsed(); + assertEquals("got " + computedBytesUsed + ", expected: " + expectedBytesUsed, + expectedBytesUsed, computedBytesUsed); } - assertFalse(it.hasNext()); - - for (int i = 0; i < arr.length; ++i) { - assertEquals(arr[i], buf.get(i)); - } - - final long expectedBytesUsed = RamUsageEstimator.sizeOf(buf); - final long computedBytesUsed = buf.ramBytesUsed(); - assertEquals("got " + computedBytesUsed + ", expected: " + expectedBytesUsed, - expectedBytesUsed, computedBytesUsed); } }