LUCENE-4780: MonotonicAppendingLongBuffer.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4765@1446896 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2013-02-16 14:02:42 +00:00
parent 8b4207e278
commit 156e036022
5 changed files with 353 additions and 144 deletions

View File

@ -24,6 +24,7 @@ import org.apache.lucene.index.MultiTermsEnum.TermsEnumIndex;
import org.apache.lucene.index.MultiTermsEnum.TermsEnumWithSlice;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.packed.AppendingLongBuffer;
import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer;
/**
* A wrapper for CompositeIndexReader providing access to DocValues.
@ -274,11 +275,11 @@ public class MultiDocValues {
// cache key of whoever asked for this aweful thing
final Object owner;
// globalOrd -> (globalOrd - segmentOrd)
final AppendingLongBuffer globalOrdDeltas;
final MonotonicAppendingLongBuffer globalOrdDeltas;
// globalOrd -> sub index
final AppendingLongBuffer subIndexes;
// segmentOrd -> (globalOrd - segmentOrd)
final AppendingLongBuffer ordDeltas[];
final MonotonicAppendingLongBuffer ordDeltas[];
/**
* Creates an ordinal map that allows mapping ords to/from a merged
@ -292,11 +293,11 @@ public class MultiDocValues {
// create the ordinal mappings by pulling a termsenum over each sub's
// unique terms, and walking a multitermsenum over those
this.owner = owner;
globalOrdDeltas = new AppendingLongBuffer();
globalOrdDeltas = new MonotonicAppendingLongBuffer();
subIndexes = new AppendingLongBuffer();
ordDeltas = new AppendingLongBuffer[subs.length];
ordDeltas = new MonotonicAppendingLongBuffer[subs.length];
for (int i = 0; i < ordDeltas.length; i++) {
ordDeltas[i] = new AppendingLongBuffer();
ordDeltas[i] = new MonotonicAppendingLongBuffer();
}
long segmentOrds[] = new long[subs.length];
ReaderSlice slices[] = new ReaderSlice[subs.length];

View File

@ -0,0 +1,150 @@
package org.apache.lucene.util.packed;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Arrays;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
/** Common functionality shared by {@link AppendingLongBuffer} and {@link MonotonicAppendingLongBuffer}. */
abstract class AbstractAppendingLongBuffer {
static final int BLOCK_BITS = 10;
static final int MAX_PENDING_COUNT = 1 << BLOCK_BITS;
static final int BLOCK_MASK = MAX_PENDING_COUNT - 1;
long[] minValues;
PackedInts.Reader[] deltas;
private long deltasBytes;
int valuesOff;
long[] pending;
int pendingOff;
AbstractAppendingLongBuffer(int initialBlockCount) {
minValues = new long[16];
deltas = new PackedInts.Reader[16];
pending = new long[MAX_PENDING_COUNT];
valuesOff = 0;
pendingOff = 0;
}
/** Get the number of values that have been added to the buffer. */
public final long size() {
return valuesOff * (long) MAX_PENDING_COUNT + pendingOff;
}
/** Append a value to this buffer. */
public final void add(long l) {
if (pendingOff == MAX_PENDING_COUNT) {
// check size
if (deltas.length == valuesOff) {
final int newLength = ArrayUtil.oversize(valuesOff + 1, 8);
grow(newLength);
}
packPendingValues();
if (deltas[valuesOff] != null) {
deltasBytes += deltas[valuesOff].ramBytesUsed();
}
++valuesOff;
// reset pending buffer
pendingOff = 0;
}
pending[pendingOff++] = l;
}
void grow(int newBlockCount) {
minValues = Arrays.copyOf(minValues, newBlockCount);
deltas = Arrays.copyOf(deltas, newBlockCount);
}
abstract void packPendingValues();
/** Get a value from this buffer. */
public final long get(long index) {
if (index < 0 || index >= size()) {
throw new IndexOutOfBoundsException("" + index);
}
int block = (int) (index >> BLOCK_BITS);
int element = (int) (index & BLOCK_MASK);
return get(block, element);
}
abstract long get(int block, int element);
abstract Iterator iterator();
abstract class Iterator {
long[] currentValues;
int vOff, pOff;
Iterator() {
vOff = pOff = 0;
if (valuesOff == 0) {
currentValues = pending;
} else {
currentValues = new long[MAX_PENDING_COUNT];
fillValues();
}
}
abstract void fillValues();
/** Whether or not there are remaining values. */
public final boolean hasNext() {
return vOff < valuesOff || (vOff == valuesOff && pOff < pendingOff);
}
/** Return the next long in the buffer. */
public final long next() {
assert hasNext();
long result = currentValues[pOff++];
if (pOff == MAX_PENDING_COUNT) {
vOff += 1;
pOff = 0;
if (vOff <= valuesOff) {
fillValues();
}
}
return result;
}
}
long baseRamBytesUsed() {
return RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
+ 3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF // the 3 arrays
+ 2 * RamUsageEstimator.NUM_BYTES_INT; // the 2 offsets
}
/**
* Return the number of bytes used by this instance.
*/
public long ramBytesUsed() {
// TODO: this is called per-doc-per-norms/dv-field, can we optimize this?
long bytesUsed = RamUsageEstimator.alignObjectSize(baseRamBytesUsed())
+ RamUsageEstimator.NUM_BYTES_LONG // valuesBytes
+ RamUsageEstimator.sizeOf(pending)
+ RamUsageEstimator.sizeOf(minValues)
+ RamUsageEstimator.alignObjectSize(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + (long) RamUsageEstimator.NUM_BYTES_OBJECT_REF * deltas.length); // values
return bytesUsed + deltasBytes;
}
}

View File

@ -19,72 +19,33 @@ package org.apache.lucene.util.packed;
import java.util.Arrays;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
/**
* Utility class to buffer a list of signed longs in memory. This class only
* supports appending.
* supports appending and is optimized for the case where values are close to
* each other.
* @lucene.internal
*/
public class AppendingLongBuffer {
private static final int BLOCK_BITS = 10;
private static final int MAX_PENDING_COUNT = 1 << BLOCK_BITS;
private static final int BLOCK_MASK = MAX_PENDING_COUNT - 1;
private long[] minValues;
private PackedInts.Reader[] values;
private long valuesBytes;
private int valuesOff;
private long[] pending;
private int pendingOff;
public final class AppendingLongBuffer extends AbstractAppendingLongBuffer {
/** Sole constructor. */
public AppendingLongBuffer() {
minValues = new long[16];
values = new PackedInts.Reader[16];
pending = new long[MAX_PENDING_COUNT];
valuesOff = 0;
pendingOff = 0;
super(16);
}
/** Append a value to this buffer. */
public void add(long l) {
if (pendingOff == MAX_PENDING_COUNT) {
packPendingValues();
}
pending[pendingOff++] = l;
}
/** Get a value from this buffer.
* <p>
* <b>NOTE</b>: This class is not really designed for random access!
* You will likely get better performance by using packed ints in another way! */
public long get(long index) {
assert index < size() : "index=" + index + ",size=" + size(); // TODO: do a better check, and throw IndexOutOfBoundsException?
// This class is currently only used by the indexer.
int block = (int) (index >> BLOCK_BITS);
int element = (int) (index & BLOCK_MASK);
@Override
long get(int block, int element) {
if (block == valuesOff) {
return pending[element];
} else if (values[block] == null) {
} else if (deltas[block] == null) {
return minValues[block];
} else {
return minValues[block] + values[block].get(element);
return minValues[block] + deltas[block].get(element);
}
}
private void packPendingValues() {
void packPendingValues() {
assert pendingOff == MAX_PENDING_COUNT;
// check size
if (values.length == valuesOff) {
final int newLength = ArrayUtil.oversize(valuesOff + 1, 8);
minValues = Arrays.copyOf(minValues, newLength);
values = Arrays.copyOf(values, newLength);
}
// compute max delta
long minValue = pending[0];
long maxValue = pending[0];
@ -105,18 +66,8 @@ public class AppendingLongBuffer {
for (int i = 0; i < pendingOff; ) {
i += mutable.set(i, pending, i, pendingOff - i);
}
values[valuesOff] = mutable;
valuesBytes += mutable.ramBytesUsed();
deltas[valuesOff] = mutable;
}
++valuesOff;
// reset pending buffer
pendingOff = 0;
}
/** Get the number of values that have been added to the buffer. */
public long size() {
return valuesOff * (long)MAX_PENDING_COUNT + pendingOff;
}
/** Return an iterator over the values of this buffer. */
@ -125,29 +76,20 @@ public class AppendingLongBuffer {
}
/** A long iterator. */
public class Iterator {
long[] currentValues;
int vOff, pOff;
public final class Iterator extends AbstractAppendingLongBuffer.Iterator {
private Iterator() {
vOff = pOff = 0;
if (valuesOff == 0) {
currentValues = pending;
} else {
currentValues = new long[MAX_PENDING_COUNT];
fillValues();
}
super();
}
private void fillValues() {
void fillValues() {
if (vOff == valuesOff) {
currentValues = pending;
} else if (values[vOff] == null) {
} else if (deltas[vOff] == null) {
Arrays.fill(currentValues, minValues[vOff]);
} else {
for (int k = 0; k < MAX_PENDING_COUNT; ) {
k += values[vOff].get(k, currentValues, k, MAX_PENDING_COUNT - k);
k += deltas[vOff].get(k, currentValues, k, MAX_PENDING_COUNT - k);
}
for (int k = 0; k < MAX_PENDING_COUNT; ++k) {
currentValues[k] += minValues[vOff];
@ -155,42 +97,6 @@ public class AppendingLongBuffer {
}
}
/** Whether or not there are remaining values. */
public boolean hasNext() {
return vOff < valuesOff || (vOff == valuesOff && pOff < pendingOff);
}
/** Return the next long in the buffer. */
public long next() {
assert hasNext();
long result = currentValues[pOff++];
if (pOff == MAX_PENDING_COUNT) {
vOff += 1;
pOff = 0;
if (vOff <= valuesOff) {
fillValues();
}
}
return result;
}
}
/**
* Return the number of bytes used by this instance.
*/
public long ramBytesUsed() {
// TODO: this is called per-doc-per-norms/dv-field, can we optimize this?
long bytesUsed = RamUsageEstimator.alignObjectSize(
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
+ 3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF // the 3 arrays
+ 2 * RamUsageEstimator.NUM_BYTES_INT) // the 2 offsets
+ RamUsageEstimator.NUM_BYTES_LONG // valuesBytes
+ RamUsageEstimator.sizeOf(pending)
+ RamUsageEstimator.sizeOf(minValues)
+ RamUsageEstimator.alignObjectSize(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + (long) RamUsageEstimator.NUM_BYTES_OBJECT_REF * values.length); // values
return bytesUsed + valuesBytes;
}
}

View File

@ -0,0 +1,139 @@
package org.apache.lucene.util.packed;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Arrays;
import org.apache.lucene.util.RamUsageEstimator;
/**
* Utility class to buffer signed longs in memory, which is optimized for the
* case where the sequence is monotonic, although it can encode any sequence of
* arbitrary longs. It only supports appending.
* @lucene.internal
*/
public final class MonotonicAppendingLongBuffer extends AbstractAppendingLongBuffer {
static long zigZagDecode(long n) {
return ((n >>> 1) ^ -(n & 1));
}
static long zigZagEncode(long n) {
return (n >> 63) ^ (n << 1);
}
private float[] averages;
/** Sole constructor. */
public MonotonicAppendingLongBuffer() {
super(16);
averages = new float[16];
}
long get(int block, int element) {
if (block == valuesOff) {
return pending[element];
} else {
final long base = minValues[block] + (long) (averages[block] * (long) element);
if (deltas[block] == null) {
return base;
} else {
return base + zigZagDecode(deltas[block].get(element));
}
}
}
@Override
void grow(int newBlockCount) {
super.grow(newBlockCount);
this.averages = Arrays.copyOf(averages, newBlockCount);
}
@Override
void packPendingValues() {
assert pendingOff == MAX_PENDING_COUNT;
minValues[valuesOff] = pending[0];
averages[valuesOff] = (float) (pending[BLOCK_MASK] - pending[0]) / BLOCK_MASK;
for (int i = 0; i < MAX_PENDING_COUNT; ++i) {
pending[i] = zigZagEncode(pending[i] - minValues[valuesOff] - (long) (averages[valuesOff] * (long) i));
}
long maxDelta = 0;
for (int i = 0; i < MAX_PENDING_COUNT; ++i) {
if (pending[i] < 0) {
maxDelta = -1;
break;
} else {
maxDelta = Math.max(maxDelta, pending[i]);
}
}
if (maxDelta != 0) {
final int bitsRequired = maxDelta < 0 ? 64 : PackedInts.bitsRequired(maxDelta);
final PackedInts.Mutable mutable = PackedInts.getMutable(pendingOff, bitsRequired, PackedInts.COMPACT);
for (int i = 0; i < pendingOff; ) {
i += mutable.set(i, pending, i, pendingOff - i);
}
deltas[valuesOff] = mutable;
}
}
/** Return an iterator over the values of this buffer. */
public Iterator iterator() {
return new Iterator();
}
/** A long iterator. */
public final class Iterator extends AbstractAppendingLongBuffer.Iterator {
Iterator() {
super();
}
void fillValues() {
if (vOff == valuesOff) {
currentValues = pending;
} else if (deltas[vOff] == null) {
for (int k = 0; k < MAX_PENDING_COUNT; ++k) {
currentValues[k] = minValues[vOff] + (long) (averages[vOff] * (long) k);
}
} else {
for (int k = 0; k < MAX_PENDING_COUNT; ) {
k += deltas[vOff].get(k, currentValues, k, MAX_PENDING_COUNT - k);
}
for (int k = 0; k < MAX_PENDING_COUNT; ++k) {
currentValues[k] = minValues[vOff] + (long) (averages[vOff] * (long) k) + zigZagDecode(currentValues[k]);
}
}
}
}
@Override
long baseRamBytesUsed() {
return super.baseRamBytesUsed()
+ RamUsageEstimator.NUM_BYTES_OBJECT_REF; // the additional array
}
@Override
public long ramBytesUsed() {
return super.ramBytesUsed()
+ RamUsageEstimator.sizeOf(averages);
}
}

View File

@ -805,42 +805,55 @@ public class TestPackedInts extends LuceneTestCase {
}
public void testAppendingLongBuffer() {
final long[] arr = new long[RandomInts.randomIntBetween(random(), 1, 2000000)];
for (int bpv : new int[] {0, 1, 63, 64, RandomInts.randomIntBetween(random(), 2, 61)}) {
if (bpv == 0) {
Arrays.fill(arr, random().nextLong());
} else if (bpv == 64) {
final long[] arr = new long[RandomInts.randomIntBetween(random(), 1, 1000000)];
for (int bpv : new int[] {0, 1, 63, 64, RandomInts.randomIntBetween(random(), 2, 62)}) {
for (boolean monotonic : new boolean[] {true, false}) {
AbstractAppendingLongBuffer buf;
final int inc;
if (monotonic) {
buf = new MonotonicAppendingLongBuffer();
inc = _TestUtil.nextInt(random(), -1000, 1000);
} else {
buf = new AppendingLongBuffer();
inc = 0;
}
if (bpv == 0) {
arr[0] = random().nextLong();
for (int i = 1; i < arr.length; ++i) {
arr[i] = arr[i-1] + inc;
}
} else if (bpv == 64) {
for (int i = 0; i < arr.length; ++i) {
arr[i] = random().nextLong();
}
} else {
final long minValue = _TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE - PackedInts.maxValue(bpv));
for (int i = 0; i < arr.length; ++i) {
arr[i] = minValue + inc * i + random().nextLong() & PackedInts.maxValue(bpv); // _TestUtil.nextLong is too slow
}
}
for (int i = 0; i < arr.length; ++i) {
arr[i] = random().nextLong();
buf.add(arr[i]);
}
} else {
final long minValue = _TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE - PackedInts.maxValue(bpv));
assertEquals(arr.length, buf.size());
final AbstractAppendingLongBuffer.Iterator it = buf.iterator();
for (int i = 0; i < arr.length; ++i) {
arr[i] = minValue + random().nextLong() & PackedInts.maxValue(bpv); // _TestUtil.nextLong is too slow
if (random().nextBoolean()) {
assertTrue(it.hasNext());
}
assertEquals(arr[i], it.next());
}
}
AppendingLongBuffer buf = new AppendingLongBuffer();
for (int i = 0; i < arr.length; ++i) {
buf.add(arr[i]);
}
assertEquals(arr.length, buf.size());
final AppendingLongBuffer.Iterator it = buf.iterator();
for (int i = 0; i < arr.length; ++i) {
if (random().nextBoolean()) {
assertTrue(it.hasNext());
assertFalse(it.hasNext());
for (int i = 0; i < arr.length; ++i) {
assertEquals(arr[i], buf.get(i));
}
assertEquals(arr[i], it.next());
final long expectedBytesUsed = RamUsageEstimator.sizeOf(buf);
final long computedBytesUsed = buf.ramBytesUsed();
assertEquals("got " + computedBytesUsed + ", expected: " + expectedBytesUsed,
expectedBytesUsed, computedBytesUsed);
}
assertFalse(it.hasNext());
for (int i = 0; i < arr.length; ++i) {
assertEquals(arr[i], buf.get(i));
}
final long expectedBytesUsed = RamUsageEstimator.sizeOf(buf);
final long computedBytesUsed = buf.ramBytesUsed();
assertEquals("got " + computedBytesUsed + ", expected: " + expectedBytesUsed,
expectedBytesUsed, computedBytesUsed);
}
}