LUCENE-4780: MonotonicAppendingLongBuffer.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4765@1446896 13f79535-47bb-0310-9956-ffa450edef68
2013-02-16 14:02:42 +00:00 · 2013-02-16 14:02:42 +00:00 · 156e036022
parent 8b4207e278
commit 156e036022
5 changed files with 353 additions and 144 deletions
--- a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
@ -24,6 +24,7 @@ import org.apache.lucene.index.MultiTermsEnum.TermsEnumIndex;
 import org.apache.lucene.index.MultiTermsEnum.TermsEnumWithSlice;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.packed.AppendingLongBuffer;
+import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer;

 /**
 * A wrapper for CompositeIndexReader providing access to DocValues.
@ -274,11 +275,11 @@ public class MultiDocValues {
    // cache key of whoever asked for this aweful thing
    final Object owner;
    // globalOrd -> (globalOrd - segmentOrd)
-    final AppendingLongBuffer globalOrdDeltas;
+    final MonotonicAppendingLongBuffer globalOrdDeltas;
    // globalOrd -> sub index
    final AppendingLongBuffer subIndexes;
    // segmentOrd -> (globalOrd - segmentOrd)
-    final AppendingLongBuffer ordDeltas[];
+    final MonotonicAppendingLongBuffer ordDeltas[];
    
    /** 
     * Creates an ordinal map that allows mapping ords to/from a merged
@ -292,11 +293,11 @@ public class MultiDocValues {
      // create the ordinal mappings by pulling a termsenum over each sub's 
      // unique terms, and walking a multitermsenum over those
      this.owner = owner;
-      globalOrdDeltas = new AppendingLongBuffer();
+      globalOrdDeltas = new MonotonicAppendingLongBuffer();
      subIndexes = new AppendingLongBuffer();
-      ordDeltas = new AppendingLongBuffer[subs.length];
+      ordDeltas = new MonotonicAppendingLongBuffer[subs.length];
      for (int i = 0; i < ordDeltas.length; i++) {
-        ordDeltas[i] = new AppendingLongBuffer();
+        ordDeltas[i] = new MonotonicAppendingLongBuffer();
      }
      long segmentOrds[] = new long[subs.length];
      ReaderSlice slices[] = new ReaderSlice[subs.length];
--- a/lucene/core/src/java/org/apache/lucene/util/packed/AbstractAppendingLongBuffer.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/AbstractAppendingLongBuffer.java
@ -0,0 +1,150 @@
+package org.apache.lucene.util.packed;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Arrays;
+
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.RamUsageEstimator;
+
+/** Common functionality shared by {@link AppendingLongBuffer} and {@link MonotonicAppendingLongBuffer}. */
+abstract class AbstractAppendingLongBuffer {
+
+  static final int BLOCK_BITS = 10;
+  static final int MAX_PENDING_COUNT = 1 << BLOCK_BITS;
+  static final int BLOCK_MASK = MAX_PENDING_COUNT - 1;
+
+  long[] minValues;
+  PackedInts.Reader[] deltas;
+  private long deltasBytes;
+  int valuesOff;
+  long[] pending;
+  int pendingOff;
+
+  AbstractAppendingLongBuffer(int initialBlockCount) {
+    minValues = new long[16];
+    deltas = new PackedInts.Reader[16];
+    pending = new long[MAX_PENDING_COUNT];
+    valuesOff = 0;
+    pendingOff = 0;
+  }
+
+  /** Get the number of values that have been added to the buffer. */
+  public final long size() {
+    return valuesOff * (long) MAX_PENDING_COUNT + pendingOff;
+  }
+
+  /** Append a value to this buffer. */
+  public final void add(long l) {
+    if (pendingOff == MAX_PENDING_COUNT) {
+      // check size
+      if (deltas.length == valuesOff) {
+        final int newLength = ArrayUtil.oversize(valuesOff + 1, 8);
+        grow(newLength);
+      }
+      packPendingValues();
+      if (deltas[valuesOff] != null) {
+        deltasBytes += deltas[valuesOff].ramBytesUsed();
+      }
+      ++valuesOff;
+      // reset pending buffer
+      pendingOff = 0;
+    }
+    pending[pendingOff++] = l;
+  }
+
+  void grow(int newBlockCount) {
+    minValues = Arrays.copyOf(minValues, newBlockCount);
+    deltas = Arrays.copyOf(deltas, newBlockCount);
+  }
+
+  abstract void packPendingValues();
+
+  /** Get a value from this buffer. */
+  public final long get(long index) {
+    if (index < 0 || index >= size()) {
+      throw new IndexOutOfBoundsException("" + index);
+    }
+    int block = (int) (index >> BLOCK_BITS);
+    int element = (int) (index & BLOCK_MASK);
+    return get(block, element);
+  }
+
+  abstract long get(int block, int element);
+
+  abstract Iterator iterator();
+
+  abstract class Iterator {
+
+    long[] currentValues;
+    int vOff, pOff;
+
+    Iterator() {
+      vOff = pOff = 0;
+      if (valuesOff == 0) {
+        currentValues = pending;
+      } else {
+        currentValues = new long[MAX_PENDING_COUNT];
+        fillValues();
+      }
+    }
+
+    abstract void fillValues();
+
+    /** Whether or not there are remaining values. */
+    public final boolean hasNext() {
+      return vOff < valuesOff || (vOff == valuesOff && pOff < pendingOff);
+    }
+
+    /** Return the next long in the buffer. */
+    public final long next() {
+      assert hasNext();
+      long result = currentValues[pOff++];
+      if (pOff == MAX_PENDING_COUNT) {
+        vOff += 1;
+        pOff = 0;
+        if (vOff <= valuesOff) {
+          fillValues();
+        }
+      }
+      return result;
+    }
+
+  }
+
+  long baseRamBytesUsed() {
+    return RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
+        + 3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF // the 3 arrays
+        + 2 * RamUsageEstimator.NUM_BYTES_INT; // the 2 offsets
+  }
+
+  /**
+   * Return the number of bytes used by this instance.
+   */
+  public long ramBytesUsed() {
+    // TODO: this is called per-doc-per-norms/dv-field, can we optimize this?
+    long bytesUsed = RamUsageEstimator.alignObjectSize(baseRamBytesUsed())
+        + RamUsageEstimator.NUM_BYTES_LONG // valuesBytes
+        + RamUsageEstimator.sizeOf(pending)
+        + RamUsageEstimator.sizeOf(minValues)
+        + RamUsageEstimator.alignObjectSize(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + (long) RamUsageEstimator.NUM_BYTES_OBJECT_REF * deltas.length); // values
+
+    return bytesUsed + deltasBytes;
+  }
+
+}
--- a/lucene/core/src/java/org/apache/lucene/util/packed/AppendingLongBuffer.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/AppendingLongBuffer.java
@ -19,72 +19,33 @@ package org.apache.lucene.util.packed;

 import java.util.Arrays;

-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.RamUsageEstimator;
-
 /**
 * Utility class to buffer a list of signed longs in memory. This class only
- * supports appending.
+ * supports appending and is optimized for the case where values are close to
+ * each other.
 * @lucene.internal
 */
-public class AppendingLongBuffer {
-
-  private static final int BLOCK_BITS = 10;
-  private static final int MAX_PENDING_COUNT = 1 << BLOCK_BITS;
-  private static final int BLOCK_MASK = MAX_PENDING_COUNT - 1;
-
-  private long[] minValues;
-  private PackedInts.Reader[] values;
-  private long valuesBytes;
-  private int valuesOff;
-  private long[] pending;
-  private int pendingOff;
+public final class AppendingLongBuffer extends AbstractAppendingLongBuffer {

  /** Sole constructor. */
  public AppendingLongBuffer() {
-    minValues = new long[16];
-    values = new PackedInts.Reader[16];
-    pending = new long[MAX_PENDING_COUNT];
-    valuesOff = 0;
-    pendingOff = 0;
+    super(16);
  }

-  /** Append a value to this buffer. */
-  public void add(long l) {
-    if (pendingOff == MAX_PENDING_COUNT) {
-      packPendingValues();
-    }
-    pending[pendingOff++] = l;
-  }
-  
-  /** Get a value from this buffer. 
-   *  <p>
-   *  <b>NOTE</b>: This class is not really designed for random access!
-   *  You will likely get better performance by using packed ints in another way! */
-  public long get(long index) {
-    assert index < size() : "index=" + index + ",size=" + size(); // TODO: do a better check, and throw IndexOutOfBoundsException?
-                           // This class is currently only used by the indexer.
-    int block = (int) (index >> BLOCK_BITS);
-    int element = (int) (index & BLOCK_MASK);
+  @Override
+  long get(int block, int element) {
    if (block == valuesOff) {
      return pending[element];
-    } else if (values[block] == null) {
+    } else if (deltas[block] == null) {
      return minValues[block];
    } else {
-      return minValues[block] + values[block].get(element);
+      return minValues[block] + deltas[block].get(element);
    }
  }

-  private void packPendingValues() {
+  void packPendingValues() {
    assert pendingOff == MAX_PENDING_COUNT;

-    // check size
-    if (values.length == valuesOff) {
-      final int newLength = ArrayUtil.oversize(valuesOff + 1, 8);
-      minValues = Arrays.copyOf(minValues, newLength);
-      values = Arrays.copyOf(values, newLength);
-    }
-
    // compute max delta
    long minValue = pending[0];
    long maxValue = pending[0];
@ -105,18 +66,8 @@ public class AppendingLongBuffer {
      for (int i = 0; i < pendingOff; ) {
        i += mutable.set(i, pending, i, pendingOff - i);
      }
-      values[valuesOff] = mutable;
-      valuesBytes += mutable.ramBytesUsed();
+      deltas[valuesOff] = mutable;
    }
-    ++valuesOff;
-
-    // reset pending buffer
-    pendingOff = 0;
-  }
-
-  /** Get the number of values that have been added to the buffer. */
-  public long size() {
-    return valuesOff * (long)MAX_PENDING_COUNT + pendingOff;
  }

  /** Return an iterator over the values of this buffer. */
@ -125,29 +76,20 @@ public class AppendingLongBuffer {
  }

  /** A long iterator. */
-  public class Iterator {
-
-    long[] currentValues;
-    int vOff, pOff;
+  public final class Iterator extends AbstractAppendingLongBuffer.Iterator {

    private Iterator() {
-      vOff = pOff = 0;
-      if (valuesOff == 0) {
-        currentValues = pending;
-      } else {
-        currentValues = new long[MAX_PENDING_COUNT];
-        fillValues();
-      }
+      super();
    }

-    private void fillValues() {
+    void fillValues() {
      if (vOff == valuesOff) {
        currentValues = pending;
-      } else if (values[vOff] == null) {
+      } else if (deltas[vOff] == null) {
        Arrays.fill(currentValues, minValues[vOff]);
      } else {
        for (int k = 0; k < MAX_PENDING_COUNT; ) {
-          k += values[vOff].get(k, currentValues, k, MAX_PENDING_COUNT - k);
+          k += deltas[vOff].get(k, currentValues, k, MAX_PENDING_COUNT - k);
        }
        for (int k = 0; k < MAX_PENDING_COUNT; ++k) {
          currentValues[k] += minValues[vOff];
@ -155,42 +97,6 @@ public class AppendingLongBuffer {
      }
    }

-    /** Whether or not there are remaining values. */
-    public boolean hasNext() {
-      return vOff < valuesOff || (vOff == valuesOff && pOff < pendingOff);
-    }
-
-    /** Return the next long in the buffer. */
-    public long next() {
-      assert hasNext();
-      long result = currentValues[pOff++];
-      if (pOff == MAX_PENDING_COUNT) {
-        vOff += 1;
-        pOff = 0;
-        if (vOff <= valuesOff) {
-          fillValues();
-        }
-      }
-      return result;
-    }
-
-  }
-
-  /**
-   * Return the number of bytes used by this instance.
-   */
-  public long ramBytesUsed() {
-    // TODO: this is called per-doc-per-norms/dv-field, can we optimize this?
-    long bytesUsed = RamUsageEstimator.alignObjectSize(
-        RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
-        + 3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF // the 3 arrays
-        + 2 * RamUsageEstimator.NUM_BYTES_INT) // the 2 offsets
-        + RamUsageEstimator.NUM_BYTES_LONG // valuesBytes
-        + RamUsageEstimator.sizeOf(pending)
-        + RamUsageEstimator.sizeOf(minValues)
-        + RamUsageEstimator.alignObjectSize(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + (long) RamUsageEstimator.NUM_BYTES_OBJECT_REF * values.length); // values
-
-    return bytesUsed + valuesBytes;
  }

 }
--- a/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicAppendingLongBuffer.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicAppendingLongBuffer.java
@ -0,0 +1,139 @@
+package org.apache.lucene.util.packed;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Arrays;
+
+import org.apache.lucene.util.RamUsageEstimator;
+
+/**
+ * Utility class to buffer signed longs in memory, which is optimized for the
+ * case where the sequence is monotonic, although it can encode any sequence of
+ * arbitrary longs. It only supports appending.
+ * @lucene.internal
+ */
+public final class MonotonicAppendingLongBuffer extends AbstractAppendingLongBuffer {
+
+  static long zigZagDecode(long n) {
+    return ((n >>> 1) ^ -(n & 1));
+  }
+  
+  static long zigZagEncode(long n) {
+    return (n >> 63) ^ (n << 1);
+  }
+
+  private float[] averages;
+
+  /** Sole constructor. */
+  public MonotonicAppendingLongBuffer() {
+    super(16);
+    averages = new float[16];
+  }
+  
+  long get(int block, int element) {
+    if (block == valuesOff) {
+      return pending[element];
+    } else {
+      final long base = minValues[block] + (long) (averages[block] * (long) element);
+      if (deltas[block] == null) {
+        return base;
+      } else {
+        return base + zigZagDecode(deltas[block].get(element));
+      }
+    }
+  }
+
+  @Override
+  void grow(int newBlockCount) {
+    super.grow(newBlockCount);
+    this.averages = Arrays.copyOf(averages, newBlockCount);
+  }
+
+  @Override
+  void packPendingValues() {
+    assert pendingOff == MAX_PENDING_COUNT;
+
+    minValues[valuesOff] = pending[0];
+    averages[valuesOff] = (float) (pending[BLOCK_MASK] - pending[0]) / BLOCK_MASK;
+
+    for (int i = 0; i < MAX_PENDING_COUNT; ++i) {
+      pending[i] = zigZagEncode(pending[i] - minValues[valuesOff] - (long) (averages[valuesOff] * (long) i));
+    }
+    long maxDelta = 0;
+    for (int i = 0; i < MAX_PENDING_COUNT; ++i) {
+      if (pending[i] < 0) {
+        maxDelta = -1;
+        break;
+      } else {
+        maxDelta = Math.max(maxDelta, pending[i]);
+      }
+    }
+    if (maxDelta != 0) {
+      final int bitsRequired = maxDelta < 0 ? 64 : PackedInts.bitsRequired(maxDelta);
+      final PackedInts.Mutable mutable = PackedInts.getMutable(pendingOff, bitsRequired, PackedInts.COMPACT);
+      for (int i = 0; i < pendingOff; ) {
+        i += mutable.set(i, pending, i, pendingOff - i);
+      }
+      deltas[valuesOff] = mutable;
+    }
+  }
+
+  /** Return an iterator over the values of this buffer. */
+  public Iterator iterator() {
+    return new Iterator();
+  }
+
+  /** A long iterator. */
+  public final class Iterator extends AbstractAppendingLongBuffer.Iterator {
+
+    Iterator() {
+      super();
+    }
+
+    void fillValues() {
+      if (vOff == valuesOff) {
+        currentValues = pending;
+      } else if (deltas[vOff] == null) {
+        for (int k = 0; k < MAX_PENDING_COUNT; ++k) {
+          currentValues[k] = minValues[vOff] + (long) (averages[vOff] * (long) k);
+        }
+      } else {
+        for (int k = 0; k < MAX_PENDING_COUNT; ) {
+          k += deltas[vOff].get(k, currentValues, k, MAX_PENDING_COUNT - k);
+        }
+        for (int k = 0; k < MAX_PENDING_COUNT; ++k) {
+          currentValues[k] = minValues[vOff] + (long) (averages[vOff] * (long) k) + zigZagDecode(currentValues[k]);
+        }
+      }
+    }
+
+  }
+
+  @Override
+  long baseRamBytesUsed() {
+    return super.baseRamBytesUsed()
+        + RamUsageEstimator.NUM_BYTES_OBJECT_REF; // the additional array
+  }
+
+  @Override
+  public long ramBytesUsed() {
+    return super.ramBytesUsed()
+        + RamUsageEstimator.sizeOf(averages);
+  }
+
+}
--- a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
+++ b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
@ -805,42 +805,55 @@ public class TestPackedInts extends LuceneTestCase {
  }

  public void testAppendingLongBuffer() {
-    final long[] arr = new long[RandomInts.randomIntBetween(random(), 1, 2000000)];
-    for (int bpv : new int[] {0, 1, 63, 64, RandomInts.randomIntBetween(random(), 2, 61)}) {
-      if (bpv == 0) {
-        Arrays.fill(arr, random().nextLong());
-      } else if (bpv == 64) {
+    final long[] arr = new long[RandomInts.randomIntBetween(random(), 1, 1000000)];
+    for (int bpv : new int[] {0, 1, 63, 64, RandomInts.randomIntBetween(random(), 2, 62)}) {
+      for (boolean monotonic : new boolean[] {true, false}) {
+        AbstractAppendingLongBuffer buf;
+        final int inc;
+        if (monotonic) {
+          buf = new MonotonicAppendingLongBuffer();
+          inc = _TestUtil.nextInt(random(), -1000, 1000);
+        } else {
+          buf = new AppendingLongBuffer();
+          inc = 0;
+        }
+        if (bpv == 0) {
+          arr[0] = random().nextLong();
+          for (int i = 1; i < arr.length; ++i) {
+            arr[i] = arr[i-1] + inc;
+          }
+        } else if (bpv == 64) {
+          for (int i = 0; i < arr.length; ++i) {
+            arr[i] = random().nextLong();
+          }
+        } else {
+          final long minValue = _TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE - PackedInts.maxValue(bpv));
+          for (int i = 0; i < arr.length; ++i) {
+            arr[i] = minValue + inc * i + random().nextLong() & PackedInts.maxValue(bpv); // _TestUtil.nextLong is too slow
+          }
+        }
        for (int i = 0; i < arr.length; ++i) {
-          arr[i] = random().nextLong();
+          buf.add(arr[i]);
        }
-      } else {
-        final long minValue = _TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE - PackedInts.maxValue(bpv));
+        assertEquals(arr.length, buf.size());
+        final AbstractAppendingLongBuffer.Iterator it = buf.iterator();
        for (int i = 0; i < arr.length; ++i) {
-          arr[i] = minValue + random().nextLong() & PackedInts.maxValue(bpv); // _TestUtil.nextLong is too slow
+          if (random().nextBoolean()) {
+            assertTrue(it.hasNext());
+          }
+          assertEquals(arr[i], it.next());
        }
-      }
-      AppendingLongBuffer buf = new AppendingLongBuffer();
-      for (int i = 0; i < arr.length; ++i) {
-        buf.add(arr[i]);
-      }
-      assertEquals(arr.length, buf.size());
-      final AppendingLongBuffer.Iterator it = buf.iterator();
-      for (int i = 0; i < arr.length; ++i) {
-        if (random().nextBoolean()) {
-          assertTrue(it.hasNext());
+        assertFalse(it.hasNext());
+        
+        for (int i = 0; i < arr.length; ++i) {
+          assertEquals(arr[i], buf.get(i));
        }
-        assertEquals(arr[i], it.next());
+  
+        final long expectedBytesUsed = RamUsageEstimator.sizeOf(buf);
+        final long computedBytesUsed = buf.ramBytesUsed();
+        assertEquals("got " + computedBytesUsed + ", expected: " + expectedBytesUsed,
+            expectedBytesUsed, computedBytesUsed);
      }
-      assertFalse(it.hasNext());
-      
-      for (int i = 0; i < arr.length; ++i) {
-        assertEquals(arr[i], buf.get(i));
-      }
-
-      final long expectedBytesUsed = RamUsageEstimator.sizeOf(buf);
-      final long computedBytesUsed = buf.ramBytesUsed();
-      assertEquals("got " + computedBytesUsed + ", expected: " + expectedBytesUsed,
-          expectedBytesUsed, computedBytesUsed);
    }
  }