LUCENE-5064: Added PagedMutable.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1494233 13f79535-47bb-0310-9956-ffa450edef68
2013-06-18 18:19:07 +00:00 · 2013-06-18 18:19:07 +00:00 · 036381290d
parent e1005a0645
commit 036381290d
6 changed files with 324 additions and 104 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -187,6 +187,9 @@ Optimizations

 New Features

+* LUCENE-5064: Added PagedMutable (internal), a paged extension of
+  PackedInts.Mutable which allows for storing more than 2B values. (Adrien Grand)
+
 * LUCENE-4766: Added a PatternCaptureGroupTokenFilter that uses Java regexes to 
  emit multiple tokens one for each capture group in one or more patterns.
  (Simon Willnauer, Clinton Gormley)
--- a/lucene/core/src/java/org/apache/lucene/util/packed/AbstractPagedMutable.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/AbstractPagedMutable.java
@ -0,0 +1,163 @@
+package org.apache.lucene.util.packed;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.lucene.util.packed.PackedInts.checkBlockSize;
+import static org.apache.lucene.util.packed.PackedInts.numBlocks;
+
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.RamUsageEstimator;
+
+/**
+ * Base implementation for {@link PagedMutable} and {@link PagedGrowableWriter}.
+ * @lucene.internal
+ */
+abstract class AbstractPagedMutable<T extends AbstractPagedMutable<T>> {
+
+  static final int MIN_BLOCK_SIZE = 1 << 6;
+  static final int MAX_BLOCK_SIZE = 1 << 30;
+
+  final long size;
+  final int pageShift;
+  final int pageMask;
+  final PackedInts.Mutable[] subMutables;
+  final int bitsPerValue;
+
+  AbstractPagedMutable(int bitsPerValue, long size, int pageSize) {
+    this.bitsPerValue = bitsPerValue;
+    this.size = size;
+    pageShift = checkBlockSize(pageSize, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE);
+    pageMask = pageSize - 1;
+    final int numPages = numBlocks(size, pageSize);
+    subMutables = new PackedInts.Mutable[numPages];
+  }
+
+  protected final void fillPages() {
+    final int numPages = numBlocks(size, pageSize());
+    for (int i = 0; i < numPages; ++i) {
+      // do not allocate for more entries than necessary on the last page
+      final int valueCount = i == numPages - 1 ? lastPageSize(size) : pageSize();
+      subMutables[i] = newMutable(valueCount, bitsPerValue);
+    }
+  }
+
+  protected abstract PackedInts.Mutable newMutable(int valueCount, int bitsPerValue);
+
+  final int lastPageSize(long size) {
+    final int sz = indexInPage(size);
+    return sz == 0 ? pageSize() : sz;
+  }
+
+  final int pageSize() {
+    return pageMask + 1;
+  }
+
+  /** The number of values. */
+  public final long size() {
+    return size;
+  }
+
+  final int pageIndex(long index) {
+    return (int) (index >>> pageShift);
+  }
+
+  final int indexInPage(long index) {
+    return (int) index & pageMask;
+  }
+
+  /** Get value at <code>index</code>. */
+  public final long get(long index) {
+    assert index >= 0 && index < size;
+    final int pageIndex = pageIndex(index);
+    final int indexInPage = indexInPage(index);
+    return subMutables[pageIndex].get(indexInPage);
+  }
+
+  /** Set value at <code>index</code>. */
+  public final void set(long index, long value) {
+    assert index >= 0 && index < size;
+    final int pageIndex = pageIndex(index);
+    final int indexInPage = indexInPage(index);
+    subMutables[pageIndex].set(indexInPage, value);
+  }
+
+  protected long baseRamBytesUsed() {
+    return RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
+        + RamUsageEstimator.NUM_BYTES_OBJECT_REF
+        + RamUsageEstimator.NUM_BYTES_LONG
+        + 3 * RamUsageEstimator.NUM_BYTES_INT;
+  }
+
+  /** Return the number of bytes used by this object. */
+  public long ramBytesUsed() {
+    long bytesUsed = RamUsageEstimator.alignObjectSize(baseRamBytesUsed());
+    bytesUsed += RamUsageEstimator.alignObjectSize(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + (long) RamUsageEstimator.NUM_BYTES_OBJECT_REF * subMutables.length);
+    for (PackedInts.Mutable gw : subMutables) {
+      bytesUsed += gw.ramBytesUsed();
+    }
+    return bytesUsed;
+  }
+
+  protected abstract T newUnfilledCopy(long newSize);
+
+  /** Create a new copy of size <code>newSize</code> based on the content of
+   *  this buffer. This method is much more efficient than creating a new
+   *  instance and copying values one by one. */
+  public final T resize(long newSize) {
+    final T copy = newUnfilledCopy(newSize);
+    final int numCommonPages = Math.min(copy.subMutables.length, subMutables.length);
+    final long[] copyBuffer = new long[1024];
+    for (int i = 0; i < copy.subMutables.length; ++i) {
+      final int valueCount = i == copy.subMutables.length - 1 ? lastPageSize(newSize) : pageSize();
+      final int bpv = i < numCommonPages ? subMutables[i].getBitsPerValue() : this.bitsPerValue;
+      copy.subMutables[i] = newMutable(valueCount, bpv);
+      if (i < numCommonPages) {
+        final int copyLength = Math.min(valueCount, subMutables[i].size());
+        PackedInts.copy(subMutables[i], 0, copy.subMutables[i], 0, copyLength, copyBuffer);
+      }
+    }
+    return copy;
+  }
+
+  /** Similar to {@link ArrayUtil#grow(long[], int)}. */
+  public final T grow(long minSize) {
+    assert minSize >= 0;
+    if (minSize <= size()) {
+      @SuppressWarnings("unchecked")
+      final T result = (T) this;
+      return result;
+    }
+    long extra = minSize >>> 3;
+    if (extra < 3) {
+      extra = 3;
+    }
+    final long newSize = minSize + extra;
+    return resize(newSize);
+  }
+
+  /** Similar to {@link ArrayUtil#grow(long[])}. */
+  public final T grow() {
+    return grow(size() + 1);
+  }
+
+  @Override
+  public final String toString() {
+    return getClass().getSimpleName() + "(size=" + size() + ",pageSize=" + pageSize() + ")";
+  }
+
+}
--- a/lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java
@ -1041,14 +1041,21 @@ public class PackedInts {
   */
  public static Mutable getMutable(int valueCount,
      int bitsPerValue, float acceptableOverheadRatio) {
-    assert valueCount >= 0;
-
    final FormatAndBits formatAndBits = fastestFormatAndBits(valueCount, bitsPerValue, acceptableOverheadRatio);
-    switch (formatAndBits.format) {
+    return getMutable(valueCount, formatAndBits.bitsPerValue, formatAndBits.format);
+  }
+
+  /** Same as {@link #getMutable(int, int, float)} with a pre-computed number
+   *  of bits per value and format.
+   *  @lucene.internal */
+  public static Mutable getMutable(int valueCount,
+      int bitsPerValue, PackedInts.Format format) {
+    assert valueCount >= 0;
+    switch (format) {
      case PACKED_SINGLE_BLOCK:
-        return Packed64SingleBlock.create(valueCount, formatAndBits.bitsPerValue);
+        return Packed64SingleBlock.create(valueCount, bitsPerValue);
      case PACKED:
-        switch (formatAndBits.bitsPerValue) {
+        switch (bitsPerValue) {
          case 8:
            return new Direct8(valueCount);
          case 16:
@ -1068,7 +1075,7 @@ public class PackedInts {
            }
            break;
        }
-        return new Packed64(valueCount, formatAndBits.bitsPerValue);
+        return new Packed64(valueCount, bitsPerValue);
      default:
        throw new AssertionError();
    }
--- a/lucene/core/src/java/org/apache/lucene/util/packed/PagedGrowableWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/PagedGrowableWriter.java
@ -17,10 +17,8 @@ package org.apache.lucene.util.packed;
 * limitations under the License.
 */

-import static org.apache.lucene.util.packed.PackedInts.checkBlockSize;
-import static org.apache.lucene.util.packed.PackedInts.numBlocks;
-
 import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.packed.PackedInts.Mutable;

 /**
 * A {@link PagedGrowableWriter}. This class slices data into fixed-size blocks
@ -30,16 +28,8 @@ import org.apache.lucene.util.RamUsageEstimator;
 * less memory-efficient.
 * @lucene.internal
 */
-public final class PagedGrowableWriter {
+public final class PagedGrowableWriter extends AbstractPagedMutable<PagedGrowableWriter> {

-  static final int MIN_BLOCK_SIZE = 1 << 6;
-  static final int MAX_BLOCK_SIZE = 1 << 30;
-
-  final long size;
-  final int pageShift;
-  final int pageMask;
-  final GrowableWriter[] subWriters;
-  final int startBitsPerValue;
  final float acceptableOverheadRatio;

  /**
@ -56,98 +46,26 @@ public final class PagedGrowableWriter {
  }

  PagedGrowableWriter(long size, int pageSize,int startBitsPerValue, float acceptableOverheadRatio, boolean fillPages) {
-    this.size = size;
-    this.startBitsPerValue = startBitsPerValue;
+    super(startBitsPerValue, size, pageSize);
    this.acceptableOverheadRatio = acceptableOverheadRatio;
-    pageShift = checkBlockSize(pageSize, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE);
-    pageMask = pageSize - 1;
-    final int numPages = numBlocks(size, pageSize);
-    subWriters = new GrowableWriter[numPages];
    if (fillPages) {
-      for (int i = 0; i < numPages; ++i) {
-        // do not allocate for more entries than necessary on the last page
-        final int valueCount = i == numPages - 1 ? lastPageSize(size) : pageSize;
-        subWriters[i] = new GrowableWriter(startBitsPerValue, valueCount, acceptableOverheadRatio);
-      }
+      fillPages();
    }
  }

-  private int lastPageSize(long size) {
-    final int sz = indexInPage(size);
-    return sz == 0 ? pageSize() : sz;
-  }
-
-  private int pageSize() {
-    return pageMask + 1;
-  }
-
-  /** The number of values. */
-  public long size() {
-    return size;
-  }
-
-  int pageIndex(long index) {
-    return (int) (index >>> pageShift);
-  }
-
-  int indexInPage(long index) {
-    return (int) index & pageMask;
-  }
-
-  /** Get value at <code>index</code>. */
-  public long get(long index) {
-    assert index >= 0 && index < size;
-    final int pageIndex = pageIndex(index);
-    final int indexInPage = indexInPage(index);
-    return subWriters[pageIndex].get(indexInPage);
-  }
-
-  /** Set value at <code>index</code>. */
-  public void set(long index, long value) {
-    assert index >= 0 && index < size;
-    final int pageIndex = pageIndex(index);
-    final int indexInPage = indexInPage(index);
-    subWriters[pageIndex].set(indexInPage, value);
-  }
-
-  /** Create a new {@link PagedGrowableWriter} of size <code>newSize</code>
-   *  based on the content of this buffer. This method is much more efficient
-   *  than creating a new {@link PagedGrowableWriter} and copying values one by
-   *  one. */
-  public PagedGrowableWriter resize(long newSize) {
-    final PagedGrowableWriter newWriter = new PagedGrowableWriter(newSize, pageSize(), startBitsPerValue, acceptableOverheadRatio, false);
-    final int numCommonPages = Math.min(newWriter.subWriters.length, subWriters.length);
-    final long[] copyBuffer = new long[1024];
-    for (int i = 0; i < newWriter.subWriters.length; ++i) {
-      final int valueCount = i == newWriter.subWriters.length - 1 ? lastPageSize(newSize) : pageSize();
-      final int bpv = i < numCommonPages ? subWriters[i].getBitsPerValue() : startBitsPerValue;
-      newWriter.subWriters[i] = new GrowableWriter(bpv, valueCount, acceptableOverheadRatio);
-      if (i < numCommonPages) {
-        final int copyLength = Math.min(valueCount, subWriters[i].size());
-        PackedInts.copy(subWriters[i], 0, newWriter.subWriters[i].getMutable(), 0, copyLength, copyBuffer);
-      }
-    }
-    return newWriter;
-  }
-
-  /** Return the number of bytes used by this object. */
-  public long ramBytesUsed() {
-    long bytesUsed = RamUsageEstimator.alignObjectSize(
-        RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
-        + RamUsageEstimator.NUM_BYTES_OBJECT_REF
-        + RamUsageEstimator.NUM_BYTES_LONG
-        + 3 * RamUsageEstimator.NUM_BYTES_INT
-        + RamUsageEstimator.NUM_BYTES_FLOAT);
-    bytesUsed += RamUsageEstimator.alignObjectSize(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + (long) RamUsageEstimator.NUM_BYTES_OBJECT_REF * subWriters.length);
-    for (GrowableWriter gw : subWriters) {
-      bytesUsed += gw.ramBytesUsed();
-    }
-    return bytesUsed;
-  }
-
  @Override
-  public String toString() {
-    return getClass().getSimpleName() + "(size=" + size() + ",pageSize=" + pageSize() + ")";
+  protected Mutable newMutable(int valueCount, int bitsPerValue) {
+    return new GrowableWriter(bitsPerValue, valueCount, acceptableOverheadRatio);
+  }
+
+  @Override
+  protected PagedGrowableWriter newUnfilledCopy(long newSize) {
+    return new PagedGrowableWriter(newSize, pageSize(), bitsPerValue, acceptableOverheadRatio, false);
+  }
+
+  @Override
+  protected long baseRamBytesUsed() {
+    return super.baseRamBytesUsed() + RamUsageEstimator.NUM_BYTES_FLOAT;
  }

 }
--- a/lucene/core/src/java/org/apache/lucene/util/packed/PagedMutable.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/PagedMutable.java
@ -0,0 +1,71 @@
+package org.apache.lucene.util.packed;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.packed.PackedInts.Mutable;
+
+/**
+ * A {@link PagedMutable}. This class slices data into fixed-size blocks
+ * which have the same number of bits per value. It can be a useful replacement
+ * for {@link PackedInts.Mutable} to store more than 2B values.
+ * @lucene.internal
+ */
+public final class PagedMutable extends AbstractPagedMutable<PagedMutable> {
+
+  final PackedInts.Format format;
+
+  /**
+   * Create a new {@link PagedMutable} instance.
+   *
+   * @param size the number of values to store.
+   * @param pageSize the number of values per page
+   * @param bitsPerValue the number of bits per value
+   * @param acceptableOverheadRatio an acceptable overhead ratio
+   */
+  public PagedMutable(long size, int pageSize, int bitsPerValue, float acceptableOverheadRatio) {
+    this(size, pageSize, PackedInts.fastestFormatAndBits(pageSize, bitsPerValue, acceptableOverheadRatio));
+    fillPages();
+  }
+
+  PagedMutable(long size, int pageSize, PackedInts.FormatAndBits formatAndBits) {
+    this(size, pageSize, formatAndBits.bitsPerValue, formatAndBits.format);
+  }
+
+  PagedMutable(long size, int pageSize, int bitsPerValue, PackedInts.Format format) {
+    super(bitsPerValue, size, pageSize);
+    this.format = format;
+  }
+
+  @Override
+  protected Mutable newMutable(int valueCount, int bitsPerValue) {
+    assert this.bitsPerValue >= bitsPerValue;
+    return PackedInts.getMutable(valueCount, this.bitsPerValue, format);
+  }
+
+  @Override
+  protected PagedMutable newUnfilledCopy(long newSize) {
+    return new PagedMutable(newSize, pageSize(), bitsPerValue, format);
+  }
+
+  @Override
+  protected long baseRamBytesUsed() {
+    return super.baseRamBytesUsed() + RamUsageEstimator.NUM_BYTES_OBJECT_REF;
+  }
+
+}
--- a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
+++ b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
@ -697,6 +697,64 @@ public class TestPackedInts extends LuceneTestCase {
        assertEquals(0, copy.get(i));
      }
    }
+
+    // test grow
+    PagedGrowableWriter grow = writer.grow(_TestUtil.nextLong(random(), writer.size() / 2, writer.size() * 3 / 2));
+    for (long i = 0; i < grow.size(); ++i) {
+      if (i < writer.size()) {
+        assertEquals(writer.get(i), grow.get(i));
+      } else {
+        assertEquals(0, grow.get(i));
+      }
+    }
+  }
+
+  public void testPagedMutable() {
+    final int bitsPerValue = _TestUtil.nextInt(random(), 1, 64);
+    final long max = PackedInts.maxValue(bitsPerValue);
+    int pageSize = 1 << (_TestUtil.nextInt(random(), 6, 30));
+    // supports 0 values?
+    PagedMutable writer = new PagedMutable(0, pageSize, bitsPerValue, random().nextFloat() / 2);
+    assertEquals(0, writer.size());
+
+    // compare against AppendingLongBuffer
+    AppendingLongBuffer buf = new AppendingLongBuffer();
+    int size = random().nextInt(1000000);
+    
+    for (int i = 0; i < size; ++i) {
+      buf.add(bitsPerValue == 64 ? random().nextLong() : _TestUtil.nextLong(random(), 0, max));
+    }
+    writer = new PagedMutable(size, pageSize, bitsPerValue, random().nextFloat());
+    assertEquals(size, writer.size());
+    for (int i = size - 1; i >= 0; --i) {
+      writer.set(i, buf.get(i));
+    }
+    for (int i = 0; i < size; ++i) {
+      assertEquals(buf.get(i), writer.get(i));
+    }
+
+    // test ramBytesUsed
+    assertEquals(RamUsageEstimator.sizeOf(writer) - RamUsageEstimator.sizeOf(writer.format), writer.ramBytesUsed());
+
+    // test copy
+    PagedMutable copy = writer.resize(_TestUtil.nextLong(random(), writer.size() / 2, writer.size() * 3 / 2));
+    for (long i = 0; i < copy.size(); ++i) {
+      if (i < writer.size()) {
+        assertEquals(writer.get(i), copy.get(i));
+      } else {
+        assertEquals(0, copy.get(i));
+      }
+    }
+
+    // test grow
+    PagedMutable grow = writer.grow(_TestUtil.nextLong(random(), writer.size() / 2, writer.size() * 3 / 2));
+    for (long i = 0; i < grow.size(); ++i) {
+      if (i < writer.size()) {
+        assertEquals(writer.get(i), grow.get(i));
+      } else {
+        assertEquals(0, grow.get(i));
+      }
+    }
  }

  // memory hole