LUCENE-5064: Added PagedMutable.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1494233 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2013-06-18 18:19:07 +00:00
parent e1005a0645
commit 036381290d
6 changed files with 324 additions and 104 deletions

View File

@ -187,6 +187,9 @@ Optimizations
New Features
* LUCENE-5064: Added PagedMutable (internal), a paged extension of
PackedInts.Mutable which allows for storing more than 2B values. (Adrien Grand)
* LUCENE-4766: Added a PatternCaptureGroupTokenFilter that uses Java regexes to
emit multiple tokens one for each capture group in one or more patterns.
(Simon Willnauer, Clinton Gormley)

View File

@ -0,0 +1,163 @@
package org.apache.lucene.util.packed;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.util.packed.PackedInts.checkBlockSize;
import static org.apache.lucene.util.packed.PackedInts.numBlocks;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
/**
* Base implementation for {@link PagedMutable} and {@link PagedGrowableWriter}.
* @lucene.internal
*/
abstract class AbstractPagedMutable<T extends AbstractPagedMutable<T>> {
static final int MIN_BLOCK_SIZE = 1 << 6;
static final int MAX_BLOCK_SIZE = 1 << 30;
final long size;
final int pageShift;
final int pageMask;
final PackedInts.Mutable[] subMutables;
final int bitsPerValue;
AbstractPagedMutable(int bitsPerValue, long size, int pageSize) {
this.bitsPerValue = bitsPerValue;
this.size = size;
pageShift = checkBlockSize(pageSize, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE);
pageMask = pageSize - 1;
final int numPages = numBlocks(size, pageSize);
subMutables = new PackedInts.Mutable[numPages];
}
protected final void fillPages() {
final int numPages = numBlocks(size, pageSize());
for (int i = 0; i < numPages; ++i) {
// do not allocate for more entries than necessary on the last page
final int valueCount = i == numPages - 1 ? lastPageSize(size) : pageSize();
subMutables[i] = newMutable(valueCount, bitsPerValue);
}
}
protected abstract PackedInts.Mutable newMutable(int valueCount, int bitsPerValue);
final int lastPageSize(long size) {
final int sz = indexInPage(size);
return sz == 0 ? pageSize() : sz;
}
final int pageSize() {
return pageMask + 1;
}
/** The number of values. */
public final long size() {
return size;
}
final int pageIndex(long index) {
return (int) (index >>> pageShift);
}
final int indexInPage(long index) {
return (int) index & pageMask;
}
/** Get value at <code>index</code>. */
public final long get(long index) {
assert index >= 0 && index < size;
final int pageIndex = pageIndex(index);
final int indexInPage = indexInPage(index);
return subMutables[pageIndex].get(indexInPage);
}
/** Set value at <code>index</code>. */
public final void set(long index, long value) {
assert index >= 0 && index < size;
final int pageIndex = pageIndex(index);
final int indexInPage = indexInPage(index);
subMutables[pageIndex].set(indexInPage, value);
}
protected long baseRamBytesUsed() {
return RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
+ RamUsageEstimator.NUM_BYTES_OBJECT_REF
+ RamUsageEstimator.NUM_BYTES_LONG
+ 3 * RamUsageEstimator.NUM_BYTES_INT;
}
/** Return the number of bytes used by this object. */
public long ramBytesUsed() {
long bytesUsed = RamUsageEstimator.alignObjectSize(baseRamBytesUsed());
bytesUsed += RamUsageEstimator.alignObjectSize(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + (long) RamUsageEstimator.NUM_BYTES_OBJECT_REF * subMutables.length);
for (PackedInts.Mutable gw : subMutables) {
bytesUsed += gw.ramBytesUsed();
}
return bytesUsed;
}
protected abstract T newUnfilledCopy(long newSize);
/** Create a new copy of size <code>newSize</code> based on the content of
* this buffer. This method is much more efficient than creating a new
* instance and copying values one by one. */
public final T resize(long newSize) {
final T copy = newUnfilledCopy(newSize);
final int numCommonPages = Math.min(copy.subMutables.length, subMutables.length);
final long[] copyBuffer = new long[1024];
for (int i = 0; i < copy.subMutables.length; ++i) {
final int valueCount = i == copy.subMutables.length - 1 ? lastPageSize(newSize) : pageSize();
final int bpv = i < numCommonPages ? subMutables[i].getBitsPerValue() : this.bitsPerValue;
copy.subMutables[i] = newMutable(valueCount, bpv);
if (i < numCommonPages) {
final int copyLength = Math.min(valueCount, subMutables[i].size());
PackedInts.copy(subMutables[i], 0, copy.subMutables[i], 0, copyLength, copyBuffer);
}
}
return copy;
}
/** Similar to {@link ArrayUtil#grow(long[], int)}. */
public final T grow(long minSize) {
assert minSize >= 0;
if (minSize <= size()) {
@SuppressWarnings("unchecked")
final T result = (T) this;
return result;
}
long extra = minSize >>> 3;
if (extra < 3) {
extra = 3;
}
final long newSize = minSize + extra;
return resize(newSize);
}
/** Similar to {@link ArrayUtil#grow(long[])}. */
public final T grow() {
return grow(size() + 1);
}
@Override
public final String toString() {
return getClass().getSimpleName() + "(size=" + size() + ",pageSize=" + pageSize() + ")";
}
}

View File

@ -1041,14 +1041,21 @@ public class PackedInts {
*/
public static Mutable getMutable(int valueCount,
int bitsPerValue, float acceptableOverheadRatio) {
assert valueCount >= 0;
final FormatAndBits formatAndBits = fastestFormatAndBits(valueCount, bitsPerValue, acceptableOverheadRatio);
switch (formatAndBits.format) {
return getMutable(valueCount, formatAndBits.bitsPerValue, formatAndBits.format);
}
/** Same as {@link #getMutable(int, int, float)} with a pre-computed number
* of bits per value and format.
* @lucene.internal */
public static Mutable getMutable(int valueCount,
int bitsPerValue, PackedInts.Format format) {
assert valueCount >= 0;
switch (format) {
case PACKED_SINGLE_BLOCK:
return Packed64SingleBlock.create(valueCount, formatAndBits.bitsPerValue);
return Packed64SingleBlock.create(valueCount, bitsPerValue);
case PACKED:
switch (formatAndBits.bitsPerValue) {
switch (bitsPerValue) {
case 8:
return new Direct8(valueCount);
case 16:
@ -1068,7 +1075,7 @@ public class PackedInts {
}
break;
}
return new Packed64(valueCount, formatAndBits.bitsPerValue);
return new Packed64(valueCount, bitsPerValue);
default:
throw new AssertionError();
}

View File

@ -17,10 +17,8 @@ package org.apache.lucene.util.packed;
* limitations under the License.
*/
import static org.apache.lucene.util.packed.PackedInts.checkBlockSize;
import static org.apache.lucene.util.packed.PackedInts.numBlocks;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.PackedInts.Mutable;
/**
* A {@link PagedGrowableWriter}. This class slices data into fixed-size blocks
@ -30,16 +28,8 @@ import org.apache.lucene.util.RamUsageEstimator;
* less memory-efficient.
* @lucene.internal
*/
public final class PagedGrowableWriter {
public final class PagedGrowableWriter extends AbstractPagedMutable<PagedGrowableWriter> {
static final int MIN_BLOCK_SIZE = 1 << 6;
static final int MAX_BLOCK_SIZE = 1 << 30;
final long size;
final int pageShift;
final int pageMask;
final GrowableWriter[] subWriters;
final int startBitsPerValue;
final float acceptableOverheadRatio;
/**
@ -56,98 +46,26 @@ public final class PagedGrowableWriter {
}
PagedGrowableWriter(long size, int pageSize,int startBitsPerValue, float acceptableOverheadRatio, boolean fillPages) {
this.size = size;
this.startBitsPerValue = startBitsPerValue;
super(startBitsPerValue, size, pageSize);
this.acceptableOverheadRatio = acceptableOverheadRatio;
pageShift = checkBlockSize(pageSize, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE);
pageMask = pageSize - 1;
final int numPages = numBlocks(size, pageSize);
subWriters = new GrowableWriter[numPages];
if (fillPages) {
for (int i = 0; i < numPages; ++i) {
// do not allocate for more entries than necessary on the last page
final int valueCount = i == numPages - 1 ? lastPageSize(size) : pageSize;
subWriters[i] = new GrowableWriter(startBitsPerValue, valueCount, acceptableOverheadRatio);
fillPages();
}
}
}
private int lastPageSize(long size) {
final int sz = indexInPage(size);
return sz == 0 ? pageSize() : sz;
}
private int pageSize() {
return pageMask + 1;
}
/** The number of values. */
public long size() {
return size;
}
int pageIndex(long index) {
return (int) (index >>> pageShift);
}
int indexInPage(long index) {
return (int) index & pageMask;
}
/** Get value at <code>index</code>. */
public long get(long index) {
assert index >= 0 && index < size;
final int pageIndex = pageIndex(index);
final int indexInPage = indexInPage(index);
return subWriters[pageIndex].get(indexInPage);
}
/** Set value at <code>index</code>. */
public void set(long index, long value) {
assert index >= 0 && index < size;
final int pageIndex = pageIndex(index);
final int indexInPage = indexInPage(index);
subWriters[pageIndex].set(indexInPage, value);
}
/** Create a new {@link PagedGrowableWriter} of size <code>newSize</code>
* based on the content of this buffer. This method is much more efficient
* than creating a new {@link PagedGrowableWriter} and copying values one by
* one. */
public PagedGrowableWriter resize(long newSize) {
final PagedGrowableWriter newWriter = new PagedGrowableWriter(newSize, pageSize(), startBitsPerValue, acceptableOverheadRatio, false);
final int numCommonPages = Math.min(newWriter.subWriters.length, subWriters.length);
final long[] copyBuffer = new long[1024];
for (int i = 0; i < newWriter.subWriters.length; ++i) {
final int valueCount = i == newWriter.subWriters.length - 1 ? lastPageSize(newSize) : pageSize();
final int bpv = i < numCommonPages ? subWriters[i].getBitsPerValue() : startBitsPerValue;
newWriter.subWriters[i] = new GrowableWriter(bpv, valueCount, acceptableOverheadRatio);
if (i < numCommonPages) {
final int copyLength = Math.min(valueCount, subWriters[i].size());
PackedInts.copy(subWriters[i], 0, newWriter.subWriters[i].getMutable(), 0, copyLength, copyBuffer);
}
}
return newWriter;
}
/** Return the number of bytes used by this object. */
public long ramBytesUsed() {
long bytesUsed = RamUsageEstimator.alignObjectSize(
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
+ RamUsageEstimator.NUM_BYTES_OBJECT_REF
+ RamUsageEstimator.NUM_BYTES_LONG
+ 3 * RamUsageEstimator.NUM_BYTES_INT
+ RamUsageEstimator.NUM_BYTES_FLOAT);
bytesUsed += RamUsageEstimator.alignObjectSize(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + (long) RamUsageEstimator.NUM_BYTES_OBJECT_REF * subWriters.length);
for (GrowableWriter gw : subWriters) {
bytesUsed += gw.ramBytesUsed();
}
return bytesUsed;
}
@Override
public String toString() {
return getClass().getSimpleName() + "(size=" + size() + ",pageSize=" + pageSize() + ")";
protected Mutable newMutable(int valueCount, int bitsPerValue) {
return new GrowableWriter(bitsPerValue, valueCount, acceptableOverheadRatio);
}
@Override
protected PagedGrowableWriter newUnfilledCopy(long newSize) {
return new PagedGrowableWriter(newSize, pageSize(), bitsPerValue, acceptableOverheadRatio, false);
}
@Override
protected long baseRamBytesUsed() {
return super.baseRamBytesUsed() + RamUsageEstimator.NUM_BYTES_FLOAT;
}
}

View File

@ -0,0 +1,71 @@
package org.apache.lucene.util.packed;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.PackedInts.Mutable;
/**
* A {@link PagedMutable}. This class slices data into fixed-size blocks
* which have the same number of bits per value. It can be a useful replacement
* for {@link PackedInts.Mutable} to store more than 2B values.
* @lucene.internal
*/
public final class PagedMutable extends AbstractPagedMutable<PagedMutable> {
final PackedInts.Format format;
/**
* Create a new {@link PagedMutable} instance.
*
* @param size the number of values to store.
* @param pageSize the number of values per page
* @param bitsPerValue the number of bits per value
* @param acceptableOverheadRatio an acceptable overhead ratio
*/
public PagedMutable(long size, int pageSize, int bitsPerValue, float acceptableOverheadRatio) {
this(size, pageSize, PackedInts.fastestFormatAndBits(pageSize, bitsPerValue, acceptableOverheadRatio));
fillPages();
}
PagedMutable(long size, int pageSize, PackedInts.FormatAndBits formatAndBits) {
this(size, pageSize, formatAndBits.bitsPerValue, formatAndBits.format);
}
PagedMutable(long size, int pageSize, int bitsPerValue, PackedInts.Format format) {
super(bitsPerValue, size, pageSize);
this.format = format;
}
@Override
protected Mutable newMutable(int valueCount, int bitsPerValue) {
assert this.bitsPerValue >= bitsPerValue;
return PackedInts.getMutable(valueCount, this.bitsPerValue, format);
}
@Override
protected PagedMutable newUnfilledCopy(long newSize) {
return new PagedMutable(newSize, pageSize(), bitsPerValue, format);
}
@Override
protected long baseRamBytesUsed() {
return super.baseRamBytesUsed() + RamUsageEstimator.NUM_BYTES_OBJECT_REF;
}
}

View File

@ -697,6 +697,64 @@ public class TestPackedInts extends LuceneTestCase {
assertEquals(0, copy.get(i));
}
}
// test grow
PagedGrowableWriter grow = writer.grow(_TestUtil.nextLong(random(), writer.size() / 2, writer.size() * 3 / 2));
for (long i = 0; i < grow.size(); ++i) {
if (i < writer.size()) {
assertEquals(writer.get(i), grow.get(i));
} else {
assertEquals(0, grow.get(i));
}
}
}
public void testPagedMutable() {
final int bitsPerValue = _TestUtil.nextInt(random(), 1, 64);
final long max = PackedInts.maxValue(bitsPerValue);
int pageSize = 1 << (_TestUtil.nextInt(random(), 6, 30));
// supports 0 values?
PagedMutable writer = new PagedMutable(0, pageSize, bitsPerValue, random().nextFloat() / 2);
assertEquals(0, writer.size());
// compare against AppendingLongBuffer
AppendingLongBuffer buf = new AppendingLongBuffer();
int size = random().nextInt(1000000);
for (int i = 0; i < size; ++i) {
buf.add(bitsPerValue == 64 ? random().nextLong() : _TestUtil.nextLong(random(), 0, max));
}
writer = new PagedMutable(size, pageSize, bitsPerValue, random().nextFloat());
assertEquals(size, writer.size());
for (int i = size - 1; i >= 0; --i) {
writer.set(i, buf.get(i));
}
for (int i = 0; i < size; ++i) {
assertEquals(buf.get(i), writer.get(i));
}
// test ramBytesUsed
assertEquals(RamUsageEstimator.sizeOf(writer) - RamUsageEstimator.sizeOf(writer.format), writer.ramBytesUsed());
// test copy
PagedMutable copy = writer.resize(_TestUtil.nextLong(random(), writer.size() / 2, writer.size() * 3 / 2));
for (long i = 0; i < copy.size(); ++i) {
if (i < writer.size()) {
assertEquals(writer.get(i), copy.get(i));
} else {
assertEquals(0, copy.get(i));
}
}
// test grow
PagedMutable grow = writer.grow(_TestUtil.nextLong(random(), writer.size() / 2, writer.size() * 3 / 2));
for (long i = 0; i < grow.size(); ++i) {
if (i < writer.size()) {
assertEquals(writer.get(i), grow.get(i));
} else {
assertEquals(0, grow.get(i));
}
}
}
// memory hole