mirror of https://github.com/apache/lucene.git
LUCENE-5064: Added PagedMutable.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1494233 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e1005a0645
commit
036381290d
|
@ -187,6 +187,9 @@ Optimizations
|
|||
|
||||
New Features
|
||||
|
||||
* LUCENE-5064: Added PagedMutable (internal), a paged extension of
|
||||
PackedInts.Mutable which allows for storing more than 2B values. (Adrien Grand)
|
||||
|
||||
* LUCENE-4766: Added a PatternCaptureGroupTokenFilter that uses Java regexes to
|
||||
emit multiple tokens one for each capture group in one or more patterns.
|
||||
(Simon Willnauer, Clinton Gormley)
|
||||
|
|
|
@ -0,0 +1,163 @@
|
|||
package org.apache.lucene.util.packed;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.util.packed.PackedInts.checkBlockSize;
|
||||
import static org.apache.lucene.util.packed.PackedInts.numBlocks;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
/**
|
||||
* Base implementation for {@link PagedMutable} and {@link PagedGrowableWriter}.
|
||||
* @lucene.internal
|
||||
*/
|
||||
abstract class AbstractPagedMutable<T extends AbstractPagedMutable<T>> {
|
||||
|
||||
static final int MIN_BLOCK_SIZE = 1 << 6;
|
||||
static final int MAX_BLOCK_SIZE = 1 << 30;
|
||||
|
||||
final long size;
|
||||
final int pageShift;
|
||||
final int pageMask;
|
||||
final PackedInts.Mutable[] subMutables;
|
||||
final int bitsPerValue;
|
||||
|
||||
AbstractPagedMutable(int bitsPerValue, long size, int pageSize) {
|
||||
this.bitsPerValue = bitsPerValue;
|
||||
this.size = size;
|
||||
pageShift = checkBlockSize(pageSize, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE);
|
||||
pageMask = pageSize - 1;
|
||||
final int numPages = numBlocks(size, pageSize);
|
||||
subMutables = new PackedInts.Mutable[numPages];
|
||||
}
|
||||
|
||||
protected final void fillPages() {
|
||||
final int numPages = numBlocks(size, pageSize());
|
||||
for (int i = 0; i < numPages; ++i) {
|
||||
// do not allocate for more entries than necessary on the last page
|
||||
final int valueCount = i == numPages - 1 ? lastPageSize(size) : pageSize();
|
||||
subMutables[i] = newMutable(valueCount, bitsPerValue);
|
||||
}
|
||||
}
|
||||
|
||||
protected abstract PackedInts.Mutable newMutable(int valueCount, int bitsPerValue);
|
||||
|
||||
final int lastPageSize(long size) {
|
||||
final int sz = indexInPage(size);
|
||||
return sz == 0 ? pageSize() : sz;
|
||||
}
|
||||
|
||||
final int pageSize() {
|
||||
return pageMask + 1;
|
||||
}
|
||||
|
||||
/** The number of values. */
|
||||
public final long size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
final int pageIndex(long index) {
|
||||
return (int) (index >>> pageShift);
|
||||
}
|
||||
|
||||
final int indexInPage(long index) {
|
||||
return (int) index & pageMask;
|
||||
}
|
||||
|
||||
/** Get value at <code>index</code>. */
|
||||
public final long get(long index) {
|
||||
assert index >= 0 && index < size;
|
||||
final int pageIndex = pageIndex(index);
|
||||
final int indexInPage = indexInPage(index);
|
||||
return subMutables[pageIndex].get(indexInPage);
|
||||
}
|
||||
|
||||
/** Set value at <code>index</code>. */
|
||||
public final void set(long index, long value) {
|
||||
assert index >= 0 && index < size;
|
||||
final int pageIndex = pageIndex(index);
|
||||
final int indexInPage = indexInPage(index);
|
||||
subMutables[pageIndex].set(indexInPage, value);
|
||||
}
|
||||
|
||||
protected long baseRamBytesUsed() {
|
||||
return RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
|
||||
+ RamUsageEstimator.NUM_BYTES_OBJECT_REF
|
||||
+ RamUsageEstimator.NUM_BYTES_LONG
|
||||
+ 3 * RamUsageEstimator.NUM_BYTES_INT;
|
||||
}
|
||||
|
||||
/** Return the number of bytes used by this object. */
|
||||
public long ramBytesUsed() {
|
||||
long bytesUsed = RamUsageEstimator.alignObjectSize(baseRamBytesUsed());
|
||||
bytesUsed += RamUsageEstimator.alignObjectSize(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + (long) RamUsageEstimator.NUM_BYTES_OBJECT_REF * subMutables.length);
|
||||
for (PackedInts.Mutable gw : subMutables) {
|
||||
bytesUsed += gw.ramBytesUsed();
|
||||
}
|
||||
return bytesUsed;
|
||||
}
|
||||
|
||||
protected abstract T newUnfilledCopy(long newSize);
|
||||
|
||||
/** Create a new copy of size <code>newSize</code> based on the content of
|
||||
* this buffer. This method is much more efficient than creating a new
|
||||
* instance and copying values one by one. */
|
||||
public final T resize(long newSize) {
|
||||
final T copy = newUnfilledCopy(newSize);
|
||||
final int numCommonPages = Math.min(copy.subMutables.length, subMutables.length);
|
||||
final long[] copyBuffer = new long[1024];
|
||||
for (int i = 0; i < copy.subMutables.length; ++i) {
|
||||
final int valueCount = i == copy.subMutables.length - 1 ? lastPageSize(newSize) : pageSize();
|
||||
final int bpv = i < numCommonPages ? subMutables[i].getBitsPerValue() : this.bitsPerValue;
|
||||
copy.subMutables[i] = newMutable(valueCount, bpv);
|
||||
if (i < numCommonPages) {
|
||||
final int copyLength = Math.min(valueCount, subMutables[i].size());
|
||||
PackedInts.copy(subMutables[i], 0, copy.subMutables[i], 0, copyLength, copyBuffer);
|
||||
}
|
||||
}
|
||||
return copy;
|
||||
}
|
||||
|
||||
/** Similar to {@link ArrayUtil#grow(long[], int)}. */
|
||||
public final T grow(long minSize) {
|
||||
assert minSize >= 0;
|
||||
if (minSize <= size()) {
|
||||
@SuppressWarnings("unchecked")
|
||||
final T result = (T) this;
|
||||
return result;
|
||||
}
|
||||
long extra = minSize >>> 3;
|
||||
if (extra < 3) {
|
||||
extra = 3;
|
||||
}
|
||||
final long newSize = minSize + extra;
|
||||
return resize(newSize);
|
||||
}
|
||||
|
||||
/** Similar to {@link ArrayUtil#grow(long[])}. */
|
||||
public final T grow() {
|
||||
return grow(size() + 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final String toString() {
|
||||
return getClass().getSimpleName() + "(size=" + size() + ",pageSize=" + pageSize() + ")";
|
||||
}
|
||||
|
||||
}
|
|
@ -1041,14 +1041,21 @@ public class PackedInts {
|
|||
*/
|
||||
public static Mutable getMutable(int valueCount,
|
||||
int bitsPerValue, float acceptableOverheadRatio) {
|
||||
assert valueCount >= 0;
|
||||
|
||||
final FormatAndBits formatAndBits = fastestFormatAndBits(valueCount, bitsPerValue, acceptableOverheadRatio);
|
||||
switch (formatAndBits.format) {
|
||||
return getMutable(valueCount, formatAndBits.bitsPerValue, formatAndBits.format);
|
||||
}
|
||||
|
||||
/** Same as {@link #getMutable(int, int, float)} with a pre-computed number
|
||||
* of bits per value and format.
|
||||
* @lucene.internal */
|
||||
public static Mutable getMutable(int valueCount,
|
||||
int bitsPerValue, PackedInts.Format format) {
|
||||
assert valueCount >= 0;
|
||||
switch (format) {
|
||||
case PACKED_SINGLE_BLOCK:
|
||||
return Packed64SingleBlock.create(valueCount, formatAndBits.bitsPerValue);
|
||||
return Packed64SingleBlock.create(valueCount, bitsPerValue);
|
||||
case PACKED:
|
||||
switch (formatAndBits.bitsPerValue) {
|
||||
switch (bitsPerValue) {
|
||||
case 8:
|
||||
return new Direct8(valueCount);
|
||||
case 16:
|
||||
|
@ -1068,7 +1075,7 @@ public class PackedInts {
|
|||
}
|
||||
break;
|
||||
}
|
||||
return new Packed64(valueCount, formatAndBits.bitsPerValue);
|
||||
return new Packed64(valueCount, bitsPerValue);
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
|
|
|
@ -17,10 +17,8 @@ package org.apache.lucene.util.packed;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.util.packed.PackedInts.checkBlockSize;
|
||||
import static org.apache.lucene.util.packed.PackedInts.numBlocks;
|
||||
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.packed.PackedInts.Mutable;
|
||||
|
||||
/**
|
||||
* A {@link PagedGrowableWriter}. This class slices data into fixed-size blocks
|
||||
|
@ -30,16 +28,8 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
* less memory-efficient.
|
||||
* @lucene.internal
|
||||
*/
|
||||
public final class PagedGrowableWriter {
|
||||
public final class PagedGrowableWriter extends AbstractPagedMutable<PagedGrowableWriter> {
|
||||
|
||||
static final int MIN_BLOCK_SIZE = 1 << 6;
|
||||
static final int MAX_BLOCK_SIZE = 1 << 30;
|
||||
|
||||
final long size;
|
||||
final int pageShift;
|
||||
final int pageMask;
|
||||
final GrowableWriter[] subWriters;
|
||||
final int startBitsPerValue;
|
||||
final float acceptableOverheadRatio;
|
||||
|
||||
/**
|
||||
|
@ -56,98 +46,26 @@ public final class PagedGrowableWriter {
|
|||
}
|
||||
|
||||
PagedGrowableWriter(long size, int pageSize,int startBitsPerValue, float acceptableOverheadRatio, boolean fillPages) {
|
||||
this.size = size;
|
||||
this.startBitsPerValue = startBitsPerValue;
|
||||
super(startBitsPerValue, size, pageSize);
|
||||
this.acceptableOverheadRatio = acceptableOverheadRatio;
|
||||
pageShift = checkBlockSize(pageSize, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE);
|
||||
pageMask = pageSize - 1;
|
||||
final int numPages = numBlocks(size, pageSize);
|
||||
subWriters = new GrowableWriter[numPages];
|
||||
if (fillPages) {
|
||||
for (int i = 0; i < numPages; ++i) {
|
||||
// do not allocate for more entries than necessary on the last page
|
||||
final int valueCount = i == numPages - 1 ? lastPageSize(size) : pageSize;
|
||||
subWriters[i] = new GrowableWriter(startBitsPerValue, valueCount, acceptableOverheadRatio);
|
||||
}
|
||||
fillPages();
|
||||
}
|
||||
}
|
||||
|
||||
private int lastPageSize(long size) {
|
||||
final int sz = indexInPage(size);
|
||||
return sz == 0 ? pageSize() : sz;
|
||||
}
|
||||
|
||||
private int pageSize() {
|
||||
return pageMask + 1;
|
||||
}
|
||||
|
||||
/** The number of values. */
|
||||
public long size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
int pageIndex(long index) {
|
||||
return (int) (index >>> pageShift);
|
||||
}
|
||||
|
||||
int indexInPage(long index) {
|
||||
return (int) index & pageMask;
|
||||
}
|
||||
|
||||
/** Get value at <code>index</code>. */
|
||||
public long get(long index) {
|
||||
assert index >= 0 && index < size;
|
||||
final int pageIndex = pageIndex(index);
|
||||
final int indexInPage = indexInPage(index);
|
||||
return subWriters[pageIndex].get(indexInPage);
|
||||
}
|
||||
|
||||
/** Set value at <code>index</code>. */
|
||||
public void set(long index, long value) {
|
||||
assert index >= 0 && index < size;
|
||||
final int pageIndex = pageIndex(index);
|
||||
final int indexInPage = indexInPage(index);
|
||||
subWriters[pageIndex].set(indexInPage, value);
|
||||
}
|
||||
|
||||
/** Create a new {@link PagedGrowableWriter} of size <code>newSize</code>
|
||||
* based on the content of this buffer. This method is much more efficient
|
||||
* than creating a new {@link PagedGrowableWriter} and copying values one by
|
||||
* one. */
|
||||
public PagedGrowableWriter resize(long newSize) {
|
||||
final PagedGrowableWriter newWriter = new PagedGrowableWriter(newSize, pageSize(), startBitsPerValue, acceptableOverheadRatio, false);
|
||||
final int numCommonPages = Math.min(newWriter.subWriters.length, subWriters.length);
|
||||
final long[] copyBuffer = new long[1024];
|
||||
for (int i = 0; i < newWriter.subWriters.length; ++i) {
|
||||
final int valueCount = i == newWriter.subWriters.length - 1 ? lastPageSize(newSize) : pageSize();
|
||||
final int bpv = i < numCommonPages ? subWriters[i].getBitsPerValue() : startBitsPerValue;
|
||||
newWriter.subWriters[i] = new GrowableWriter(bpv, valueCount, acceptableOverheadRatio);
|
||||
if (i < numCommonPages) {
|
||||
final int copyLength = Math.min(valueCount, subWriters[i].size());
|
||||
PackedInts.copy(subWriters[i], 0, newWriter.subWriters[i].getMutable(), 0, copyLength, copyBuffer);
|
||||
}
|
||||
}
|
||||
return newWriter;
|
||||
}
|
||||
|
||||
/** Return the number of bytes used by this object. */
|
||||
public long ramBytesUsed() {
|
||||
long bytesUsed = RamUsageEstimator.alignObjectSize(
|
||||
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
|
||||
+ RamUsageEstimator.NUM_BYTES_OBJECT_REF
|
||||
+ RamUsageEstimator.NUM_BYTES_LONG
|
||||
+ 3 * RamUsageEstimator.NUM_BYTES_INT
|
||||
+ RamUsageEstimator.NUM_BYTES_FLOAT);
|
||||
bytesUsed += RamUsageEstimator.alignObjectSize(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + (long) RamUsageEstimator.NUM_BYTES_OBJECT_REF * subWriters.length);
|
||||
for (GrowableWriter gw : subWriters) {
|
||||
bytesUsed += gw.ramBytesUsed();
|
||||
}
|
||||
return bytesUsed;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + "(size=" + size() + ",pageSize=" + pageSize() + ")";
|
||||
protected Mutable newMutable(int valueCount, int bitsPerValue) {
|
||||
return new GrowableWriter(bitsPerValue, valueCount, acceptableOverheadRatio);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected PagedGrowableWriter newUnfilledCopy(long newSize) {
|
||||
return new PagedGrowableWriter(newSize, pageSize(), bitsPerValue, acceptableOverheadRatio, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected long baseRamBytesUsed() {
|
||||
return super.baseRamBytesUsed() + RamUsageEstimator.NUM_BYTES_FLOAT;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,71 @@
|
|||
package org.apache.lucene.util.packed;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.packed.PackedInts.Mutable;
|
||||
|
||||
/**
|
||||
* A {@link PagedMutable}. This class slices data into fixed-size blocks
|
||||
* which have the same number of bits per value. It can be a useful replacement
|
||||
* for {@link PackedInts.Mutable} to store more than 2B values.
|
||||
* @lucene.internal
|
||||
*/
|
||||
public final class PagedMutable extends AbstractPagedMutable<PagedMutable> {
|
||||
|
||||
final PackedInts.Format format;
|
||||
|
||||
/**
|
||||
* Create a new {@link PagedMutable} instance.
|
||||
*
|
||||
* @param size the number of values to store.
|
||||
* @param pageSize the number of values per page
|
||||
* @param bitsPerValue the number of bits per value
|
||||
* @param acceptableOverheadRatio an acceptable overhead ratio
|
||||
*/
|
||||
public PagedMutable(long size, int pageSize, int bitsPerValue, float acceptableOverheadRatio) {
|
||||
this(size, pageSize, PackedInts.fastestFormatAndBits(pageSize, bitsPerValue, acceptableOverheadRatio));
|
||||
fillPages();
|
||||
}
|
||||
|
||||
PagedMutable(long size, int pageSize, PackedInts.FormatAndBits formatAndBits) {
|
||||
this(size, pageSize, formatAndBits.bitsPerValue, formatAndBits.format);
|
||||
}
|
||||
|
||||
PagedMutable(long size, int pageSize, int bitsPerValue, PackedInts.Format format) {
|
||||
super(bitsPerValue, size, pageSize);
|
||||
this.format = format;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Mutable newMutable(int valueCount, int bitsPerValue) {
|
||||
assert this.bitsPerValue >= bitsPerValue;
|
||||
return PackedInts.getMutable(valueCount, this.bitsPerValue, format);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected PagedMutable newUnfilledCopy(long newSize) {
|
||||
return new PagedMutable(newSize, pageSize(), bitsPerValue, format);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected long baseRamBytesUsed() {
|
||||
return super.baseRamBytesUsed() + RamUsageEstimator.NUM_BYTES_OBJECT_REF;
|
||||
}
|
||||
|
||||
}
|
|
@ -697,6 +697,64 @@ public class TestPackedInts extends LuceneTestCase {
|
|||
assertEquals(0, copy.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
// test grow
|
||||
PagedGrowableWriter grow = writer.grow(_TestUtil.nextLong(random(), writer.size() / 2, writer.size() * 3 / 2));
|
||||
for (long i = 0; i < grow.size(); ++i) {
|
||||
if (i < writer.size()) {
|
||||
assertEquals(writer.get(i), grow.get(i));
|
||||
} else {
|
||||
assertEquals(0, grow.get(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testPagedMutable() {
|
||||
final int bitsPerValue = _TestUtil.nextInt(random(), 1, 64);
|
||||
final long max = PackedInts.maxValue(bitsPerValue);
|
||||
int pageSize = 1 << (_TestUtil.nextInt(random(), 6, 30));
|
||||
// supports 0 values?
|
||||
PagedMutable writer = new PagedMutable(0, pageSize, bitsPerValue, random().nextFloat() / 2);
|
||||
assertEquals(0, writer.size());
|
||||
|
||||
// compare against AppendingLongBuffer
|
||||
AppendingLongBuffer buf = new AppendingLongBuffer();
|
||||
int size = random().nextInt(1000000);
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
buf.add(bitsPerValue == 64 ? random().nextLong() : _TestUtil.nextLong(random(), 0, max));
|
||||
}
|
||||
writer = new PagedMutable(size, pageSize, bitsPerValue, random().nextFloat());
|
||||
assertEquals(size, writer.size());
|
||||
for (int i = size - 1; i >= 0; --i) {
|
||||
writer.set(i, buf.get(i));
|
||||
}
|
||||
for (int i = 0; i < size; ++i) {
|
||||
assertEquals(buf.get(i), writer.get(i));
|
||||
}
|
||||
|
||||
// test ramBytesUsed
|
||||
assertEquals(RamUsageEstimator.sizeOf(writer) - RamUsageEstimator.sizeOf(writer.format), writer.ramBytesUsed());
|
||||
|
||||
// test copy
|
||||
PagedMutable copy = writer.resize(_TestUtil.nextLong(random(), writer.size() / 2, writer.size() * 3 / 2));
|
||||
for (long i = 0; i < copy.size(); ++i) {
|
||||
if (i < writer.size()) {
|
||||
assertEquals(writer.get(i), copy.get(i));
|
||||
} else {
|
||||
assertEquals(0, copy.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
// test grow
|
||||
PagedMutable grow = writer.grow(_TestUtil.nextLong(random(), writer.size() / 2, writer.size() * 3 / 2));
|
||||
for (long i = 0; i < grow.size(); ++i) {
|
||||
if (i < writer.size()) {
|
||||
assertEquals(writer.get(i), grow.get(i));
|
||||
} else {
|
||||
assertEquals(0, grow.get(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// memory hole
|
||||
|
|
Loading…
Reference in New Issue