From 28567c2327f011e467a3b4f9ed0a19853f1829fd Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Mon, 21 Jan 2013 23:47:31 +0000 Subject: [PATCH] NumericDocValues: Prevent rare extreme values from raising the number of bits per value for everyone. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1436692 13f79535-47bb-0310-9956-ffa450edef68 --- .../CompressingTermVectorsReader.java | 8 +- .../lucene42/Lucene42DocValuesConsumer.java | 58 ++--- .../lucene42/Lucene42DocValuesProducer.java | 16 +- .../lucene/util/packed/BlockPackedReader.java | 229 +++-------------- .../packed/BlockPackedReaderIterator.java | 241 ++++++++++++++++++ .../lucene/util/packed/BlockPackedWriter.java | 9 +- .../lucene/util/packed/TestPackedInts.java | 44 ++-- 7 files changed, 347 insertions(+), 258 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReaderIterator.java diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java index 7074c6b50f2..20b35e004b9 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java @@ -57,7 +57,7 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LongsRef; -import org.apache.lucene.util.packed.BlockPackedReader; +import org.apache.lucene.util.packed.BlockPackedReaderIterator; import org.apache.lucene.util.packed.PackedInts; @@ -76,7 +76,7 @@ public final class CompressingTermVectorsReader extends TermVectorsReader implem private final int chunkSize; private final int numDocs; private boolean closed; - private final BlockPackedReader reader; + private final BlockPackedReaderIterator reader; // used by clone private CompressingTermVectorsReader(CompressingTermVectorsReader reader) { @@ -88,7 +88,7 @@ public final class CompressingTermVectorsReader extends TermVectorsReader implem this.decompressor = reader.decompressor.clone(); this.chunkSize = reader.chunkSize; this.numDocs = reader.numDocs; - this.reader = new BlockPackedReader(vectorsStream, packedIntsVersion, BLOCK_SIZE, 0); + this.reader = new BlockPackedReaderIterator(vectorsStream, packedIntsVersion, BLOCK_SIZE, 0); this.closed = false; } @@ -119,7 +119,7 @@ public final class CompressingTermVectorsReader extends TermVectorsReader implem packedIntsVersion = vectorsStream.readVInt(); chunkSize = vectorsStream.readVInt(); decompressor = compressionMode.newDecompressor(); - this.reader = new BlockPackedReader(vectorsStream, packedIntsVersion, BLOCK_SIZE, 0); + this.reader = new BlockPackedReaderIterator(vectorsStream, packedIntsVersion, BLOCK_SIZE, 0); success = true; } finally { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java index 502ceea16d5..9724511e530 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java @@ -36,6 +36,7 @@ import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST.INPUT_TYPE; import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.packed.BlockPackedWriter; import org.apache.lucene.util.packed.PackedInts; /** @@ -53,7 +54,9 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { static final byte NUMBER = 0; static final byte BYTES = 1; static final byte FST = 2; - + + static final int BLOCK_SIZE = 4096; + final IndexOutput data, meta; Lucene42DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { @@ -97,15 +100,10 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { } } - long delta = maxValue - minValue; - final int bitsPerValue; - if (delta < 0) { - bitsPerValue = 64; - meta.writeByte((byte)0); // delta-compressed - } else if (uniqueValues != null && PackedInts.bitsRequired(uniqueValues.size()-1) < PackedInts.bitsRequired(delta)) { + final long delta = maxValue - minValue; + if (uniqueValues != null && (delta < 0 || PackedInts.bitsRequired(uniqueValues.size()-1) < PackedInts.bitsRequired(delta))) { // smaller to tableize - bitsPerValue = PackedInts.bitsRequired(uniqueValues.size()-1); - minValue = 0; // we will write indexes into the table instead of values + final int bitsPerValue = PackedInts.bitsRequired(uniqueValues.size()-1); meta.writeByte((byte)1); // table-compressed Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]); final HashMap encode = new HashMap(); @@ -114,39 +112,29 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { data.writeLong(decode[i]); encode.put(decode[i], i); } - final Iterable original = values; - values = new Iterable() { - @Override - public Iterator iterator() { - final Iterator inner = original.iterator(); - return new Iterator() { - @Override - public boolean hasNext() { - return inner.hasNext(); - } - @Override - public Number next() { - return encode.get(inner.next()); - } + data.writeVInt(PackedInts.VERSION_CURRENT); + data.writeVInt(count); + data.writeVInt(bitsPerValue); - @Override - public void remove() { throw new UnsupportedOperationException(); } - }; - } - }; + final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, PackedInts.Format.PACKED, count, bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE); + for(Number nv : values) { + writer.add(encode.get(nv)); + } + writer.finish(); } else { - bitsPerValue = PackedInts.bitsRequired(delta); meta.writeByte((byte)0); // delta-compressed - } - data.writeLong(minValue); + data.writeVInt(PackedInts.VERSION_CURRENT); + data.writeVInt(count); + data.writeVInt(BLOCK_SIZE); - final PackedInts.Writer writer = PackedInts.getWriter(data, count, bitsPerValue, PackedInts.COMPACT); - for(Number nv : values) { - writer.add(nv.longValue() - minValue); + final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); + for (Number nv : values) { + writer.add(nv.longValue()); + } + writer.finish(); } - writer.finish(); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java index 93516173a34..9cd9ee2ac77 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java @@ -42,6 +42,7 @@ import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST.BytesReader; import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.packed.BlockPackedReader; import org.apache.lucene.util.packed.PackedInts; class Lucene42DocValuesProducer extends DocValuesProducer { @@ -139,9 +140,10 @@ class Lucene42DocValuesProducer extends DocValuesProducer { for (int i = 0; i < decode.length; i++) { decode[i] = data.readLong(); } - final long minValue = data.readLong(); - assert minValue == 0; - final PackedInts.Reader reader = PackedInts.getReader(data); + final int packedIntsVersion = data.readVInt(); + final int count = data.readVInt(); + final int bitsPerValue = data.readVInt(); + final PackedInts.Reader reader = PackedInts.getReaderNoHeader(data, PackedInts.Format.PACKED, packedIntsVersion, count, bitsPerValue); return new NumericDocValues() { @Override public long get(int docID) { @@ -149,12 +151,14 @@ class Lucene42DocValuesProducer extends DocValuesProducer { } }; } else { - final long minValue = data.readLong(); - final PackedInts.Reader reader = PackedInts.getReader(data); + final int packedIntsVersion = data.readVInt(); + final int count = data.readVInt(); + final int blockSize = data.readVInt(); + final BlockPackedReader reader = new BlockPackedReader(data, packedIntsVersion, blockSize, count, false); return new NumericDocValues() { @Override public long get(int docID) { - return minValue + reader.get(docID); + return reader.get(docID); } }; } diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java b/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java index 4682c5d7f89..c2e8a8ef2e1 100644 --- a/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java @@ -17,225 +17,74 @@ package org.apache.lucene.util.packed; * limitations under the License. */ +import static org.apache.lucene.util.packed.BlockPackedReaderIterator.readVLong; +import static org.apache.lucene.util.packed.BlockPackedReaderIterator.zigZagDecode; import static org.apache.lucene.util.packed.BlockPackedWriter.BPV_SHIFT; import static org.apache.lucene.util.packed.BlockPackedWriter.MIN_VALUE_EQUALS_0; import static org.apache.lucene.util.packed.BlockPackedWriter.checkBlockSize; -import java.io.EOFException; import java.io.IOException; -import java.util.Arrays; -import org.apache.lucene.store.DataInput; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.LongsRef; /** - * Reader for sequences of longs written with {@link BlockPackedWriter}. - * @see BlockPackedWriter + * Provides random access to a stream written with {@link BlockPackedWriter}. * @lucene.internal */ public final class BlockPackedReader { - static long zigZagDecode(long n) { - return ((n >>> 1) ^ -(n & 1)); - } + private final int blockShift, blockMask; + private final long valueCount; + private final long[] minValues; + private final PackedInts.Reader[] subReaders; - // same as DataInput.readVLong but supports negative values - static long readVLong(DataInput in) throws IOException { - byte b = in.readByte(); - if (b >= 0) return b; - long i = b & 0x7FL; - b = in.readByte(); - i |= (b & 0x7FL) << 7; - if (b >= 0) return i; - b = in.readByte(); - i |= (b & 0x7FL) << 14; - if (b >= 0) return i; - b = in.readByte(); - i |= (b & 0x7FL) << 21; - if (b >= 0) return i; - b = in.readByte(); - i |= (b & 0x7FL) << 28; - if (b >= 0) return i; - b = in.readByte(); - i |= (b & 0x7FL) << 35; - if (b >= 0) return i; - b = in.readByte(); - i |= (b & 0x7FL) << 42; - if (b >= 0) return i; - b = in.readByte(); - i |= (b & 0x7FL) << 49; - if (b >= 0) return i; - b = in.readByte(); - i |= (b & 0xFFL) << 56; - return i; - } - - DataInput in; - final int packedIntsVersion; - long valueCount; - final int blockSize; - final long[] values; - final LongsRef valuesRef; - byte[] blocks; - int off; - long ord; - - /** Sole constructor. - * @param blockSize the number of values of a block, must be equal to the - * block size of the {@link BlockPackedWriter} which has - * been used to write the stream - */ - public BlockPackedReader(DataInput in, int packedIntsVersion, int blockSize, long valueCount) { + /** Sole constructor. */ + public BlockPackedReader(IndexInput in, int packedIntsVersion, int blockSize, long valueCount, boolean direct) throws IOException { checkBlockSize(blockSize); - this.packedIntsVersion = packedIntsVersion; - this.blockSize = blockSize; - this.values = new long[blockSize]; - this.valuesRef = new LongsRef(this.values, 0, 0); - reset(in, valueCount); - } - - /** Reset the current reader to wrap a stream of valueCount - * values contained in in. The block size remains unchanged. */ - public void reset(DataInput in, long valueCount) { - this.in = in; - assert valueCount >= 0; this.valueCount = valueCount; - off = blockSize; - ord = 0; - } - - /** Skip exactly count values. */ - public void skip(long count) throws IOException { - assert count >= 0; - if (ord + count > valueCount || ord + count < 0) { - throw new EOFException(); + blockShift = Long.numberOfTrailingZeros(blockSize); + blockMask = blockSize - 1; + final int numBlocks = (int) (valueCount / blockSize) + (valueCount % blockSize == 0 ? 0 : 1); + if (numBlocks * blockSize < valueCount) { + throw new IllegalArgumentException("valueCount is too large for this block size"); } - - // 1. skip buffered values - final int skipBuffer = (int) Math.min(count, blockSize - off); - off += skipBuffer; - ord += skipBuffer; - count -= skipBuffer; - if (count == 0L) { - return; - } - - // 2. skip as many blocks as necessary - assert off == blockSize; - while (count >= blockSize) { + long[] minValues = null; + subReaders = new PackedInts.Reader[numBlocks]; + for (int i = 0; i < numBlocks; ++i) { final int token = in.readByte() & 0xFF; final int bitsPerValue = token >>> BPV_SHIFT; if (bitsPerValue > 64) { throw new IOException("Corrupted"); } if ((token & MIN_VALUE_EQUALS_0) == 0) { - readVLong(in); + if (minValues == null) { + minValues = new long[numBlocks]; + } + minValues[i] = zigZagDecode(1L + readVLong(in)); } - final long blockBytes = PackedInts.Format.PACKED.byteCount(packedIntsVersion, blockSize, bitsPerValue); - skipBytes(blockBytes); - ord += blockSize; - count -= blockSize; - } - if (count == 0L) { - return; - } - - // 3. skip last values - assert count < blockSize; - refill(); - ord += count; - off += count; - } - - private void skipBytes(long count) throws IOException { - if (in instanceof IndexInput) { - final IndexInput iin = (IndexInput) in; - iin.seek(iin.getFilePointer() + count); - } else { - if (blocks == null) { - blocks = new byte[blockSize]; - } - long skipped = 0; - while (skipped < count) { - final int toSkip = (int) Math.min(blocks.length, count - skipped); - in.readBytes(blocks, 0, toSkip); - skipped += toSkip; - } - } - } - - /** Read the next value. */ - public long next() throws IOException { - if (ord == valueCount) { - throw new EOFException(); - } - if (off == blockSize) { - refill(); - } - final long value = values[off++]; - ++ord; - return value; - } - - /** Read between 1 and count values. */ - public LongsRef next(int count) throws IOException { - assert count > 0; - if (ord == valueCount) { - throw new EOFException(); - } - if (off == blockSize) { - refill(); - } - - count = Math.min(count, blockSize - off); - count = (int) Math.min(count, valueCount - ord); - - valuesRef.offset = off; - valuesRef.length = count; - off += count; - ord += count; - return valuesRef; - } - - private void refill() throws IOException { - final int token = in.readByte() & 0xFF; - final boolean minEquals0 = (token & MIN_VALUE_EQUALS_0) != 0; - final int bitsPerValue = token >>> BPV_SHIFT; - if (bitsPerValue > 64) { - throw new IOException("Corrupted"); - } - final long minValue = minEquals0 ? 0L : zigZagDecode(1L + readVLong(in)); - assert minEquals0 || minValue != 0; - - if (bitsPerValue == 0) { - Arrays.fill(values, minValue); - } else { - final PackedInts.Decoder decoder = PackedInts.getDecoder(PackedInts.Format.PACKED, packedIntsVersion, bitsPerValue); - final int iterations = blockSize / decoder.valueCount(); - final int blocksSize = iterations * 8 * decoder.blockCount(); - if (blocks == null || blocks.length < blocksSize) { - blocks = new byte[blocksSize]; - } - - final int valueCount = (int) Math.min(this.valueCount - ord, blockSize); - final int blocksCount = (int) PackedInts.Format.PACKED.byteCount(packedIntsVersion, valueCount, bitsPerValue); - in.readBytes(blocks, 0, blocksCount); - - decoder.decode(blocks, 0, values, 0, iterations); - - if (minValue != 0) { - for (int i = 0; i < valueCount; ++i) { - values[i] += minValue; + if (bitsPerValue != 0) { + final int size = (int) Math.min(blockSize, valueCount - (long) i * blockSize); + if (direct) { + final long pointer = in.getFilePointer(); + subReaders[i] = PackedInts.getDirectReaderNoHeader(in, PackedInts.Format.PACKED, packedIntsVersion, size, bitsPerValue); + in.seek(pointer + PackedInts.Format.PACKED.byteCount(packedIntsVersion, size, bitsPerValue)); + } else { + subReaders[i] = PackedInts.getReaderNoHeader(in, PackedInts.Format.PACKED, packedIntsVersion, size, bitsPerValue); } } } - off = 0; + this.minValues = minValues; } - /** Return the offset of the next value to read. */ - public long ord() { - return ord; + /** Get value at index. */ + public long get(long index) { + assert index >= 0 && index < valueCount; + final int block = (int) (index >>> blockShift); + if (subReaders[block] == null) { + return minValues == null ? 0 : minValues[block]; + } + final int idx = (int) (index & blockMask); + return (minValues == null ? 0 : minValues[block]) + subReaders[block].get(idx); } } diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReaderIterator.java b/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReaderIterator.java new file mode 100644 index 00000000000..0b59f32c4d2 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReaderIterator.java @@ -0,0 +1,241 @@ +package org.apache.lucene.util.packed; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.lucene.util.packed.BlockPackedWriter.BPV_SHIFT; +import static org.apache.lucene.util.packed.BlockPackedWriter.MIN_VALUE_EQUALS_0; +import static org.apache.lucene.util.packed.BlockPackedWriter.checkBlockSize; + +import java.io.EOFException; +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.LongsRef; + +/** + * Reader for sequences of longs written with {@link BlockPackedWriter}. + * @see BlockPackedWriter + * @lucene.internal + */ +public final class BlockPackedReaderIterator { + + static long zigZagDecode(long n) { + return ((n >>> 1) ^ -(n & 1)); + } + + // same as DataInput.readVLong but supports negative values + static long readVLong(DataInput in) throws IOException { + byte b = in.readByte(); + if (b >= 0) return b; + long i = b & 0x7FL; + b = in.readByte(); + i |= (b & 0x7FL) << 7; + if (b >= 0) return i; + b = in.readByte(); + i |= (b & 0x7FL) << 14; + if (b >= 0) return i; + b = in.readByte(); + i |= (b & 0x7FL) << 21; + if (b >= 0) return i; + b = in.readByte(); + i |= (b & 0x7FL) << 28; + if (b >= 0) return i; + b = in.readByte(); + i |= (b & 0x7FL) << 35; + if (b >= 0) return i; + b = in.readByte(); + i |= (b & 0x7FL) << 42; + if (b >= 0) return i; + b = in.readByte(); + i |= (b & 0x7FL) << 49; + if (b >= 0) return i; + b = in.readByte(); + i |= (b & 0xFFL) << 56; + return i; + } + + DataInput in; + final int packedIntsVersion; + long valueCount; + final int blockSize; + final long[] values; + final LongsRef valuesRef; + byte[] blocks; + int off; + long ord; + + /** Sole constructor. + * @param blockSize the number of values of a block, must be equal to the + * block size of the {@link BlockPackedWriter} which has + * been used to write the stream + */ + public BlockPackedReaderIterator(DataInput in, int packedIntsVersion, int blockSize, long valueCount) { + checkBlockSize(blockSize); + this.packedIntsVersion = packedIntsVersion; + this.blockSize = blockSize; + this.values = new long[blockSize]; + this.valuesRef = new LongsRef(this.values, 0, 0); + reset(in, valueCount); + } + + /** Reset the current reader to wrap a stream of valueCount + * values contained in in. The block size remains unchanged. */ + public void reset(DataInput in, long valueCount) { + this.in = in; + assert valueCount >= 0; + this.valueCount = valueCount; + off = blockSize; + ord = 0; + } + + /** Skip exactly count values. */ + public void skip(long count) throws IOException { + assert count >= 0; + if (ord + count > valueCount || ord + count < 0) { + throw new EOFException(); + } + + // 1. skip buffered values + final int skipBuffer = (int) Math.min(count, blockSize - off); + off += skipBuffer; + ord += skipBuffer; + count -= skipBuffer; + if (count == 0L) { + return; + } + + // 2. skip as many blocks as necessary + assert off == blockSize; + while (count >= blockSize) { + final int token = in.readByte() & 0xFF; + final int bitsPerValue = token >>> BPV_SHIFT; + if (bitsPerValue > 64) { + throw new IOException("Corrupted"); + } + if ((token & MIN_VALUE_EQUALS_0) == 0) { + readVLong(in); + } + final long blockBytes = PackedInts.Format.PACKED.byteCount(packedIntsVersion, blockSize, bitsPerValue); + skipBytes(blockBytes); + ord += blockSize; + count -= blockSize; + } + if (count == 0L) { + return; + } + + // 3. skip last values + assert count < blockSize; + refill(); + ord += count; + off += count; + } + + private void skipBytes(long count) throws IOException { + if (in instanceof IndexInput) { + final IndexInput iin = (IndexInput) in; + iin.seek(iin.getFilePointer() + count); + } else { + if (blocks == null) { + blocks = new byte[blockSize]; + } + long skipped = 0; + while (skipped < count) { + final int toSkip = (int) Math.min(blocks.length, count - skipped); + in.readBytes(blocks, 0, toSkip); + skipped += toSkip; + } + } + } + + /** Read the next value. */ + public long next() throws IOException { + if (ord == valueCount) { + throw new EOFException(); + } + if (off == blockSize) { + refill(); + } + final long value = values[off++]; + ++ord; + return value; + } + + /** Read between 1 and count values. */ + public LongsRef next(int count) throws IOException { + assert count > 0; + if (ord == valueCount) { + throw new EOFException(); + } + if (off == blockSize) { + refill(); + } + + count = Math.min(count, blockSize - off); + count = (int) Math.min(count, valueCount - ord); + + valuesRef.offset = off; + valuesRef.length = count; + off += count; + ord += count; + return valuesRef; + } + + private void refill() throws IOException { + final int token = in.readByte() & 0xFF; + final boolean minEquals0 = (token & MIN_VALUE_EQUALS_0) != 0; + final int bitsPerValue = token >>> BPV_SHIFT; + if (bitsPerValue > 64) { + throw new IOException("Corrupted"); + } + final long minValue = minEquals0 ? 0L : zigZagDecode(1L + readVLong(in)); + assert minEquals0 || minValue != 0; + + if (bitsPerValue == 0) { + Arrays.fill(values, minValue); + } else { + final PackedInts.Decoder decoder = PackedInts.getDecoder(PackedInts.Format.PACKED, packedIntsVersion, bitsPerValue); + final int iterations = blockSize / decoder.valueCount(); + final int blocksSize = iterations * 8 * decoder.blockCount(); + if (blocks == null || blocks.length < blocksSize) { + blocks = new byte[blocksSize]; + } + + final int valueCount = (int) Math.min(this.valueCount - ord, blockSize); + final int blocksCount = (int) PackedInts.Format.PACKED.byteCount(packedIntsVersion, valueCount, bitsPerValue); + in.readBytes(blocks, 0, blocksCount); + + decoder.decode(blocks, 0, values, 0, iterations); + + if (minValue != 0) { + for (int i = 0; i < valueCount; ++i) { + values[i] += minValue; + } + } + } + off = 0; + } + + /** Return the offset of the next value to read. */ + public long ord() { + return ord; + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java b/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java index a8c74a2a4f9..89cf5e94cef 100644 --- a/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java @@ -30,7 +30,7 @@ import org.apache.lucene.store.DataOutput; * using as few bits as possible. Memory usage of this class is proportional to * the block size. Each block has an overhead between 1 and 10 bytes to store * the minimum value and the number of bits per value of the block. - * @see BlockPackedReader + * @see BlockPackedReaderIterator * @lucene.internal */ public final class BlockPackedWriter { @@ -43,8 +43,11 @@ public final class BlockPackedWriter { if (blockSize <= 0 || blockSize > MAX_BLOCK_SIZE) { throw new IllegalArgumentException("blockSize must be > 0 and < " + MAX_BLOCK_SIZE + ", got " + blockSize); } - if (blockSize % 64 != 0) { - throw new IllegalArgumentException("blockSize must be a multiple of 64, got " + blockSize); + if (blockSize < 64) { + throw new IllegalArgumentException("blockSize must be >= 64, got " + blockSize); + } + if ((blockSize & (blockSize - 1)) != 0) { + throw new IllegalArgumentException("blockSize must be a power of two, got " + blockSize); } } diff --git a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java index 18629bc22c7..aed199b3565 100644 --- a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java +++ b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java @@ -877,10 +877,11 @@ public class TestPackedInts extends LuceneTestCase { in.close(); dir.close(); } + public void testBlockPackedReaderWriter() throws IOException { final int iters = atLeast(2); for (int iter = 0; iter < iters; ++iter) { - final int blockSize = 64 * _TestUtil.nextInt(random(), 1, 1 << 12); + final int blockSize = 1 << _TestUtil.nextInt(random(), 6, 18); final int valueCount = random().nextInt(1 << 18); final long[] values = new long[valueCount]; long minValue = 0; @@ -912,30 +913,29 @@ public class TestPackedInts extends LuceneTestCase { final long fp = out.getFilePointer(); out.close(); - DataInput in = dir.openInput("out.bin", IOContext.DEFAULT); - if (random().nextBoolean()) { - byte[] buf = new byte[(int) fp]; - in.readBytes(buf, 0, (int) fp); - ((IndexInput) in).close(); - in = new ByteArrayDataInput(buf); - } - final BlockPackedReader reader = new BlockPackedReader(in, PackedInts.VERSION_CURRENT, blockSize, valueCount); + IndexInput in1 = dir.openInput("out.bin", IOContext.DEFAULT); + byte[] buf = new byte[(int) fp]; + in1.readBytes(buf, 0, (int) fp); + in1.seek(0L); + ByteArrayDataInput in2 = new ByteArrayDataInput(buf); + final DataInput in = random().nextBoolean() ? in1 : in2; + final BlockPackedReaderIterator it = new BlockPackedReaderIterator(in, PackedInts.VERSION_CURRENT, blockSize, valueCount); for (int i = 0; i < valueCount; ) { if (random().nextBoolean()) { - assertEquals("" + i, values[i], reader.next()); + assertEquals("" + i, values[i], it.next()); ++i; } else { - final LongsRef nextValues = reader.next(_TestUtil.nextInt(random(), 1, 1024)); + final LongsRef nextValues = it.next(_TestUtil.nextInt(random(), 1, 1024)); for (int j = 0; j < nextValues.length; ++j) { assertEquals("" + (i + j), values[i + j], nextValues.longs[nextValues.offset + j]); } i += nextValues.length; } - assertEquals(i, reader.ord()); + assertEquals(i, it.ord()); } assertEquals(fp, in instanceof ByteArrayDataInput ? ((ByteArrayDataInput) in).getPosition() : ((IndexInput) in).getFilePointer()); try { - reader.next(); + it.next(); assertTrue(false); } catch (IOException e) { // OK @@ -946,31 +946,35 @@ public class TestPackedInts extends LuceneTestCase { } else { ((IndexInput) in).seek(0L); } - final BlockPackedReader reader2 = new BlockPackedReader(in, PackedInts.VERSION_CURRENT, blockSize, valueCount); + final BlockPackedReaderIterator it2 = new BlockPackedReaderIterator(in, PackedInts.VERSION_CURRENT, blockSize, valueCount); int i = 0; while (true) { final int skip = _TestUtil.nextInt(random(), 0, valueCount - i); - reader2.skip(skip); + it2.skip(skip); i += skip; - assertEquals(i, reader2.ord()); + assertEquals(i, it2.ord()); if (i == valueCount) { break; } else { - assertEquals(values[i], reader2.next()); + assertEquals(values[i], it2.next()); ++i; } } assertEquals(fp, in instanceof ByteArrayDataInput ? ((ByteArrayDataInput) in).getPosition() : ((IndexInput) in).getFilePointer()); try { - reader2.skip(1); + it2.skip(1); assertTrue(false); } catch (IOException e) { // OK } - if (in instanceof IndexInput) { - ((IndexInput) in).close(); + in1.seek(0L); + final BlockPackedReader reader = new BlockPackedReader(in1, PackedInts.VERSION_CURRENT, blockSize, valueCount, random().nextBoolean()); + for (i = 0; i < valueCount; ++i) { + assertEquals("i=" + i, values[i], reader.get(i)); } + assertEquals(in1.getFilePointer(), in1.length()); + in1.close(); dir.close(); } }