mirror of https://github.com/apache/lucene.git
NumericDocValues: Prevent rare extreme values from raising the number of bits per value for everyone.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1436692 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
36ebc71124
commit
28567c2327
|
@ -57,7 +57,7 @@ import org.apache.lucene.util.Bits;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LongsRef;
|
||||
import org.apache.lucene.util.packed.BlockPackedReader;
|
||||
import org.apache.lucene.util.packed.BlockPackedReaderIterator;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
|
||||
|
@ -76,7 +76,7 @@ public final class CompressingTermVectorsReader extends TermVectorsReader implem
|
|||
private final int chunkSize;
|
||||
private final int numDocs;
|
||||
private boolean closed;
|
||||
private final BlockPackedReader reader;
|
||||
private final BlockPackedReaderIterator reader;
|
||||
|
||||
// used by clone
|
||||
private CompressingTermVectorsReader(CompressingTermVectorsReader reader) {
|
||||
|
@ -88,7 +88,7 @@ public final class CompressingTermVectorsReader extends TermVectorsReader implem
|
|||
this.decompressor = reader.decompressor.clone();
|
||||
this.chunkSize = reader.chunkSize;
|
||||
this.numDocs = reader.numDocs;
|
||||
this.reader = new BlockPackedReader(vectorsStream, packedIntsVersion, BLOCK_SIZE, 0);
|
||||
this.reader = new BlockPackedReaderIterator(vectorsStream, packedIntsVersion, BLOCK_SIZE, 0);
|
||||
this.closed = false;
|
||||
}
|
||||
|
||||
|
@ -119,7 +119,7 @@ public final class CompressingTermVectorsReader extends TermVectorsReader implem
|
|||
packedIntsVersion = vectorsStream.readVInt();
|
||||
chunkSize = vectorsStream.readVInt();
|
||||
decompressor = compressionMode.newDecompressor();
|
||||
this.reader = new BlockPackedReader(vectorsStream, packedIntsVersion, BLOCK_SIZE, 0);
|
||||
this.reader = new BlockPackedReaderIterator(vectorsStream, packedIntsVersion, BLOCK_SIZE, 0);
|
||||
|
||||
success = true;
|
||||
} finally {
|
||||
|
|
|
@ -36,6 +36,7 @@ import org.apache.lucene.util.fst.FST;
|
|||
import org.apache.lucene.util.fst.FST.INPUT_TYPE;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
import org.apache.lucene.util.packed.BlockPackedWriter;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
|
@ -54,6 +55,8 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer {
|
|||
static final byte BYTES = 1;
|
||||
static final byte FST = 2;
|
||||
|
||||
static final int BLOCK_SIZE = 4096;
|
||||
|
||||
final IndexOutput data, meta;
|
||||
|
||||
Lucene42DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||
|
@ -97,15 +100,10 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer {
|
|||
}
|
||||
}
|
||||
|
||||
long delta = maxValue - minValue;
|
||||
final int bitsPerValue;
|
||||
if (delta < 0) {
|
||||
bitsPerValue = 64;
|
||||
meta.writeByte((byte)0); // delta-compressed
|
||||
} else if (uniqueValues != null && PackedInts.bitsRequired(uniqueValues.size()-1) < PackedInts.bitsRequired(delta)) {
|
||||
final long delta = maxValue - minValue;
|
||||
if (uniqueValues != null && (delta < 0 || PackedInts.bitsRequired(uniqueValues.size()-1) < PackedInts.bitsRequired(delta))) {
|
||||
// smaller to tableize
|
||||
bitsPerValue = PackedInts.bitsRequired(uniqueValues.size()-1);
|
||||
minValue = 0; // we will write indexes into the table instead of values
|
||||
final int bitsPerValue = PackedInts.bitsRequired(uniqueValues.size()-1);
|
||||
meta.writeByte((byte)1); // table-compressed
|
||||
Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
|
||||
final HashMap<Long,Integer> encode = new HashMap<Long,Integer>();
|
||||
|
@ -114,39 +112,29 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer {
|
|||
data.writeLong(decode[i]);
|
||||
encode.put(decode[i], i);
|
||||
}
|
||||
final Iterable<Number> original = values;
|
||||
values = new Iterable<Number>() {
|
||||
@Override
|
||||
public Iterator<Number> iterator() {
|
||||
final Iterator<Number> inner = original.iterator();
|
||||
return new Iterator<Number>() {
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return inner.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Number next() {
|
||||
return encode.get(inner.next());
|
||||
}
|
||||
data.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
data.writeVInt(count);
|
||||
data.writeVInt(bitsPerValue);
|
||||
|
||||
@Override
|
||||
public void remove() { throw new UnsupportedOperationException(); }
|
||||
};
|
||||
}
|
||||
};
|
||||
final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, PackedInts.Format.PACKED, count, bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
|
||||
for(Number nv : values) {
|
||||
writer.add(encode.get(nv));
|
||||
}
|
||||
writer.finish();
|
||||
} else {
|
||||
bitsPerValue = PackedInts.bitsRequired(delta);
|
||||
meta.writeByte((byte)0); // delta-compressed
|
||||
}
|
||||
|
||||
data.writeLong(minValue);
|
||||
data.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
data.writeVInt(count);
|
||||
data.writeVInt(BLOCK_SIZE);
|
||||
|
||||
final PackedInts.Writer writer = PackedInts.getWriter(data, count, bitsPerValue, PackedInts.COMPACT);
|
||||
for(Number nv : values) {
|
||||
writer.add(nv.longValue() - minValue);
|
||||
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
|
||||
for (Number nv : values) {
|
||||
writer.add(nv.longValue());
|
||||
}
|
||||
writer.finish();
|
||||
}
|
||||
writer.finish();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -42,6 +42,7 @@ import org.apache.lucene.util.fst.FST.Arc;
|
|||
import org.apache.lucene.util.fst.FST.BytesReader;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
import org.apache.lucene.util.packed.BlockPackedReader;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
class Lucene42DocValuesProducer extends DocValuesProducer {
|
||||
|
@ -139,9 +140,10 @@ class Lucene42DocValuesProducer extends DocValuesProducer {
|
|||
for (int i = 0; i < decode.length; i++) {
|
||||
decode[i] = data.readLong();
|
||||
}
|
||||
final long minValue = data.readLong();
|
||||
assert minValue == 0;
|
||||
final PackedInts.Reader reader = PackedInts.getReader(data);
|
||||
final int packedIntsVersion = data.readVInt();
|
||||
final int count = data.readVInt();
|
||||
final int bitsPerValue = data.readVInt();
|
||||
final PackedInts.Reader reader = PackedInts.getReaderNoHeader(data, PackedInts.Format.PACKED, packedIntsVersion, count, bitsPerValue);
|
||||
return new NumericDocValues() {
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
|
@ -149,12 +151,14 @@ class Lucene42DocValuesProducer extends DocValuesProducer {
|
|||
}
|
||||
};
|
||||
} else {
|
||||
final long minValue = data.readLong();
|
||||
final PackedInts.Reader reader = PackedInts.getReader(data);
|
||||
final int packedIntsVersion = data.readVInt();
|
||||
final int count = data.readVInt();
|
||||
final int blockSize = data.readVInt();
|
||||
final BlockPackedReader reader = new BlockPackedReader(data, packedIntsVersion, blockSize, count, false);
|
||||
return new NumericDocValues() {
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
return minValue + reader.get(docID);
|
||||
return reader.get(docID);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -17,225 +17,74 @@ package org.apache.lucene.util.packed;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.util.packed.BlockPackedReaderIterator.readVLong;
|
||||
import static org.apache.lucene.util.packed.BlockPackedReaderIterator.zigZagDecode;
|
||||
import static org.apache.lucene.util.packed.BlockPackedWriter.BPV_SHIFT;
|
||||
import static org.apache.lucene.util.packed.BlockPackedWriter.MIN_VALUE_EQUALS_0;
|
||||
import static org.apache.lucene.util.packed.BlockPackedWriter.checkBlockSize;
|
||||
|
||||
import java.io.EOFException;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.LongsRef;
|
||||
|
||||
/**
|
||||
* Reader for sequences of longs written with {@link BlockPackedWriter}.
|
||||
* @see BlockPackedWriter
|
||||
* Provides random access to a stream written with {@link BlockPackedWriter}.
|
||||
* @lucene.internal
|
||||
*/
|
||||
public final class BlockPackedReader {
|
||||
|
||||
static long zigZagDecode(long n) {
|
||||
return ((n >>> 1) ^ -(n & 1));
|
||||
}
|
||||
private final int blockShift, blockMask;
|
||||
private final long valueCount;
|
||||
private final long[] minValues;
|
||||
private final PackedInts.Reader[] subReaders;
|
||||
|
||||
// same as DataInput.readVLong but supports negative values
|
||||
static long readVLong(DataInput in) throws IOException {
|
||||
byte b = in.readByte();
|
||||
if (b >= 0) return b;
|
||||
long i = b & 0x7FL;
|
||||
b = in.readByte();
|
||||
i |= (b & 0x7FL) << 7;
|
||||
if (b >= 0) return i;
|
||||
b = in.readByte();
|
||||
i |= (b & 0x7FL) << 14;
|
||||
if (b >= 0) return i;
|
||||
b = in.readByte();
|
||||
i |= (b & 0x7FL) << 21;
|
||||
if (b >= 0) return i;
|
||||
b = in.readByte();
|
||||
i |= (b & 0x7FL) << 28;
|
||||
if (b >= 0) return i;
|
||||
b = in.readByte();
|
||||
i |= (b & 0x7FL) << 35;
|
||||
if (b >= 0) return i;
|
||||
b = in.readByte();
|
||||
i |= (b & 0x7FL) << 42;
|
||||
if (b >= 0) return i;
|
||||
b = in.readByte();
|
||||
i |= (b & 0x7FL) << 49;
|
||||
if (b >= 0) return i;
|
||||
b = in.readByte();
|
||||
i |= (b & 0xFFL) << 56;
|
||||
return i;
|
||||
}
|
||||
|
||||
DataInput in;
|
||||
final int packedIntsVersion;
|
||||
long valueCount;
|
||||
final int blockSize;
|
||||
final long[] values;
|
||||
final LongsRef valuesRef;
|
||||
byte[] blocks;
|
||||
int off;
|
||||
long ord;
|
||||
|
||||
/** Sole constructor.
|
||||
* @param blockSize the number of values of a block, must be equal to the
|
||||
* block size of the {@link BlockPackedWriter} which has
|
||||
* been used to write the stream
|
||||
*/
|
||||
public BlockPackedReader(DataInput in, int packedIntsVersion, int blockSize, long valueCount) {
|
||||
/** Sole constructor. */
|
||||
public BlockPackedReader(IndexInput in, int packedIntsVersion, int blockSize, long valueCount, boolean direct) throws IOException {
|
||||
checkBlockSize(blockSize);
|
||||
this.packedIntsVersion = packedIntsVersion;
|
||||
this.blockSize = blockSize;
|
||||
this.values = new long[blockSize];
|
||||
this.valuesRef = new LongsRef(this.values, 0, 0);
|
||||
reset(in, valueCount);
|
||||
}
|
||||
|
||||
/** Reset the current reader to wrap a stream of <code>valueCount</code>
|
||||
* values contained in <code>in</code>. The block size remains unchanged. */
|
||||
public void reset(DataInput in, long valueCount) {
|
||||
this.in = in;
|
||||
assert valueCount >= 0;
|
||||
this.valueCount = valueCount;
|
||||
off = blockSize;
|
||||
ord = 0;
|
||||
}
|
||||
|
||||
/** Skip exactly <code>count</code> values. */
|
||||
public void skip(long count) throws IOException {
|
||||
assert count >= 0;
|
||||
if (ord + count > valueCount || ord + count < 0) {
|
||||
throw new EOFException();
|
||||
blockShift = Long.numberOfTrailingZeros(blockSize);
|
||||
blockMask = blockSize - 1;
|
||||
final int numBlocks = (int) (valueCount / blockSize) + (valueCount % blockSize == 0 ? 0 : 1);
|
||||
if (numBlocks * blockSize < valueCount) {
|
||||
throw new IllegalArgumentException("valueCount is too large for this block size");
|
||||
}
|
||||
|
||||
// 1. skip buffered values
|
||||
final int skipBuffer = (int) Math.min(count, blockSize - off);
|
||||
off += skipBuffer;
|
||||
ord += skipBuffer;
|
||||
count -= skipBuffer;
|
||||
if (count == 0L) {
|
||||
return;
|
||||
}
|
||||
|
||||
// 2. skip as many blocks as necessary
|
||||
assert off == blockSize;
|
||||
while (count >= blockSize) {
|
||||
long[] minValues = null;
|
||||
subReaders = new PackedInts.Reader[numBlocks];
|
||||
for (int i = 0; i < numBlocks; ++i) {
|
||||
final int token = in.readByte() & 0xFF;
|
||||
final int bitsPerValue = token >>> BPV_SHIFT;
|
||||
if (bitsPerValue > 64) {
|
||||
throw new IOException("Corrupted");
|
||||
}
|
||||
if ((token & MIN_VALUE_EQUALS_0) == 0) {
|
||||
readVLong(in);
|
||||
if (minValues == null) {
|
||||
minValues = new long[numBlocks];
|
||||
}
|
||||
minValues[i] = zigZagDecode(1L + readVLong(in));
|
||||
}
|
||||
final long blockBytes = PackedInts.Format.PACKED.byteCount(packedIntsVersion, blockSize, bitsPerValue);
|
||||
skipBytes(blockBytes);
|
||||
ord += blockSize;
|
||||
count -= blockSize;
|
||||
}
|
||||
if (count == 0L) {
|
||||
return;
|
||||
}
|
||||
|
||||
// 3. skip last values
|
||||
assert count < blockSize;
|
||||
refill();
|
||||
ord += count;
|
||||
off += count;
|
||||
}
|
||||
|
||||
private void skipBytes(long count) throws IOException {
|
||||
if (in instanceof IndexInput) {
|
||||
final IndexInput iin = (IndexInput) in;
|
||||
iin.seek(iin.getFilePointer() + count);
|
||||
} else {
|
||||
if (blocks == null) {
|
||||
blocks = new byte[blockSize];
|
||||
}
|
||||
long skipped = 0;
|
||||
while (skipped < count) {
|
||||
final int toSkip = (int) Math.min(blocks.length, count - skipped);
|
||||
in.readBytes(blocks, 0, toSkip);
|
||||
skipped += toSkip;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Read the next value. */
|
||||
public long next() throws IOException {
|
||||
if (ord == valueCount) {
|
||||
throw new EOFException();
|
||||
}
|
||||
if (off == blockSize) {
|
||||
refill();
|
||||
}
|
||||
final long value = values[off++];
|
||||
++ord;
|
||||
return value;
|
||||
}
|
||||
|
||||
/** Read between <tt>1</tt> and <code>count</code> values. */
|
||||
public LongsRef next(int count) throws IOException {
|
||||
assert count > 0;
|
||||
if (ord == valueCount) {
|
||||
throw new EOFException();
|
||||
}
|
||||
if (off == blockSize) {
|
||||
refill();
|
||||
}
|
||||
|
||||
count = Math.min(count, blockSize - off);
|
||||
count = (int) Math.min(count, valueCount - ord);
|
||||
|
||||
valuesRef.offset = off;
|
||||
valuesRef.length = count;
|
||||
off += count;
|
||||
ord += count;
|
||||
return valuesRef;
|
||||
}
|
||||
|
||||
private void refill() throws IOException {
|
||||
final int token = in.readByte() & 0xFF;
|
||||
final boolean minEquals0 = (token & MIN_VALUE_EQUALS_0) != 0;
|
||||
final int bitsPerValue = token >>> BPV_SHIFT;
|
||||
if (bitsPerValue > 64) {
|
||||
throw new IOException("Corrupted");
|
||||
}
|
||||
final long minValue = minEquals0 ? 0L : zigZagDecode(1L + readVLong(in));
|
||||
assert minEquals0 || minValue != 0;
|
||||
|
||||
if (bitsPerValue == 0) {
|
||||
Arrays.fill(values, minValue);
|
||||
} else {
|
||||
final PackedInts.Decoder decoder = PackedInts.getDecoder(PackedInts.Format.PACKED, packedIntsVersion, bitsPerValue);
|
||||
final int iterations = blockSize / decoder.valueCount();
|
||||
final int blocksSize = iterations * 8 * decoder.blockCount();
|
||||
if (blocks == null || blocks.length < blocksSize) {
|
||||
blocks = new byte[blocksSize];
|
||||
}
|
||||
|
||||
final int valueCount = (int) Math.min(this.valueCount - ord, blockSize);
|
||||
final int blocksCount = (int) PackedInts.Format.PACKED.byteCount(packedIntsVersion, valueCount, bitsPerValue);
|
||||
in.readBytes(blocks, 0, blocksCount);
|
||||
|
||||
decoder.decode(blocks, 0, values, 0, iterations);
|
||||
|
||||
if (minValue != 0) {
|
||||
for (int i = 0; i < valueCount; ++i) {
|
||||
values[i] += minValue;
|
||||
if (bitsPerValue != 0) {
|
||||
final int size = (int) Math.min(blockSize, valueCount - (long) i * blockSize);
|
||||
if (direct) {
|
||||
final long pointer = in.getFilePointer();
|
||||
subReaders[i] = PackedInts.getDirectReaderNoHeader(in, PackedInts.Format.PACKED, packedIntsVersion, size, bitsPerValue);
|
||||
in.seek(pointer + PackedInts.Format.PACKED.byteCount(packedIntsVersion, size, bitsPerValue));
|
||||
} else {
|
||||
subReaders[i] = PackedInts.getReaderNoHeader(in, PackedInts.Format.PACKED, packedIntsVersion, size, bitsPerValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
off = 0;
|
||||
this.minValues = minValues;
|
||||
}
|
||||
|
||||
/** Return the offset of the next value to read. */
|
||||
public long ord() {
|
||||
return ord;
|
||||
/** Get value at <code>index</code>. */
|
||||
public long get(long index) {
|
||||
assert index >= 0 && index < valueCount;
|
||||
final int block = (int) (index >>> blockShift);
|
||||
if (subReaders[block] == null) {
|
||||
return minValues == null ? 0 : minValues[block];
|
||||
}
|
||||
final int idx = (int) (index & blockMask);
|
||||
return (minValues == null ? 0 : minValues[block]) + subReaders[block].get(idx);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,241 @@
|
|||
package org.apache.lucene.util.packed;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.util.packed.BlockPackedWriter.BPV_SHIFT;
|
||||
import static org.apache.lucene.util.packed.BlockPackedWriter.MIN_VALUE_EQUALS_0;
|
||||
import static org.apache.lucene.util.packed.BlockPackedWriter.checkBlockSize;
|
||||
|
||||
import java.io.EOFException;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.LongsRef;
|
||||
|
||||
/**
|
||||
* Reader for sequences of longs written with {@link BlockPackedWriter}.
|
||||
* @see BlockPackedWriter
|
||||
* @lucene.internal
|
||||
*/
|
||||
public final class BlockPackedReaderIterator {
|
||||
|
||||
static long zigZagDecode(long n) {
|
||||
return ((n >>> 1) ^ -(n & 1));
|
||||
}
|
||||
|
||||
// same as DataInput.readVLong but supports negative values
|
||||
static long readVLong(DataInput in) throws IOException {
|
||||
byte b = in.readByte();
|
||||
if (b >= 0) return b;
|
||||
long i = b & 0x7FL;
|
||||
b = in.readByte();
|
||||
i |= (b & 0x7FL) << 7;
|
||||
if (b >= 0) return i;
|
||||
b = in.readByte();
|
||||
i |= (b & 0x7FL) << 14;
|
||||
if (b >= 0) return i;
|
||||
b = in.readByte();
|
||||
i |= (b & 0x7FL) << 21;
|
||||
if (b >= 0) return i;
|
||||
b = in.readByte();
|
||||
i |= (b & 0x7FL) << 28;
|
||||
if (b >= 0) return i;
|
||||
b = in.readByte();
|
||||
i |= (b & 0x7FL) << 35;
|
||||
if (b >= 0) return i;
|
||||
b = in.readByte();
|
||||
i |= (b & 0x7FL) << 42;
|
||||
if (b >= 0) return i;
|
||||
b = in.readByte();
|
||||
i |= (b & 0x7FL) << 49;
|
||||
if (b >= 0) return i;
|
||||
b = in.readByte();
|
||||
i |= (b & 0xFFL) << 56;
|
||||
return i;
|
||||
}
|
||||
|
||||
DataInput in;
|
||||
final int packedIntsVersion;
|
||||
long valueCount;
|
||||
final int blockSize;
|
||||
final long[] values;
|
||||
final LongsRef valuesRef;
|
||||
byte[] blocks;
|
||||
int off;
|
||||
long ord;
|
||||
|
||||
/** Sole constructor.
|
||||
* @param blockSize the number of values of a block, must be equal to the
|
||||
* block size of the {@link BlockPackedWriter} which has
|
||||
* been used to write the stream
|
||||
*/
|
||||
public BlockPackedReaderIterator(DataInput in, int packedIntsVersion, int blockSize, long valueCount) {
|
||||
checkBlockSize(blockSize);
|
||||
this.packedIntsVersion = packedIntsVersion;
|
||||
this.blockSize = blockSize;
|
||||
this.values = new long[blockSize];
|
||||
this.valuesRef = new LongsRef(this.values, 0, 0);
|
||||
reset(in, valueCount);
|
||||
}
|
||||
|
||||
/** Reset the current reader to wrap a stream of <code>valueCount</code>
|
||||
* values contained in <code>in</code>. The block size remains unchanged. */
|
||||
public void reset(DataInput in, long valueCount) {
|
||||
this.in = in;
|
||||
assert valueCount >= 0;
|
||||
this.valueCount = valueCount;
|
||||
off = blockSize;
|
||||
ord = 0;
|
||||
}
|
||||
|
||||
/** Skip exactly <code>count</code> values. */
|
||||
public void skip(long count) throws IOException {
|
||||
assert count >= 0;
|
||||
if (ord + count > valueCount || ord + count < 0) {
|
||||
throw new EOFException();
|
||||
}
|
||||
|
||||
// 1. skip buffered values
|
||||
final int skipBuffer = (int) Math.min(count, blockSize - off);
|
||||
off += skipBuffer;
|
||||
ord += skipBuffer;
|
||||
count -= skipBuffer;
|
||||
if (count == 0L) {
|
||||
return;
|
||||
}
|
||||
|
||||
// 2. skip as many blocks as necessary
|
||||
assert off == blockSize;
|
||||
while (count >= blockSize) {
|
||||
final int token = in.readByte() & 0xFF;
|
||||
final int bitsPerValue = token >>> BPV_SHIFT;
|
||||
if (bitsPerValue > 64) {
|
||||
throw new IOException("Corrupted");
|
||||
}
|
||||
if ((token & MIN_VALUE_EQUALS_0) == 0) {
|
||||
readVLong(in);
|
||||
}
|
||||
final long blockBytes = PackedInts.Format.PACKED.byteCount(packedIntsVersion, blockSize, bitsPerValue);
|
||||
skipBytes(blockBytes);
|
||||
ord += blockSize;
|
||||
count -= blockSize;
|
||||
}
|
||||
if (count == 0L) {
|
||||
return;
|
||||
}
|
||||
|
||||
// 3. skip last values
|
||||
assert count < blockSize;
|
||||
refill();
|
||||
ord += count;
|
||||
off += count;
|
||||
}
|
||||
|
||||
private void skipBytes(long count) throws IOException {
|
||||
if (in instanceof IndexInput) {
|
||||
final IndexInput iin = (IndexInput) in;
|
||||
iin.seek(iin.getFilePointer() + count);
|
||||
} else {
|
||||
if (blocks == null) {
|
||||
blocks = new byte[blockSize];
|
||||
}
|
||||
long skipped = 0;
|
||||
while (skipped < count) {
|
||||
final int toSkip = (int) Math.min(blocks.length, count - skipped);
|
||||
in.readBytes(blocks, 0, toSkip);
|
||||
skipped += toSkip;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Read the next value. */
|
||||
public long next() throws IOException {
|
||||
if (ord == valueCount) {
|
||||
throw new EOFException();
|
||||
}
|
||||
if (off == blockSize) {
|
||||
refill();
|
||||
}
|
||||
final long value = values[off++];
|
||||
++ord;
|
||||
return value;
|
||||
}
|
||||
|
||||
/** Read between <tt>1</tt> and <code>count</code> values. */
|
||||
public LongsRef next(int count) throws IOException {
|
||||
assert count > 0;
|
||||
if (ord == valueCount) {
|
||||
throw new EOFException();
|
||||
}
|
||||
if (off == blockSize) {
|
||||
refill();
|
||||
}
|
||||
|
||||
count = Math.min(count, blockSize - off);
|
||||
count = (int) Math.min(count, valueCount - ord);
|
||||
|
||||
valuesRef.offset = off;
|
||||
valuesRef.length = count;
|
||||
off += count;
|
||||
ord += count;
|
||||
return valuesRef;
|
||||
}
|
||||
|
||||
private void refill() throws IOException {
|
||||
final int token = in.readByte() & 0xFF;
|
||||
final boolean minEquals0 = (token & MIN_VALUE_EQUALS_0) != 0;
|
||||
final int bitsPerValue = token >>> BPV_SHIFT;
|
||||
if (bitsPerValue > 64) {
|
||||
throw new IOException("Corrupted");
|
||||
}
|
||||
final long minValue = minEquals0 ? 0L : zigZagDecode(1L + readVLong(in));
|
||||
assert minEquals0 || minValue != 0;
|
||||
|
||||
if (bitsPerValue == 0) {
|
||||
Arrays.fill(values, minValue);
|
||||
} else {
|
||||
final PackedInts.Decoder decoder = PackedInts.getDecoder(PackedInts.Format.PACKED, packedIntsVersion, bitsPerValue);
|
||||
final int iterations = blockSize / decoder.valueCount();
|
||||
final int blocksSize = iterations * 8 * decoder.blockCount();
|
||||
if (blocks == null || blocks.length < blocksSize) {
|
||||
blocks = new byte[blocksSize];
|
||||
}
|
||||
|
||||
final int valueCount = (int) Math.min(this.valueCount - ord, blockSize);
|
||||
final int blocksCount = (int) PackedInts.Format.PACKED.byteCount(packedIntsVersion, valueCount, bitsPerValue);
|
||||
in.readBytes(blocks, 0, blocksCount);
|
||||
|
||||
decoder.decode(blocks, 0, values, 0, iterations);
|
||||
|
||||
if (minValue != 0) {
|
||||
for (int i = 0; i < valueCount; ++i) {
|
||||
values[i] += minValue;
|
||||
}
|
||||
}
|
||||
}
|
||||
off = 0;
|
||||
}
|
||||
|
||||
/** Return the offset of the next value to read. */
|
||||
public long ord() {
|
||||
return ord;
|
||||
}
|
||||
|
||||
}
|
|
@ -30,7 +30,7 @@ import org.apache.lucene.store.DataOutput;
|
|||
* using as few bits as possible. Memory usage of this class is proportional to
|
||||
* the block size. Each block has an overhead between 1 and 10 bytes to store
|
||||
* the minimum value and the number of bits per value of the block.
|
||||
* @see BlockPackedReader
|
||||
* @see BlockPackedReaderIterator
|
||||
* @lucene.internal
|
||||
*/
|
||||
public final class BlockPackedWriter {
|
||||
|
@ -43,8 +43,11 @@ public final class BlockPackedWriter {
|
|||
if (blockSize <= 0 || blockSize > MAX_BLOCK_SIZE) {
|
||||
throw new IllegalArgumentException("blockSize must be > 0 and < " + MAX_BLOCK_SIZE + ", got " + blockSize);
|
||||
}
|
||||
if (blockSize % 64 != 0) {
|
||||
throw new IllegalArgumentException("blockSize must be a multiple of 64, got " + blockSize);
|
||||
if (blockSize < 64) {
|
||||
throw new IllegalArgumentException("blockSize must be >= 64, got " + blockSize);
|
||||
}
|
||||
if ((blockSize & (blockSize - 1)) != 0) {
|
||||
throw new IllegalArgumentException("blockSize must be a power of two, got " + blockSize);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -877,10 +877,11 @@ public class TestPackedInts extends LuceneTestCase {
|
|||
in.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testBlockPackedReaderWriter() throws IOException {
|
||||
final int iters = atLeast(2);
|
||||
for (int iter = 0; iter < iters; ++iter) {
|
||||
final int blockSize = 64 * _TestUtil.nextInt(random(), 1, 1 << 12);
|
||||
final int blockSize = 1 << _TestUtil.nextInt(random(), 6, 18);
|
||||
final int valueCount = random().nextInt(1 << 18);
|
||||
final long[] values = new long[valueCount];
|
||||
long minValue = 0;
|
||||
|
@ -912,30 +913,29 @@ public class TestPackedInts extends LuceneTestCase {
|
|||
final long fp = out.getFilePointer();
|
||||
out.close();
|
||||
|
||||
DataInput in = dir.openInput("out.bin", IOContext.DEFAULT);
|
||||
if (random().nextBoolean()) {
|
||||
byte[] buf = new byte[(int) fp];
|
||||
in.readBytes(buf, 0, (int) fp);
|
||||
((IndexInput) in).close();
|
||||
in = new ByteArrayDataInput(buf);
|
||||
}
|
||||
final BlockPackedReader reader = new BlockPackedReader(in, PackedInts.VERSION_CURRENT, blockSize, valueCount);
|
||||
IndexInput in1 = dir.openInput("out.bin", IOContext.DEFAULT);
|
||||
byte[] buf = new byte[(int) fp];
|
||||
in1.readBytes(buf, 0, (int) fp);
|
||||
in1.seek(0L);
|
||||
ByteArrayDataInput in2 = new ByteArrayDataInput(buf);
|
||||
final DataInput in = random().nextBoolean() ? in1 : in2;
|
||||
final BlockPackedReaderIterator it = new BlockPackedReaderIterator(in, PackedInts.VERSION_CURRENT, blockSize, valueCount);
|
||||
for (int i = 0; i < valueCount; ) {
|
||||
if (random().nextBoolean()) {
|
||||
assertEquals("" + i, values[i], reader.next());
|
||||
assertEquals("" + i, values[i], it.next());
|
||||
++i;
|
||||
} else {
|
||||
final LongsRef nextValues = reader.next(_TestUtil.nextInt(random(), 1, 1024));
|
||||
final LongsRef nextValues = it.next(_TestUtil.nextInt(random(), 1, 1024));
|
||||
for (int j = 0; j < nextValues.length; ++j) {
|
||||
assertEquals("" + (i + j), values[i + j], nextValues.longs[nextValues.offset + j]);
|
||||
}
|
||||
i += nextValues.length;
|
||||
}
|
||||
assertEquals(i, reader.ord());
|
||||
assertEquals(i, it.ord());
|
||||
}
|
||||
assertEquals(fp, in instanceof ByteArrayDataInput ? ((ByteArrayDataInput) in).getPosition() : ((IndexInput) in).getFilePointer());
|
||||
try {
|
||||
reader.next();
|
||||
it.next();
|
||||
assertTrue(false);
|
||||
} catch (IOException e) {
|
||||
// OK
|
||||
|
@ -946,31 +946,35 @@ public class TestPackedInts extends LuceneTestCase {
|
|||
} else {
|
||||
((IndexInput) in).seek(0L);
|
||||
}
|
||||
final BlockPackedReader reader2 = new BlockPackedReader(in, PackedInts.VERSION_CURRENT, blockSize, valueCount);
|
||||
final BlockPackedReaderIterator it2 = new BlockPackedReaderIterator(in, PackedInts.VERSION_CURRENT, blockSize, valueCount);
|
||||
int i = 0;
|
||||
while (true) {
|
||||
final int skip = _TestUtil.nextInt(random(), 0, valueCount - i);
|
||||
reader2.skip(skip);
|
||||
it2.skip(skip);
|
||||
i += skip;
|
||||
assertEquals(i, reader2.ord());
|
||||
assertEquals(i, it2.ord());
|
||||
if (i == valueCount) {
|
||||
break;
|
||||
} else {
|
||||
assertEquals(values[i], reader2.next());
|
||||
assertEquals(values[i], it2.next());
|
||||
++i;
|
||||
}
|
||||
}
|
||||
assertEquals(fp, in instanceof ByteArrayDataInput ? ((ByteArrayDataInput) in).getPosition() : ((IndexInput) in).getFilePointer());
|
||||
try {
|
||||
reader2.skip(1);
|
||||
it2.skip(1);
|
||||
assertTrue(false);
|
||||
} catch (IOException e) {
|
||||
// OK
|
||||
}
|
||||
|
||||
if (in instanceof IndexInput) {
|
||||
((IndexInput) in).close();
|
||||
in1.seek(0L);
|
||||
final BlockPackedReader reader = new BlockPackedReader(in1, PackedInts.VERSION_CURRENT, blockSize, valueCount, random().nextBoolean());
|
||||
for (i = 0; i < valueCount; ++i) {
|
||||
assertEquals("i=" + i, values[i], reader.get(i));
|
||||
}
|
||||
assertEquals(in1.getFilePointer(), in1.length());
|
||||
in1.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue