Move group-varint encoding/decoding logic to DataOutput/DataInput (#12841)

This commit is contained in:
Zhang Chao 2023-12-23 20:18:34 +08:00 committed by GitHub
parent 9359a9dcff
commit dc9f154aa5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 451 additions and 194 deletions

View File

@ -120,6 +120,8 @@ Optimizations
* GITHUB#12552: Make FSTPostingsFormat load FSTs off-heap. (Tony X)
* GITHUB#12841: Move group-varint encoding/decoding logic to DataOutput/DataInput. (Adrien Grand, Zhang Chao, Uwe Schindler)
Bug Fixes
---------------------

View File

@ -21,15 +21,18 @@ import java.nio.file.Files;
import java.util.Arrays;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.codecs.lucene99.GroupVIntReader;
import org.apache.lucene.codecs.lucene99.GroupVIntWriter;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.ByteBuffersDataInput;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.GroupVIntUtil;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
@ -86,12 +89,13 @@ public class GroupVIntBenchmark {
final long[] values = new long[maxSize];
IndexInput byteBufferGVIntIn;
IndexInput nioGVIntIn;
IndexInput byteBufferVIntIn;
ByteBuffersDataInput byteBuffersGVIntIn;
ByteArrayDataInput byteArrayVIntIn;
ByteArrayDataInput byteArrayGVIntIn;
// @Param({"16", "32", "64", "128", "248"})
@Param({"64"})
public int size;
@ -99,8 +103,8 @@ public class GroupVIntBenchmark {
byte[] gVIntBytes = new byte[Integer.BYTES * maxSize * 2];
byte[] vIntBytes = new byte[Integer.BYTES * maxSize * 2];
ByteArrayDataOutput vIntOut = new ByteArrayDataOutput(vIntBytes);
GroupVIntWriter w = new GroupVIntWriter();
w.writeValues(new ByteArrayDataOutput(gVIntBytes), docs, docs.length);
ByteArrayDataOutput out = new ByteArrayDataOutput(gVIntBytes);
out.writeGroupVInts(docs, docs.length);
for (long v : docs) {
vIntOut.writeVInt((int) v);
}
@ -108,13 +112,26 @@ public class GroupVIntBenchmark {
byteArrayGVIntIn = new ByteArrayDataInput(gVIntBytes);
}
void initNioInput(long[] docs) throws Exception {
Directory dir = new NIOFSDirectory(Files.createTempDirectory("groupvintdata"));
IndexOutput out = dir.createOutput("gvint", IOContext.DEFAULT);
out.writeGroupVInts(docs, docs.length);
out.close();
nioGVIntIn = dir.openInput("gvint", IOContext.DEFAULT);
}
void initByteBuffersInput(long[] docs) throws Exception {
ByteBuffersDataOutput buffer = new ByteBuffersDataOutput();
buffer.writeGroupVInts(docs, docs.length);
byteBuffersGVIntIn = buffer.toDataInput();
}
void initByteBufferInput(long[] docs) throws Exception {
Directory dir = MMapDirectory.open(Files.createTempDirectory("groupvintdata"));
Directory dir = new MMapDirectory(Files.createTempDirectory("groupvintdata"));
IndexOutput vintOut = dir.createOutput("vint", IOContext.DEFAULT);
IndexOutput gvintOut = dir.createOutput("gvint", IOContext.DEFAULT);
GroupVIntWriter w = new GroupVIntWriter();
w.writeValues(gvintOut, docs, docs.length);
gvintOut.writeGroupVInts(docs, docs.length);
for (long v : docs) {
vintOut.writeVInt((int) v);
}
@ -124,6 +141,16 @@ public class GroupVIntBenchmark {
byteBufferVIntIn = dir.openInput("vint", IOContext.DEFAULT);
}
private void readGroupVIntsBaseline(DataInput in, long[] dst, int limit) throws IOException {
int i;
for (i = 0; i <= limit - 4; i += 4) {
GroupVIntUtil.readGroupVInt(in, dst, i);
}
for (; i < limit; ++i) {
dst[i] = in.readVInt();
}
}
@Setup(Level.Trial)
public void init() throws Exception {
long[] docs = new long[maxSize];
@ -140,10 +167,12 @@ public class GroupVIntBenchmark {
}
initByteBufferInput(docs);
initArrayInput(docs);
initNioInput(docs);
initByteBuffersInput(docs);
}
@Benchmark
public void byteBufferReadVInt(Blackhole bh) throws IOException {
public void benchMMapDirectoryInputs_readVInt(Blackhole bh) throws IOException {
byteBufferVIntIn.seek(0);
for (int i = 0; i < size; i++) {
values[i] = byteBufferVIntIn.readVInt();
@ -152,14 +181,21 @@ public class GroupVIntBenchmark {
}
@Benchmark
public void byteBufferReadGroupVInt(Blackhole bh) throws IOException {
public void benchMMapDirectoryInputs_readGroupVInt(Blackhole bh) throws IOException {
byteBufferGVIntIn.seek(0);
GroupVIntReader.readValues(byteBufferGVIntIn, values, size);
byteBufferGVIntIn.readGroupVInts(values, size);
bh.consume(values);
}
@Benchmark
public void byteArrayReadVInt(Blackhole bh) {
public void benchMMapDirectoryInputs_readGroupVIntBaseline(Blackhole bh) throws IOException {
byteBufferGVIntIn.seek(0);
this.readGroupVIntsBaseline(byteBufferGVIntIn, values, size);
bh.consume(values);
}
@Benchmark
public void benchByteArrayDataInput_readVInt(Blackhole bh) {
byteArrayVIntIn.rewind();
for (int i = 0; i < size; i++) {
values[i] = byteArrayVIntIn.readVInt();
@ -168,9 +204,37 @@ public class GroupVIntBenchmark {
}
@Benchmark
public void byteArrayReadGroupVInt(Blackhole bh) throws IOException {
public void benchByteArrayDataInput_readGroupVInt(Blackhole bh) throws IOException {
byteArrayGVIntIn.rewind();
GroupVIntReader.readValues(byteArrayGVIntIn, values, size);
byteArrayGVIntIn.readGroupVInts(values, size);
bh.consume(values);
}
@Benchmark
public void benchNIOFSDirectoryInputs_readGroupVInt(Blackhole bh) throws IOException {
nioGVIntIn.seek(0);
nioGVIntIn.readGroupVInts(values, size);
bh.consume(values);
}
@Benchmark
public void benchNIOFSDirectoryInputs_readGroupVIntBaseline(Blackhole bh) throws IOException {
nioGVIntIn.seek(0);
this.readGroupVIntsBaseline(nioGVIntIn, values, size);
bh.consume(values);
}
@Benchmark
public void benchByteBuffersIndexInput_readGroupVInt(Blackhole bh) throws IOException {
byteBuffersGVIntIn.seek(0);
byteBuffersGVIntIn.readGroupVInts(values, size);
bh.consume(values);
}
@Benchmark
public void benchByteBuffersIndexInput_readGroupVIntBaseline(Blackhole bh) throws IOException {
byteBuffersGVIntIn.seek(0);
this.readGroupVIntsBaseline(byteBuffersGVIntIn, values, size);
bh.consume(values);
}
}

View File

@ -1,57 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.store.DataInput;
/** Decode integers using group-varint. */
public class GroupVIntReader {
public static void readValues(DataInput in, long[] docs, int limit) throws IOException {
int i;
for (i = 0; i <= limit - 4; i += 4) {
final int flag = in.readByte() & 0xFF;
final int n1Minus1 = flag >> 6;
final int n2Minus1 = (flag >> 4) & 0x03;
final int n3Minus1 = (flag >> 2) & 0x03;
final int n4Minus1 = flag & 0x03;
docs[i] = readLong(in, n1Minus1);
docs[i + 1] = readLong(in, n2Minus1);
docs[i + 2] = readLong(in, n3Minus1);
docs[i + 3] = readLong(in, n4Minus1);
}
for (; i < limit; ++i) {
docs[i] = in.readVInt();
}
}
private static long readLong(DataInput in, int numBytesMinus1) throws IOException {
switch (numBytesMinus1) {
case 0:
return in.readByte() & 0xFFL;
case 1:
return in.readShort() & 0xFFFFL;
case 2:
return (in.readShort() & 0xFFFFL) | ((in.readByte() & 0xFFL) << 16);
default:
return in.readInt() & 0xFFFFFFFFL;
}
}
}

View File

@ -1,63 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.store.DataOutput;
/**
* Encode integers using group-varint. It uses VInt to encode tail values that are not enough for a
* group
*/
public class GroupVIntWriter {
// the maximum size of one group is 4 integers + 1 byte flag.
private byte[] bytes = new byte[17];
private int byteOffset = 0;
public GroupVIntWriter() {}
private int encodeValue(int v) {
int lastOff = byteOffset;
do {
bytes[byteOffset++] = (byte) (v & 0xFF);
v >>>= 8;
} while (v != 0);
return byteOffset - lastOff;
}
public void writeValues(DataOutput out, long[] values, int limit) throws IOException {
int off = 0;
// encode each group
while ((limit - off) >= 4) {
byte flag = 0;
byteOffset = 1;
flag |= (encodeValue((int) values[off++]) - 1) << 6;
flag |= (encodeValue((int) values[off++]) - 1) << 4;
flag |= (encodeValue((int) values[off++]) - 1) << 2;
flag |= (encodeValue((int) values[off++]) - 1);
bytes[0] = flag;
out.writeBytes(bytes, byteOffset);
}
// tail vints
for (; off < limit; off++) {
out.writeVInt((int) values[off]);
}
}
}

View File

@ -149,7 +149,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
boolean indexHasFreq,
boolean decodeFreq)
throws IOException {
GroupVIntReader.readValues(docIn, docBuffer, num);
docIn.readGroupVInts(docBuffer, num);
if (indexHasFreq && decodeFreq) {
for (int i = 0; i < num; ++i) {
freqBuffer[i] = docBuffer[i] & 0x01;

View File

@ -92,7 +92,6 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
private final PForUtil pforUtil;
private final ForDeltaUtil forDeltaUtil;
private final Lucene99SkipWriter skipWriter;
private final GroupVIntWriter docGroupVIntWriter;
private boolean fieldHasNorms;
private NumericDocValues norms;
@ -173,7 +172,6 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
skipWriter =
new Lucene99SkipWriter(
MAX_SKIP_LEVELS, BLOCK_SIZE, state.segmentInfo.maxDoc(), docOut, posOut, payOut);
docGroupVIntWriter = new GroupVIntWriter();
}
@Override
@ -378,7 +376,7 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
docDeltaBuffer[i] = (docDeltaBuffer[i] << 1) | (freqBuffer[i] == 1 ? 1 : 0);
}
}
docGroupVIntWriter.writeValues(docOut, docDeltaBuffer, docBufferUpto);
docOut.writeGroupVInts(docDeltaBuffer, docBufferUpto);
if (writeFreqs) {
for (int i = 0; i < docBufferUpto; i++) {
final int freq = (int) freqBuffer[i];

View File

@ -20,6 +20,7 @@ import java.io.EOFException;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import org.apache.lucene.util.GroupVIntUtil;
/** Base implementation class for buffered {@link IndexInput}. */
public abstract class BufferedIndexInput extends IndexInput implements RandomAccessInput {
@ -149,6 +150,16 @@ public abstract class BufferedIndexInput extends IndexInput implements RandomAcc
}
}
@Override
protected void readGroupVInt(long[] dst, int offset) throws IOException {
final int len =
GroupVIntUtil.readGroupVInt(
this, buffer.remaining(), p -> buffer.getInt((int) p), buffer.position(), dst, offset);
if (len > 0) {
buffer.position(buffer.position() + len);
}
}
@Override
public final long readLong() throws IOException {
if (Long.BYTES <= buffer.remaining()) {

View File

@ -29,6 +29,7 @@ import java.util.List;
import java.util.Locale;
import java.util.stream.Collectors;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.GroupVIntUtil;
import org.apache.lucene.util.RamUsageEstimator;
/**
@ -212,6 +213,25 @@ public final class ByteBuffersDataInput extends DataInput
}
}
@Override
protected void readGroupVInt(long[] dst, int offset) throws IOException {
final ByteBuffer block = blocks[blockIndex(pos)];
final int blockOffset = blockOffset(pos);
// We MUST save the return value to local variable, could not use pos += readGroupVInt(...).
// because `pos +=` in java will move current value(not address) of pos to register first,
// then call the function, but we will update pos value in function via readByte(), then
// `pos +=` will use an old pos value plus return value, thereby missing 1 byte.
final int len =
GroupVIntUtil.readGroupVInt(
this,
block.limit() - blockOffset,
p -> block.getInt((int) p),
blockOffset,
dst,
offset);
pos += len;
}
@Override
public long length() {
return length;

View File

@ -205,6 +205,12 @@ public final class ByteBuffersIndexInput extends IndexInput implements RandomAcc
in.readLongs(dst, offset, length);
}
@Override
protected void readGroupVInt(long[] dst, int offset) throws IOException {
ensureOpen();
in.readGroupVInt(dst, offset);
}
@Override
public IndexInput clone() {
ensureOpen();

View File

@ -27,6 +27,7 @@ import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.GroupVIntUtil;
/**
* Abstract base class for performing read operations of Lucene's low-level data types.
@ -98,6 +99,32 @@ public abstract class DataInput implements Cloneable {
return ((b4 & 0xFF) << 24) | ((b3 & 0xFF) << 16) | ((b2 & 0xFF) << 8) | (b1 & 0xFF);
}
/**
* Read all the group varints, including the tail vints. we need a long[] because this is what
* postings are using, all longs are actually required to be integers.
*
* @param dst the array to read ints into.
* @param limit the number of int values to read.
* @lucene.experimental
*/
public final void readGroupVInts(long[] dst, int limit) throws IOException {
int i;
for (i = 0; i <= limit - 4; i += 4) {
readGroupVInt(dst, i);
}
for (; i < limit; ++i) {
dst[i] = readVInt();
}
}
/**
* Override if you have a efficient implementation. In general this is when the input supports
* random access.
*/
protected void readGroupVInt(long[] dst, int offset) throws IOException {
GroupVIntUtil.readGroupVInt(this, dst, offset);
}
/**
* Reads an int stored in variable-length format. Reads between one and five bytes. Smaller values
* take fewer bytes. Negative numbers are supported, but should be avoided.

View File

@ -21,6 +21,7 @@ import java.util.Map;
import java.util.Set;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
/**
* Abstract base class for performing write operations of Lucene's low-level data types.
@ -29,6 +30,7 @@ import org.apache.lucene.util.BytesRef;
* internal state like file position).
*/
public abstract class DataOutput {
private final BytesRefBuilder groupVIntBytes = new BytesRefBuilder();
/**
* Writes a single byte.
@ -322,4 +324,43 @@ public abstract class DataOutput {
writeString(value);
}
}
/**
* Encode integers using group-varint. It uses {@link DataOutput#writeVInt VInt} to encode tail
* values that are not enough for a group. we need a long[] because this is what postings are
* using, all longs are actually required to be integers.
*
* @param values the values to write
* @param limit the number of values to write.
* @lucene.experimental
*/
public void writeGroupVInts(long[] values, int limit) throws IOException {
int off = 0;
// encode each group
while ((limit - off) >= 4) {
byte flag = 0;
groupVIntBytes.setLength(1);
flag |= (encodeGroupValue(Math.toIntExact(values[off++])) - 1) << 6;
flag |= (encodeGroupValue(Math.toIntExact(values[off++])) - 1) << 4;
flag |= (encodeGroupValue(Math.toIntExact(values[off++])) - 1) << 2;
flag |= (encodeGroupValue(Math.toIntExact(values[off++])) - 1);
groupVIntBytes.setByteAt(0, flag);
writeBytes(groupVIntBytes.bytes(), groupVIntBytes.length());
}
// tail vints
for (; off < limit; off++) {
writeVInt(Math.toIntExact(values[off]));
}
}
private int encodeGroupValue(int v) {
int lastOff = groupVIntBytes.length();
do {
groupVIntBytes.append((byte) (v & 0xFF));
v >>>= 8;
} while (v != 0);
return groupVIntBytes.length() - lastOff;
}
}

View File

@ -0,0 +1,114 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util;
import java.io.IOException;
import org.apache.lucene.store.DataInput;
/**
* This class contains utility methods and constants for group varint
*
* @lucene.internal
*/
public final class GroupVIntUtil {
// the maximum length of a single group-varint is 4 integers + 1 byte flag.
public static final int MAX_LENGTH_PER_GROUP = 17;
private static final int[] MASKS = new int[] {0xFF, 0xFFFF, 0xFFFFFF, 0xFFFFFFFF};
/**
* Default implementation of read single group, for optimal performance, you should use {@link
* DataInput#readGroupVInts(long[], int)} instead.
*
* @param dst the array to read ints into.
* @param offset the offset in the array to start storing ints.
*/
public static void readGroupVInt(DataInput in, long[] dst, int offset) throws IOException {
final int flag = in.readByte() & 0xFF;
final int n1Minus1 = flag >> 6;
final int n2Minus1 = (flag >> 4) & 0x03;
final int n3Minus1 = (flag >> 2) & 0x03;
final int n4Minus1 = flag & 0x03;
dst[offset] = readLongInGroup(in, n1Minus1);
dst[offset + 1] = readLongInGroup(in, n2Minus1);
dst[offset + 2] = readLongInGroup(in, n3Minus1);
dst[offset + 3] = readLongInGroup(in, n4Minus1);
}
private static long readLongInGroup(DataInput in, int numBytesMinus1) throws IOException {
switch (numBytesMinus1) {
case 0:
return in.readByte() & 0xFFL;
case 1:
return in.readShort() & 0xFFFFL;
case 2:
return (in.readShort() & 0xFFFFL) | ((in.readByte() & 0xFFL) << 16);
default:
return in.readInt() & 0xFFFFFFFFL;
}
}
/**
* Provides an abstraction for read int values, so that decoding logic can be reused in different
* DataInput.
*/
@FunctionalInterface
public static interface IntReader {
int read(long v);
}
/**
* Faster implementation of read single group, It read values from the buffer that would not cross
* boundaries.
*
* @param in the input to use to read data.
* @param remaining the number of remaining bytes allowed to read for current block/segment.
* @param reader the supplier of read int.
* @param pos the start pos to read from the reader.
* @param dst the array to read ints into.
* @param offset the offset in the array to start storing ints.
* @return the number of bytes read excluding the flag. this indicates the number of positions
* should to be increased for caller, it is 0 or positive number and less than {@link
* #MAX_LENGTH_PER_GROUP}
*/
public static int readGroupVInt(
DataInput in, long remaining, IntReader reader, long pos, long[] dst, int offset)
throws IOException {
if (remaining < MAX_LENGTH_PER_GROUP) {
readGroupVInt(in, dst, offset);
return 0;
}
final int flag = in.readByte() & 0xFF;
final long posStart = ++pos; // exclude the flag bytes, the position has updated via readByte().
final int n1Minus1 = flag >> 6;
final int n2Minus1 = (flag >> 4) & 0x03;
final int n3Minus1 = (flag >> 2) & 0x03;
final int n4Minus1 = flag & 0x03;
// This code path has fewer conditionals and tends to be significantly faster in benchmarks
dst[offset] = reader.read(pos) & MASKS[n1Minus1];
pos += 1 + n1Minus1;
dst[offset + 1] = reader.read(pos) & MASKS[n2Minus1];
pos += 1 + n2Minus1;
dst[offset + 2] = reader.read(pos) & MASKS[n3Minus1];
pos += 1 + n3Minus1;
dst[offset + 3] = reader.read(pos) & MASKS[n4Minus1];
pos += 1 + n4Minus1;
return (int) (pos - posStart);
}
}

View File

@ -25,6 +25,7 @@ import java.nio.ByteOrder;
import java.util.Arrays;
import java.util.Objects;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.GroupVIntUtil;
/**
* Base IndexInput implementation that uses an array of MemorySegments to represent a file.
@ -305,6 +306,23 @@ abstract class MemorySegmentIndexInput extends IndexInput implements RandomAcces
}
}
@Override
protected void readGroupVInt(long[] dst, int offset) throws IOException {
try {
final int len =
GroupVIntUtil.readGroupVInt(
this,
curSegment.byteSize() - curPosition,
p -> curSegment.get(LAYOUT_LE_INT, p),
curPosition,
dst,
offset);
curPosition += len;
} catch (NullPointerException | IllegalStateException e) {
throw alreadyClosed(e);
}
}
@Override
public void readBytes(long pos, byte[] b, int offset, int len) throws IOException {
try {

View File

@ -25,6 +25,7 @@ import java.nio.ByteOrder;
import java.util.Arrays;
import java.util.Objects;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.GroupVIntUtil;
/**
* Base IndexInput implementation that uses an array of MemorySegments to represent a file.
@ -303,6 +304,23 @@ abstract class MemorySegmentIndexInput extends IndexInput implements RandomAcces
}
}
@Override
protected void readGroupVInt(long[] dst, int offset) throws IOException {
try {
final int len =
GroupVIntUtil.readGroupVInt(
this,
curSegment.byteSize() - curPosition,
p -> curSegment.get(LAYOUT_LE_INT, p),
curPosition,
dst,
offset);
curPosition += len;
} catch (NullPointerException | IllegalStateException e) {
throw alreadyClosed(e);
}
}
@Override
public void readBytes(long pos, byte[] b, int offset, int len) throws IOException {
try {

View File

@ -25,6 +25,7 @@ import java.nio.ByteOrder;
import java.util.Arrays;
import java.util.Objects;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.GroupVIntUtil;
/**
* Base IndexInput implementation that uses an array of MemorySegments to represent a file.
@ -303,6 +304,23 @@ abstract class MemorySegmentIndexInput extends IndexInput implements RandomAcces
}
}
@Override
protected void readGroupVInt(long[] dst, int offset) throws IOException {
try {
final int len =
GroupVIntUtil.readGroupVInt(
this,
curSegment.byteSize() - curPosition,
p -> curSegment.get(LAYOUT_LE_INT, p),
curPosition,
dst,
offset);
curPosition += len;
} catch (NullPointerException | IllegalStateException e) {
throw alreadyClosed(e);
}
}
@Override
public void readBytes(long pos, byte[] b, int offset, int len) throws IOException {
try {

View File

@ -1,55 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import java.io.IOException;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.packed.PackedInts;
public class TestGroupVInt extends LuceneTestCase {
public void testEncodeDecode() throws IOException {
long[] values = new long[ForUtil.BLOCK_SIZE];
long[] restored = new long[ForUtil.BLOCK_SIZE];
final int iterations = atLeast(100);
final GroupVIntWriter w = new GroupVIntWriter();
byte[] encoded = new byte[(int) (Integer.BYTES * ForUtil.BLOCK_SIZE * 1.25)];
for (int i = 0; i < iterations; i++) {
final int bpv = TestUtil.nextInt(random(), 1, 31);
final int numValues = TestUtil.nextInt(random(), 1, ForUtil.BLOCK_SIZE);
// encode
for (int j = 0; j < numValues; j++) {
values[j] = RandomNumbers.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv));
}
w.writeValues(new ByteArrayDataOutput(encoded), values, numValues);
// decode
GroupVIntReader.readValues(new ByteArrayDataInput(encoded), restored, numValues);
assertArrayEquals(
ArrayUtil.copyOfSubArray(values, 0, numValues),
ArrayUtil.copyOfSubArray(restored, 0, numValues));
}
}
}

View File

@ -50,10 +50,19 @@ public abstract class BaseChunkedDirectoryTestCase extends BaseDirectoryTestCase
/** Creates a new directory with the specified max chunk size */
protected abstract Directory getDirectory(Path path, int maxChunkSize) throws IOException;
public void testGroupVIntMultiBlocks() throws IOException {
final int maxChunkSize = random().nextInt(64, 512);
try (Directory dir = getDirectory(createTempDir(), maxChunkSize)) {
doTestGroupVInt(dir, 10, 1, 31, 1024);
}
}
public void testCloneClose() throws Exception {
Directory dir = getDirectory(createTempDir("testCloneClose"));
IndexOutput io = dir.createOutput("bytes", newIOContext(random()));
final long[] values = new long[] {0, 7, 11, 9};
io.writeVInt(5);
io.writeGroupVInts(values, values.length);
io.close();
IndexInput one = dir.openInput("bytes", IOContext.DEFAULT);
IndexInput two = one.clone();
@ -65,6 +74,11 @@ public abstract class BaseChunkedDirectoryTestCase extends BaseDirectoryTestCase
() -> {
two.readVInt();
});
expectThrows(
AlreadyClosedException.class,
() -> {
two.readGroupVInts(values, values.length);
});
assertEquals(5, three.readVInt());
one.close();
three.close();
@ -74,11 +88,13 @@ public abstract class BaseChunkedDirectoryTestCase extends BaseDirectoryTestCase
public void testCloneSliceClose() throws Exception {
Directory dir = getDirectory(createTempDir("testCloneSliceClose"));
IndexOutput io = dir.createOutput("bytes", newIOContext(random()));
final long[] values = new long[] {0, 7, 11, 9};
io.writeInt(1);
io.writeInt(2);
io.writeGroupVInts(values, values.length); // will write 5 bytes
io.close();
IndexInput slicer = dir.openInput("bytes", newIOContext(random()));
IndexInput one = slicer.slice("first int", 0, 4);
IndexInput one = slicer.slice("first int", 0, 4 + 5);
IndexInput two = slicer.slice("second int", 4, 4);
one.close();
expectThrows(
@ -86,6 +102,11 @@ public abstract class BaseChunkedDirectoryTestCase extends BaseDirectoryTestCase
() -> {
one.readInt();
});
expectThrows(
AlreadyClosedException.class,
() -> {
one.readGroupVInts(values, values.length);
});
assertEquals(2, two.readInt());
// reopen a new slice "another":
IndexInput another = slicer.slice("first int", 0, 4);

View File

@ -18,6 +18,7 @@ package org.apache.lucene.tests.store;
import com.carrotsearch.randomizedtesting.RandomizedTest;
import com.carrotsearch.randomizedtesting.generators.RandomBytes;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import java.io.EOFException;
import java.io.FileNotFoundException;
@ -58,6 +59,7 @@ import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;
import org.junit.Assert;
/** Base class for {@link Directory} implementations. */
@ -1438,4 +1440,76 @@ public abstract class BaseDirectoryTestCase extends LuceneTestCase {
assertArrayEquals(expected, actual);
}
}
public void testDataTypes() throws IOException {
final long[] values = new long[] {43, 12345, 123456, 1234567890};
try (Directory dir = getDirectory(createTempDir("testDataTypes"))) {
IndexOutput out = dir.createOutput("test", IOContext.DEFAULT);
out.writeByte((byte) 43);
out.writeShort((short) 12345);
out.writeInt(1234567890);
out.writeGroupVInts(values, 4);
out.writeLong(1234567890123456789L);
out.close();
long[] restored = new long[4];
IndexInput in = dir.openInput("test", IOContext.DEFAULT);
assertEquals(43, in.readByte());
assertEquals(12345, in.readShort());
assertEquals(1234567890, in.readInt());
in.readGroupVInts(restored, 4);
assertArrayEquals(values, restored);
assertEquals(1234567890123456789L, in.readLong());
in.close();
}
}
public void testGroupVInt() throws IOException {
try (Directory dir = getDirectory(createTempDir("testGroupVInt"))) {
// test fallback to default implementation of readGroupVInt
doTestGroupVInt(dir, 5, 1, 6, 8);
// use more iterations to covers all bpv
doTestGroupVInt(dir, atLeast(100), 1, 31, 128);
// we use BaseChunkedDirectoryTestCase#testGroupVIntMultiBlocks cover multiple blocks for
// ByteBuffersDataInput and MMapDirectory
}
}
protected void doTestGroupVInt(
Directory dir, int iterations, int minBpv, int maxBpv, int maxNumValues) throws IOException {
long[] values = new long[maxNumValues];
int[] numValuesArray = new int[iterations];
IndexOutput groupVIntOut = dir.createOutput("group-varint", IOContext.DEFAULT);
IndexOutput vIntOut = dir.createOutput("vint", IOContext.DEFAULT);
// encode
for (int iter = 0; iter < iterations; iter++) {
final int bpv = TestUtil.nextInt(random(), minBpv, maxBpv);
numValuesArray[iter] = TestUtil.nextInt(random(), 1, maxNumValues);
for (int j = 0; j < numValuesArray[iter]; j++) {
values[j] = RandomNumbers.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv));
vIntOut.writeVInt((int) values[j]);
}
groupVIntOut.writeGroupVInts(values, numValuesArray[iter]);
}
groupVIntOut.close();
vIntOut.close();
// decode
IndexInput groupVIntIn = dir.openInput("group-varint", IOContext.DEFAULT);
IndexInput vIntIn = dir.openInput("vint", IOContext.DEFAULT);
for (int iter = 0; iter < iterations; iter++) {
groupVIntIn.readGroupVInts(values, numValuesArray[iter]);
for (int j = 0; j < numValuesArray[iter]; j++) {
assertEquals(vIntIn.readVInt(), values[j]);
}
}
groupVIntIn.close();
vIntIn.close();
dir.deleteFile("group-varint");
dir.deleteFile("vint");
}
}