mirror of https://github.com/apache/lucene.git
Use group-varint encoding for the tail of postings (#12782)
Co-authored-by: Adrien Grand <jpountz@gmail.com>
This commit is contained in:
parent
194a500323
commit
d0f63ec530
|
@ -285,12 +285,14 @@ Optimizations
|
|||
|
||||
* GITHUB#12381: Skip docs with DocValues in NumericLeafComparator. (Lu Xugang, Adrien Grand)
|
||||
|
||||
* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Chao Zhang)
|
||||
* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Zhang Chao)
|
||||
|
||||
* GITHUB#12784: Cache buckets to speed up BytesRefHash#sort. (Guo Feng)
|
||||
|
||||
* GITHUB#12806: Utilize exact kNN search when gathering k >= numVectors in a segment (Ben Trent)
|
||||
|
||||
* GITHUB#12782: Use group-varint encoding for the tail of postings. (Adrien Grand, Zhang Chao)
|
||||
|
||||
Changes in runtime behavior
|
||||
---------------------
|
||||
|
||||
|
@ -402,7 +404,7 @@ Improvements
|
|||
Optimizations
|
||||
---------------------
|
||||
|
||||
* GITHUB#12377: Avoid redundant loop for compute min value in DirectMonotonicWriter. (Chao Zhang)
|
||||
* GITHUB#12377: Avoid redundant loop for compute min value in DirectMonotonicWriter. (Zhang Chao)
|
||||
|
||||
* GITHUB#12361: Faster top-level disjunctions sorted by descending score.
|
||||
(Adrien Grand)
|
||||
|
@ -417,7 +419,7 @@ Optimizations
|
|||
|
||||
* GITHUB#12385: Restore parallel knn query rewrite across segments rather than slices (Luca Cavanna)
|
||||
|
||||
* GITHUB#12381: Speed up NumericDocValuesWriter with index sorting. (Chao Zhang)
|
||||
* GITHUB#12381: Speed up NumericDocValuesWriter with index sorting. (Zhang Chao)
|
||||
|
||||
* GITHUB#12453: Faster bulk numeric reads from BufferedIndexInput (Armin Braun)
|
||||
|
||||
|
@ -484,7 +486,7 @@ Other
|
|||
* GITHUB#12428: Replace consecutive close() calls and close() calls with null checks with IOUtils.close().
|
||||
(Shubham Chaudhary)
|
||||
|
||||
* GITHUB#12512: Remove unused variable in BKDWriter. (Chao Zhang)
|
||||
* GITHUB#12512: Remove unused variable in BKDWriter. (Zhang Chao)
|
||||
|
||||
======================== Lucene 9.7.0 =======================
|
||||
|
||||
|
|
|
@ -0,0 +1,147 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.benchmark.jmh;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.lucene.codecs.lucene99.GroupVIntReader;
|
||||
import org.apache.lucene.codecs.lucene99.GroupVIntWriter;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.MMapDirectory;
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.Fork;
|
||||
import org.openjdk.jmh.annotations.Level;
|
||||
import org.openjdk.jmh.annotations.Measurement;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Param;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
import org.openjdk.jmh.annotations.Warmup;
|
||||
import org.openjdk.jmh.infra.Blackhole;
|
||||
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@OutputTimeUnit(TimeUnit.MICROSECONDS)
|
||||
@State(Scope.Benchmark)
|
||||
@Warmup(iterations = 3, time = 3)
|
||||
@Measurement(iterations = 5, time = 5)
|
||||
@Fork(
|
||||
value = 1,
|
||||
jvmArgsPrepend = {"--add-modules=jdk.unsupported"})
|
||||
public class GroupVIntBenchmark {
|
||||
|
||||
final int maxSize = 256;
|
||||
final long[] values = new long[maxSize];
|
||||
|
||||
IndexInput byteBufferGVIntIn;
|
||||
IndexInput byteBufferVIntIn;
|
||||
|
||||
ByteArrayDataInput byteArrayVIntIn;
|
||||
ByteArrayDataInput byteArrayGVIntIn;
|
||||
|
||||
// @Param({"16", "32", "64", "128", "248"})
|
||||
@Param({"64"})
|
||||
public int size;
|
||||
|
||||
@Param({"1", "2", "3", "4"})
|
||||
public int numBytesPerInt;
|
||||
|
||||
private final int[] maxValues = new int[] {0, 1 << 4, 1 << 12, 1 << 18, 1 << 25};
|
||||
|
||||
void initArrayInput(long[] docs) throws Exception {
|
||||
byte[] gVIntBytes = new byte[Integer.BYTES * maxSize * 2];
|
||||
byte[] vIntBytes = new byte[Integer.BYTES * maxSize * 2];
|
||||
ByteArrayDataOutput vIntOut = new ByteArrayDataOutput(vIntBytes);
|
||||
GroupVIntWriter w = new GroupVIntWriter();
|
||||
w.writeValues(new ByteArrayDataOutput(gVIntBytes), docs, docs.length);
|
||||
for (long v : docs) {
|
||||
vIntOut.writeVInt((int) v);
|
||||
}
|
||||
byteArrayVIntIn = new ByteArrayDataInput(vIntBytes);
|
||||
byteArrayGVIntIn = new ByteArrayDataInput(gVIntBytes);
|
||||
}
|
||||
|
||||
void initByteBufferInput(long[] docs) throws Exception {
|
||||
Directory dir = MMapDirectory.open(Files.createTempDirectory("groupvintdata"));
|
||||
IndexOutput vintOut = dir.createOutput("vint", IOContext.DEFAULT);
|
||||
IndexOutput gvintOut = dir.createOutput("gvint", IOContext.DEFAULT);
|
||||
|
||||
GroupVIntWriter w = new GroupVIntWriter();
|
||||
w.writeValues(gvintOut, docs, docs.length);
|
||||
for (long v : docs) {
|
||||
vintOut.writeVInt((int) v);
|
||||
}
|
||||
vintOut.close();
|
||||
gvintOut.close();
|
||||
byteBufferGVIntIn = dir.openInput("gvint", IOContext.DEFAULT);
|
||||
byteBufferVIntIn = dir.openInput("vint", IOContext.DEFAULT);
|
||||
}
|
||||
|
||||
@Setup(Level.Trial)
|
||||
public void init() throws Exception {
|
||||
long[] docs = new long[maxSize];
|
||||
int max = maxValues[numBytesPerInt];
|
||||
int min = max >> 1;
|
||||
for (int i = 0; i < maxSize; i++) {
|
||||
long v = ThreadLocalRandom.current().nextInt(min, max);
|
||||
docs[i] = v;
|
||||
}
|
||||
initByteBufferInput(docs);
|
||||
initArrayInput(docs);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void byteBufferReadVInt(Blackhole bh) throws IOException {
|
||||
byteBufferVIntIn.seek(0);
|
||||
for (int i = 0; i < size; i++) {
|
||||
values[i] = byteBufferVIntIn.readVInt();
|
||||
}
|
||||
bh.consume(values);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void byteBufferReadGroupVInt(Blackhole bh) throws IOException {
|
||||
byteBufferGVIntIn.seek(0);
|
||||
GroupVIntReader.readValues(byteBufferGVIntIn, values, size);
|
||||
bh.consume(values);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void byteArrayReadVInt(Blackhole bh) {
|
||||
byteArrayVIntIn.rewind();
|
||||
for (int i = 0; i < size; i++) {
|
||||
values[i] = byteArrayVIntIn.readVInt();
|
||||
}
|
||||
bh.consume(values);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void byteArrayReadGroupVInt(Blackhole bh) throws IOException {
|
||||
byteArrayGVIntIn.rewind();
|
||||
GroupVIntReader.readValues(byteArrayGVIntIn, values, size);
|
||||
bh.consume(values);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
|
||||
/** Decode integers using group-varint. */
|
||||
public class GroupVIntReader {
|
||||
|
||||
public static void readValues(DataInput in, long[] docs, int limit) throws IOException {
|
||||
int i;
|
||||
for (i = 0; i <= limit - 4; i += 4) {
|
||||
final int flag = in.readByte() & 0xFF;
|
||||
|
||||
final int n1Minus1 = flag >> 6;
|
||||
final int n2Minus1 = (flag >> 4) & 0x03;
|
||||
final int n3Minus1 = (flag >> 2) & 0x03;
|
||||
final int n4Minus1 = flag & 0x03;
|
||||
|
||||
docs[i] = readLong(in, n1Minus1);
|
||||
docs[i + 1] = readLong(in, n2Minus1);
|
||||
docs[i + 2] = readLong(in, n3Minus1);
|
||||
docs[i + 3] = readLong(in, n4Minus1);
|
||||
}
|
||||
for (; i < limit; ++i) {
|
||||
docs[i] = in.readVInt();
|
||||
}
|
||||
}
|
||||
|
||||
private static long readLong(DataInput in, int numBytesMinus1) throws IOException {
|
||||
switch (numBytesMinus1) {
|
||||
case 0:
|
||||
return in.readByte() & 0xFFL;
|
||||
case 1:
|
||||
return in.readShort() & 0xFFFFL;
|
||||
case 2:
|
||||
return (in.readShort() & 0xFFFFL) | ((in.readByte() & 0xFFL) << 16);
|
||||
default:
|
||||
return in.readInt() & 0xFFFFFFFFL;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
|
||||
/**
|
||||
* Encode integers using group-varint. It uses VInt to encode tail values that are not enough for a
|
||||
* group
|
||||
*/
|
||||
public class GroupVIntWriter {
|
||||
|
||||
// the maximum size of one group is 4 integers + 1 byte flag.
|
||||
private byte[] bytes = new byte[17];
|
||||
private int byteOffset = 0;
|
||||
|
||||
public GroupVIntWriter() {}
|
||||
|
||||
private int encodeValue(int v) {
|
||||
int lastOff = byteOffset;
|
||||
do {
|
||||
bytes[byteOffset++] = (byte) (v & 0xFF);
|
||||
v >>>= 8;
|
||||
} while (v != 0);
|
||||
return byteOffset - lastOff;
|
||||
}
|
||||
|
||||
public void writeValues(DataOutput out, long[] values, int limit) throws IOException {
|
||||
int off = 0;
|
||||
|
||||
// encode each group
|
||||
while ((limit - off) >= 4) {
|
||||
byte flag = 0;
|
||||
byteOffset = 1;
|
||||
flag |= (encodeValue((int) values[off++]) - 1) << 6;
|
||||
flag |= (encodeValue((int) values[off++]) - 1) << 4;
|
||||
flag |= (encodeValue((int) values[off++]) - 1) << 2;
|
||||
flag |= (encodeValue((int) values[off++]) - 1);
|
||||
bytes[0] = flag;
|
||||
out.writeBytes(bytes, byteOffset);
|
||||
}
|
||||
|
||||
// tail vints
|
||||
for (; off < limit; off++) {
|
||||
out.writeVInt((int) values[off]);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -144,20 +144,15 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
|||
static void readVIntBlock(
|
||||
IndexInput docIn, long[] docBuffer, long[] freqBuffer, int num, boolean indexHasFreq)
|
||||
throws IOException {
|
||||
GroupVIntReader.readValues(docIn, docBuffer, num);
|
||||
if (indexHasFreq) {
|
||||
for (int i = 0; i < num; i++) {
|
||||
final int code = docIn.readVInt();
|
||||
docBuffer[i] = code >>> 1;
|
||||
if ((code & 1) != 0) {
|
||||
freqBuffer[i] = 1;
|
||||
} else {
|
||||
for (int i = 0; i < num; ++i) {
|
||||
freqBuffer[i] = docBuffer[i] & 0x01;
|
||||
docBuffer[i] >>= 1;
|
||||
if (freqBuffer[i] == 0) {
|
||||
freqBuffer[i] = docIn.readVInt();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < num; i++) {
|
||||
docBuffer[i] = docIn.readVInt();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -92,6 +92,7 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
|
|||
private final PForUtil pforUtil;
|
||||
private final ForDeltaUtil forDeltaUtil;
|
||||
private final Lucene99SkipWriter skipWriter;
|
||||
private final GroupVIntWriter docGroupVIntWriter;
|
||||
|
||||
private boolean fieldHasNorms;
|
||||
private NumericDocValues norms;
|
||||
|
@ -172,6 +173,7 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
|
|||
skipWriter =
|
||||
new Lucene99SkipWriter(
|
||||
MAX_SKIP_LEVELS, BLOCK_SIZE, state.segmentInfo.maxDoc(), docOut, posOut, payOut);
|
||||
docGroupVIntWriter = new GroupVIntWriter();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -370,17 +372,19 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
|
|||
singletonDocID = (int) docDeltaBuffer[0];
|
||||
} else {
|
||||
singletonDocID = -1;
|
||||
// vInt encode the remaining doc deltas and freqs:
|
||||
for (int i = 0; i < docBufferUpto; i++) {
|
||||
final int docDelta = (int) docDeltaBuffer[i];
|
||||
final int freq = (int) freqBuffer[i];
|
||||
if (!writeFreqs) {
|
||||
docOut.writeVInt(docDelta);
|
||||
} else if (freq == 1) {
|
||||
docOut.writeVInt((docDelta << 1) | 1);
|
||||
} else {
|
||||
docOut.writeVInt(docDelta << 1);
|
||||
docOut.writeVInt(freq);
|
||||
// Group vInt encode the remaining doc deltas and freqs:
|
||||
if (writeFreqs) {
|
||||
for (int i = 0; i < docBufferUpto; i++) {
|
||||
docDeltaBuffer[i] = (docDeltaBuffer[i] << 1) | (freqBuffer[i] == 1 ? 1 : 0);
|
||||
}
|
||||
}
|
||||
docGroupVIntWriter.writeValues(docOut, docDeltaBuffer, docBufferUpto);
|
||||
if (writeFreqs) {
|
||||
for (int i = 0; i < docBufferUpto; i++) {
|
||||
final int freq = (int) freqBuffer[i];
|
||||
if (freq != 1) {
|
||||
docOut.writeVInt(freq);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,55 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
public class TestGroupVInt extends LuceneTestCase {
|
||||
|
||||
public void testEncodeDecode() throws IOException {
|
||||
long[] values = new long[ForUtil.BLOCK_SIZE];
|
||||
long[] restored = new long[ForUtil.BLOCK_SIZE];
|
||||
final int iterations = atLeast(100);
|
||||
|
||||
final GroupVIntWriter w = new GroupVIntWriter();
|
||||
byte[] encoded = new byte[(int) (Integer.BYTES * ForUtil.BLOCK_SIZE * 1.25)];
|
||||
|
||||
for (int i = 0; i < iterations; i++) {
|
||||
final int bpv = TestUtil.nextInt(random(), 1, 31);
|
||||
final int numValues = TestUtil.nextInt(random(), 1, ForUtil.BLOCK_SIZE);
|
||||
|
||||
// encode
|
||||
for (int j = 0; j < numValues; j++) {
|
||||
values[j] = RandomNumbers.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv));
|
||||
}
|
||||
w.writeValues(new ByteArrayDataOutput(encoded), values, numValues);
|
||||
|
||||
// decode
|
||||
GroupVIntReader.readValues(new ByteArrayDataInput(encoded), restored, numValues);
|
||||
assertArrayEquals(
|
||||
ArrayUtil.copyOfSubArray(values, 0, numValues),
|
||||
ArrayUtil.copyOfSubArray(restored, 0, numValues));
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue