Use group-varint encoding for the tail of postings (#12782)

Co-authored-by: Adrien Grand <jpountz@gmail.com>
This commit is contained in:
Zhang Chao 2023-11-20 18:06:13 +08:00 committed by GitHub
parent 194a500323
commit d0f63ec530
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 348 additions and 25 deletions

View File

@ -285,12 +285,14 @@ Optimizations
* GITHUB#12381: Skip docs with DocValues in NumericLeafComparator. (Lu Xugang, Adrien Grand)
* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Chao Zhang)
* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Zhang Chao)
* GITHUB#12784: Cache buckets to speed up BytesRefHash#sort. (Guo Feng)
* GITHUB#12806: Utilize exact kNN search when gathering k >= numVectors in a segment (Ben Trent)
* GITHUB#12782: Use group-varint encoding for the tail of postings. (Adrien Grand, Zhang Chao)
Changes in runtime behavior
---------------------
@ -402,7 +404,7 @@ Improvements
Optimizations
---------------------
* GITHUB#12377: Avoid redundant loop for compute min value in DirectMonotonicWriter. (Chao Zhang)
* GITHUB#12377: Avoid redundant loop for compute min value in DirectMonotonicWriter. (Zhang Chao)
* GITHUB#12361: Faster top-level disjunctions sorted by descending score.
(Adrien Grand)
@ -417,7 +419,7 @@ Optimizations
* GITHUB#12385: Restore parallel knn query rewrite across segments rather than slices (Luca Cavanna)
* GITHUB#12381: Speed up NumericDocValuesWriter with index sorting. (Chao Zhang)
* GITHUB#12381: Speed up NumericDocValuesWriter with index sorting. (Zhang Chao)
* GITHUB#12453: Faster bulk numeric reads from BufferedIndexInput (Armin Braun)
@ -484,7 +486,7 @@ Other
* GITHUB#12428: Replace consecutive close() calls and close() calls with null checks with IOUtils.close().
(Shubham Chaudhary)
* GITHUB#12512: Remove unused variable in BKDWriter. (Chao Zhang)
* GITHUB#12512: Remove unused variable in BKDWriter. (Zhang Chao)
======================== Lucene 9.7.0 =======================

View File

@ -0,0 +1,147 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.jmh;
import java.io.IOException;
import java.nio.file.Files;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.codecs.lucene99.GroupVIntReader;
import org.apache.lucene.codecs.lucene99.GroupVIntWriter;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.MMapDirectory;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 3, time = 3)
@Measurement(iterations = 5, time = 5)
@Fork(
value = 1,
jvmArgsPrepend = {"--add-modules=jdk.unsupported"})
public class GroupVIntBenchmark {
final int maxSize = 256;
final long[] values = new long[maxSize];
IndexInput byteBufferGVIntIn;
IndexInput byteBufferVIntIn;
ByteArrayDataInput byteArrayVIntIn;
ByteArrayDataInput byteArrayGVIntIn;
// @Param({"16", "32", "64", "128", "248"})
@Param({"64"})
public int size;
@Param({"1", "2", "3", "4"})
public int numBytesPerInt;
private final int[] maxValues = new int[] {0, 1 << 4, 1 << 12, 1 << 18, 1 << 25};
void initArrayInput(long[] docs) throws Exception {
byte[] gVIntBytes = new byte[Integer.BYTES * maxSize * 2];
byte[] vIntBytes = new byte[Integer.BYTES * maxSize * 2];
ByteArrayDataOutput vIntOut = new ByteArrayDataOutput(vIntBytes);
GroupVIntWriter w = new GroupVIntWriter();
w.writeValues(new ByteArrayDataOutput(gVIntBytes), docs, docs.length);
for (long v : docs) {
vIntOut.writeVInt((int) v);
}
byteArrayVIntIn = new ByteArrayDataInput(vIntBytes);
byteArrayGVIntIn = new ByteArrayDataInput(gVIntBytes);
}
void initByteBufferInput(long[] docs) throws Exception {
Directory dir = MMapDirectory.open(Files.createTempDirectory("groupvintdata"));
IndexOutput vintOut = dir.createOutput("vint", IOContext.DEFAULT);
IndexOutput gvintOut = dir.createOutput("gvint", IOContext.DEFAULT);
GroupVIntWriter w = new GroupVIntWriter();
w.writeValues(gvintOut, docs, docs.length);
for (long v : docs) {
vintOut.writeVInt((int) v);
}
vintOut.close();
gvintOut.close();
byteBufferGVIntIn = dir.openInput("gvint", IOContext.DEFAULT);
byteBufferVIntIn = dir.openInput("vint", IOContext.DEFAULT);
}
@Setup(Level.Trial)
public void init() throws Exception {
long[] docs = new long[maxSize];
int max = maxValues[numBytesPerInt];
int min = max >> 1;
for (int i = 0; i < maxSize; i++) {
long v = ThreadLocalRandom.current().nextInt(min, max);
docs[i] = v;
}
initByteBufferInput(docs);
initArrayInput(docs);
}
@Benchmark
public void byteBufferReadVInt(Blackhole bh) throws IOException {
byteBufferVIntIn.seek(0);
for (int i = 0; i < size; i++) {
values[i] = byteBufferVIntIn.readVInt();
}
bh.consume(values);
}
@Benchmark
public void byteBufferReadGroupVInt(Blackhole bh) throws IOException {
byteBufferGVIntIn.seek(0);
GroupVIntReader.readValues(byteBufferGVIntIn, values, size);
bh.consume(values);
}
@Benchmark
public void byteArrayReadVInt(Blackhole bh) {
byteArrayVIntIn.rewind();
for (int i = 0; i < size; i++) {
values[i] = byteArrayVIntIn.readVInt();
}
bh.consume(values);
}
@Benchmark
public void byteArrayReadGroupVInt(Blackhole bh) throws IOException {
byteArrayGVIntIn.rewind();
GroupVIntReader.readValues(byteArrayGVIntIn, values, size);
bh.consume(values);
}
}

View File

@ -0,0 +1,57 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.store.DataInput;
/** Decode integers using group-varint. */
public class GroupVIntReader {
public static void readValues(DataInput in, long[] docs, int limit) throws IOException {
int i;
for (i = 0; i <= limit - 4; i += 4) {
final int flag = in.readByte() & 0xFF;
final int n1Minus1 = flag >> 6;
final int n2Minus1 = (flag >> 4) & 0x03;
final int n3Minus1 = (flag >> 2) & 0x03;
final int n4Minus1 = flag & 0x03;
docs[i] = readLong(in, n1Minus1);
docs[i + 1] = readLong(in, n2Minus1);
docs[i + 2] = readLong(in, n3Minus1);
docs[i + 3] = readLong(in, n4Minus1);
}
for (; i < limit; ++i) {
docs[i] = in.readVInt();
}
}
private static long readLong(DataInput in, int numBytesMinus1) throws IOException {
switch (numBytesMinus1) {
case 0:
return in.readByte() & 0xFFL;
case 1:
return in.readShort() & 0xFFFFL;
case 2:
return (in.readShort() & 0xFFFFL) | ((in.readByte() & 0xFFL) << 16);
default:
return in.readInt() & 0xFFFFFFFFL;
}
}
}

View File

@ -0,0 +1,63 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.store.DataOutput;
/**
* Encode integers using group-varint. It uses VInt to encode tail values that are not enough for a
* group
*/
public class GroupVIntWriter {
// the maximum size of one group is 4 integers + 1 byte flag.
private byte[] bytes = new byte[17];
private int byteOffset = 0;
public GroupVIntWriter() {}
private int encodeValue(int v) {
int lastOff = byteOffset;
do {
bytes[byteOffset++] = (byte) (v & 0xFF);
v >>>= 8;
} while (v != 0);
return byteOffset - lastOff;
}
public void writeValues(DataOutput out, long[] values, int limit) throws IOException {
int off = 0;
// encode each group
while ((limit - off) >= 4) {
byte flag = 0;
byteOffset = 1;
flag |= (encodeValue((int) values[off++]) - 1) << 6;
flag |= (encodeValue((int) values[off++]) - 1) << 4;
flag |= (encodeValue((int) values[off++]) - 1) << 2;
flag |= (encodeValue((int) values[off++]) - 1);
bytes[0] = flag;
out.writeBytes(bytes, byteOffset);
}
// tail vints
for (; off < limit; off++) {
out.writeVInt((int) values[off]);
}
}
}

View File

@ -144,20 +144,15 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
static void readVIntBlock(
IndexInput docIn, long[] docBuffer, long[] freqBuffer, int num, boolean indexHasFreq)
throws IOException {
GroupVIntReader.readValues(docIn, docBuffer, num);
if (indexHasFreq) {
for (int i = 0; i < num; i++) {
final int code = docIn.readVInt();
docBuffer[i] = code >>> 1;
if ((code & 1) != 0) {
freqBuffer[i] = 1;
} else {
for (int i = 0; i < num; ++i) {
freqBuffer[i] = docBuffer[i] & 0x01;
docBuffer[i] >>= 1;
if (freqBuffer[i] == 0) {
freqBuffer[i] = docIn.readVInt();
}
}
} else {
for (int i = 0; i < num; i++) {
docBuffer[i] = docIn.readVInt();
}
}
}

View File

@ -92,6 +92,7 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
private final PForUtil pforUtil;
private final ForDeltaUtil forDeltaUtil;
private final Lucene99SkipWriter skipWriter;
private final GroupVIntWriter docGroupVIntWriter;
private boolean fieldHasNorms;
private NumericDocValues norms;
@ -172,6 +173,7 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
skipWriter =
new Lucene99SkipWriter(
MAX_SKIP_LEVELS, BLOCK_SIZE, state.segmentInfo.maxDoc(), docOut, posOut, payOut);
docGroupVIntWriter = new GroupVIntWriter();
}
@Override
@ -370,17 +372,19 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
singletonDocID = (int) docDeltaBuffer[0];
} else {
singletonDocID = -1;
// vInt encode the remaining doc deltas and freqs:
for (int i = 0; i < docBufferUpto; i++) {
final int docDelta = (int) docDeltaBuffer[i];
final int freq = (int) freqBuffer[i];
if (!writeFreqs) {
docOut.writeVInt(docDelta);
} else if (freq == 1) {
docOut.writeVInt((docDelta << 1) | 1);
} else {
docOut.writeVInt(docDelta << 1);
docOut.writeVInt(freq);
// Group vInt encode the remaining doc deltas and freqs:
if (writeFreqs) {
for (int i = 0; i < docBufferUpto; i++) {
docDeltaBuffer[i] = (docDeltaBuffer[i] << 1) | (freqBuffer[i] == 1 ? 1 : 0);
}
}
docGroupVIntWriter.writeValues(docOut, docDeltaBuffer, docBufferUpto);
if (writeFreqs) {
for (int i = 0; i < docBufferUpto; i++) {
final int freq = (int) freqBuffer[i];
if (freq != 1) {
docOut.writeVInt(freq);
}
}
}
}

View File

@ -0,0 +1,55 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import java.io.IOException;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.packed.PackedInts;
public class TestGroupVInt extends LuceneTestCase {
public void testEncodeDecode() throws IOException {
long[] values = new long[ForUtil.BLOCK_SIZE];
long[] restored = new long[ForUtil.BLOCK_SIZE];
final int iterations = atLeast(100);
final GroupVIntWriter w = new GroupVIntWriter();
byte[] encoded = new byte[(int) (Integer.BYTES * ForUtil.BLOCK_SIZE * 1.25)];
for (int i = 0; i < iterations; i++) {
final int bpv = TestUtil.nextInt(random(), 1, 31);
final int numValues = TestUtil.nextInt(random(), 1, ForUtil.BLOCK_SIZE);
// encode
for (int j = 0; j < numValues; j++) {
values[j] = RandomNumbers.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv));
}
w.writeValues(new ByteArrayDataOutput(encoded), values, numValues);
// decode
GroupVIntReader.readValues(new ByteArrayDataInput(encoded), restored, numValues);
assertArrayEquals(
ArrayUtil.copyOfSubArray(values, 0, numValues),
ArrayUtil.copyOfSubArray(restored, 0, numValues));
}
}
}