diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index c1d715121f5..23b84075416 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -285,12 +285,14 @@ Optimizations * GITHUB#12381: Skip docs with DocValues in NumericLeafComparator. (Lu Xugang, Adrien Grand) -* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Chao Zhang) +* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Zhang Chao) * GITHUB#12784: Cache buckets to speed up BytesRefHash#sort. (Guo Feng) * GITHUB#12806: Utilize exact kNN search when gathering k >= numVectors in a segment (Ben Trent) +* GITHUB#12782: Use group-varint encoding for the tail of postings. (Adrien Grand, Zhang Chao) + Changes in runtime behavior --------------------- @@ -402,7 +404,7 @@ Improvements Optimizations --------------------- -* GITHUB#12377: Avoid redundant loop for compute min value in DirectMonotonicWriter. (Chao Zhang) +* GITHUB#12377: Avoid redundant loop for compute min value in DirectMonotonicWriter. (Zhang Chao) * GITHUB#12361: Faster top-level disjunctions sorted by descending score. (Adrien Grand) @@ -417,7 +419,7 @@ Optimizations * GITHUB#12385: Restore parallel knn query rewrite across segments rather than slices (Luca Cavanna) -* GITHUB#12381: Speed up NumericDocValuesWriter with index sorting. (Chao Zhang) +* GITHUB#12381: Speed up NumericDocValuesWriter with index sorting. (Zhang Chao) * GITHUB#12453: Faster bulk numeric reads from BufferedIndexInput (Armin Braun) @@ -484,7 +486,7 @@ Other * GITHUB#12428: Replace consecutive close() calls and close() calls with null checks with IOUtils.close(). (Shubham Chaudhary) -* GITHUB#12512: Remove unused variable in BKDWriter. (Chao Zhang) +* GITHUB#12512: Remove unused variable in BKDWriter. (Zhang Chao) ======================== Lucene 9.7.0 ======================= diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/GroupVIntBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/GroupVIntBenchmark.java new file mode 100644 index 00000000000..659ba686544 --- /dev/null +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/GroupVIntBenchmark.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.jmh; + +import java.io.IOException; +import java.nio.file.Files; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; +import org.apache.lucene.codecs.lucene99.GroupVIntReader; +import org.apache.lucene.codecs.lucene99.GroupVIntWriter; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.MMapDirectory; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.MICROSECONDS) +@State(Scope.Benchmark) +@Warmup(iterations = 3, time = 3) +@Measurement(iterations = 5, time = 5) +@Fork( + value = 1, + jvmArgsPrepend = {"--add-modules=jdk.unsupported"}) +public class GroupVIntBenchmark { + + final int maxSize = 256; + final long[] values = new long[maxSize]; + + IndexInput byteBufferGVIntIn; + IndexInput byteBufferVIntIn; + + ByteArrayDataInput byteArrayVIntIn; + ByteArrayDataInput byteArrayGVIntIn; + + // @Param({"16", "32", "64", "128", "248"}) + @Param({"64"}) + public int size; + + @Param({"1", "2", "3", "4"}) + public int numBytesPerInt; + + private final int[] maxValues = new int[] {0, 1 << 4, 1 << 12, 1 << 18, 1 << 25}; + + void initArrayInput(long[] docs) throws Exception { + byte[] gVIntBytes = new byte[Integer.BYTES * maxSize * 2]; + byte[] vIntBytes = new byte[Integer.BYTES * maxSize * 2]; + ByteArrayDataOutput vIntOut = new ByteArrayDataOutput(vIntBytes); + GroupVIntWriter w = new GroupVIntWriter(); + w.writeValues(new ByteArrayDataOutput(gVIntBytes), docs, docs.length); + for (long v : docs) { + vIntOut.writeVInt((int) v); + } + byteArrayVIntIn = new ByteArrayDataInput(vIntBytes); + byteArrayGVIntIn = new ByteArrayDataInput(gVIntBytes); + } + + void initByteBufferInput(long[] docs) throws Exception { + Directory dir = MMapDirectory.open(Files.createTempDirectory("groupvintdata")); + IndexOutput vintOut = dir.createOutput("vint", IOContext.DEFAULT); + IndexOutput gvintOut = dir.createOutput("gvint", IOContext.DEFAULT); + + GroupVIntWriter w = new GroupVIntWriter(); + w.writeValues(gvintOut, docs, docs.length); + for (long v : docs) { + vintOut.writeVInt((int) v); + } + vintOut.close(); + gvintOut.close(); + byteBufferGVIntIn = dir.openInput("gvint", IOContext.DEFAULT); + byteBufferVIntIn = dir.openInput("vint", IOContext.DEFAULT); + } + + @Setup(Level.Trial) + public void init() throws Exception { + long[] docs = new long[maxSize]; + int max = maxValues[numBytesPerInt]; + int min = max >> 1; + for (int i = 0; i < maxSize; i++) { + long v = ThreadLocalRandom.current().nextInt(min, max); + docs[i] = v; + } + initByteBufferInput(docs); + initArrayInput(docs); + } + + @Benchmark + public void byteBufferReadVInt(Blackhole bh) throws IOException { + byteBufferVIntIn.seek(0); + for (int i = 0; i < size; i++) { + values[i] = byteBufferVIntIn.readVInt(); + } + bh.consume(values); + } + + @Benchmark + public void byteBufferReadGroupVInt(Blackhole bh) throws IOException { + byteBufferGVIntIn.seek(0); + GroupVIntReader.readValues(byteBufferGVIntIn, values, size); + bh.consume(values); + } + + @Benchmark + public void byteArrayReadVInt(Blackhole bh) { + byteArrayVIntIn.rewind(); + for (int i = 0; i < size; i++) { + values[i] = byteArrayVIntIn.readVInt(); + } + bh.consume(values); + } + + @Benchmark + public void byteArrayReadGroupVInt(Blackhole bh) throws IOException { + byteArrayGVIntIn.rewind(); + GroupVIntReader.readValues(byteArrayGVIntIn, values, size); + bh.consume(values); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/GroupVIntReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/GroupVIntReader.java new file mode 100644 index 00000000000..5fbd2069701 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/GroupVIntReader.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene99; + +import java.io.IOException; +import org.apache.lucene.store.DataInput; + +/** Decode integers using group-varint. */ +public class GroupVIntReader { + + public static void readValues(DataInput in, long[] docs, int limit) throws IOException { + int i; + for (i = 0; i <= limit - 4; i += 4) { + final int flag = in.readByte() & 0xFF; + + final int n1Minus1 = flag >> 6; + final int n2Minus1 = (flag >> 4) & 0x03; + final int n3Minus1 = (flag >> 2) & 0x03; + final int n4Minus1 = flag & 0x03; + + docs[i] = readLong(in, n1Minus1); + docs[i + 1] = readLong(in, n2Minus1); + docs[i + 2] = readLong(in, n3Minus1); + docs[i + 3] = readLong(in, n4Minus1); + } + for (; i < limit; ++i) { + docs[i] = in.readVInt(); + } + } + + private static long readLong(DataInput in, int numBytesMinus1) throws IOException { + switch (numBytesMinus1) { + case 0: + return in.readByte() & 0xFFL; + case 1: + return in.readShort() & 0xFFFFL; + case 2: + return (in.readShort() & 0xFFFFL) | ((in.readByte() & 0xFFL) << 16); + default: + return in.readInt() & 0xFFFFFFFFL; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/GroupVIntWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/GroupVIntWriter.java new file mode 100644 index 00000000000..905cab22b2b --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/GroupVIntWriter.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene99; + +import java.io.IOException; +import org.apache.lucene.store.DataOutput; + +/** + * Encode integers using group-varint. It uses VInt to encode tail values that are not enough for a + * group + */ +public class GroupVIntWriter { + + // the maximum size of one group is 4 integers + 1 byte flag. + private byte[] bytes = new byte[17]; + private int byteOffset = 0; + + public GroupVIntWriter() {} + + private int encodeValue(int v) { + int lastOff = byteOffset; + do { + bytes[byteOffset++] = (byte) (v & 0xFF); + v >>>= 8; + } while (v != 0); + return byteOffset - lastOff; + } + + public void writeValues(DataOutput out, long[] values, int limit) throws IOException { + int off = 0; + + // encode each group + while ((limit - off) >= 4) { + byte flag = 0; + byteOffset = 1; + flag |= (encodeValue((int) values[off++]) - 1) << 6; + flag |= (encodeValue((int) values[off++]) - 1) << 4; + flag |= (encodeValue((int) values[off++]) - 1) << 2; + flag |= (encodeValue((int) values[off++]) - 1); + bytes[0] = flag; + out.writeBytes(bytes, byteOffset); + } + + // tail vints + for (; off < limit; off++) { + out.writeVInt((int) values[off]); + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99PostingsReader.java index 5546039ae08..52cb999aafd 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99PostingsReader.java @@ -144,20 +144,15 @@ public final class Lucene99PostingsReader extends PostingsReaderBase { static void readVIntBlock( IndexInput docIn, long[] docBuffer, long[] freqBuffer, int num, boolean indexHasFreq) throws IOException { + GroupVIntReader.readValues(docIn, docBuffer, num); if (indexHasFreq) { - for (int i = 0; i < num; i++) { - final int code = docIn.readVInt(); - docBuffer[i] = code >>> 1; - if ((code & 1) != 0) { - freqBuffer[i] = 1; - } else { + for (int i = 0; i < num; ++i) { + freqBuffer[i] = docBuffer[i] & 0x01; + docBuffer[i] >>= 1; + if (freqBuffer[i] == 0) { freqBuffer[i] = docIn.readVInt(); } } - } else { - for (int i = 0; i < num; i++) { - docBuffer[i] = docIn.readVInt(); - } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99PostingsWriter.java index 34b4c9c62a3..2bd562fc3da 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99PostingsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99PostingsWriter.java @@ -92,6 +92,7 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase { private final PForUtil pforUtil; private final ForDeltaUtil forDeltaUtil; private final Lucene99SkipWriter skipWriter; + private final GroupVIntWriter docGroupVIntWriter; private boolean fieldHasNorms; private NumericDocValues norms; @@ -172,6 +173,7 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase { skipWriter = new Lucene99SkipWriter( MAX_SKIP_LEVELS, BLOCK_SIZE, state.segmentInfo.maxDoc(), docOut, posOut, payOut); + docGroupVIntWriter = new GroupVIntWriter(); } @Override @@ -370,17 +372,19 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase { singletonDocID = (int) docDeltaBuffer[0]; } else { singletonDocID = -1; - // vInt encode the remaining doc deltas and freqs: - for (int i = 0; i < docBufferUpto; i++) { - final int docDelta = (int) docDeltaBuffer[i]; - final int freq = (int) freqBuffer[i]; - if (!writeFreqs) { - docOut.writeVInt(docDelta); - } else if (freq == 1) { - docOut.writeVInt((docDelta << 1) | 1); - } else { - docOut.writeVInt(docDelta << 1); - docOut.writeVInt(freq); + // Group vInt encode the remaining doc deltas and freqs: + if (writeFreqs) { + for (int i = 0; i < docBufferUpto; i++) { + docDeltaBuffer[i] = (docDeltaBuffer[i] << 1) | (freqBuffer[i] == 1 ? 1 : 0); + } + } + docGroupVIntWriter.writeValues(docOut, docDeltaBuffer, docBufferUpto); + if (writeFreqs) { + for (int i = 0; i < docBufferUpto; i++) { + final int freq = (int) freqBuffer[i]; + if (freq != 1) { + docOut.writeVInt(freq); + } } } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestGroupVInt.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestGroupVInt.java new file mode 100644 index 00000000000..6c819302ac9 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestGroupVInt.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene99; + +import com.carrotsearch.randomizedtesting.generators.RandomNumbers; +import java.io.IOException; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.packed.PackedInts; + +public class TestGroupVInt extends LuceneTestCase { + + public void testEncodeDecode() throws IOException { + long[] values = new long[ForUtil.BLOCK_SIZE]; + long[] restored = new long[ForUtil.BLOCK_SIZE]; + final int iterations = atLeast(100); + + final GroupVIntWriter w = new GroupVIntWriter(); + byte[] encoded = new byte[(int) (Integer.BYTES * ForUtil.BLOCK_SIZE * 1.25)]; + + for (int i = 0; i < iterations; i++) { + final int bpv = TestUtil.nextInt(random(), 1, 31); + final int numValues = TestUtil.nextInt(random(), 1, ForUtil.BLOCK_SIZE); + + // encode + for (int j = 0; j < numValues; j++) { + values[j] = RandomNumbers.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv)); + } + w.writeValues(new ByteArrayDataOutput(encoded), values, numValues); + + // decode + GroupVIntReader.readValues(new ByteArrayDataInput(encoded), restored, numValues); + assertArrayEquals( + ArrayUtil.copyOfSubArray(values, 0, numValues), + ArrayUtil.copyOfSubArray(restored, 0, numValues)); + } + } +}