Write MSB VLong for better outputs sharing in block tree index (#12631)

This commit is contained in:
gf2121 2023-10-10 01:00:21 -05:00 committed by GitHub
parent e20e245f47
commit 4f01de2a2d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 107 additions and 9 deletions

View File

@ -178,6 +178,9 @@ Optimizations
* GITHUB#12623: Use a MergeSorter taking advantage of extra storage for StableMSBRadixSorter. (Guo Feng)
* GITHUB#12623: Write MSB VLong for better outputs sharing in block tree index, decreasing ~14% size
of .tip file. (Guo Feng)
Changes in runtime behavior
---------------------

View File

@ -16,12 +16,15 @@
*/
package org.apache.lucene.codecs.lucene90.blocktree;
import static org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader.VERSION_MSB_VLONG_OUTPUT;
import java.io.IOException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
@ -82,7 +85,7 @@ public final class FieldReader extends Terms {
// + rootCode + " divisor=" + indexDivisor);
// }
rootBlockFP =
(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong()
readVLongOutput(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length))
>>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
// Initialize FST always off-heap.
final IndexInput clone = indexIn.clone();
@ -99,6 +102,32 @@ public final class FieldReader extends Terms {
*/
}
long readVLongOutput(DataInput in) throws IOException {
if (parent.version >= VERSION_MSB_VLONG_OUTPUT) {
return readMSBVLong(in);
} else {
return in.readVLong();
}
}
/**
* Decodes a variable length byte[] in MSB order back to long, as written by {@link
* Lucene90BlockTreeTermsWriter#writeMSBVLong}.
*
* <p>Package private for testing.
*/
static long readMSBVLong(DataInput in) throws IOException {
long l = 0L;
while (true) {
byte b = in.readByte();
l = (l << 7) | (b & 0x7FL);
if ((b & 0x80) == 0) {
break;
}
}
return l;
}
@Override
public BytesRef getMin() throws IOException {
if (minTerm == null) {

View File

@ -146,7 +146,7 @@ final class IntersectTermsEnumFrame {
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
// Skip first long -- has redundant fp, hasTerms
// flag, isFloor flag
final long code = floorDataReader.readVLong();
final long code = ite.fr.readVLongOutput(floorDataReader);
if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
// Floor frame
numFollowFloorBlocks = floorDataReader.readVInt();

View File

@ -81,8 +81,13 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
/** Initial terms format. */
public static final int VERSION_START = 0;
/**
* Version that encode output as MSB VLong for better outputs sharing in FST, see GITHUB#12620.
*/
public static final int VERSION_MSB_VLONG_OUTPUT = 1;
/** Current terms format. */
public static final int VERSION_CURRENT = VERSION_START;
public static final int VERSION_CURRENT = VERSION_MSB_VLONG_OUTPUT;
/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tip";

View File

@ -430,6 +430,25 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {
return brToString(new BytesRef(b));
}
/**
* Encodes long value to variable length byte[], in MSB order. Use {@link
* FieldReader#readMSBVLong} to decode.
*
* <p>Package private for testing
*/
static void writeMSBVLong(long l, DataOutput scratchBytes) throws IOException {
assert l >= 0;
// Keep zero bits on most significant byte to have more chance to get prefix bytes shared.
// e.g. we expect 0x7FFF stored as [0x81, 0xFF, 0x7F] but not [0xFF, 0xFF, 0x40]
final int bytesNeeded = (Long.SIZE - Long.numberOfLeadingZeros(l) - 1) / 7 + 1;
l <<= Long.SIZE - bytesNeeded * 7;
for (int i = 1; i < bytesNeeded; i++) {
scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL) | 0x80));
l = l << 7;
}
scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL)));
}
private static final class PendingBlock extends PendingEntry {
public final BytesRef prefix;
public final long fp;
@ -472,10 +491,8 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {
assert scratchBytes.size() == 0;
// TODO: try writing the leading vLong in MSB order
// (opposite of what Lucene does today), for better
// outputs sharing in the FST
scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor));
// write the leading vLong in MSB order for better outputs sharing in the FST
writeMSBVLong(encodeOutput(fp, hasTerms, isFloor), scratchBytes);
if (isFloor) {
scratchBytes.writeVInt(blocks.size() - 1);
for (int i = 1; i < blocks.size(); i++) {

View File

@ -236,7 +236,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length)
throws IOException {
scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
final long code = scratchReader.readVLong();
final long code = fr.readVLongOutput(scratchReader);
final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord);
f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
@ -980,7 +980,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
} else if (isSeekFrame && !f.isFloor) {
final ByteArrayDataInput reader =
new ByteArrayDataInput(output.bytes, output.offset, output.length);
final long codeOrig = reader.readVLong();
final long codeOrig = fr.readVLongOutput(reader);
final long code =
(f.fp << Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS)
| (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0)

View File

@ -0,0 +1,44 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene90.blocktree;
import java.io.IOException;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.ArrayUtil;
public class TestMSBVLong extends LuceneTestCase {
public void testMSBVLong() throws IOException {
assertMSBVLong(Long.MAX_VALUE);
int iter = atLeast(10000);
for (long i = 0; i < iter; i++) {
assertMSBVLong(i);
}
}
private static void assertMSBVLong(long l) throws IOException {
byte[] bytes = new byte[10];
ByteArrayDataOutput output = new ByteArrayDataOutput(bytes);
Lucene90BlockTreeTermsWriter.writeMSBVLong(l, output);
ByteArrayDataInput in =
new ByteArrayDataInput(ArrayUtil.copyOfSubArray(bytes, 0, output.getPosition()));
long recovered = FieldReader.readMSBVLong(in);
assertEquals(l + " != " + recovered, l, recovered);
}
}