mirror of https://github.com/apache/lucene.git
Write MSB VLong for better outputs sharing in block tree index (#12631)
This commit is contained in:
parent
e20e245f47
commit
4f01de2a2d
|
@ -178,6 +178,9 @@ Optimizations
|
|||
|
||||
* GITHUB#12623: Use a MergeSorter taking advantage of extra storage for StableMSBRadixSorter. (Guo Feng)
|
||||
|
||||
* GITHUB#12623: Write MSB VLong for better outputs sharing in block tree index, decreasing ~14% size
|
||||
of .tip file. (Guo Feng)
|
||||
|
||||
Changes in runtime behavior
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -16,12 +16,15 @@
|
|||
*/
|
||||
package org.apache.lucene.codecs.lucene90.blocktree;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader.VERSION_MSB_VLONG_OUTPUT;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
|
@ -82,7 +85,7 @@ public final class FieldReader extends Terms {
|
|||
// + rootCode + " divisor=" + indexDivisor);
|
||||
// }
|
||||
rootBlockFP =
|
||||
(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong()
|
||||
readVLongOutput(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length))
|
||||
>>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
|
||||
// Initialize FST always off-heap.
|
||||
final IndexInput clone = indexIn.clone();
|
||||
|
@ -99,6 +102,32 @@ public final class FieldReader extends Terms {
|
|||
*/
|
||||
}
|
||||
|
||||
long readVLongOutput(DataInput in) throws IOException {
|
||||
if (parent.version >= VERSION_MSB_VLONG_OUTPUT) {
|
||||
return readMSBVLong(in);
|
||||
} else {
|
||||
return in.readVLong();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes a variable length byte[] in MSB order back to long, as written by {@link
|
||||
* Lucene90BlockTreeTermsWriter#writeMSBVLong}.
|
||||
*
|
||||
* <p>Package private for testing.
|
||||
*/
|
||||
static long readMSBVLong(DataInput in) throws IOException {
|
||||
long l = 0L;
|
||||
while (true) {
|
||||
byte b = in.readByte();
|
||||
l = (l << 7) | (b & 0x7FL);
|
||||
if ((b & 0x80) == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return l;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getMin() throws IOException {
|
||||
if (minTerm == null) {
|
||||
|
|
|
@ -146,7 +146,7 @@ final class IntersectTermsEnumFrame {
|
|||
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
|
||||
// Skip first long -- has redundant fp, hasTerms
|
||||
// flag, isFloor flag
|
||||
final long code = floorDataReader.readVLong();
|
||||
final long code = ite.fr.readVLongOutput(floorDataReader);
|
||||
if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
|
||||
// Floor frame
|
||||
numFollowFloorBlocks = floorDataReader.readVInt();
|
||||
|
|
|
@ -81,8 +81,13 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
|
|||
/** Initial terms format. */
|
||||
public static final int VERSION_START = 0;
|
||||
|
||||
/**
|
||||
* Version that encode output as MSB VLong for better outputs sharing in FST, see GITHUB#12620.
|
||||
*/
|
||||
public static final int VERSION_MSB_VLONG_OUTPUT = 1;
|
||||
|
||||
/** Current terms format. */
|
||||
public static final int VERSION_CURRENT = VERSION_START;
|
||||
public static final int VERSION_CURRENT = VERSION_MSB_VLONG_OUTPUT;
|
||||
|
||||
/** Extension of terms index file */
|
||||
static final String TERMS_INDEX_EXTENSION = "tip";
|
||||
|
|
|
@ -430,6 +430,25 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {
|
|||
return brToString(new BytesRef(b));
|
||||
}
|
||||
|
||||
/**
|
||||
* Encodes long value to variable length byte[], in MSB order. Use {@link
|
||||
* FieldReader#readMSBVLong} to decode.
|
||||
*
|
||||
* <p>Package private for testing
|
||||
*/
|
||||
static void writeMSBVLong(long l, DataOutput scratchBytes) throws IOException {
|
||||
assert l >= 0;
|
||||
// Keep zero bits on most significant byte to have more chance to get prefix bytes shared.
|
||||
// e.g. we expect 0x7FFF stored as [0x81, 0xFF, 0x7F] but not [0xFF, 0xFF, 0x40]
|
||||
final int bytesNeeded = (Long.SIZE - Long.numberOfLeadingZeros(l) - 1) / 7 + 1;
|
||||
l <<= Long.SIZE - bytesNeeded * 7;
|
||||
for (int i = 1; i < bytesNeeded; i++) {
|
||||
scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL) | 0x80));
|
||||
l = l << 7;
|
||||
}
|
||||
scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL)));
|
||||
}
|
||||
|
||||
private static final class PendingBlock extends PendingEntry {
|
||||
public final BytesRef prefix;
|
||||
public final long fp;
|
||||
|
@ -472,10 +491,8 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {
|
|||
|
||||
assert scratchBytes.size() == 0;
|
||||
|
||||
// TODO: try writing the leading vLong in MSB order
|
||||
// (opposite of what Lucene does today), for better
|
||||
// outputs sharing in the FST
|
||||
scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor));
|
||||
// write the leading vLong in MSB order for better outputs sharing in the FST
|
||||
writeMSBVLong(encodeOutput(fp, hasTerms, isFloor), scratchBytes);
|
||||
if (isFloor) {
|
||||
scratchBytes.writeVInt(blocks.size() - 1);
|
||||
for (int i = 1; i < blocks.size(); i++) {
|
||||
|
|
|
@ -236,7 +236,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length)
|
||||
throws IOException {
|
||||
scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
|
||||
final long code = scratchReader.readVLong();
|
||||
final long code = fr.readVLongOutput(scratchReader);
|
||||
final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
|
||||
final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord);
|
||||
f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
|
||||
|
@ -980,7 +980,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
} else if (isSeekFrame && !f.isFloor) {
|
||||
final ByteArrayDataInput reader =
|
||||
new ByteArrayDataInput(output.bytes, output.offset, output.length);
|
||||
final long codeOrig = reader.readVLong();
|
||||
final long codeOrig = fr.readVLongOutput(reader);
|
||||
final long code =
|
||||
(f.fp << Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS)
|
||||
| (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0)
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene90.blocktree;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
public class TestMSBVLong extends LuceneTestCase {
|
||||
|
||||
public void testMSBVLong() throws IOException {
|
||||
assertMSBVLong(Long.MAX_VALUE);
|
||||
int iter = atLeast(10000);
|
||||
for (long i = 0; i < iter; i++) {
|
||||
assertMSBVLong(i);
|
||||
}
|
||||
}
|
||||
|
||||
private static void assertMSBVLong(long l) throws IOException {
|
||||
byte[] bytes = new byte[10];
|
||||
ByteArrayDataOutput output = new ByteArrayDataOutput(bytes);
|
||||
Lucene90BlockTreeTermsWriter.writeMSBVLong(l, output);
|
||||
ByteArrayDataInput in =
|
||||
new ByteArrayDataInput(ArrayUtil.copyOfSubArray(bytes, 0, output.getPosition()));
|
||||
long recovered = FieldReader.readMSBVLong(in);
|
||||
assertEquals(l + " != " + recovered, l, recovered);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue