mirror of https://github.com/apache/lucene.git
Write MSB VLong for better outputs sharing in block tree index (#12631)
This commit is contained in:
parent
e20e245f47
commit
4f01de2a2d
|
@ -178,6 +178,9 @@ Optimizations
|
||||||
|
|
||||||
* GITHUB#12623: Use a MergeSorter taking advantage of extra storage for StableMSBRadixSorter. (Guo Feng)
|
* GITHUB#12623: Use a MergeSorter taking advantage of extra storage for StableMSBRadixSorter. (Guo Feng)
|
||||||
|
|
||||||
|
* GITHUB#12623: Write MSB VLong for better outputs sharing in block tree index, decreasing ~14% size
|
||||||
|
of .tip file. (Guo Feng)
|
||||||
|
|
||||||
Changes in runtime behavior
|
Changes in runtime behavior
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
|
|
@ -16,12 +16,15 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene90.blocktree;
|
package org.apache.lucene.codecs.lucene90.blocktree;
|
||||||
|
|
||||||
|
import static org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader.VERSION_MSB_VLONG_OUTPUT;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import org.apache.lucene.index.FieldInfo;
|
import org.apache.lucene.index.FieldInfo;
|
||||||
import org.apache.lucene.index.IndexOptions;
|
import org.apache.lucene.index.IndexOptions;
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
import org.apache.lucene.index.TermsEnum;
|
import org.apache.lucene.index.TermsEnum;
|
||||||
import org.apache.lucene.store.ByteArrayDataInput;
|
import org.apache.lucene.store.ByteArrayDataInput;
|
||||||
|
import org.apache.lucene.store.DataInput;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||||
|
@ -82,7 +85,7 @@ public final class FieldReader extends Terms {
|
||||||
// + rootCode + " divisor=" + indexDivisor);
|
// + rootCode + " divisor=" + indexDivisor);
|
||||||
// }
|
// }
|
||||||
rootBlockFP =
|
rootBlockFP =
|
||||||
(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong()
|
readVLongOutput(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length))
|
||||||
>>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
|
>>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
|
||||||
// Initialize FST always off-heap.
|
// Initialize FST always off-heap.
|
||||||
final IndexInput clone = indexIn.clone();
|
final IndexInput clone = indexIn.clone();
|
||||||
|
@ -99,6 +102,32 @@ public final class FieldReader extends Terms {
|
||||||
*/
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
|
long readVLongOutput(DataInput in) throws IOException {
|
||||||
|
if (parent.version >= VERSION_MSB_VLONG_OUTPUT) {
|
||||||
|
return readMSBVLong(in);
|
||||||
|
} else {
|
||||||
|
return in.readVLong();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decodes a variable length byte[] in MSB order back to long, as written by {@link
|
||||||
|
* Lucene90BlockTreeTermsWriter#writeMSBVLong}.
|
||||||
|
*
|
||||||
|
* <p>Package private for testing.
|
||||||
|
*/
|
||||||
|
static long readMSBVLong(DataInput in) throws IOException {
|
||||||
|
long l = 0L;
|
||||||
|
while (true) {
|
||||||
|
byte b = in.readByte();
|
||||||
|
l = (l << 7) | (b & 0x7FL);
|
||||||
|
if ((b & 0x80) == 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return l;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public BytesRef getMin() throws IOException {
|
public BytesRef getMin() throws IOException {
|
||||||
if (minTerm == null) {
|
if (minTerm == null) {
|
||||||
|
|
|
@ -146,7 +146,7 @@ final class IntersectTermsEnumFrame {
|
||||||
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
|
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
|
||||||
// Skip first long -- has redundant fp, hasTerms
|
// Skip first long -- has redundant fp, hasTerms
|
||||||
// flag, isFloor flag
|
// flag, isFloor flag
|
||||||
final long code = floorDataReader.readVLong();
|
final long code = ite.fr.readVLongOutput(floorDataReader);
|
||||||
if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
|
if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
|
||||||
// Floor frame
|
// Floor frame
|
||||||
numFollowFloorBlocks = floorDataReader.readVInt();
|
numFollowFloorBlocks = floorDataReader.readVInt();
|
||||||
|
|
|
@ -81,8 +81,13 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
|
||||||
/** Initial terms format. */
|
/** Initial terms format. */
|
||||||
public static final int VERSION_START = 0;
|
public static final int VERSION_START = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Version that encode output as MSB VLong for better outputs sharing in FST, see GITHUB#12620.
|
||||||
|
*/
|
||||||
|
public static final int VERSION_MSB_VLONG_OUTPUT = 1;
|
||||||
|
|
||||||
/** Current terms format. */
|
/** Current terms format. */
|
||||||
public static final int VERSION_CURRENT = VERSION_START;
|
public static final int VERSION_CURRENT = VERSION_MSB_VLONG_OUTPUT;
|
||||||
|
|
||||||
/** Extension of terms index file */
|
/** Extension of terms index file */
|
||||||
static final String TERMS_INDEX_EXTENSION = "tip";
|
static final String TERMS_INDEX_EXTENSION = "tip";
|
||||||
|
|
|
@ -430,6 +430,25 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {
|
||||||
return brToString(new BytesRef(b));
|
return brToString(new BytesRef(b));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Encodes long value to variable length byte[], in MSB order. Use {@link
|
||||||
|
* FieldReader#readMSBVLong} to decode.
|
||||||
|
*
|
||||||
|
* <p>Package private for testing
|
||||||
|
*/
|
||||||
|
static void writeMSBVLong(long l, DataOutput scratchBytes) throws IOException {
|
||||||
|
assert l >= 0;
|
||||||
|
// Keep zero bits on most significant byte to have more chance to get prefix bytes shared.
|
||||||
|
// e.g. we expect 0x7FFF stored as [0x81, 0xFF, 0x7F] but not [0xFF, 0xFF, 0x40]
|
||||||
|
final int bytesNeeded = (Long.SIZE - Long.numberOfLeadingZeros(l) - 1) / 7 + 1;
|
||||||
|
l <<= Long.SIZE - bytesNeeded * 7;
|
||||||
|
for (int i = 1; i < bytesNeeded; i++) {
|
||||||
|
scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL) | 0x80));
|
||||||
|
l = l << 7;
|
||||||
|
}
|
||||||
|
scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL)));
|
||||||
|
}
|
||||||
|
|
||||||
private static final class PendingBlock extends PendingEntry {
|
private static final class PendingBlock extends PendingEntry {
|
||||||
public final BytesRef prefix;
|
public final BytesRef prefix;
|
||||||
public final long fp;
|
public final long fp;
|
||||||
|
@ -472,10 +491,8 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {
|
||||||
|
|
||||||
assert scratchBytes.size() == 0;
|
assert scratchBytes.size() == 0;
|
||||||
|
|
||||||
// TODO: try writing the leading vLong in MSB order
|
// write the leading vLong in MSB order for better outputs sharing in the FST
|
||||||
// (opposite of what Lucene does today), for better
|
writeMSBVLong(encodeOutput(fp, hasTerms, isFloor), scratchBytes);
|
||||||
// outputs sharing in the FST
|
|
||||||
scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor));
|
|
||||||
if (isFloor) {
|
if (isFloor) {
|
||||||
scratchBytes.writeVInt(blocks.size() - 1);
|
scratchBytes.writeVInt(blocks.size() - 1);
|
||||||
for (int i = 1; i < blocks.size(); i++) {
|
for (int i = 1; i < blocks.size(); i++) {
|
||||||
|
|
|
@ -236,7 +236,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length)
|
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
|
scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
|
||||||
final long code = scratchReader.readVLong();
|
final long code = fr.readVLongOutput(scratchReader);
|
||||||
final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
|
final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
|
||||||
final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord);
|
final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord);
|
||||||
f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
|
f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
|
||||||
|
@ -980,7 +980,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
} else if (isSeekFrame && !f.isFloor) {
|
} else if (isSeekFrame && !f.isFloor) {
|
||||||
final ByteArrayDataInput reader =
|
final ByteArrayDataInput reader =
|
||||||
new ByteArrayDataInput(output.bytes, output.offset, output.length);
|
new ByteArrayDataInput(output.bytes, output.offset, output.length);
|
||||||
final long codeOrig = reader.readVLong();
|
final long codeOrig = fr.readVLongOutput(reader);
|
||||||
final long code =
|
final long code =
|
||||||
(f.fp << Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS)
|
(f.fp << Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS)
|
||||||
| (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0)
|
| (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0)
|
||||||
|
|
|
@ -0,0 +1,44 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.codecs.lucene90.blocktree;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import org.apache.lucene.store.ByteArrayDataInput;
|
||||||
|
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||||
|
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
|
||||||
|
public class TestMSBVLong extends LuceneTestCase {
|
||||||
|
|
||||||
|
public void testMSBVLong() throws IOException {
|
||||||
|
assertMSBVLong(Long.MAX_VALUE);
|
||||||
|
int iter = atLeast(10000);
|
||||||
|
for (long i = 0; i < iter; i++) {
|
||||||
|
assertMSBVLong(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void assertMSBVLong(long l) throws IOException {
|
||||||
|
byte[] bytes = new byte[10];
|
||||||
|
ByteArrayDataOutput output = new ByteArrayDataOutput(bytes);
|
||||||
|
Lucene90BlockTreeTermsWriter.writeMSBVLong(l, output);
|
||||||
|
ByteArrayDataInput in =
|
||||||
|
new ByteArrayDataInput(ArrayUtil.copyOfSubArray(bytes, 0, output.getPosition()));
|
||||||
|
long recovered = FieldReader.readMSBVLong(in);
|
||||||
|
assertEquals(l + " != " + recovered, l, recovered);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue