Reduce overhead for FSTs in FieldReader (#13524)

We don't need to clone the index input we hold on to in OffHeapFSTStore
since we only use it for slicing from known coordinates anyway.
-> remove the cloning and add the infrastructure to initialize
OffHeapFSTStore without seeking the input to the starting offset.
This commit is contained in:
Armin Braun 2024-06-27 09:52:10 +02:00 committed by GitHub
parent 33a4c1d8ef
commit 126834c09e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 50 additions and 106 deletions

View File

@ -88,21 +88,15 @@ public final class FieldReader extends Terms {
(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong()
>>> Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
// Initialize FST always off-heap.
final IndexInput clone = indexIn.clone();
clone.seek(indexStartFP);
final FST.FSTMetadata<BytesRef> fstMetadata;
if (metaIn == indexIn) { // Only true before Lucene 8.6
index =
new FST<>(
readMetadata(clone, ByteSequenceOutputs.getSingleton()),
clone,
new OffHeapFSTStore());
final IndexInput clone = indexIn.clone();
clone.seek(indexStartFP);
fstMetadata = readMetadata(clone, ByteSequenceOutputs.getSingleton());
} else {
index =
new FST<>(
readMetadata(metaIn, ByteSequenceOutputs.getSingleton()),
clone,
new OffHeapFSTStore());
fstMetadata = readMetadata(metaIn, ByteSequenceOutputs.getSingleton());
}
index = FST.fromFSTReader(fstMetadata, new OffHeapFSTStore(indexIn, indexStartFP, fstMetadata));
/*
if (false) {
final String dotFileName = segment + "_" + fieldInfo.name + ".dot";

View File

@ -195,9 +195,10 @@ public class FSTTermsReader extends FieldsProducer {
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore();
FSTTermOutputs outputs = new FSTTermOutputs(fieldInfo);
this.dict = new FST<>(FST.readMetadata(in, outputs), in, offHeapFSTStore);
final var fstMetadata = FST.readMetadata(in, outputs);
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore(in, in.getFilePointer(), fstMetadata);
this.dict = FST.fromFSTReader(fstMetadata, offHeapFSTStore);
in.skipBytes(offHeapFSTStore.size());
}

View File

@ -90,10 +90,15 @@ public class FSTDictionary implements IndexDictionary {
}
PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
FST.FSTMetadata<Long> metadata = FST.readMetadata(fstDataInput, fstOutputs);
FST<Long> fst =
isFSTOnHeap
? new FST<>(metadata, fstDataInput)
: new FST<>(metadata, fstDataInput, new OffHeapFSTStore());
FST<Long> fst;
if (isFSTOnHeap) {
fst = new FST<>(metadata, fstDataInput);
} else {
final IndexInput indexInput = (IndexInput) fstDataInput;
fst =
FST.fromFSTReader(
metadata, new OffHeapFSTStore(indexInput, indexInput.getFilePointer(), metadata));
}
return new FSTDictionary(fst);
}

View File

@ -89,13 +89,8 @@ public final class FieldReader extends Terms {
readVLongOutput(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length))
>>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
// Initialize FST always off-heap.
final IndexInput clone = indexIn.clone();
clone.seek(indexStartFP);
index =
new FST<>(
FST.readMetadata(metaIn, ByteSequenceOutputs.getSingleton()),
clone,
new OffHeapFSTStore());
var metadata = FST.readMetadata(metaIn, ByteSequenceOutputs.getSingleton());
index = FST.fromFSTReader(metadata, new OffHeapFSTStore(indexIn, indexStartFP, metadata));
/*
if (false) {
final String dotFileName = segment + "_" + fieldInfo.name + ".dot";

View File

@ -417,16 +417,7 @@ public final class FST<T> implements Accountable {
* maxBlockBits set to {@link #DEFAULT_MAX_BLOCK_BITS}
*/
public FST(FSTMetadata<T> metadata, DataInput in) throws IOException {
this(metadata, in, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS));
}
/**
* Load a previously saved FST with a metdata object and a FSTStore. If using {@link
* OnHeapFSTStore}, setting maxBlockBits allows you to control the size of the byte[] pages used
* to hold the FST bytes.
*/
public FST(FSTMetadata<T> metadata, DataInput in, FSTStore fstStore) throws IOException {
this(metadata, fstStore.init(in, metadata.numBytes));
this(metadata, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS, in, metadata.numBytes));
}
/** Create the FST with a metadata object and a FSTReader. */

View File

@ -1,34 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.fst;
import java.io.IOException;
import org.apache.lucene.store.DataInput;
/** A type of {@link FSTReader} which needs data to be initialized before use */
public interface FSTStore extends FSTReader {
/**
* Initialize the FSTStore
*
* @param in the DataInput to read from
* @param numBytes the number of bytes to read
* @return this FSTStore
* @throws IOException if exception occurred during reading the DataInput
*/
FSTStore init(DataInput in, long numBytes) throws IOException;
}

View File

@ -17,7 +17,6 @@
package org.apache.lucene.util.fst;
import java.io.IOException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.RamUsageEstimator;
@ -28,27 +27,19 @@ import org.apache.lucene.util.RamUsageEstimator;
*
* @lucene.experimental
*/
public final class OffHeapFSTStore implements FSTStore {
public final class OffHeapFSTStore implements FSTReader {
private static final long BASE_RAM_BYTES_USED =
RamUsageEstimator.shallowSizeOfInstance(OffHeapFSTStore.class);
private IndexInput in;
private long offset;
private long numBytes;
private final IndexInput in;
private final long offset;
private final long numBytes;
@Override
public FSTStore init(DataInput in, long numBytes) throws IOException {
if (in instanceof IndexInput) {
this.in = (IndexInput) in;
this.numBytes = numBytes;
this.offset = this.in.getFilePointer();
} else {
throw new IllegalArgumentException(
"parameter:in should be an instance of IndexInput for using OffHeapFSTStore, not a "
+ in.getClass().getName());
}
return this;
public OffHeapFSTStore(IndexInput in, long offset, FST.FSTMetadata<?> metadata) {
this.in = in;
this.offset = offset;
this.numBytes = metadata.numBytes;
}
@Override

View File

@ -28,7 +28,7 @@ import org.apache.lucene.util.RamUsageEstimator;
*
* @lucene.experimental
*/
public final class OnHeapFSTStore implements FSTStore {
public final class OnHeapFSTStore implements FSTReader {
private static final long BASE_RAM_BYTES_USED =
RamUsageEstimator.shallowSizeOfInstance(OnHeapFSTStore.class);
@ -40,31 +40,24 @@ public final class OnHeapFSTStore implements FSTStore {
private ReadWriteDataOutput dataOutput;
/** Used at read time when the FST fits into a single byte[]. */
private byte[] bytesArray;
private final byte[] bytesArray;
private final int maxBlockBits;
public OnHeapFSTStore(int maxBlockBits) {
public OnHeapFSTStore(int maxBlockBits, DataInput in, long numBytes) throws IOException {
if (maxBlockBits < 1 || maxBlockBits > 30) {
throw new IllegalArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits);
}
this.maxBlockBits = maxBlockBits;
}
@Override
public FSTStore init(DataInput in, long numBytes) throws IOException {
if (numBytes > 1 << this.maxBlockBits) {
if (numBytes > 1 << maxBlockBits) {
// FST is big: we need multiple pages
dataOutput = (ReadWriteDataOutput) getOnHeapReaderWriter(maxBlockBits);
dataOutput.copyBytes(in, numBytes);
dataOutput.freeze();
bytesArray = null;
} else {
// FST fits into a single block: use ByteArrayBytesStoreReader for less overhead
bytesArray = new byte[(int) numBytes];
in.readBytes(bytesArray, 0, bytesArray.length);
}
return this;
}
@Override

View File

@ -93,7 +93,10 @@ public class Test2BFSTOffHeap extends LuceneTestCase {
FST.FSTMetadata<Object> fstMetadata = fstCompiler.compile();
indexOutput.close();
try (IndexInput indexInput = dir.openInput("fst", IOContext.DEFAULT)) {
FST<Object> fst = new FST<>(fstMetadata, indexInput, new OffHeapFSTStore());
FST<Object> fst =
FST.fromFSTReader(
fstMetadata,
new OffHeapFSTStore(indexInput, indexInput.getFilePointer(), fstMetadata));
for (int verify = 0; verify < 2; verify++) {
System.out.println(
@ -181,7 +184,10 @@ public class Test2BFSTOffHeap extends LuceneTestCase {
FST.FSTMetadata<BytesRef> fstMetadata = fstCompiler.compile();
indexOutput.close();
try (IndexInput indexInput = dir.openInput("fst", IOContext.DEFAULT)) {
FST<BytesRef> fst = new FST<>(fstMetadata, indexInput, new OffHeapFSTStore());
FST<BytesRef> fst =
FST.fromFSTReader(
fstMetadata,
new OffHeapFSTStore(indexInput, indexInput.getFilePointer(), fstMetadata));
for (int verify = 0; verify < 2; verify++) {
System.out.println(
@ -266,7 +272,10 @@ public class Test2BFSTOffHeap extends LuceneTestCase {
FST.FSTMetadata<Long> fstMetadata = fstCompiler.compile();
indexOutput.close();
try (IndexInput indexInput = dir.openInput("fst", IOContext.DEFAULT)) {
FST<Long> fst = new FST<>(fstMetadata, indexInput, new OffHeapFSTStore());
FST<Long> fst =
FST.fromFSTReader(
fstMetadata,
new OffHeapFSTStore(indexInput, indexInput.getFilePointer(), fstMetadata));
for (int verify = 0; verify < 2; verify++) {

View File

@ -341,11 +341,10 @@ public final class NRTSuggester implements Accountable {
PairOutputs<Long, BytesRef> outputs =
new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
if (shouldLoadFSTOffHeap(input, fstLoadMode)) {
OffHeapFSTStore store = new OffHeapFSTStore();
IndexInput clone = input.clone();
clone.seek(input.getFilePointer());
fst = new FST<>(FST.readMetadata(clone, outputs), clone, store);
input.seek(clone.getFilePointer() + store.size());
final FST.FSTMetadata<Pair<Long, BytesRef>> fstMetadata = FST.readMetadata(input, outputs);
OffHeapFSTStore store = new OffHeapFSTStore(input, input.getFilePointer(), fstMetadata);
fst = FST.fromFSTReader(fstMetadata, store);
input.skipBytes(store.size());
} else {
fst = new FST<>(FST.readMetadata(input, outputs), input);
}