From 126834c09e69f0f0a3cfa5b09195f3ebfb5c1c0f Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Thu, 27 Jun 2024 09:52:10 +0200 Subject: [PATCH] Reduce overhead for FSTs in FieldReader (#13524) We don't need to clone the index input we hold on to in OffHeapFSTStore since we only use it for slicing from known coordinates anyway. -> remove the cloning and add the infrastructure to initialize OffHeapFSTStore without seeking the input to the starting offset. --- .../lucene40/blocktree/FieldReader.java | 18 ++++------ .../lucene/codecs/memory/FSTTermsReader.java | 5 +-- .../codecs/uniformsplit/FSTDictionary.java | 13 ++++--- .../lucene90/blocktree/FieldReader.java | 9 ++--- .../java/org/apache/lucene/util/fst/FST.java | 11 +----- .../org/apache/lucene/util/fst/FSTStore.java | 34 ------------------- .../lucene/util/fst/OffHeapFSTStore.java | 25 +++++--------- .../lucene/util/fst/OnHeapFSTStore.java | 17 +++------- .../lucene/util/fst/Test2BFSTOffHeap.java | 15 ++++++-- .../search/suggest/document/NRTSuggester.java | 9 +++-- 10 files changed, 50 insertions(+), 106 deletions(-) delete mode 100644 lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene40/blocktree/FieldReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene40/blocktree/FieldReader.java index 3387c2c7b5c..18ba067b2bb 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene40/blocktree/FieldReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene40/blocktree/FieldReader.java @@ -88,21 +88,15 @@ public final class FieldReader extends Terms { (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong() >>> Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; // Initialize FST always off-heap. - final IndexInput clone = indexIn.clone(); - clone.seek(indexStartFP); + final FST.FSTMetadata fstMetadata; if (metaIn == indexIn) { // Only true before Lucene 8.6 - index = - new FST<>( - readMetadata(clone, ByteSequenceOutputs.getSingleton()), - clone, - new OffHeapFSTStore()); + final IndexInput clone = indexIn.clone(); + clone.seek(indexStartFP); + fstMetadata = readMetadata(clone, ByteSequenceOutputs.getSingleton()); } else { - index = - new FST<>( - readMetadata(metaIn, ByteSequenceOutputs.getSingleton()), - clone, - new OffHeapFSTStore()); + fstMetadata = readMetadata(metaIn, ByteSequenceOutputs.getSingleton()); } + index = FST.fromFSTReader(fstMetadata, new OffHeapFSTStore(indexIn, indexStartFP, fstMetadata)); /* if (false) { final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java index 6f4e2db0d6e..d500b75eaab 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java @@ -195,9 +195,10 @@ public class FSTTermsReader extends FieldsProducer { this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; this.docCount = docCount; - OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore(); FSTTermOutputs outputs = new FSTTermOutputs(fieldInfo); - this.dict = new FST<>(FST.readMetadata(in, outputs), in, offHeapFSTStore); + final var fstMetadata = FST.readMetadata(in, outputs); + OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore(in, in.getFilePointer(), fstMetadata); + this.dict = FST.fromFSTReader(fstMetadata, offHeapFSTStore); in.skipBytes(offHeapFSTStore.size()); } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java index a73fef410dd..ced1dadab6f 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java @@ -90,10 +90,15 @@ public class FSTDictionary implements IndexDictionary { } PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton(); FST.FSTMetadata metadata = FST.readMetadata(fstDataInput, fstOutputs); - FST fst = - isFSTOnHeap - ? new FST<>(metadata, fstDataInput) - : new FST<>(metadata, fstDataInput, new OffHeapFSTStore()); + FST fst; + if (isFSTOnHeap) { + fst = new FST<>(metadata, fstDataInput); + } else { + final IndexInput indexInput = (IndexInput) fstDataInput; + fst = + FST.fromFSTReader( + metadata, new OffHeapFSTStore(indexInput, indexInput.getFilePointer(), metadata)); + } return new FSTDictionary(fst); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java index 6cef964a6a5..ed3e827c37c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java @@ -89,13 +89,8 @@ public final class FieldReader extends Terms { readVLongOutput(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)) >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; // Initialize FST always off-heap. - final IndexInput clone = indexIn.clone(); - clone.seek(indexStartFP); - index = - new FST<>( - FST.readMetadata(metaIn, ByteSequenceOutputs.getSingleton()), - clone, - new OffHeapFSTStore()); + var metadata = FST.readMetadata(metaIn, ByteSequenceOutputs.getSingleton()); + index = FST.fromFSTReader(metadata, new OffHeapFSTStore(indexIn, indexStartFP, metadata)); /* if (false) { final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index 6bb5718d5c7..17201194da4 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -417,16 +417,7 @@ public final class FST implements Accountable { * maxBlockBits set to {@link #DEFAULT_MAX_BLOCK_BITS} */ public FST(FSTMetadata metadata, DataInput in) throws IOException { - this(metadata, in, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS)); - } - - /** - * Load a previously saved FST with a metdata object and a FSTStore. If using {@link - * OnHeapFSTStore}, setting maxBlockBits allows you to control the size of the byte[] pages used - * to hold the FST bytes. - */ - public FST(FSTMetadata metadata, DataInput in, FSTStore fstStore) throws IOException { - this(metadata, fstStore.init(in, metadata.numBytes)); + this(metadata, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS, in, metadata.numBytes)); } /** Create the FST with a metadata object and a FSTReader. */ diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java deleted file mode 100644 index 682503b2c44..00000000000 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.util.fst; - -import java.io.IOException; -import org.apache.lucene.store.DataInput; - -/** A type of {@link FSTReader} which needs data to be initialized before use */ -public interface FSTStore extends FSTReader { - - /** - * Initialize the FSTStore - * - * @param in the DataInput to read from - * @param numBytes the number of bytes to read - * @return this FSTStore - * @throws IOException if exception occurred during reading the DataInput - */ - FSTStore init(DataInput in, long numBytes) throws IOException; -} diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/OffHeapFSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/OffHeapFSTStore.java index f88715b191c..e8b00037d15 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/OffHeapFSTStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/OffHeapFSTStore.java @@ -17,7 +17,6 @@ package org.apache.lucene.util.fst; import java.io.IOException; -import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.RamUsageEstimator; @@ -28,27 +27,19 @@ import org.apache.lucene.util.RamUsageEstimator; * * @lucene.experimental */ -public final class OffHeapFSTStore implements FSTStore { +public final class OffHeapFSTStore implements FSTReader { private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(OffHeapFSTStore.class); - private IndexInput in; - private long offset; - private long numBytes; + private final IndexInput in; + private final long offset; + private final long numBytes; - @Override - public FSTStore init(DataInput in, long numBytes) throws IOException { - if (in instanceof IndexInput) { - this.in = (IndexInput) in; - this.numBytes = numBytes; - this.offset = this.in.getFilePointer(); - } else { - throw new IllegalArgumentException( - "parameter:in should be an instance of IndexInput for using OffHeapFSTStore, not a " - + in.getClass().getName()); - } - return this; + public OffHeapFSTStore(IndexInput in, long offset, FST.FSTMetadata metadata) { + this.in = in; + this.offset = offset; + this.numBytes = metadata.numBytes; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java index 949babdaa88..15bb0bb5f8a 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java @@ -28,7 +28,7 @@ import org.apache.lucene.util.RamUsageEstimator; * * @lucene.experimental */ -public final class OnHeapFSTStore implements FSTStore { +public final class OnHeapFSTStore implements FSTReader { private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(OnHeapFSTStore.class); @@ -40,31 +40,24 @@ public final class OnHeapFSTStore implements FSTStore { private ReadWriteDataOutput dataOutput; /** Used at read time when the FST fits into a single byte[]. */ - private byte[] bytesArray; + private final byte[] bytesArray; - private final int maxBlockBits; - - public OnHeapFSTStore(int maxBlockBits) { + public OnHeapFSTStore(int maxBlockBits, DataInput in, long numBytes) throws IOException { if (maxBlockBits < 1 || maxBlockBits > 30) { throw new IllegalArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits); } - this.maxBlockBits = maxBlockBits; - } - - @Override - public FSTStore init(DataInput in, long numBytes) throws IOException { - if (numBytes > 1 << this.maxBlockBits) { + if (numBytes > 1 << maxBlockBits) { // FST is big: we need multiple pages dataOutput = (ReadWriteDataOutput) getOnHeapReaderWriter(maxBlockBits); dataOutput.copyBytes(in, numBytes); dataOutput.freeze(); + bytesArray = null; } else { // FST fits into a single block: use ByteArrayBytesStoreReader for less overhead bytesArray = new byte[(int) numBytes]; in.readBytes(bytesArray, 0, bytesArray.length); } - return this; } @Override diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFSTOffHeap.java b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFSTOffHeap.java index 42fd4e1128a..2169c093fca 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFSTOffHeap.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFSTOffHeap.java @@ -93,7 +93,10 @@ public class Test2BFSTOffHeap extends LuceneTestCase { FST.FSTMetadata fstMetadata = fstCompiler.compile(); indexOutput.close(); try (IndexInput indexInput = dir.openInput("fst", IOContext.DEFAULT)) { - FST fst = new FST<>(fstMetadata, indexInput, new OffHeapFSTStore()); + FST fst = + FST.fromFSTReader( + fstMetadata, + new OffHeapFSTStore(indexInput, indexInput.getFilePointer(), fstMetadata)); for (int verify = 0; verify < 2; verify++) { System.out.println( @@ -181,7 +184,10 @@ public class Test2BFSTOffHeap extends LuceneTestCase { FST.FSTMetadata fstMetadata = fstCompiler.compile(); indexOutput.close(); try (IndexInput indexInput = dir.openInput("fst", IOContext.DEFAULT)) { - FST fst = new FST<>(fstMetadata, indexInput, new OffHeapFSTStore()); + FST fst = + FST.fromFSTReader( + fstMetadata, + new OffHeapFSTStore(indexInput, indexInput.getFilePointer(), fstMetadata)); for (int verify = 0; verify < 2; verify++) { System.out.println( @@ -266,7 +272,10 @@ public class Test2BFSTOffHeap extends LuceneTestCase { FST.FSTMetadata fstMetadata = fstCompiler.compile(); indexOutput.close(); try (IndexInput indexInput = dir.openInput("fst", IOContext.DEFAULT)) { - FST fst = new FST<>(fstMetadata, indexInput, new OffHeapFSTStore()); + FST fst = + FST.fromFSTReader( + fstMetadata, + new OffHeapFSTStore(indexInput, indexInput.getFilePointer(), fstMetadata)); for (int verify = 0; verify < 2; verify++) { diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java index b059d3e7356..a56365fed9d 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java @@ -341,11 +341,10 @@ public final class NRTSuggester implements Accountable { PairOutputs outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()); if (shouldLoadFSTOffHeap(input, fstLoadMode)) { - OffHeapFSTStore store = new OffHeapFSTStore(); - IndexInput clone = input.clone(); - clone.seek(input.getFilePointer()); - fst = new FST<>(FST.readMetadata(clone, outputs), clone, store); - input.seek(clone.getFilePointer() + store.size()); + final FST.FSTMetadata> fstMetadata = FST.readMetadata(input, outputs); + OffHeapFSTStore store = new OffHeapFSTStore(input, input.getFilePointer(), fstMetadata); + fst = FST.fromFSTReader(fstMetadata, store); + input.skipBytes(store.size()); } else { fst = new FST<>(FST.readMetadata(input, outputs), input); }