From c7cf9e8e4f9b28493f0b135b6a463c1e15e9a291 Mon Sep 17 00:00:00 2001 From: Bruno Roustant Date: Thu, 27 Feb 2020 12:56:58 +0100 Subject: [PATCH] LUCENE-9254: UniformSplit supports FST off-heap. Closes #1301 --- lucene/CHANGES.txt | 3 +- .../codecs/uniformsplit/FSTDictionary.java | 36 +++++++++++------- .../codecs/uniformsplit/FieldMetadata.java | 23 ++++++++---- .../codecs/uniformsplit/IndexDictionary.java | 3 +- .../UniformSplitPostingsFormat.java | 24 +++++++++--- .../uniformsplit/UniformSplitTerms.java | 9 ----- .../uniformsplit/UniformSplitTermsReader.java | 37 ++++++++++++------- .../STUniformSplitPostingsFormat.java | 21 ++++++++--- .../sharedterms/STUniformSplitTerms.java | 3 +- .../STUniformSplitTermsReader.java | 24 +++++++----- .../UnionFieldMetadataBuilder.java | 8 +++- .../uniformsplit/TestFSTDictionary.java | 3 +- .../TestUniformSplitPostingFormat.java | 26 ++++++++++--- .../TestSTUniformSplitPostingFormat.java | 12 +++--- .../UniformSplitRot13PostingsFormat.java | 8 ++-- .../STUniformSplitRot13PostingsFormat.java | 4 +- 16 files changed, 156 insertions(+), 88 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 495e9af283e..b6f5e1a0192 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -118,7 +118,8 @@ Improvements Optimizations --------------------- -(No changes) + +* LUCENE-9254: UniformSplit keeps FST off-heap. (Bruno Roustant) Bug Fixes --------------------- diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java index 08dbacfd08d..026e8724f31 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java @@ -30,6 +30,7 @@ import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; +import org.apache.lucene.util.fst.OffHeapFSTStore; import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util; @@ -56,24 +57,24 @@ public class FSTDictionary implements IndexDictionary { private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(FSTDictionary.class); - protected final FST dictionary; + protected final FST fst; - protected FSTDictionary(FST dictionary) { - this.dictionary = dictionary; + protected FSTDictionary(FST fst) { + this.fst = fst; } @Override public long ramBytesUsed() { - return BASE_RAM_USAGE + dictionary.ramBytesUsed(); + return BASE_RAM_USAGE + fst.ramBytesUsed(); } @Override public void write(DataOutput output, BlockEncoder blockEncoder) throws IOException { if (blockEncoder == null) { - dictionary.save(output); + fst.save(output); } else { ByteBuffersDataOutput bytesDataOutput = ByteBuffersDataOutput.newResettableInstance(); - dictionary.save(bytesDataOutput); + fst.save(bytesDataOutput); BlockEncoder.WritableBytes encodedBytes = blockEncoder.encode(bytesDataOutput.toDataInput(), bytesDataOutput.size()); output.writeVLong(encodedBytes.size()); encodedBytes.writeTo(output); @@ -84,7 +85,7 @@ public class FSTDictionary implements IndexDictionary { * Reads a {@link FSTDictionary} from the provided input. * @param blockDecoder The {@link BlockDecoder} to use for specific decoding; or null if none. */ - protected static FSTDictionary read(DataInput input, BlockDecoder blockDecoder) throws IOException { + protected static FSTDictionary read(DataInput input, BlockDecoder blockDecoder, boolean isFSTOnHeap) throws IOException { DataInput fstDataInput; if (blockDecoder == null) { fstDataInput = input; @@ -92,10 +93,14 @@ public class FSTDictionary implements IndexDictionary { long numBytes = input.readVLong(); BytesRef decodedBytes = blockDecoder.decode(input, numBytes); fstDataInput = new ByteArrayDataInput(decodedBytes.bytes, 0, decodedBytes.length); + // OffHeapFSTStore.init() requires a DataInput which is an instance of IndexInput. + // When the block is decoded we must load the FST on heap. + isFSTOnHeap = true; } PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton(); - FST dictionary = new FST<>(fstDataInput, fstOutputs); - return new FSTDictionary(dictionary); + FST fst = isFSTOnHeap ? new FST<>(fstDataInput, fstOutputs) + : new FST<>(fstDataInput, fstOutputs, new OffHeapFSTStore()); + return new FSTDictionary(fst); } @Override @@ -109,7 +114,7 @@ public class FSTDictionary implements IndexDictionary { */ protected class Browser implements IndexDictionary.Browser { - protected final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum<>(dictionary); + protected final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum<>(fst); @Override public long seekBlock(BytesRef term) throws IOException { @@ -127,16 +132,19 @@ public class FSTDictionary implements IndexDictionary { protected final IndexInput dictionaryInput; protected final BlockDecoder blockDecoder; + protected final boolean isFSTOnHeap; /** - * Lazy loaded immutable index dictionary (trie hold in RAM). + * Lazy loaded immutable index dictionary FST. + * The FST is either kept off-heap, or hold in RAM on-heap. */ protected IndexDictionary dictionary; - public BrowserSupplier(IndexInput dictionaryInput, long startFilePointer, BlockDecoder blockDecoder) throws IOException { + public BrowserSupplier(IndexInput dictionaryInput, long dictionaryStartFP, BlockDecoder blockDecoder, boolean isFSTOnHeap) throws IOException { this.dictionaryInput = dictionaryInput.clone(); - this.dictionaryInput.seek(startFilePointer); + this.dictionaryInput.seek(dictionaryStartFP); this.blockDecoder = blockDecoder; + this.isFSTOnHeap = isFSTOnHeap; } @Override @@ -147,7 +155,7 @@ public class FSTDictionary implements IndexDictionary { if (dictionary == null) { synchronized (this) { if (dictionary == null) { - dictionary = read(dictionaryInput, blockDecoder); + dictionary = read(dictionaryInput, blockDecoder, isFSTOnHeap); } } } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FieldMetadata.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FieldMetadata.java index 69bd651602a..80055317139 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FieldMetadata.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FieldMetadata.java @@ -59,31 +59,38 @@ public class FieldMetadata implements Accountable { protected BytesRef lastTerm; /** - * Constructs a {@link FieldMetadata} used for writing the index. This {@link FieldMetadata} is mutable. - * + * Constructs field metadata for writing. * @param maxDoc The total number of documents in the segment being written. */ public FieldMetadata(FieldInfo fieldInfo, int maxDoc) { this(fieldInfo, maxDoc, true); } - public FieldMetadata(FieldInfo fieldInfo, int maxDoc, boolean isMutable) { - this(fieldInfo, maxDoc, isMutable, -1, -1, null); + /** + * Constructs immutable virtual field metadata for reading. + */ + public FieldMetadata(long dictionaryStartFP, long firstBlockStartFP, long lastBlockStartFP, BytesRef lastTerm) { + this(null, 0, false); + this.dictionaryStartFP = dictionaryStartFP; + this.firstBlockStartFP = firstBlockStartFP; + this.lastBlockStartFP = lastBlockStartFP; + this.lastTerm = lastTerm; } /** + * Constructs field metadata for reading or writing. + * @param maxDoc The total number of documents in the segment being written. * @param isMutable Set true if this FieldMetadata is created for writing the index. Set false if it is used for reading the index. */ - public FieldMetadata(FieldInfo fieldInfo, int maxDoc, boolean isMutable, long firstBlockStartFP, long lastBlockStartFP, BytesRef lastTerm) { + protected FieldMetadata(FieldInfo fieldInfo, int maxDoc, boolean isMutable) { assert isMutable || maxDoc == 0; this.fieldInfo = fieldInfo; this.isMutable = isMutable; // docsSeen must not be set if this FieldMetadata is immutable, that means it is used for reading the index. this.docsSeen = isMutable ? new FixedBitSet(maxDoc) : null; this.dictionaryStartFP = -1; - this.firstBlockStartFP = firstBlockStartFP; - this.lastBlockStartFP = lastBlockStartFP; - this.lastTerm = lastTerm; + this.firstBlockStartFP = -1; + this.lastBlockStartFP = -1; } /** diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/IndexDictionary.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/IndexDictionary.java index e59748673d9..2953c1b9598 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/IndexDictionary.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/IndexDictionary.java @@ -112,5 +112,6 @@ public interface IndexDictionary extends Accountable { * {@link org.apache.lucene.index.TermsEnum#seekExact} are called (it is not loaded for a direct * all-terms enumeration). */ - interface BrowserSupplier extends IOSupplier, Accountable {} + interface BrowserSupplier extends IOSupplier, Accountable { + } } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java index f9d147a0814..f982ed3ad2e 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java @@ -55,9 +55,14 @@ public class UniformSplitPostingsFormat extends PostingsFormat { protected final int deltaNumLines; protected final BlockEncoder blockEncoder; protected final BlockDecoder blockDecoder; + protected final boolean dictionaryOnHeap; + /** + * Creates a {@link UniformSplitPostingsFormat} with default settings. + */ public UniformSplitPostingsFormat() { - this(UniformSplitTermsWriter.DEFAULT_TARGET_NUM_BLOCK_LINES, UniformSplitTermsWriter.DEFAULT_DELTA_NUM_LINES, null, null); + this(UniformSplitTermsWriter.DEFAULT_TARGET_NUM_BLOCK_LINES, UniformSplitTermsWriter.DEFAULT_DELTA_NUM_LINES, + null, null, false); } /** @@ -73,12 +78,20 @@ public class UniformSplitPostingsFormat extends PostingsFormat { * It can be used for compression or encryption. * @param blockDecoder Optional block decoder, may be null if none. * It can be used for compression or encryption. + * @param dictionaryOnHeap Whether to force loading the terms dictionary on-heap. By default it is kept off-heap without + * impact on performance. If block encoding/decoding is used, then the dictionary is always + * loaded on-heap whatever this parameter value is. */ - public UniformSplitPostingsFormat(int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder, BlockDecoder blockDecoder) { - this(NAME, targetNumBlockLines, deltaNumLines, blockEncoder, blockDecoder); + public UniformSplitPostingsFormat(int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder, BlockDecoder blockDecoder, + boolean dictionaryOnHeap) { + this(NAME, targetNumBlockLines, deltaNumLines, blockEncoder, blockDecoder, dictionaryOnHeap); } - protected UniformSplitPostingsFormat(String name, int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder, BlockDecoder blockDecoder) { + /** + * @see #UniformSplitPostingsFormat(int, int, BlockEncoder, BlockDecoder, boolean) + */ + protected UniformSplitPostingsFormat(String name, int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder, + BlockDecoder blockDecoder, boolean dictionaryOnHeap) { super(name); UniformSplitTermsWriter.validateSettings(targetNumBlockLines, deltaNumLines); validateBlockEncoder(blockEncoder, blockDecoder); @@ -86,6 +99,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat { this.deltaNumLines = deltaNumLines; this.blockEncoder = blockEncoder; this.blockDecoder = blockDecoder; + this.dictionaryOnHeap = dictionaryOnHeap; } @Override @@ -125,7 +139,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat { protected FieldsProducer createUniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentReadState state, BlockDecoder blockDecoder) throws IOException { - return new UniformSplitTermsReader(postingsReader, state, blockDecoder); + return new UniformSplitTermsReader(postingsReader, state, blockDecoder, dictionaryOnHeap); } private static void validateBlockEncoder(BlockEncoder blockEncoder, BlockDecoder blockDecoder) { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTerms.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTerms.java index cc95daa2026..5f0da751833 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTerms.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTerms.java @@ -49,15 +49,6 @@ public class UniformSplitTerms extends Terms implements Accountable { protected final BlockDecoder blockDecoder; protected final IndexDictionary.BrowserSupplier dictionaryBrowserSupplier; - /** - * @param blockDecoder Optional block decoder, may be null if none. It can be used for decompression or decryption. - */ - protected UniformSplitTerms(IndexInput dictionaryInput, IndexInput blockInput, FieldMetadata fieldMetadata, - PostingsReaderBase postingsReader, BlockDecoder blockDecoder) throws IOException { - this(blockInput, fieldMetadata, postingsReader, blockDecoder, - new FSTDictionary.BrowserSupplier(dictionaryInput, fieldMetadata.getDictionaryStartFP(), blockDecoder)); - } - /** * @param blockDecoder Optional block decoder, may be null if none. It can be used for decompression or decryption. */ diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java index bda04063fee..9b2552b5017 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java @@ -65,21 +65,24 @@ public class UniformSplitTermsReader extends FieldsProducer { protected final Collection sortedFieldNames; /** - * @param blockDecoder Optional block decoder, may be null if none. - * It can be used for decompression or decryption. + * @param blockDecoder Optional block decoder, may be null if none. + * It can be used for decompression or decryption. + * @param dictionaryOnHeap Whether to force loading the terms dictionary on-heap. By default it is kept off-heap without + * impact on performance. If block encoding/decoding is used, then the dictionary is always + * loaded on-heap whatever this parameter value is. */ - public UniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentReadState state, BlockDecoder blockDecoder) throws IOException { - this(postingsReader, state, blockDecoder, FieldMetadata.Serializer.INSTANCE, NAME, VERSION_START, VERSION_CURRENT, + public UniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentReadState state, BlockDecoder blockDecoder, + boolean dictionaryOnHeap) throws IOException { + this(postingsReader, state, blockDecoder, dictionaryOnHeap, FieldMetadata.Serializer.INSTANCE, NAME, VERSION_START, VERSION_CURRENT, TERMS_BLOCKS_EXTENSION, TERMS_DICTIONARY_EXTENSION); } /** - * @param blockDecoder Optional block decoder, may be null if none. - * It can be used for decompression or decryption. + * @see #UniformSplitTermsReader(PostingsReaderBase, SegmentReadState, BlockDecoder, boolean) */ - protected UniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentReadState state, - BlockDecoder blockDecoder, FieldMetadata.Serializer fieldMetadataReader, - String codecName, int versionStart, int versionCurrent, + protected UniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentReadState state, BlockDecoder blockDecoder, + boolean dictionaryOnHeap, FieldMetadata.Serializer fieldMetadataReader, + String codecName, int versionStart, int versionCurrent, String termsBlocksExtension, String dictionaryExtension) throws IOException { IndexInput dictionaryInput = null; IndexInput blockInput = null; @@ -108,7 +111,7 @@ public class UniformSplitTermsReader extends FieldsProducer { this.blockInput = blockInput; this.dictionaryInput = dictionaryInput; - fillFieldMap(postingsReader, blockDecoder, dictionaryInput, blockInput, fieldMetadataCollection, state.fieldInfos); + fillFieldMap(postingsReader, state, blockDecoder, dictionaryOnHeap, dictionaryInput, blockInput, fieldMetadataCollection, state.fieldInfos); List fieldNames = new ArrayList<>(fieldToTermsMap.keySet()); Collections.sort(fieldNames); @@ -122,15 +125,21 @@ public class UniformSplitTermsReader extends FieldsProducer { } } - protected void fillFieldMap(PostingsReaderBase postingsReader, BlockDecoder blockDecoder, - IndexInput dictionaryInput, IndexInput blockInput, - Collection fieldMetadataCollection, FieldInfos fieldInfos) throws IOException { + protected void fillFieldMap(PostingsReaderBase postingsReader, SegmentReadState state, BlockDecoder blockDecoder, + boolean dictionaryOnHeap, IndexInput dictionaryInput, IndexInput blockInput, + Collection fieldMetadataCollection, FieldInfos fieldInfos) throws IOException { for (FieldMetadata fieldMetadata : fieldMetadataCollection) { + IndexDictionary.BrowserSupplier dictionaryBrowserSupplier = createDictionaryBrowserSupplier(state, dictionaryInput, fieldMetadata, blockDecoder, dictionaryOnHeap); fieldToTermsMap.put(fieldMetadata.getFieldInfo().name, - new UniformSplitTerms(dictionaryInput, blockInput, fieldMetadata, postingsReader, blockDecoder)); + new UniformSplitTerms(blockInput, fieldMetadata, postingsReader, blockDecoder, dictionaryBrowserSupplier)); } } + protected IndexDictionary.BrowserSupplier createDictionaryBrowserSupplier(SegmentReadState state, IndexInput dictionaryInput, FieldMetadata fieldMetadata, + BlockDecoder blockDecoder, boolean dictionaryOnHeap) throws IOException { + return new FSTDictionary.BrowserSupplier(dictionaryInput, fieldMetadata.getDictionaryStartFP(), blockDecoder, dictionaryOnHeap); + } + /** * @param indexInput {@link IndexInput} must be positioned to the fields metadata * details by calling {@link #seekFieldsMetadata(IndexInput)} before this call. diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitPostingsFormat.java index 5ffc9100315..57c15409904 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitPostingsFormat.java @@ -58,16 +58,25 @@ public class STUniformSplitPostingsFormat extends UniformSplitPostingsFormat { public static final String NAME = "SharedTermsUniformSplit"; + /** + * Creates a {@link STUniformSplitPostingsFormat} with default settings. + */ public STUniformSplitPostingsFormat() { - this(UniformSplitTermsWriter.DEFAULT_TARGET_NUM_BLOCK_LINES, UniformSplitTermsWriter.DEFAULT_DELTA_NUM_LINES, null, null); + this(UniformSplitTermsWriter.DEFAULT_TARGET_NUM_BLOCK_LINES, UniformSplitTermsWriter.DEFAULT_DELTA_NUM_LINES, + null, null, false); } - public STUniformSplitPostingsFormat(int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder, BlockDecoder blockDecoder) { - this(NAME, targetNumBlockLines, deltaNumLines, blockEncoder, blockDecoder); + /** + * @see UniformSplitPostingsFormat#UniformSplitPostingsFormat(int, int, BlockEncoder, BlockDecoder, boolean) + */ + public STUniformSplitPostingsFormat(int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder, BlockDecoder blockDecoder, + boolean dictionaryOnHeap) { + this(NAME, targetNumBlockLines, deltaNumLines, blockEncoder, blockDecoder, dictionaryOnHeap); } - protected STUniformSplitPostingsFormat(String name, int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder, BlockDecoder blockDecoder) { - super(name, targetNumBlockLines, deltaNumLines, blockEncoder, blockDecoder); + protected STUniformSplitPostingsFormat(String name, int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder, + BlockDecoder blockDecoder, boolean dictionaryOnHeap) { + super(name, targetNumBlockLines, deltaNumLines, blockEncoder, blockDecoder, dictionaryOnHeap); } @Override @@ -79,6 +88,6 @@ public class STUniformSplitPostingsFormat extends UniformSplitPostingsFormat { @Override protected FieldsProducer createUniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentReadState state, BlockDecoder blockDecoder) throws IOException { - return new STUniformSplitTermsReader(postingsReader, state, blockDecoder); + return new STUniformSplitTermsReader(postingsReader, state, blockDecoder, dictionaryOnHeap); } } \ No newline at end of file diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTerms.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTerms.java index 01d374c364e..819ff4d4043 100755 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTerms.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTerms.java @@ -43,7 +43,8 @@ public class STUniformSplitTerms extends UniformSplitTerms { protected STUniformSplitTerms(IndexInput blockInput, FieldMetadata fieldMetadata, FieldMetadata unionFieldMetadata, PostingsReaderBase postingsReader, - BlockDecoder blockDecoder, FieldInfos fieldInfos, IndexDictionary.BrowserSupplier dictionaryBrowserSupplier) { + BlockDecoder blockDecoder, FieldInfos fieldInfos, + IndexDictionary.BrowserSupplier dictionaryBrowserSupplier) { super(blockInput, fieldMetadata, postingsReader, blockDecoder, dictionaryBrowserSupplier); this.unionFieldMetadata = unionFieldMetadata; this.fieldInfos = fieldInfos; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java index 50a17bce8dd..cc25a30cef6 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java @@ -22,7 +22,6 @@ import java.util.Collection; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.uniformsplit.BlockDecoder; -import org.apache.lucene.codecs.uniformsplit.FSTDictionary; import org.apache.lucene.codecs.uniformsplit.FieldMetadata; import org.apache.lucene.codecs.uniformsplit.IndexDictionary; import org.apache.lucene.codecs.uniformsplit.UniformSplitTerms; @@ -46,26 +45,33 @@ import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPo */ public class STUniformSplitTermsReader extends UniformSplitTermsReader { - public STUniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentReadState state, BlockDecoder blockDecoder) throws IOException { - this(postingsReader, state, blockDecoder, FieldMetadata.Serializer.INSTANCE, + /** + * @see UniformSplitTermsReader#UniformSplitTermsReader(PostingsReaderBase, SegmentReadState, BlockDecoder, boolean) + */ + public STUniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentReadState state, BlockDecoder blockDecoder, + boolean dictionaryOnHeap) throws IOException { + this(postingsReader, state, blockDecoder, dictionaryOnHeap, FieldMetadata.Serializer.INSTANCE, NAME, VERSION_START, VERSION_CURRENT, TERMS_BLOCKS_EXTENSION, TERMS_DICTIONARY_EXTENSION); } - protected STUniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentReadState state, - BlockDecoder blockDecoder, FieldMetadata.Serializer fieldMetadataReader, + /** + * @see UniformSplitTermsReader#UniformSplitTermsReader(PostingsReaderBase, SegmentReadState, BlockDecoder, boolean) + */ + protected STUniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentReadState state, BlockDecoder blockDecoder, + boolean dictionaryOnHeap, FieldMetadata.Serializer fieldMetadataReader, String codecName, int versionStart, int versionCurrent, String termsBlocksExtension, String dictionaryExtension) throws IOException { - super(postingsReader, state, blockDecoder, fieldMetadataReader, codecName, versionStart, versionCurrent, termsBlocksExtension, dictionaryExtension); + super(postingsReader, state, blockDecoder, dictionaryOnHeap, fieldMetadataReader, codecName, versionStart, versionCurrent, termsBlocksExtension, dictionaryExtension); } @Override - protected void fillFieldMap(PostingsReaderBase postingsReader, BlockDecoder blockDecoder, - IndexInput dictionaryInput, IndexInput blockInput, + protected void fillFieldMap(PostingsReaderBase postingsReader, SegmentReadState state, BlockDecoder blockDecoder, + boolean dictionaryOnHeap, IndexInput dictionaryInput, IndexInput blockInput, Collection fieldMetadataCollection, FieldInfos fieldInfos) throws IOException { if (!fieldMetadataCollection.isEmpty()) { FieldMetadata unionFieldMetadata = createUnionFieldMetadata(fieldMetadataCollection); // Share the same immutable dictionary between all fields. - IndexDictionary.BrowserSupplier dictionaryBrowserSupplier = new FSTDictionary.BrowserSupplier(dictionaryInput, fieldMetadataCollection.iterator().next().getDictionaryStartFP(), blockDecoder); + IndexDictionary.BrowserSupplier dictionaryBrowserSupplier = createDictionaryBrowserSupplier(state, dictionaryInput, unionFieldMetadata, blockDecoder, dictionaryOnHeap); for (FieldMetadata fieldMetadata : fieldMetadataCollection) { fieldToTermsMap.put(fieldMetadata.getFieldInfo().name, new STUniformSplitTerms(blockInput, fieldMetadata, unionFieldMetadata, postingsReader, blockDecoder, fieldInfos, dictionaryBrowserSupplier)); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/UnionFieldMetadataBuilder.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/UnionFieldMetadataBuilder.java index 7c06df4e925..85b6a27fd3b 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/UnionFieldMetadataBuilder.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/UnionFieldMetadataBuilder.java @@ -27,6 +27,7 @@ import org.apache.lucene.util.BytesRef; */ public class UnionFieldMetadataBuilder { + private long dictionaryStartFP; private long minStartBlockFP; private long maxEndBlockFP; private BytesRef maxLastTerm; @@ -36,13 +37,16 @@ public class UnionFieldMetadataBuilder { } public UnionFieldMetadataBuilder reset() { - maxEndBlockFP = Long.MIN_VALUE; + dictionaryStartFP = -1; minStartBlockFP = Long.MAX_VALUE; + maxEndBlockFP = Long.MIN_VALUE; maxLastTerm = null; return this; } public UnionFieldMetadataBuilder addFieldMetadata(FieldMetadata fieldMetadata) { + assert dictionaryStartFP == -1 || dictionaryStartFP == fieldMetadata.getDictionaryStartFP(); + dictionaryStartFP = fieldMetadata.getDictionaryStartFP(); minStartBlockFP = Math.min(minStartBlockFP, fieldMetadata.getFirstBlockStartFP()); maxEndBlockFP = Math.max(maxEndBlockFP, fieldMetadata.getLastBlockStartFP()); if (maxLastTerm == null || maxLastTerm.compareTo(fieldMetadata.getLastTerm()) < 0) { @@ -55,6 +59,6 @@ public class UnionFieldMetadataBuilder { if (maxLastTerm == null) { throw new IllegalStateException("no field metadata was provided"); } - return new FieldMetadata(null, 0, false, minStartBlockFP, maxEndBlockFP, maxLastTerm); + return new FieldMetadata(dictionaryStartFP, minStartBlockFP, maxEndBlockFP, maxLastTerm); } } diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestFSTDictionary.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestFSTDictionary.java index 781af4882a8..8313d9d50e8 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestFSTDictionary.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestFSTDictionary.java @@ -140,6 +140,7 @@ public class TestFSTDictionary extends LuceneTestCase { private static FSTDictionary serializeAndReadDictionary(FSTDictionary srcDictionary, boolean shouldEncrypt) throws IOException { ByteBuffersDataOutput output = ByteBuffersDataOutput.newResettableInstance(); srcDictionary.write(output, shouldEncrypt ? Rot13CypherTestUtil.getBlockEncoder() : null); - return FSTDictionary.read(output.toDataInput(), shouldEncrypt ? Rot13CypherTestUtil.getBlockDecoder() : null); + // We must load the FST on-heap since we use a ByteBuffersDataInput which is not an instance of IndexInput. + return FSTDictionary.read(output.toDataInput(), shouldEncrypt ? Rot13CypherTestUtil.getBlockDecoder() : null, true); } } diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestUniformSplitPostingFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestUniformSplitPostingFormat.java index b4dad1befaa..db1d6c12e61 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestUniformSplitPostingFormat.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestUniformSplitPostingFormat.java @@ -18,6 +18,7 @@ package org.apache.lucene.codecs.uniformsplit; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.index.BasePostingsFormatTestCase; import org.apache.lucene.util.TestUtil; import org.junit.After; @@ -28,10 +29,21 @@ import org.junit.Before; */ public class TestUniformSplitPostingFormat extends BasePostingsFormatTestCase { - private final Codec codec = TestUtil.alwaysPostingsFormat(new UniformSplitRot13PostingsFormat()); - + protected final boolean checkEncoding; + protected final Codec codec; private boolean shouldCheckDecoderWasCalled = true; + public TestUniformSplitPostingFormat() { + checkEncoding = random().nextBoolean(); + codec = TestUtil.alwaysPostingsFormat(getPostingsFormat()); + } + + protected PostingsFormat getPostingsFormat() { + return checkEncoding ? new UniformSplitRot13PostingsFormat() + : new UniformSplitPostingsFormat(UniformSplitTermsWriter.DEFAULT_TARGET_NUM_BLOCK_LINES, UniformSplitTermsWriter.DEFAULT_DELTA_NUM_LINES, + null, null, random().nextBoolean()); + } + @Override protected Codec getCodec() { return codec; @@ -44,10 +56,12 @@ public class TestUniformSplitPostingFormat extends BasePostingsFormatTestCase { @After public void checkEncodingCalled() { - assertTrue(UniformSplitRot13PostingsFormat.blocksEncoded); - assertTrue(UniformSplitRot13PostingsFormat.dictionaryEncoded); - if (shouldCheckDecoderWasCalled) { - assertTrue(UniformSplitRot13PostingsFormat.decoderCalled); + if (checkEncoding) { + assertTrue(UniformSplitRot13PostingsFormat.blocksEncoded); + assertTrue(UniformSplitRot13PostingsFormat.dictionaryEncoded); + if (shouldCheckDecoderWasCalled) { + assertTrue(UniformSplitRot13PostingsFormat.decoderCalled); + } } } diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTUniformSplitPostingFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTUniformSplitPostingFormat.java index 76dd26d06df..28c5baa7a7f 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTUniformSplitPostingFormat.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTUniformSplitPostingFormat.java @@ -17,19 +17,19 @@ package org.apache.lucene.codecs.uniformsplit.sharedterms; -import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.uniformsplit.TestUniformSplitPostingFormat; -import org.apache.lucene.util.TestUtil; +import org.apache.lucene.codecs.uniformsplit.UniformSplitTermsWriter; /** * Tests {@link STUniformSplitPostingsFormat} with block encoding using ROT13 cypher. */ public class TestSTUniformSplitPostingFormat extends TestUniformSplitPostingFormat { - private final Codec codec = TestUtil.alwaysPostingsFormat(new STUniformSplitRot13PostingsFormat()); - @Override - protected Codec getCodec() { - return codec; + protected PostingsFormat getPostingsFormat() { + return checkEncoding ? new STUniformSplitRot13PostingsFormat() + : new STUniformSplitPostingsFormat(UniformSplitTermsWriter.DEFAULT_TARGET_NUM_BLOCK_LINES, UniformSplitTermsWriter.DEFAULT_DELTA_NUM_LINES, + null, null, random().nextBoolean()); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java index 76b6e9cacee..4b3a68034d8 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java @@ -41,13 +41,15 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat { public static volatile boolean decoderCalled; public static volatile boolean blocksEncoded; public static volatile boolean dictionaryEncoded; + protected final boolean dictionaryOnHeap; public UniformSplitRot13PostingsFormat() { - this("UniformSplitRot13"); + this("UniformSplitRot13", false); } - protected UniformSplitRot13PostingsFormat(String name) { + protected UniformSplitRot13PostingsFormat(String name, boolean dictionaryOnHeap) { super(name); + this.dictionaryOnHeap = dictionaryOnHeap; } public static void resetEncodingFlags() { @@ -135,7 +137,7 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat { } protected FieldsProducer createFieldsProducer(SegmentReadState segmentReadState, PostingsReaderBase postingsReader) throws IOException { - return new UniformSplitTermsReader(postingsReader, segmentReadState, getBlockDecoder()); + return new UniformSplitTermsReader(postingsReader, segmentReadState, getBlockDecoder(), dictionaryOnHeap); } protected BlockDecoder getBlockDecoder() { diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitRot13PostingsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitRot13PostingsFormat.java index 5252924c014..a300e364237 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitRot13PostingsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitRot13PostingsFormat.java @@ -35,7 +35,7 @@ import org.apache.lucene.index.SegmentWriteState; public class STUniformSplitRot13PostingsFormat extends UniformSplitRot13PostingsFormat { public STUniformSplitRot13PostingsFormat() { - super("STUniformSplitRot13"); + super("STUniformSplitRot13", false); } protected FieldsConsumer createFieldsConsumer(SegmentWriteState segmentWriteState, PostingsWriterBase postingsWriter) throws IOException { @@ -54,6 +54,6 @@ public class STUniformSplitRot13PostingsFormat extends UniformSplitRot13Postings } protected FieldsProducer createFieldsProducer(SegmentReadState segmentReadState, PostingsReaderBase postingsReader) throws IOException { - return new STUniformSplitTermsReader(postingsReader, segmentReadState, getBlockDecoder()); + return new STUniformSplitTermsReader(postingsReader, segmentReadState, getBlockDecoder(), dictionaryOnHeap); } }