From 75d25ad6779dec194a2e0ef2a3263ce0fb872cf6 Mon Sep 17 00:00:00 2001 From: Bruno Roustant Date: Wed, 10 Jun 2020 16:09:32 +0200 Subject: [PATCH] LUCENE-9397: UniformSplit supports encodable fields metadata. --- lucene/CHANGES.txt | 2 + .../UniformSplitPostingsFormat.java | 12 ++--- .../uniformsplit/UniformSplitTermsReader.java | 44 ++++++++++++++----- .../uniformsplit/UniformSplitTermsWriter.java | 17 ++++++- .../STUniformSplitPostingsFormat.java | 2 +- .../STUniformSplitTermsReader.java | 5 +-- .../UnionFieldMetadataBuilder.java | 6 --- .../TestUniformSplitPostingFormat.java | 19 +++++--- ...ReaderTest.java => TestSTBlockReader.java} | 4 +- .../UniformSplitRot13PostingsFormat.java | 15 +++++++ .../STUniformSplitRot13PostingsFormat.java | 7 +++ 11 files changed, 97 insertions(+), 36 deletions(-) rename lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/{STBlockReaderTest.java => TestSTBlockReader.java} (98%) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 4759341b089..553f16abc17 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -206,6 +206,8 @@ Improvements * LUCENE-9392: Make FacetsConfig.DELIM_CHAR publicly accessible (Ankur Goel)) +* LUCENE-9397: UniformSplit supports encodable fields metadata. (Bruno Roustant) + Optimizations --------------------- diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java index f982ed3ad2e..a58a1de7400 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java @@ -47,7 +47,9 @@ public class UniformSplitPostingsFormat extends PostingsFormat { */ public static final String TERMS_BLOCKS_EXTENSION = "ustb"; - public static final int VERSION_CURRENT = 0; + public static final int VERSION_START = 0; + public static final int VERSION_ENCODABLE_FIELDS_METADATA = 1; + public static final int VERSION_CURRENT = VERSION_ENCODABLE_FIELDS_METADATA; public static final String NAME = "UniformSplit"; @@ -74,10 +76,10 @@ public class UniformSplitPostingsFormat extends PostingsFormat { * Must be greater than or equal to 0 and strictly less than {@code targetNumBlockLines}. * The block size will be {@code targetNumBlockLines}+-{@code deltaNumLines}. * The block size must always be less than or equal to {@link UniformSplitTermsWriter#MAX_NUM_BLOCK_LINES}. - * @param blockEncoder Optional block encoder, may be null if none. - * It can be used for compression or encryption. - * @param blockDecoder Optional block decoder, may be null if none. - * It can be used for compression or encryption. + * @param blockEncoder Optional block encoder, may be null if none. If present, it is used to encode all terms + * blocks, as well as the FST dictionary and the fields metadata. + * @param blockDecoder Optional block decoder, may be null if none. If present, it is used to decode all terms + * blocks, as well as the FST dictionary and the fields metadata. * @param dictionaryOnHeap Whether to force loading the terms dictionary on-heap. By default it is kept off-heap without * impact on performance. If block encoding/decoding is used, then the dictionary is always * loaded on-heap whatever this parameter value is. diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java index 9b2552b5017..377919dc81b 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java @@ -34,14 +34,14 @@ import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.Terms; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.DataInput; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.RamUsageEstimator; -import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.NAME; -import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_BLOCKS_EXTENSION; -import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_DICTIONARY_EXTENSION; -import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.VERSION_CURRENT; +import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.*; /** * A block-based terms index and dictionary based on the Uniform Split technique. @@ -51,12 +51,11 @@ import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.V */ public class UniformSplitTermsReader extends FieldsProducer { - protected static final int VERSION_START = 0; - private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(UniformSplitTermsReader.class) + RamUsageEstimator.shallowSizeOfInstance(IndexInput.class) * 2; protected final PostingsReaderBase postingsReader; + protected final int version; protected final IndexInput blockInput; protected final IndexInput dictionaryInput; @@ -93,7 +92,7 @@ public class UniformSplitTermsReader extends FieldsProducer { String termsName = IndexFileNames.segmentFileName(segmentName, state.segmentSuffix, termsBlocksExtension); blockInput = state.directory.openInput(termsName, state.context); - int version = CodecUtil.checkIndexHeader(blockInput, codecName, versionStart, + version = CodecUtil.checkIndexHeader(blockInput, codecName, versionStart, versionCurrent, state.segmentInfo.getId(), state.segmentSuffix); String indexName = IndexFileNames.segmentFileName(segmentName, state.segmentSuffix, dictionaryExtension); dictionaryInput = state.directory.openInput(indexName, state.context); @@ -105,7 +104,8 @@ public class UniformSplitTermsReader extends FieldsProducer { CodecUtil.retrieveChecksum(blockInput); seekFieldsMetadata(blockInput); - Collection fieldMetadataCollection = parseFieldsMetadata(blockInput, state.fieldInfos, fieldMetadataReader, state.segmentInfo.maxDoc()); + Collection fieldMetadataCollection = + readFieldsMetadata(blockInput, blockDecoder, state.fieldInfos, fieldMetadataReader, state.segmentInfo.maxDoc()); fieldToTermsMap = new HashMap<>(); this.blockInput = blockInput; @@ -143,16 +143,36 @@ public class UniformSplitTermsReader extends FieldsProducer { /** * @param indexInput {@link IndexInput} must be positioned to the fields metadata * details by calling {@link #seekFieldsMetadata(IndexInput)} before this call. + * @param blockDecoder Optional block decoder, may be null if none. */ - protected static Collection parseFieldsMetadata(IndexInput indexInput, FieldInfos fieldInfos, - FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException { + protected Collection readFieldsMetadata(IndexInput indexInput, BlockDecoder blockDecoder, FieldInfos fieldInfos, + FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException { int numFields = indexInput.readVInt(); if (numFields < 0) { throw new CorruptIndexException("Illegal number of fields= " + numFields, indexInput); } + return (blockDecoder != null && version >= VERSION_ENCODABLE_FIELDS_METADATA) ? + readEncodedFieldsMetadata(numFields, indexInput, blockDecoder, fieldInfos, fieldMetadataReader, maxNumDocs) + : readUnencodedFieldsMetadata(numFields, indexInput, fieldInfos, fieldMetadataReader, maxNumDocs); + } + + protected Collection readEncodedFieldsMetadata(int numFields, DataInput metadataInput, BlockDecoder blockDecoder, + FieldInfos fieldInfos, FieldMetadata.Serializer fieldMetadataReader, + int maxNumDocs) throws IOException { + long encodedLength = metadataInput.readVLong(); + if (encodedLength < 0) { + throw new CorruptIndexException("Illegal encoded length: " + encodedLength, metadataInput); + } + BytesRef decodedBytes = blockDecoder.decode(metadataInput, encodedLength); + DataInput decodedMetadataInput = new ByteArrayDataInput(decodedBytes.bytes, 0, decodedBytes.length); + return readUnencodedFieldsMetadata(numFields, decodedMetadataInput, fieldInfos, fieldMetadataReader, maxNumDocs); + } + + protected Collection readUnencodedFieldsMetadata(int numFields, DataInput metadataInput, FieldInfos fieldInfos, + FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException { Collection fieldMetadataCollection = new ArrayList<>(numFields); for (int i = 0; i < numFields; i++) { - fieldMetadataCollection.add(fieldMetadataReader.read(indexInput, fieldInfos, maxNumDocs)); + fieldMetadataCollection.add(fieldMetadataReader.read(metadataInput, fieldInfos, maxNumDocs)); } return fieldMetadataCollection; } @@ -212,7 +232,7 @@ public class UniformSplitTermsReader extends FieldsProducer { /** * Positions the given {@link IndexInput} at the beginning of the fields metadata. */ - protected static void seekFieldsMetadata(IndexInput indexInput) throws IOException { + protected void seekFieldsMetadata(IndexInput indexInput) throws IOException { indexInput.seek(indexInput.length() - CodecUtil.footerLength() - 8); indexInput.seek(indexInput.readLong()); } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsWriter.java index 101b6b5942f..c4e089f5627 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsWriter.java @@ -249,11 +249,26 @@ public class UniformSplitTermsWriter extends FieldsConsumer { protected void writeFieldsMetadata(int fieldsNumber, ByteBuffersDataOutput fieldsOutput) throws IOException { long fieldsStartPosition = blockOutput.getFilePointer(); blockOutput.writeVInt(fieldsNumber); - fieldsOutput.copyTo(blockOutput); + if (blockEncoder == null) { + writeUnencodedFieldsMetadata(fieldsOutput); + } else { + writeEncodedFieldsMetadata(fieldsOutput); + } + // Must be a fixed length. Read by UniformSplitTermsReader when seeking fields metadata. blockOutput.writeLong(fieldsStartPosition); CodecUtil.writeFooter(blockOutput); } + protected void writeUnencodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException { + fieldsOutput.copyTo(blockOutput); + } + + protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException { + BlockEncoder.WritableBytes encodedBytes = blockEncoder.encode(fieldsOutput.toDataInput(), fieldsOutput.size()); + blockOutput.writeVLong(encodedBytes.size()); + encodedBytes.writeTo(blockOutput); + } + /** * @return 1 if the field was written; 0 otherwise. */ diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitPostingsFormat.java index 57c15409904..730728ba7f4 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitPostingsFormat.java @@ -54,7 +54,7 @@ public class STUniformSplitPostingsFormat extends UniformSplitPostingsFormat { */ public static final String TERMS_BLOCKS_EXTENSION = "stustb"; - public static final int VERSION_CURRENT = 0; + public static final int VERSION_CURRENT = UniformSplitPostingsFormat.VERSION_CURRENT; public static final String NAME = "SharedTermsUniformSplit"; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java index cc25a30cef6..5c2b24b5fca 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java @@ -30,10 +30,7 @@ import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.store.IndexInput; -import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.NAME; -import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.TERMS_BLOCKS_EXTENSION; -import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.TERMS_DICTIONARY_EXTENSION; -import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.VERSION_CURRENT; +import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.*; /** * A block-based terms index and dictionary based on the Uniform Split technique, diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/UnionFieldMetadataBuilder.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/UnionFieldMetadataBuilder.java index 85b6a27fd3b..4cf5c2623ae 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/UnionFieldMetadataBuilder.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/UnionFieldMetadataBuilder.java @@ -33,15 +33,9 @@ public class UnionFieldMetadataBuilder { private BytesRef maxLastTerm; public UnionFieldMetadataBuilder() { - reset(); - } - - public UnionFieldMetadataBuilder reset() { dictionaryStartFP = -1; minStartBlockFP = Long.MAX_VALUE; maxEndBlockFP = Long.MIN_VALUE; - maxLastTerm = null; - return this; } public UnionFieldMetadataBuilder addFieldMetadata(FieldMetadata fieldMetadata) { diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestUniformSplitPostingFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestUniformSplitPostingFormat.java index db1d6c12e61..9a68a14c21a 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestUniformSplitPostingFormat.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestUniformSplitPostingFormat.java @@ -51,17 +51,26 @@ public class TestUniformSplitPostingFormat extends BasePostingsFormatTestCase { @Before public void initialize() { + initializeInner(); + } + + protected void initializeInner() { UniformSplitRot13PostingsFormat.resetEncodingFlags(); } @After public void checkEncodingCalled() { if (checkEncoding) { - assertTrue(UniformSplitRot13PostingsFormat.blocksEncoded); - assertTrue(UniformSplitRot13PostingsFormat.dictionaryEncoded); - if (shouldCheckDecoderWasCalled) { - assertTrue(UniformSplitRot13PostingsFormat.decoderCalled); - } + checkEncodingCalledInner(); + } + } + + protected void checkEncodingCalledInner() { + assertTrue(UniformSplitRot13PostingsFormat.blocksEncoded); + assertTrue(UniformSplitRot13PostingsFormat.fieldsMetadataEncoded); + assertTrue(UniformSplitRot13PostingsFormat.dictionaryEncoded); + if (shouldCheckDecoderWasCalled) { + assertTrue(UniformSplitRot13PostingsFormat.decoderCalled); } } diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReaderTest.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java similarity index 98% rename from lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReaderTest.java rename to lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java index 6d09fe36e16..5707fb4f6a0 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReaderTest.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java @@ -51,9 +51,9 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; -public class STBlockReaderTest extends LuceneTestCase { +public class TestSTBlockReader extends LuceneTestCase { - private static final String MOCK_BLOCK_OUTPUT_NAME = "STBlockReaderTest.tmp"; + private static final String MOCK_BLOCK_OUTPUT_NAME = "TestSTBlockReader.tmp"; private FieldInfos fieldInfos; private List blockLines; diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java index 4b3a68034d8..26d14adb290 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java @@ -28,6 +28,7 @@ import org.apache.lucene.codecs.lucene84.Lucene84PostingsReader; import org.apache.lucene.codecs.lucene84.Lucene84PostingsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.ByteBuffersDataOutput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; @@ -40,6 +41,7 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat { public static volatile boolean encoderCalled; public static volatile boolean decoderCalled; public static volatile boolean blocksEncoded; + public static volatile boolean fieldsMetadataEncoded; public static volatile boolean dictionaryEncoded; protected final boolean dictionaryOnHeap; @@ -56,6 +58,7 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat { encoderCalled = false; decoderCalled = false; blocksEncoded = false; + fieldsMetadataEncoded = false; dictionaryEncoded = false; } @@ -86,6 +89,11 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat { super.writeDictionary(dictionaryBuilder); recordDictionaryEncodingCall(); } + @Override + protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException { + super.writeEncodedFieldsMetadata(fieldsOutput); + recordFieldsMetadataEncodingCall(); + } }; } @@ -96,6 +104,13 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat { } } + protected void recordFieldsMetadataEncodingCall() { + if (encoderCalled) { + fieldsMetadataEncoded = true; + encoderCalled = false; + } + } + protected void recordDictionaryEncodingCall() { if (encoderCalled) { dictionaryEncoded = true; diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitRot13PostingsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitRot13PostingsFormat.java index a300e364237..04f3964e337 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitRot13PostingsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitRot13PostingsFormat.java @@ -28,6 +28,7 @@ import org.apache.lucene.codecs.uniformsplit.UniformSplitRot13PostingsFormat; import org.apache.lucene.codecs.uniformsplit.UniformSplitTermsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.ByteBuffersDataOutput; /** * {@link STUniformSplitPostingsFormat} with block encoding using ROT13 cypher. @@ -50,6 +51,12 @@ public class STUniformSplitRot13PostingsFormat extends UniformSplitRot13Postings super.writeDictionary(dictionaryBuilder); recordDictionaryEncodingCall(); } + @Override + protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException { + recordBlockEncodingCall(); + super.writeEncodedFieldsMetadata(fieldsOutput); + recordFieldsMetadataEncodingCall(); + } }; }