LUCENE-9397: UniformSplit supports encodable fields metadata.

This commit is contained in:
Bruno Roustant 2020-06-10 16:09:32 +02:00
parent 36109ec362
commit 75d25ad677
No known key found for this signature in database
GPG Key ID: CD28DABB95360525
11 changed files with 97 additions and 36 deletions

View File

@ -206,6 +206,8 @@ Improvements
* LUCENE-9392: Make FacetsConfig.DELIM_CHAR publicly accessible (Ankur Goel)) * LUCENE-9392: Make FacetsConfig.DELIM_CHAR publicly accessible (Ankur Goel))
* LUCENE-9397: UniformSplit supports encodable fields metadata. (Bruno Roustant)
Optimizations Optimizations
--------------------- ---------------------

View File

@ -47,7 +47,9 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
*/ */
public static final String TERMS_BLOCKS_EXTENSION = "ustb"; public static final String TERMS_BLOCKS_EXTENSION = "ustb";
public static final int VERSION_CURRENT = 0; public static final int VERSION_START = 0;
public static final int VERSION_ENCODABLE_FIELDS_METADATA = 1;
public static final int VERSION_CURRENT = VERSION_ENCODABLE_FIELDS_METADATA;
public static final String NAME = "UniformSplit"; public static final String NAME = "UniformSplit";
@ -74,10 +76,10 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
* Must be greater than or equal to 0 and strictly less than {@code targetNumBlockLines}. * Must be greater than or equal to 0 and strictly less than {@code targetNumBlockLines}.
* The block size will be {@code targetNumBlockLines}+-{@code deltaNumLines}. * The block size will be {@code targetNumBlockLines}+-{@code deltaNumLines}.
* The block size must always be less than or equal to {@link UniformSplitTermsWriter#MAX_NUM_BLOCK_LINES}. * The block size must always be less than or equal to {@link UniformSplitTermsWriter#MAX_NUM_BLOCK_LINES}.
* @param blockEncoder Optional block encoder, may be null if none. * @param blockEncoder Optional block encoder, may be null if none. If present, it is used to encode all terms
* It can be used for compression or encryption. * blocks, as well as the FST dictionary and the fields metadata.
* @param blockDecoder Optional block decoder, may be null if none. * @param blockDecoder Optional block decoder, may be null if none. If present, it is used to decode all terms
* It can be used for compression or encryption. * blocks, as well as the FST dictionary and the fields metadata.
* @param dictionaryOnHeap Whether to force loading the terms dictionary on-heap. By default it is kept off-heap without * @param dictionaryOnHeap Whether to force loading the terms dictionary on-heap. By default it is kept off-heap without
* impact on performance. If block encoding/decoding is used, then the dictionary is always * impact on performance. If block encoding/decoding is used, then the dictionary is always
* loaded on-heap whatever this parameter value is. * loaded on-heap whatever this parameter value is.

View File

@ -34,14 +34,14 @@ import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.Terms; import org.apache.lucene.index.Terms;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.RamUsageEstimator;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.NAME; import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.*;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_BLOCKS_EXTENSION;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_DICTIONARY_EXTENSION;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.VERSION_CURRENT;
/** /**
* A block-based terms index and dictionary based on the Uniform Split technique. * A block-based terms index and dictionary based on the Uniform Split technique.
@ -51,12 +51,11 @@ import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.V
*/ */
public class UniformSplitTermsReader extends FieldsProducer { public class UniformSplitTermsReader extends FieldsProducer {
protected static final int VERSION_START = 0;
private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(UniformSplitTermsReader.class) private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(UniformSplitTermsReader.class)
+ RamUsageEstimator.shallowSizeOfInstance(IndexInput.class) * 2; + RamUsageEstimator.shallowSizeOfInstance(IndexInput.class) * 2;
protected final PostingsReaderBase postingsReader; protected final PostingsReaderBase postingsReader;
protected final int version;
protected final IndexInput blockInput; protected final IndexInput blockInput;
protected final IndexInput dictionaryInput; protected final IndexInput dictionaryInput;
@ -93,7 +92,7 @@ public class UniformSplitTermsReader extends FieldsProducer {
String termsName = IndexFileNames.segmentFileName(segmentName, state.segmentSuffix, termsBlocksExtension); String termsName = IndexFileNames.segmentFileName(segmentName, state.segmentSuffix, termsBlocksExtension);
blockInput = state.directory.openInput(termsName, state.context); blockInput = state.directory.openInput(termsName, state.context);
int version = CodecUtil.checkIndexHeader(blockInput, codecName, versionStart, version = CodecUtil.checkIndexHeader(blockInput, codecName, versionStart,
versionCurrent, state.segmentInfo.getId(), state.segmentSuffix); versionCurrent, state.segmentInfo.getId(), state.segmentSuffix);
String indexName = IndexFileNames.segmentFileName(segmentName, state.segmentSuffix, dictionaryExtension); String indexName = IndexFileNames.segmentFileName(segmentName, state.segmentSuffix, dictionaryExtension);
dictionaryInput = state.directory.openInput(indexName, state.context); dictionaryInput = state.directory.openInput(indexName, state.context);
@ -105,7 +104,8 @@ public class UniformSplitTermsReader extends FieldsProducer {
CodecUtil.retrieveChecksum(blockInput); CodecUtil.retrieveChecksum(blockInput);
seekFieldsMetadata(blockInput); seekFieldsMetadata(blockInput);
Collection<FieldMetadata> fieldMetadataCollection = parseFieldsMetadata(blockInput, state.fieldInfos, fieldMetadataReader, state.segmentInfo.maxDoc()); Collection<FieldMetadata> fieldMetadataCollection =
readFieldsMetadata(blockInput, blockDecoder, state.fieldInfos, fieldMetadataReader, state.segmentInfo.maxDoc());
fieldToTermsMap = new HashMap<>(); fieldToTermsMap = new HashMap<>();
this.blockInput = blockInput; this.blockInput = blockInput;
@ -143,16 +143,36 @@ public class UniformSplitTermsReader extends FieldsProducer {
/** /**
* @param indexInput {@link IndexInput} must be positioned to the fields metadata * @param indexInput {@link IndexInput} must be positioned to the fields metadata
* details by calling {@link #seekFieldsMetadata(IndexInput)} before this call. * details by calling {@link #seekFieldsMetadata(IndexInput)} before this call.
* @param blockDecoder Optional block decoder, may be null if none.
*/ */
protected static Collection<FieldMetadata> parseFieldsMetadata(IndexInput indexInput, FieldInfos fieldInfos, protected Collection<FieldMetadata> readFieldsMetadata(IndexInput indexInput, BlockDecoder blockDecoder, FieldInfos fieldInfos,
FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException { FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException {
int numFields = indexInput.readVInt(); int numFields = indexInput.readVInt();
if (numFields < 0) { if (numFields < 0) {
throw new CorruptIndexException("Illegal number of fields= " + numFields, indexInput); throw new CorruptIndexException("Illegal number of fields= " + numFields, indexInput);
} }
return (blockDecoder != null && version >= VERSION_ENCODABLE_FIELDS_METADATA) ?
readEncodedFieldsMetadata(numFields, indexInput, blockDecoder, fieldInfos, fieldMetadataReader, maxNumDocs)
: readUnencodedFieldsMetadata(numFields, indexInput, fieldInfos, fieldMetadataReader, maxNumDocs);
}
protected Collection<FieldMetadata> readEncodedFieldsMetadata(int numFields, DataInput metadataInput, BlockDecoder blockDecoder,
FieldInfos fieldInfos, FieldMetadata.Serializer fieldMetadataReader,
int maxNumDocs) throws IOException {
long encodedLength = metadataInput.readVLong();
if (encodedLength < 0) {
throw new CorruptIndexException("Illegal encoded length: " + encodedLength, metadataInput);
}
BytesRef decodedBytes = blockDecoder.decode(metadataInput, encodedLength);
DataInput decodedMetadataInput = new ByteArrayDataInput(decodedBytes.bytes, 0, decodedBytes.length);
return readUnencodedFieldsMetadata(numFields, decodedMetadataInput, fieldInfos, fieldMetadataReader, maxNumDocs);
}
protected Collection<FieldMetadata> readUnencodedFieldsMetadata(int numFields, DataInput metadataInput, FieldInfos fieldInfos,
FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException {
Collection<FieldMetadata> fieldMetadataCollection = new ArrayList<>(numFields); Collection<FieldMetadata> fieldMetadataCollection = new ArrayList<>(numFields);
for (int i = 0; i < numFields; i++) { for (int i = 0; i < numFields; i++) {
fieldMetadataCollection.add(fieldMetadataReader.read(indexInput, fieldInfos, maxNumDocs)); fieldMetadataCollection.add(fieldMetadataReader.read(metadataInput, fieldInfos, maxNumDocs));
} }
return fieldMetadataCollection; return fieldMetadataCollection;
} }
@ -212,7 +232,7 @@ public class UniformSplitTermsReader extends FieldsProducer {
/** /**
* Positions the given {@link IndexInput} at the beginning of the fields metadata. * Positions the given {@link IndexInput} at the beginning of the fields metadata.
*/ */
protected static void seekFieldsMetadata(IndexInput indexInput) throws IOException { protected void seekFieldsMetadata(IndexInput indexInput) throws IOException {
indexInput.seek(indexInput.length() - CodecUtil.footerLength() - 8); indexInput.seek(indexInput.length() - CodecUtil.footerLength() - 8);
indexInput.seek(indexInput.readLong()); indexInput.seek(indexInput.readLong());
} }

View File

@ -249,11 +249,26 @@ public class UniformSplitTermsWriter extends FieldsConsumer {
protected void writeFieldsMetadata(int fieldsNumber, ByteBuffersDataOutput fieldsOutput) throws IOException { protected void writeFieldsMetadata(int fieldsNumber, ByteBuffersDataOutput fieldsOutput) throws IOException {
long fieldsStartPosition = blockOutput.getFilePointer(); long fieldsStartPosition = blockOutput.getFilePointer();
blockOutput.writeVInt(fieldsNumber); blockOutput.writeVInt(fieldsNumber);
fieldsOutput.copyTo(blockOutput); if (blockEncoder == null) {
writeUnencodedFieldsMetadata(fieldsOutput);
} else {
writeEncodedFieldsMetadata(fieldsOutput);
}
// Must be a fixed length. Read by UniformSplitTermsReader when seeking fields metadata.
blockOutput.writeLong(fieldsStartPosition); blockOutput.writeLong(fieldsStartPosition);
CodecUtil.writeFooter(blockOutput); CodecUtil.writeFooter(blockOutput);
} }
protected void writeUnencodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
fieldsOutput.copyTo(blockOutput);
}
protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
BlockEncoder.WritableBytes encodedBytes = blockEncoder.encode(fieldsOutput.toDataInput(), fieldsOutput.size());
blockOutput.writeVLong(encodedBytes.size());
encodedBytes.writeTo(blockOutput);
}
/** /**
* @return 1 if the field was written; 0 otherwise. * @return 1 if the field was written; 0 otherwise.
*/ */

View File

@ -54,7 +54,7 @@ public class STUniformSplitPostingsFormat extends UniformSplitPostingsFormat {
*/ */
public static final String TERMS_BLOCKS_EXTENSION = "stustb"; public static final String TERMS_BLOCKS_EXTENSION = "stustb";
public static final int VERSION_CURRENT = 0; public static final int VERSION_CURRENT = UniformSplitPostingsFormat.VERSION_CURRENT;
public static final String NAME = "SharedTermsUniformSplit"; public static final String NAME = "SharedTermsUniformSplit";

View File

@ -30,10 +30,7 @@ import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexInput;
import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.NAME; import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.*;
import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.TERMS_BLOCKS_EXTENSION;
import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.TERMS_DICTIONARY_EXTENSION;
import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.VERSION_CURRENT;
/** /**
* A block-based terms index and dictionary based on the Uniform Split technique, * A block-based terms index and dictionary based on the Uniform Split technique,

View File

@ -33,15 +33,9 @@ public class UnionFieldMetadataBuilder {
private BytesRef maxLastTerm; private BytesRef maxLastTerm;
public UnionFieldMetadataBuilder() { public UnionFieldMetadataBuilder() {
reset();
}
public UnionFieldMetadataBuilder reset() {
dictionaryStartFP = -1; dictionaryStartFP = -1;
minStartBlockFP = Long.MAX_VALUE; minStartBlockFP = Long.MAX_VALUE;
maxEndBlockFP = Long.MIN_VALUE; maxEndBlockFP = Long.MIN_VALUE;
maxLastTerm = null;
return this;
} }
public UnionFieldMetadataBuilder addFieldMetadata(FieldMetadata fieldMetadata) { public UnionFieldMetadataBuilder addFieldMetadata(FieldMetadata fieldMetadata) {

View File

@ -51,19 +51,28 @@ public class TestUniformSplitPostingFormat extends BasePostingsFormatTestCase {
@Before @Before
public void initialize() { public void initialize() {
initializeInner();
}
protected void initializeInner() {
UniformSplitRot13PostingsFormat.resetEncodingFlags(); UniformSplitRot13PostingsFormat.resetEncodingFlags();
} }
@After @After
public void checkEncodingCalled() { public void checkEncodingCalled() {
if (checkEncoding) { if (checkEncoding) {
checkEncodingCalledInner();
}
}
protected void checkEncodingCalledInner() {
assertTrue(UniformSplitRot13PostingsFormat.blocksEncoded); assertTrue(UniformSplitRot13PostingsFormat.blocksEncoded);
assertTrue(UniformSplitRot13PostingsFormat.fieldsMetadataEncoded);
assertTrue(UniformSplitRot13PostingsFormat.dictionaryEncoded); assertTrue(UniformSplitRot13PostingsFormat.dictionaryEncoded);
if (shouldCheckDecoderWasCalled) { if (shouldCheckDecoderWasCalled) {
assertTrue(UniformSplitRot13PostingsFormat.decoderCalled); assertTrue(UniformSplitRot13PostingsFormat.decoderCalled);
} }
} }
}
@Override @Override
public void testRandomExceptions() throws Exception { public void testRandomExceptions() throws Exception {

View File

@ -51,9 +51,9 @@ import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
public class STBlockReaderTest extends LuceneTestCase { public class TestSTBlockReader extends LuceneTestCase {
private static final String MOCK_BLOCK_OUTPUT_NAME = "STBlockReaderTest.tmp"; private static final String MOCK_BLOCK_OUTPUT_NAME = "TestSTBlockReader.tmp";
private FieldInfos fieldInfos; private FieldInfos fieldInfos;
private List<MockSTBlockLine> blockLines; private List<MockSTBlockLine> blockLines;

View File

@ -28,6 +28,7 @@ import org.apache.lucene.codecs.lucene84.Lucene84PostingsReader;
import org.apache.lucene.codecs.lucene84.Lucene84PostingsWriter; import org.apache.lucene.codecs.lucene84.Lucene84PostingsWriter;
import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
@ -40,6 +41,7 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
public static volatile boolean encoderCalled; public static volatile boolean encoderCalled;
public static volatile boolean decoderCalled; public static volatile boolean decoderCalled;
public static volatile boolean blocksEncoded; public static volatile boolean blocksEncoded;
public static volatile boolean fieldsMetadataEncoded;
public static volatile boolean dictionaryEncoded; public static volatile boolean dictionaryEncoded;
protected final boolean dictionaryOnHeap; protected final boolean dictionaryOnHeap;
@ -56,6 +58,7 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
encoderCalled = false; encoderCalled = false;
decoderCalled = false; decoderCalled = false;
blocksEncoded = false; blocksEncoded = false;
fieldsMetadataEncoded = false;
dictionaryEncoded = false; dictionaryEncoded = false;
} }
@ -86,6 +89,11 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
super.writeDictionary(dictionaryBuilder); super.writeDictionary(dictionaryBuilder);
recordDictionaryEncodingCall(); recordDictionaryEncodingCall();
} }
@Override
protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
super.writeEncodedFieldsMetadata(fieldsOutput);
recordFieldsMetadataEncodingCall();
}
}; };
} }
@ -96,6 +104,13 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
} }
} }
protected void recordFieldsMetadataEncodingCall() {
if (encoderCalled) {
fieldsMetadataEncoded = true;
encoderCalled = false;
}
}
protected void recordDictionaryEncodingCall() { protected void recordDictionaryEncodingCall() {
if (encoderCalled) { if (encoderCalled) {
dictionaryEncoded = true; dictionaryEncoded = true;

View File

@ -28,6 +28,7 @@ import org.apache.lucene.codecs.uniformsplit.UniformSplitRot13PostingsFormat;
import org.apache.lucene.codecs.uniformsplit.UniformSplitTermsWriter; import org.apache.lucene.codecs.uniformsplit.UniformSplitTermsWriter;
import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.ByteBuffersDataOutput;
/** /**
* {@link STUniformSplitPostingsFormat} with block encoding using ROT13 cypher. * {@link STUniformSplitPostingsFormat} with block encoding using ROT13 cypher.
@ -50,6 +51,12 @@ public class STUniformSplitRot13PostingsFormat extends UniformSplitRot13Postings
super.writeDictionary(dictionaryBuilder); super.writeDictionary(dictionaryBuilder);
recordDictionaryEncodingCall(); recordDictionaryEncodingCall();
} }
@Override
protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
recordBlockEncodingCall();
super.writeEncodedFieldsMetadata(fieldsOutput);
recordFieldsMetadataEncodingCall();
}
}; };
} }