LUCENE-9397: UniformSplit supports encodable fields metadata.

This commit is contained in:
Bruno Roustant 2020-06-10 16:09:32 +02:00
parent 36109ec362
commit 75d25ad677
No known key found for this signature in database
GPG Key ID: CD28DABB95360525
11 changed files with 97 additions and 36 deletions

View File

@ -206,6 +206,8 @@ Improvements
* LUCENE-9392: Make FacetsConfig.DELIM_CHAR publicly accessible (Ankur Goel))
* LUCENE-9397: UniformSplit supports encodable fields metadata. (Bruno Roustant)
Optimizations
---------------------

View File

@ -47,7 +47,9 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
*/
public static final String TERMS_BLOCKS_EXTENSION = "ustb";
public static final int VERSION_CURRENT = 0;
public static final int VERSION_START = 0;
public static final int VERSION_ENCODABLE_FIELDS_METADATA = 1;
public static final int VERSION_CURRENT = VERSION_ENCODABLE_FIELDS_METADATA;
public static final String NAME = "UniformSplit";
@ -74,10 +76,10 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
* Must be greater than or equal to 0 and strictly less than {@code targetNumBlockLines}.
* The block size will be {@code targetNumBlockLines}+-{@code deltaNumLines}.
* The block size must always be less than or equal to {@link UniformSplitTermsWriter#MAX_NUM_BLOCK_LINES}.
* @param blockEncoder Optional block encoder, may be null if none.
* It can be used for compression or encryption.
* @param blockDecoder Optional block decoder, may be null if none.
* It can be used for compression or encryption.
* @param blockEncoder Optional block encoder, may be null if none. If present, it is used to encode all terms
* blocks, as well as the FST dictionary and the fields metadata.
* @param blockDecoder Optional block decoder, may be null if none. If present, it is used to decode all terms
* blocks, as well as the FST dictionary and the fields metadata.
* @param dictionaryOnHeap Whether to force loading the terms dictionary on-heap. By default it is kept off-heap without
* impact on performance. If block encoding/decoding is used, then the dictionary is always
* loaded on-heap whatever this parameter value is.

View File

@ -34,14 +34,14 @@ import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.NAME;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_BLOCKS_EXTENSION;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_DICTIONARY_EXTENSION;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.VERSION_CURRENT;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.*;
/**
* A block-based terms index and dictionary based on the Uniform Split technique.
@ -51,12 +51,11 @@ import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.V
*/
public class UniformSplitTermsReader extends FieldsProducer {
protected static final int VERSION_START = 0;
private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(UniformSplitTermsReader.class)
+ RamUsageEstimator.shallowSizeOfInstance(IndexInput.class) * 2;
protected final PostingsReaderBase postingsReader;
protected final int version;
protected final IndexInput blockInput;
protected final IndexInput dictionaryInput;
@ -93,7 +92,7 @@ public class UniformSplitTermsReader extends FieldsProducer {
String termsName = IndexFileNames.segmentFileName(segmentName, state.segmentSuffix, termsBlocksExtension);
blockInput = state.directory.openInput(termsName, state.context);
int version = CodecUtil.checkIndexHeader(blockInput, codecName, versionStart,
version = CodecUtil.checkIndexHeader(blockInput, codecName, versionStart,
versionCurrent, state.segmentInfo.getId(), state.segmentSuffix);
String indexName = IndexFileNames.segmentFileName(segmentName, state.segmentSuffix, dictionaryExtension);
dictionaryInput = state.directory.openInput(indexName, state.context);
@ -105,7 +104,8 @@ public class UniformSplitTermsReader extends FieldsProducer {
CodecUtil.retrieveChecksum(blockInput);
seekFieldsMetadata(blockInput);
Collection<FieldMetadata> fieldMetadataCollection = parseFieldsMetadata(blockInput, state.fieldInfos, fieldMetadataReader, state.segmentInfo.maxDoc());
Collection<FieldMetadata> fieldMetadataCollection =
readFieldsMetadata(blockInput, blockDecoder, state.fieldInfos, fieldMetadataReader, state.segmentInfo.maxDoc());
fieldToTermsMap = new HashMap<>();
this.blockInput = blockInput;
@ -143,16 +143,36 @@ public class UniformSplitTermsReader extends FieldsProducer {
/**
* @param indexInput {@link IndexInput} must be positioned to the fields metadata
* details by calling {@link #seekFieldsMetadata(IndexInput)} before this call.
* @param blockDecoder Optional block decoder, may be null if none.
*/
protected static Collection<FieldMetadata> parseFieldsMetadata(IndexInput indexInput, FieldInfos fieldInfos,
FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException {
protected Collection<FieldMetadata> readFieldsMetadata(IndexInput indexInput, BlockDecoder blockDecoder, FieldInfos fieldInfos,
FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException {
int numFields = indexInput.readVInt();
if (numFields < 0) {
throw new CorruptIndexException("Illegal number of fields= " + numFields, indexInput);
}
return (blockDecoder != null && version >= VERSION_ENCODABLE_FIELDS_METADATA) ?
readEncodedFieldsMetadata(numFields, indexInput, blockDecoder, fieldInfos, fieldMetadataReader, maxNumDocs)
: readUnencodedFieldsMetadata(numFields, indexInput, fieldInfos, fieldMetadataReader, maxNumDocs);
}
protected Collection<FieldMetadata> readEncodedFieldsMetadata(int numFields, DataInput metadataInput, BlockDecoder blockDecoder,
FieldInfos fieldInfos, FieldMetadata.Serializer fieldMetadataReader,
int maxNumDocs) throws IOException {
long encodedLength = metadataInput.readVLong();
if (encodedLength < 0) {
throw new CorruptIndexException("Illegal encoded length: " + encodedLength, metadataInput);
}
BytesRef decodedBytes = blockDecoder.decode(metadataInput, encodedLength);
DataInput decodedMetadataInput = new ByteArrayDataInput(decodedBytes.bytes, 0, decodedBytes.length);
return readUnencodedFieldsMetadata(numFields, decodedMetadataInput, fieldInfos, fieldMetadataReader, maxNumDocs);
}
protected Collection<FieldMetadata> readUnencodedFieldsMetadata(int numFields, DataInput metadataInput, FieldInfos fieldInfos,
FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException {
Collection<FieldMetadata> fieldMetadataCollection = new ArrayList<>(numFields);
for (int i = 0; i < numFields; i++) {
fieldMetadataCollection.add(fieldMetadataReader.read(indexInput, fieldInfos, maxNumDocs));
fieldMetadataCollection.add(fieldMetadataReader.read(metadataInput, fieldInfos, maxNumDocs));
}
return fieldMetadataCollection;
}
@ -212,7 +232,7 @@ public class UniformSplitTermsReader extends FieldsProducer {
/**
* Positions the given {@link IndexInput} at the beginning of the fields metadata.
*/
protected static void seekFieldsMetadata(IndexInput indexInput) throws IOException {
protected void seekFieldsMetadata(IndexInput indexInput) throws IOException {
indexInput.seek(indexInput.length() - CodecUtil.footerLength() - 8);
indexInput.seek(indexInput.readLong());
}

View File

@ -249,11 +249,26 @@ public class UniformSplitTermsWriter extends FieldsConsumer {
protected void writeFieldsMetadata(int fieldsNumber, ByteBuffersDataOutput fieldsOutput) throws IOException {
long fieldsStartPosition = blockOutput.getFilePointer();
blockOutput.writeVInt(fieldsNumber);
fieldsOutput.copyTo(blockOutput);
if (blockEncoder == null) {
writeUnencodedFieldsMetadata(fieldsOutput);
} else {
writeEncodedFieldsMetadata(fieldsOutput);
}
// Must be a fixed length. Read by UniformSplitTermsReader when seeking fields metadata.
blockOutput.writeLong(fieldsStartPosition);
CodecUtil.writeFooter(blockOutput);
}
protected void writeUnencodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
fieldsOutput.copyTo(blockOutput);
}
protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
BlockEncoder.WritableBytes encodedBytes = blockEncoder.encode(fieldsOutput.toDataInput(), fieldsOutput.size());
blockOutput.writeVLong(encodedBytes.size());
encodedBytes.writeTo(blockOutput);
}
/**
* @return 1 if the field was written; 0 otherwise.
*/

View File

@ -54,7 +54,7 @@ public class STUniformSplitPostingsFormat extends UniformSplitPostingsFormat {
*/
public static final String TERMS_BLOCKS_EXTENSION = "stustb";
public static final int VERSION_CURRENT = 0;
public static final int VERSION_CURRENT = UniformSplitPostingsFormat.VERSION_CURRENT;
public static final String NAME = "SharedTermsUniformSplit";

View File

@ -30,10 +30,7 @@ import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.store.IndexInput;
import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.NAME;
import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.TERMS_BLOCKS_EXTENSION;
import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.TERMS_DICTIONARY_EXTENSION;
import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.VERSION_CURRENT;
import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.*;
/**
* A block-based terms index and dictionary based on the Uniform Split technique,

View File

@ -33,15 +33,9 @@ public class UnionFieldMetadataBuilder {
private BytesRef maxLastTerm;
public UnionFieldMetadataBuilder() {
reset();
}
public UnionFieldMetadataBuilder reset() {
dictionaryStartFP = -1;
minStartBlockFP = Long.MAX_VALUE;
maxEndBlockFP = Long.MIN_VALUE;
maxLastTerm = null;
return this;
}
public UnionFieldMetadataBuilder addFieldMetadata(FieldMetadata fieldMetadata) {

View File

@ -51,17 +51,26 @@ public class TestUniformSplitPostingFormat extends BasePostingsFormatTestCase {
@Before
public void initialize() {
initializeInner();
}
protected void initializeInner() {
UniformSplitRot13PostingsFormat.resetEncodingFlags();
}
@After
public void checkEncodingCalled() {
if (checkEncoding) {
assertTrue(UniformSplitRot13PostingsFormat.blocksEncoded);
assertTrue(UniformSplitRot13PostingsFormat.dictionaryEncoded);
if (shouldCheckDecoderWasCalled) {
assertTrue(UniformSplitRot13PostingsFormat.decoderCalled);
}
checkEncodingCalledInner();
}
}
protected void checkEncodingCalledInner() {
assertTrue(UniformSplitRot13PostingsFormat.blocksEncoded);
assertTrue(UniformSplitRot13PostingsFormat.fieldsMetadataEncoded);
assertTrue(UniformSplitRot13PostingsFormat.dictionaryEncoded);
if (shouldCheckDecoderWasCalled) {
assertTrue(UniformSplitRot13PostingsFormat.decoderCalled);
}
}

View File

@ -51,9 +51,9 @@ import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
public class STBlockReaderTest extends LuceneTestCase {
public class TestSTBlockReader extends LuceneTestCase {
private static final String MOCK_BLOCK_OUTPUT_NAME = "STBlockReaderTest.tmp";
private static final String MOCK_BLOCK_OUTPUT_NAME = "TestSTBlockReader.tmp";
private FieldInfos fieldInfos;
private List<MockSTBlockLine> blockLines;

View File

@ -28,6 +28,7 @@ import org.apache.lucene.codecs.lucene84.Lucene84PostingsReader;
import org.apache.lucene.codecs.lucene84.Lucene84PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
@ -40,6 +41,7 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
public static volatile boolean encoderCalled;
public static volatile boolean decoderCalled;
public static volatile boolean blocksEncoded;
public static volatile boolean fieldsMetadataEncoded;
public static volatile boolean dictionaryEncoded;
protected final boolean dictionaryOnHeap;
@ -56,6 +58,7 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
encoderCalled = false;
decoderCalled = false;
blocksEncoded = false;
fieldsMetadataEncoded = false;
dictionaryEncoded = false;
}
@ -86,6 +89,11 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
super.writeDictionary(dictionaryBuilder);
recordDictionaryEncodingCall();
}
@Override
protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
super.writeEncodedFieldsMetadata(fieldsOutput);
recordFieldsMetadataEncodingCall();
}
};
}
@ -96,6 +104,13 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
}
}
protected void recordFieldsMetadataEncodingCall() {
if (encoderCalled) {
fieldsMetadataEncoded = true;
encoderCalled = false;
}
}
protected void recordDictionaryEncodingCall() {
if (encoderCalled) {
dictionaryEncoded = true;

View File

@ -28,6 +28,7 @@ import org.apache.lucene.codecs.uniformsplit.UniformSplitRot13PostingsFormat;
import org.apache.lucene.codecs.uniformsplit.UniformSplitTermsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.ByteBuffersDataOutput;
/**
* {@link STUniformSplitPostingsFormat} with block encoding using ROT13 cypher.
@ -50,6 +51,12 @@ public class STUniformSplitRot13PostingsFormat extends UniformSplitRot13Postings
super.writeDictionary(dictionaryBuilder);
recordDictionaryEncodingCall();
}
@Override
protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
recordBlockEncodingCall();
super.writeEncodedFieldsMetadata(fieldsOutput);
recordFieldsMetadataEncodingCall();
}
};
}