mirror of https://github.com/apache/lucene.git
LUCENE-9397: UniformSplit supports encodable fields metadata.
This commit is contained in:
parent
36109ec362
commit
75d25ad677
|
@ -206,6 +206,8 @@ Improvements
|
|||
|
||||
* LUCENE-9392: Make FacetsConfig.DELIM_CHAR publicly accessible (Ankur Goel))
|
||||
|
||||
* LUCENE-9397: UniformSplit supports encodable fields metadata. (Bruno Roustant)
|
||||
|
||||
Optimizations
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -47,7 +47,9 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
|
|||
*/
|
||||
public static final String TERMS_BLOCKS_EXTENSION = "ustb";
|
||||
|
||||
public static final int VERSION_CURRENT = 0;
|
||||
public static final int VERSION_START = 0;
|
||||
public static final int VERSION_ENCODABLE_FIELDS_METADATA = 1;
|
||||
public static final int VERSION_CURRENT = VERSION_ENCODABLE_FIELDS_METADATA;
|
||||
|
||||
public static final String NAME = "UniformSplit";
|
||||
|
||||
|
@ -74,10 +76,10 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
|
|||
* Must be greater than or equal to 0 and strictly less than {@code targetNumBlockLines}.
|
||||
* The block size will be {@code targetNumBlockLines}+-{@code deltaNumLines}.
|
||||
* The block size must always be less than or equal to {@link UniformSplitTermsWriter#MAX_NUM_BLOCK_LINES}.
|
||||
* @param blockEncoder Optional block encoder, may be null if none.
|
||||
* It can be used for compression or encryption.
|
||||
* @param blockDecoder Optional block decoder, may be null if none.
|
||||
* It can be used for compression or encryption.
|
||||
* @param blockEncoder Optional block encoder, may be null if none. If present, it is used to encode all terms
|
||||
* blocks, as well as the FST dictionary and the fields metadata.
|
||||
* @param blockDecoder Optional block decoder, may be null if none. If present, it is used to decode all terms
|
||||
* blocks, as well as the FST dictionary and the fields metadata.
|
||||
* @param dictionaryOnHeap Whether to force loading the terms dictionary on-heap. By default it is kept off-heap without
|
||||
* impact on performance. If block encoding/decoding is used, then the dictionary is always
|
||||
* loaded on-heap whatever this parameter value is.
|
||||
|
|
|
@ -34,14 +34,14 @@ import org.apache.lucene.index.FieldInfos;
|
|||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.NAME;
|
||||
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_BLOCKS_EXTENSION;
|
||||
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_DICTIONARY_EXTENSION;
|
||||
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.VERSION_CURRENT;
|
||||
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.*;
|
||||
|
||||
/**
|
||||
* A block-based terms index and dictionary based on the Uniform Split technique.
|
||||
|
@ -51,12 +51,11 @@ import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.V
|
|||
*/
|
||||
public class UniformSplitTermsReader extends FieldsProducer {
|
||||
|
||||
protected static final int VERSION_START = 0;
|
||||
|
||||
private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(UniformSplitTermsReader.class)
|
||||
+ RamUsageEstimator.shallowSizeOfInstance(IndexInput.class) * 2;
|
||||
|
||||
protected final PostingsReaderBase postingsReader;
|
||||
protected final int version;
|
||||
protected final IndexInput blockInput;
|
||||
protected final IndexInput dictionaryInput;
|
||||
|
||||
|
@ -93,7 +92,7 @@ public class UniformSplitTermsReader extends FieldsProducer {
|
|||
String termsName = IndexFileNames.segmentFileName(segmentName, state.segmentSuffix, termsBlocksExtension);
|
||||
blockInput = state.directory.openInput(termsName, state.context);
|
||||
|
||||
int version = CodecUtil.checkIndexHeader(blockInput, codecName, versionStart,
|
||||
version = CodecUtil.checkIndexHeader(blockInput, codecName, versionStart,
|
||||
versionCurrent, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
String indexName = IndexFileNames.segmentFileName(segmentName, state.segmentSuffix, dictionaryExtension);
|
||||
dictionaryInput = state.directory.openInput(indexName, state.context);
|
||||
|
@ -105,7 +104,8 @@ public class UniformSplitTermsReader extends FieldsProducer {
|
|||
CodecUtil.retrieveChecksum(blockInput);
|
||||
|
||||
seekFieldsMetadata(blockInput);
|
||||
Collection<FieldMetadata> fieldMetadataCollection = parseFieldsMetadata(blockInput, state.fieldInfos, fieldMetadataReader, state.segmentInfo.maxDoc());
|
||||
Collection<FieldMetadata> fieldMetadataCollection =
|
||||
readFieldsMetadata(blockInput, blockDecoder, state.fieldInfos, fieldMetadataReader, state.segmentInfo.maxDoc());
|
||||
|
||||
fieldToTermsMap = new HashMap<>();
|
||||
this.blockInput = blockInput;
|
||||
|
@ -143,16 +143,36 @@ public class UniformSplitTermsReader extends FieldsProducer {
|
|||
/**
|
||||
* @param indexInput {@link IndexInput} must be positioned to the fields metadata
|
||||
* details by calling {@link #seekFieldsMetadata(IndexInput)} before this call.
|
||||
* @param blockDecoder Optional block decoder, may be null if none.
|
||||
*/
|
||||
protected static Collection<FieldMetadata> parseFieldsMetadata(IndexInput indexInput, FieldInfos fieldInfos,
|
||||
FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException {
|
||||
protected Collection<FieldMetadata> readFieldsMetadata(IndexInput indexInput, BlockDecoder blockDecoder, FieldInfos fieldInfos,
|
||||
FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException {
|
||||
int numFields = indexInput.readVInt();
|
||||
if (numFields < 0) {
|
||||
throw new CorruptIndexException("Illegal number of fields= " + numFields, indexInput);
|
||||
}
|
||||
return (blockDecoder != null && version >= VERSION_ENCODABLE_FIELDS_METADATA) ?
|
||||
readEncodedFieldsMetadata(numFields, indexInput, blockDecoder, fieldInfos, fieldMetadataReader, maxNumDocs)
|
||||
: readUnencodedFieldsMetadata(numFields, indexInput, fieldInfos, fieldMetadataReader, maxNumDocs);
|
||||
}
|
||||
|
||||
protected Collection<FieldMetadata> readEncodedFieldsMetadata(int numFields, DataInput metadataInput, BlockDecoder blockDecoder,
|
||||
FieldInfos fieldInfos, FieldMetadata.Serializer fieldMetadataReader,
|
||||
int maxNumDocs) throws IOException {
|
||||
long encodedLength = metadataInput.readVLong();
|
||||
if (encodedLength < 0) {
|
||||
throw new CorruptIndexException("Illegal encoded length: " + encodedLength, metadataInput);
|
||||
}
|
||||
BytesRef decodedBytes = blockDecoder.decode(metadataInput, encodedLength);
|
||||
DataInput decodedMetadataInput = new ByteArrayDataInput(decodedBytes.bytes, 0, decodedBytes.length);
|
||||
return readUnencodedFieldsMetadata(numFields, decodedMetadataInput, fieldInfos, fieldMetadataReader, maxNumDocs);
|
||||
}
|
||||
|
||||
protected Collection<FieldMetadata> readUnencodedFieldsMetadata(int numFields, DataInput metadataInput, FieldInfos fieldInfos,
|
||||
FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException {
|
||||
Collection<FieldMetadata> fieldMetadataCollection = new ArrayList<>(numFields);
|
||||
for (int i = 0; i < numFields; i++) {
|
||||
fieldMetadataCollection.add(fieldMetadataReader.read(indexInput, fieldInfos, maxNumDocs));
|
||||
fieldMetadataCollection.add(fieldMetadataReader.read(metadataInput, fieldInfos, maxNumDocs));
|
||||
}
|
||||
return fieldMetadataCollection;
|
||||
}
|
||||
|
@ -212,7 +232,7 @@ public class UniformSplitTermsReader extends FieldsProducer {
|
|||
/**
|
||||
* Positions the given {@link IndexInput} at the beginning of the fields metadata.
|
||||
*/
|
||||
protected static void seekFieldsMetadata(IndexInput indexInput) throws IOException {
|
||||
protected void seekFieldsMetadata(IndexInput indexInput) throws IOException {
|
||||
indexInput.seek(indexInput.length() - CodecUtil.footerLength() - 8);
|
||||
indexInput.seek(indexInput.readLong());
|
||||
}
|
||||
|
|
|
@ -249,11 +249,26 @@ public class UniformSplitTermsWriter extends FieldsConsumer {
|
|||
protected void writeFieldsMetadata(int fieldsNumber, ByteBuffersDataOutput fieldsOutput) throws IOException {
|
||||
long fieldsStartPosition = blockOutput.getFilePointer();
|
||||
blockOutput.writeVInt(fieldsNumber);
|
||||
fieldsOutput.copyTo(blockOutput);
|
||||
if (blockEncoder == null) {
|
||||
writeUnencodedFieldsMetadata(fieldsOutput);
|
||||
} else {
|
||||
writeEncodedFieldsMetadata(fieldsOutput);
|
||||
}
|
||||
// Must be a fixed length. Read by UniformSplitTermsReader when seeking fields metadata.
|
||||
blockOutput.writeLong(fieldsStartPosition);
|
||||
CodecUtil.writeFooter(blockOutput);
|
||||
}
|
||||
|
||||
protected void writeUnencodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
|
||||
fieldsOutput.copyTo(blockOutput);
|
||||
}
|
||||
|
||||
protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
|
||||
BlockEncoder.WritableBytes encodedBytes = blockEncoder.encode(fieldsOutput.toDataInput(), fieldsOutput.size());
|
||||
blockOutput.writeVLong(encodedBytes.size());
|
||||
encodedBytes.writeTo(blockOutput);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return 1 if the field was written; 0 otherwise.
|
||||
*/
|
||||
|
|
|
@ -54,7 +54,7 @@ public class STUniformSplitPostingsFormat extends UniformSplitPostingsFormat {
|
|||
*/
|
||||
public static final String TERMS_BLOCKS_EXTENSION = "stustb";
|
||||
|
||||
public static final int VERSION_CURRENT = 0;
|
||||
public static final int VERSION_CURRENT = UniformSplitPostingsFormat.VERSION_CURRENT;
|
||||
|
||||
public static final String NAME = "SharedTermsUniformSplit";
|
||||
|
||||
|
|
|
@ -30,10 +30,7 @@ import org.apache.lucene.index.FieldInfos;
|
|||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.NAME;
|
||||
import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.TERMS_BLOCKS_EXTENSION;
|
||||
import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.TERMS_DICTIONARY_EXTENSION;
|
||||
import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.VERSION_CURRENT;
|
||||
import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.*;
|
||||
|
||||
/**
|
||||
* A block-based terms index and dictionary based on the Uniform Split technique,
|
||||
|
|
|
@ -33,15 +33,9 @@ public class UnionFieldMetadataBuilder {
|
|||
private BytesRef maxLastTerm;
|
||||
|
||||
public UnionFieldMetadataBuilder() {
|
||||
reset();
|
||||
}
|
||||
|
||||
public UnionFieldMetadataBuilder reset() {
|
||||
dictionaryStartFP = -1;
|
||||
minStartBlockFP = Long.MAX_VALUE;
|
||||
maxEndBlockFP = Long.MIN_VALUE;
|
||||
maxLastTerm = null;
|
||||
return this;
|
||||
}
|
||||
|
||||
public UnionFieldMetadataBuilder addFieldMetadata(FieldMetadata fieldMetadata) {
|
||||
|
|
|
@ -51,17 +51,26 @@ public class TestUniformSplitPostingFormat extends BasePostingsFormatTestCase {
|
|||
|
||||
@Before
|
||||
public void initialize() {
|
||||
initializeInner();
|
||||
}
|
||||
|
||||
protected void initializeInner() {
|
||||
UniformSplitRot13PostingsFormat.resetEncodingFlags();
|
||||
}
|
||||
|
||||
@After
|
||||
public void checkEncodingCalled() {
|
||||
if (checkEncoding) {
|
||||
assertTrue(UniformSplitRot13PostingsFormat.blocksEncoded);
|
||||
assertTrue(UniformSplitRot13PostingsFormat.dictionaryEncoded);
|
||||
if (shouldCheckDecoderWasCalled) {
|
||||
assertTrue(UniformSplitRot13PostingsFormat.decoderCalled);
|
||||
}
|
||||
checkEncodingCalledInner();
|
||||
}
|
||||
}
|
||||
|
||||
protected void checkEncodingCalledInner() {
|
||||
assertTrue(UniformSplitRot13PostingsFormat.blocksEncoded);
|
||||
assertTrue(UniformSplitRot13PostingsFormat.fieldsMetadataEncoded);
|
||||
assertTrue(UniformSplitRot13PostingsFormat.dictionaryEncoded);
|
||||
if (shouldCheckDecoderWasCalled) {
|
||||
assertTrue(UniformSplitRot13PostingsFormat.decoderCalled);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -51,9 +51,9 @@ import org.apache.lucene.store.IndexOutput;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class STBlockReaderTest extends LuceneTestCase {
|
||||
public class TestSTBlockReader extends LuceneTestCase {
|
||||
|
||||
private static final String MOCK_BLOCK_OUTPUT_NAME = "STBlockReaderTest.tmp";
|
||||
private static final String MOCK_BLOCK_OUTPUT_NAME = "TestSTBlockReader.tmp";
|
||||
|
||||
private FieldInfos fieldInfos;
|
||||
private List<MockSTBlockLine> blockLines;
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.codecs.lucene84.Lucene84PostingsReader;
|
|||
import org.apache.lucene.codecs.lucene84.Lucene84PostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.ByteBuffersDataOutput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -40,6 +41,7 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
|
|||
public static volatile boolean encoderCalled;
|
||||
public static volatile boolean decoderCalled;
|
||||
public static volatile boolean blocksEncoded;
|
||||
public static volatile boolean fieldsMetadataEncoded;
|
||||
public static volatile boolean dictionaryEncoded;
|
||||
protected final boolean dictionaryOnHeap;
|
||||
|
||||
|
@ -56,6 +58,7 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
|
|||
encoderCalled = false;
|
||||
decoderCalled = false;
|
||||
blocksEncoded = false;
|
||||
fieldsMetadataEncoded = false;
|
||||
dictionaryEncoded = false;
|
||||
}
|
||||
|
||||
|
@ -86,6 +89,11 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
|
|||
super.writeDictionary(dictionaryBuilder);
|
||||
recordDictionaryEncodingCall();
|
||||
}
|
||||
@Override
|
||||
protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
|
||||
super.writeEncodedFieldsMetadata(fieldsOutput);
|
||||
recordFieldsMetadataEncodingCall();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -96,6 +104,13 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
|
|||
}
|
||||
}
|
||||
|
||||
protected void recordFieldsMetadataEncodingCall() {
|
||||
if (encoderCalled) {
|
||||
fieldsMetadataEncoded = true;
|
||||
encoderCalled = false;
|
||||
}
|
||||
}
|
||||
|
||||
protected void recordDictionaryEncodingCall() {
|
||||
if (encoderCalled) {
|
||||
dictionaryEncoded = true;
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.codecs.uniformsplit.UniformSplitRot13PostingsFormat;
|
|||
import org.apache.lucene.codecs.uniformsplit.UniformSplitTermsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.ByteBuffersDataOutput;
|
||||
|
||||
/**
|
||||
* {@link STUniformSplitPostingsFormat} with block encoding using ROT13 cypher.
|
||||
|
@ -50,6 +51,12 @@ public class STUniformSplitRot13PostingsFormat extends UniformSplitRot13Postings
|
|||
super.writeDictionary(dictionaryBuilder);
|
||||
recordDictionaryEncodingCall();
|
||||
}
|
||||
@Override
|
||||
protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
|
||||
recordBlockEncodingCall();
|
||||
super.writeEncodedFieldsMetadata(fieldsOutput);
|
||||
recordFieldsMetadataEncodingCall();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue