From 8fa7412dec458e42f379cc856bd6ffebe8c6f8e9 Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Wed, 5 Jan 2022 16:28:00 +0100 Subject: [PATCH] LUCENE-10291: Only read/write postings when there is at least one indexed field (#539) --- .../apache/lucene/codecs/FieldsConsumer.java | 8 +- .../codecs/perfield/PerFieldMergeState.java | 14 +- .../perfield/PerFieldPostingsFormat.java | 2 + .../org/apache/lucene/index/CheckIndex.java | 19 +- .../org/apache/lucene/index/CodecReader.java | 10 +- .../org/apache/lucene/index/FieldInfos.java | 9 + .../lucene/index/FreqProxTermsWriter.java | 5 + .../org/apache/lucene/index/MergeState.java | 6 +- .../lucene/index/SegmentCommitInfo.java | 4 +- .../lucene/index/SegmentCoreReaders.java | 12 +- .../apache/lucene/index/SegmentMerger.java | 6 +- .../services/org.apache.lucene.codecs.Codec | 17 ++ .../lucene/codecs/TestMinimalCodec.java | 162 ++++++++++++++++++ 13 files changed, 257 insertions(+), 17 deletions(-) create mode 100644 lucene/core/src/test/META-INF/services/org.apache.lucene.codecs.Codec create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/TestMinimalCodec.java diff --git a/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java index 0659827c80f..cf0f796e32a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java @@ -79,9 +79,11 @@ public abstract class FieldsConsumer implements Closeable { final FieldsProducer f = mergeState.fieldsProducers[readerIndex]; final int maxDoc = mergeState.maxDocs[readerIndex]; - f.checkIntegrity(); - slices.add(new ReaderSlice(docBase, maxDoc, readerIndex)); - fields.add(f); + if (f != null) { + f.checkIntegrity(); + slices.add(new ReaderSlice(docBase, maxDoc, readerIndex)); + fields.add(f); + } docBase += maxDoc; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldMergeState.java b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldMergeState.java index 47fe0dfc087..c340448e691 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldMergeState.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldMergeState.java @@ -79,7 +79,10 @@ final class PerFieldMergeState { in.fieldInfos[i] = new FilterFieldInfos(orgFieldInfos[i], fields); } for (int i = 0; i < orgFieldsProducers.length; i++) { - in.fieldsProducers[i] = new FilterFieldsProducer(orgFieldsProducers[i], fields); + in.fieldsProducers[i] = + orgFieldsProducers[i] == null + ? null + : new FilterFieldsProducer(orgFieldsProducers[i], fields); } return in; } @@ -103,6 +106,7 @@ final class PerFieldMergeState { // Copy of the private fields from FieldInfos // Renamed so as to be less confusing about which fields we're referring to private final boolean filteredHasVectors; + private final boolean filteredHasPostings; private final boolean filteredHasProx; private final boolean filteredHasPayloads; private final boolean filteredHasOffsets; @@ -116,6 +120,7 @@ final class PerFieldMergeState { super(toArray(src)); boolean hasVectors = false; + boolean hasPostings = false; boolean hasProx = false; boolean hasPayloads = false; boolean hasOffsets = false; @@ -130,6 +135,7 @@ final class PerFieldMergeState { if (this.filteredNames.contains(fi.name)) { this.filtered.add(fi); hasVectors |= fi.hasVectors(); + hasPostings |= fi.getIndexOptions() != IndexOptions.NONE; hasProx |= fi.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; hasFreq |= fi.getIndexOptions() != IndexOptions.DOCS; hasOffsets |= @@ -143,6 +149,7 @@ final class PerFieldMergeState { } this.filteredHasVectors = hasVectors; + this.filteredHasPostings = hasPostings; this.filteredHasProx = hasProx; this.filteredHasPayloads = hasPayloads; this.filteredHasOffsets = hasOffsets; @@ -171,6 +178,11 @@ final class PerFieldMergeState { return filteredHasFreq; } + @Override + public boolean hasPostings() { + return filteredHasPostings; + } + @Override public boolean hasProx() { return filteredHasProx; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java index d183f62c56e..64688972f4d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java @@ -27,6 +27,7 @@ import java.util.IdentityHashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.ServiceLoader; import java.util.Set; import java.util.TreeMap; @@ -185,6 +186,7 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat { new MergedIterator<>( true, Arrays.stream(mergeState.fieldsProducers) + .filter(Objects::nonNull) .map(FieldsProducer::iterator) .toArray(Iterator[]::new)); Map formatToGroups = buildFieldsGroupMapping(indexedFieldNames); diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index de5141acb2c..f699937eb59 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -44,6 +44,7 @@ import java.util.concurrent.TimeUnit; import java.util.function.Supplier; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.PostingsFormat; @@ -2407,7 +2408,12 @@ public final class CheckIndex implements Closeable { infoStream.print(" test: terms, freq, prox..."); } - final Fields fields = reader.getPostingsReader().getMergeInstance(); + FieldsProducer fields = reader.getPostingsReader(); + if (fields != null) { + fields = fields.getMergeInstance(); + } else { + return new Status.TermIndexStatus(); + } final FieldInfos fieldInfos = reader.getFieldInfos(); NormsProducer normsProducer = reader.getNormsReader(); if (normsProducer != null) { @@ -3540,10 +3546,13 @@ public final class CheckIndex implements Closeable { final Bits liveDocs = reader.getLiveDocs(); - final Fields postingsFields; + FieldsProducer postingsFields; // TODO: testTermsIndex if (doSlowChecks) { - postingsFields = reader.getPostingsReader().getMergeInstance(); + postingsFields = reader.getPostingsReader(); + if (postingsFields != null) { + postingsFields = postingsFields.getMergeInstance(); + } } else { postingsFields = null; } @@ -3597,6 +3606,10 @@ public final class CheckIndex implements Closeable { final boolean postingsHasPayload = fieldInfo.hasPayloads(); final boolean vectorsHasPayload = terms.hasPayloads(); + if (postingsFields == null) { + throw new CheckIndexException( + "vector field=" + field + " does not exist in postings; doc=" + j); + } Terms postingsTerms = postingsFields.terms(field); if (postingsTerms == null) { throw new CheckIndexException( diff --git a/lucene/core/src/java/org/apache/lucene/index/CodecReader.java b/lucene/core/src/java/org/apache/lucene/index/CodecReader.java index f78f126c70b..064cdbf2aaf 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/CodecReader.java @@ -106,10 +106,14 @@ public abstract class CodecReader extends LeafReader { @Override public final Terms terms(String field) throws IOException { // ensureOpen(); no; getPostingsReader calls this + FieldsProducer fieldsProducer = getPostingsReader(); + if (fieldsProducer == null) { + return null; + } // We could check the FieldInfo IndexOptions but there's no point since // PostingsReader will simply return null for fields that don't exist or that have no terms // index. - return getPostingsReader().terms(field); + return fieldsProducer.terms(field); } // returns the FieldInfo that corresponds to the given field and type, or @@ -241,7 +245,9 @@ public abstract class CodecReader extends LeafReader { ensureOpen(); // terms/postings - getPostingsReader().checkIntegrity(); + if (getPostingsReader() != null) { + getPostingsReader().checkIntegrity(); + } // norms if (getNormsReader() != null) { diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java index 44fb021395f..76cf9bb654a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java @@ -48,6 +48,7 @@ public class FieldInfos implements Iterable { public static final FieldInfos EMPTY = new FieldInfos(new FieldInfo[0]); private final boolean hasFreq; + private final boolean hasPostings; private final boolean hasProx; private final boolean hasPayloads; private final boolean hasOffsets; @@ -67,6 +68,7 @@ public class FieldInfos implements Iterable { /** Constructs a new FieldInfos from an array of FieldInfo objects */ public FieldInfos(FieldInfo[] infos) { boolean hasVectors = false; + boolean hasPostings = false; boolean hasProx = false; boolean hasPayloads = false; boolean hasOffsets = false; @@ -112,6 +114,7 @@ public class FieldInfos implements Iterable { } hasVectors |= info.hasVectors(); + hasPostings |= info.getIndexOptions() != IndexOptions.NONE; hasProx |= info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; hasFreq |= info.getIndexOptions() != IndexOptions.DOCS; hasOffsets |= @@ -132,6 +135,7 @@ public class FieldInfos implements Iterable { } this.hasVectors = hasVectors; + this.hasPostings = hasPostings; this.hasProx = hasProx; this.hasPayloads = hasPayloads; this.hasOffsets = hasOffsets; @@ -200,6 +204,11 @@ public class FieldInfos implements Iterable { return hasFreq; } + /** Returns true if any fields have postings */ + public boolean hasPostings() { + return hasPostings; + } + /** Returns true if any fields have positions */ public boolean hasProx() { return hasProx; diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java index e417549a8dd..ca2775c348a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java @@ -99,6 +99,11 @@ final class FreqProxTermsWriter extends TermsHash { } } + if (!state.fieldInfos.hasPostings()) { + assert allFields.isEmpty(); + return; + } + // Sort by field name CollectionUtil.introSort(allFields); diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java index 6bc7e76440a..faea6cbf0e8 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java @@ -132,7 +132,11 @@ public class MergeState { termVectorsReaders[i] = termVectorsReaders[i].getMergeInstance(); } - fieldsProducers[i] = reader.getPostingsReader().getMergeInstance(); + fieldsProducers[i] = reader.getPostingsReader(); + if (fieldsProducers[i] != null) { + fieldsProducers[i] = fieldsProducers[i].getMergeInstance(); + } + pointsReaders[i] = reader.getPointsReader(); if (pointsReaders[i] != null) { pointsReaders[i] = pointsReaders[i].getMergeInstance(); diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentCommitInfo.java b/lucene/core/src/java/org/apache/lucene/index/SegmentCommitInfo.java index b8bb1d73eee..9357cf10e77 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentCommitInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentCommitInfo.java @@ -244,7 +244,9 @@ public class SegmentCommitInfo { // updates) and then maybe even be able to remove LiveDocsFormat.files(). // Must separately add any live docs files: - info.getCodec().liveDocsFormat().files(this, files); + if (hasDeletions()) { + info.getCodec().liveDocsFormat().files(this, files); + } // must separately add any field updates files for (Set updatefiles : dvUpdatesFiles.values()) { diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java b/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java index 68de0f20e73..df733b11196 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java @@ -112,10 +112,14 @@ final class SegmentCoreReaders { final SegmentReadState segmentReadState = new SegmentReadState(cfsDir, si.info, coreFieldInfos, context); - final PostingsFormat format = codec.postingsFormat(); - // Ask codec for its Fields - fields = format.fieldsProducer(segmentReadState); - assert fields != null; + if (coreFieldInfos.hasPostings()) { + final PostingsFormat format = codec.postingsFormat(); + // Ask codec for its Fields + fields = format.fieldsProducer(segmentReadState); + assert fields != null; + } else { + fields = null; + } // ask codec for its Norms: // TODO: since we don't write any norms file if there are no norms, // kinda jaky to assume the codec handles the case of no norms file at all gracefully?! diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java index 689b7725c75..805b5e5a23d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java @@ -203,8 +203,10 @@ final class SegmentMerger { // Use the merge instance in order to reuse the same IndexInput for all terms normsMergeInstance = norms.getMergeInstance(); } - try (FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(segmentWriteState)) { - consumer.merge(mergeState, normsMergeInstance); + if (mergeState.mergeFieldInfos.hasPostings()) { + try (FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(segmentWriteState)) { + consumer.merge(mergeState, normsMergeInstance); + } } } } diff --git a/lucene/core/src/test/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/core/src/test/META-INF/services/org.apache.lucene.codecs.Codec new file mode 100644 index 00000000000..8c7c0df6396 --- /dev/null +++ b/lucene/core/src/test/META-INF/services/org.apache.lucene.codecs.Codec @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.codecs.TestMinimalCodec$MinimalCodec +org.apache.lucene.codecs.TestMinimalCodec$MinimalCompoundCodec diff --git a/lucene/core/src/test/org/apache/lucene/codecs/TestMinimalCodec.java b/lucene/core/src/test/org/apache/lucene/codecs/TestMinimalCodec.java new file mode 100644 index 00000000000..c3c4edb9bfc --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/TestMinimalCodec.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs; + +import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean; + +import java.io.IOException; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.tests.analysis.MockAnalyzer; +import org.apache.lucene.tests.store.BaseDirectoryWrapper; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; + +/** + * Tests to ensure that {@link Codec}s won't need to implement all formats in case where only a + * small subset of Lucene's functionality is used. + */ +public class TestMinimalCodec extends LuceneTestCase { + + public void testMinimalCodec() throws IOException { + runMinimalCodecTest(false); + } + + public void testMinimalCompoundCodec() throws IOException { + runMinimalCodecTest(true); + } + + private void runMinimalCodecTest(boolean useCompoundFile) throws IOException { + try (BaseDirectoryWrapper dir = newDirectory()) { + IndexWriterConfig writerConfig = + newIndexWriterConfig(new MockAnalyzer(random())) + .setCodec(useCompoundFile ? new MinimalCompoundCodec() : new MinimalCodec()) + .setUseCompoundFile(useCompoundFile); + if (!useCompoundFile) { + writerConfig.getMergePolicy().setNoCFSRatio(0.0); + writerConfig.getMergePolicy().setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY); + } + + try (IndexWriter writer = new IndexWriter(dir, writerConfig)) { + writer.addDocument(basicDocument()); + writer.flush(); + // create second segment + writer.addDocument(basicDocument()); + writer.forceMerge(1); // test merges + if (randomBoolean()) { + writer.commit(); + } + + try (DirectoryReader reader = DirectoryReader.open(writer)) { + assertEquals(2, reader.numDocs()); + } + } + } + } + + /** returns a basic document with no indexed fields */ + private static Document basicDocument() { + return new Document(); + } + + /** Minimal codec implementation for working with the most basic documents */ + public static class MinimalCodec extends Codec { + + protected final Codec wrappedCodec = TestUtil.getDefaultCodec(); + + public MinimalCodec() { + this("MinimalCodec"); + } + + protected MinimalCodec(String name) { + super(name); + } + + @Override + public FieldInfosFormat fieldInfosFormat() { + return wrappedCodec.fieldInfosFormat(); + } + + @Override + public SegmentInfoFormat segmentInfoFormat() { + return wrappedCodec.segmentInfoFormat(); + } + + @Override + public CompoundFormat compoundFormat() { + throw new UnsupportedOperationException(); + } + + @Override + public LiveDocsFormat liveDocsFormat() { + throw new UnsupportedOperationException(); + } + + @Override + public StoredFieldsFormat storedFieldsFormat() { + // TODO: avoid calling this when no stored fields are written or read + return wrappedCodec.storedFieldsFormat(); + } + + @Override + public PostingsFormat postingsFormat() { + throw new UnsupportedOperationException(); + } + + @Override + public DocValuesFormat docValuesFormat() { + throw new UnsupportedOperationException(); + } + + @Override + public TermVectorsFormat termVectorsFormat() { + throw new UnsupportedOperationException(); + } + + @Override + public NormsFormat normsFormat() { + throw new UnsupportedOperationException(); + } + + @Override + public PointsFormat pointsFormat() { + throw new UnsupportedOperationException(); + } + + @Override + public KnnVectorsFormat knnVectorsFormat() { + throw new UnsupportedOperationException(); + } + } + + /** + * Minimal codec implementation for working with the most basic documents, supporting compound + * formats + */ + public static class MinimalCompoundCodec extends MinimalCodec { + public MinimalCompoundCodec() { + super("MinimalCompoundCodec"); + } + + @Override + public CompoundFormat compoundFormat() { + return wrappedCodec.compoundFormat(); + } + } +}