LUCENE-10291: Only read/write postings when there is at least one indexed field (#539)

2022-01-05 16:28:00 +01:00 · 2022-01-05 16:28:00 +01:00 · 8fa7412dec
parent 65296e5f84
commit 8fa7412dec
13 changed files with 257 additions and 17 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java
@ -79,9 +79,11 @@ public abstract class FieldsConsumer implements Closeable {
      final FieldsProducer f = mergeState.fieldsProducers[readerIndex];
      final int maxDoc = mergeState.maxDocs[readerIndex];
-      f.checkIntegrity();
+      if (f != null) {
-      slices.add(new ReaderSlice(docBase, maxDoc, readerIndex));
+        f.checkIntegrity();
-      fields.add(f);
+        slices.add(new ReaderSlice(docBase, maxDoc, readerIndex));
        fields.add(f);
      }
      docBase += maxDoc;
    }
--- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldMergeState.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldMergeState.java
@ -79,7 +79,10 @@ final class PerFieldMergeState {
      in.fieldInfos[i] = new FilterFieldInfos(orgFieldInfos[i], fields);
    }
    for (int i = 0; i < orgFieldsProducers.length; i++) {
-      in.fieldsProducers[i] = new FilterFieldsProducer(orgFieldsProducers[i], fields);
+      in.fieldsProducers[i] =
          orgFieldsProducers[i] == null
              ? null
              : new FilterFieldsProducer(orgFieldsProducers[i], fields);
    }
    return in;
  }
@ -103,6 +106,7 @@ final class PerFieldMergeState {
    // Copy of the private fields from FieldInfos
    // Renamed so as to be less confusing about which fields we're referring to
    private final boolean filteredHasVectors;
    private final boolean filteredHasPostings;
    private final boolean filteredHasProx;
    private final boolean filteredHasPayloads;
    private final boolean filteredHasOffsets;
@ -116,6 +120,7 @@ final class PerFieldMergeState {
      super(toArray(src));
      boolean hasVectors = false;
      boolean hasPostings = false;
      boolean hasProx = false;
      boolean hasPayloads = false;
      boolean hasOffsets = false;
@ -130,6 +135,7 @@ final class PerFieldMergeState {
        if (this.filteredNames.contains(fi.name)) {
          this.filtered.add(fi);
          hasVectors |= fi.hasVectors();
          hasPostings |= fi.getIndexOptions() != IndexOptions.NONE;
          hasProx |= fi.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
          hasFreq |= fi.getIndexOptions() != IndexOptions.DOCS;
          hasOffsets |=
@ -143,6 +149,7 @@ final class PerFieldMergeState {
      }
      this.filteredHasVectors = hasVectors;
      this.filteredHasPostings = hasPostings;
      this.filteredHasProx = hasProx;
      this.filteredHasPayloads = hasPayloads;
      this.filteredHasOffsets = hasOffsets;
@ -171,6 +178,11 @@ final class PerFieldMergeState {
      return filteredHasFreq;
    }
    @Override
    public boolean hasPostings() {
      return filteredHasPostings;
    }
    @Override
    public boolean hasProx() {
      return filteredHasProx;
--- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java
@ -27,6 +27,7 @@ import java.util.IdentityHashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 import java.util.ServiceLoader;
 import java.util.Set;
 import java.util.TreeMap;
@ -185,6 +186,7 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
              new MergedIterator<>(
                  true,
                  Arrays.stream(mergeState.fieldsProducers)
                      .filter(Objects::nonNull)
                      .map(FieldsProducer::iterator)
                      .toArray(Iterator[]::new));
      Map<PostingsFormat, FieldsGroup> formatToGroups = buildFieldsGroupMapping(indexedFieldNames);
--- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
+++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
@ -44,6 +44,7 @@ import java.util.concurrent.TimeUnit;
 import java.util.function.Supplier;
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.DocValuesProducer;
 import org.apache.lucene.codecs.FieldsProducer;
 import org.apache.lucene.codecs.NormsProducer;
 import org.apache.lucene.codecs.PointsReader;
 import org.apache.lucene.codecs.PostingsFormat;
@ -2407,7 +2408,12 @@ public final class CheckIndex implements Closeable {
        infoStream.print("    test: terms, freq, prox...");
      }
-      final Fields fields = reader.getPostingsReader().getMergeInstance();
+      FieldsProducer fields = reader.getPostingsReader();
      if (fields != null) {
        fields = fields.getMergeInstance();
      } else {
        return new Status.TermIndexStatus();
      }
      final FieldInfos fieldInfos = reader.getFieldInfos();
      NormsProducer normsProducer = reader.getNormsReader();
      if (normsProducer != null) {
@ -3540,10 +3546,13 @@ public final class CheckIndex implements Closeable {
      final Bits liveDocs = reader.getLiveDocs();
-      final Fields postingsFields;
+      FieldsProducer postingsFields;
      // TODO: testTermsIndex
      if (doSlowChecks) {
-        postingsFields = reader.getPostingsReader().getMergeInstance();
+        postingsFields = reader.getPostingsReader();
        if (postingsFields != null) {
          postingsFields = postingsFields.getMergeInstance();
        }
      } else {
        postingsFields = null;
      }
@ -3597,6 +3606,10 @@ public final class CheckIndex implements Closeable {
                final boolean postingsHasPayload = fieldInfo.hasPayloads();
                final boolean vectorsHasPayload = terms.hasPayloads();
                if (postingsFields == null) {
                  throw new CheckIndexException(
                      "vector field=" + field + " does not exist in postings; doc=" + j);
                }
                Terms postingsTerms = postingsFields.terms(field);
                if (postingsTerms == null) {
                  throw new CheckIndexException(
--- a/lucene/core/src/java/org/apache/lucene/index/CodecReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/CodecReader.java
@ -106,10 +106,14 @@ public abstract class CodecReader extends LeafReader {
  @Override
  public final Terms terms(String field) throws IOException {
    // ensureOpen(); no; getPostingsReader calls this
    FieldsProducer fieldsProducer = getPostingsReader();
    if (fieldsProducer == null) {
      return null;
    }
    // We could check the FieldInfo IndexOptions but there's no point since
    //   PostingsReader will simply return null for fields that don't exist or that have no terms
    // index.
-    return getPostingsReader().terms(field);
+    return fieldsProducer.terms(field);
  }
  // returns the FieldInfo that corresponds to the given field and type, or
@ -241,7 +245,9 @@ public abstract class CodecReader extends LeafReader {
    ensureOpen();
    // terms/postings
-    getPostingsReader().checkIntegrity();
+    if (getPostingsReader() != null) {
      getPostingsReader().checkIntegrity();
    }
    // norms
    if (getNormsReader() != null) {
--- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
@ -48,6 +48,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
  public static final FieldInfos EMPTY = new FieldInfos(new FieldInfo[0]);
  private final boolean hasFreq;
  private final boolean hasPostings;
  private final boolean hasProx;
  private final boolean hasPayloads;
  private final boolean hasOffsets;
@ -67,6 +68,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
  /** Constructs a new FieldInfos from an array of FieldInfo objects */
  public FieldInfos(FieldInfo[] infos) {
    boolean hasVectors = false;
    boolean hasPostings = false;
    boolean hasProx = false;
    boolean hasPayloads = false;
    boolean hasOffsets = false;
@ -112,6 +114,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
      }
      hasVectors |= info.hasVectors();
      hasPostings |= info.getIndexOptions() != IndexOptions.NONE;
      hasProx |= info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
      hasFreq |= info.getIndexOptions() != IndexOptions.DOCS;
      hasOffsets |=
@ -132,6 +135,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
    }
    this.hasVectors = hasVectors;
    this.hasPostings = hasPostings;
    this.hasProx = hasProx;
    this.hasPayloads = hasPayloads;
    this.hasOffsets = hasOffsets;
@ -200,6 +204,11 @@ public class FieldInfos implements Iterable<FieldInfo> {
    return hasFreq;
  }
  /** Returns true if any fields have postings */
  public boolean hasPostings() {
    return hasPostings;
  }
  /** Returns true if any fields have positions */
  public boolean hasProx() {
    return hasProx;
--- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
@ -99,6 +99,11 @@ final class FreqProxTermsWriter extends TermsHash {
      }
    }
    if (!state.fieldInfos.hasPostings()) {
      assert allFields.isEmpty();
      return;
    }
    // Sort by field name
    CollectionUtil.introSort(allFields);
--- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java
@ -132,7 +132,11 @@ public class MergeState {
        termVectorsReaders[i] = termVectorsReaders[i].getMergeInstance();
      }
-      fieldsProducers[i] = reader.getPostingsReader().getMergeInstance();
+      fieldsProducers[i] = reader.getPostingsReader();
      if (fieldsProducers[i] != null) {
        fieldsProducers[i] = fieldsProducers[i].getMergeInstance();
      }
      pointsReaders[i] = reader.getPointsReader();
      if (pointsReaders[i] != null) {
        pointsReaders[i] = pointsReaders[i].getMergeInstance();
--- a/lucene/core/src/java/org/apache/lucene/index/SegmentCommitInfo.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SegmentCommitInfo.java
@ -244,7 +244,9 @@ public class SegmentCommitInfo {
    // updates) and then maybe even be able to remove LiveDocsFormat.files().
    // Must separately add any live docs files:
-    info.getCodec().liveDocsFormat().files(this, files);
+    if (hasDeletions()) {
      info.getCodec().liveDocsFormat().files(this, files);
    }
    // must separately add any field updates files
    for (Set<String> updatefiles : dvUpdatesFiles.values()) {
--- a/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java
@ -112,10 +112,14 @@ final class SegmentCoreReaders {
      final SegmentReadState segmentReadState =
          new SegmentReadState(cfsDir, si.info, coreFieldInfos, context);
-      final PostingsFormat format = codec.postingsFormat();
+      if (coreFieldInfos.hasPostings()) {
-      // Ask codec for its Fields
+        final PostingsFormat format = codec.postingsFormat();
-      fields = format.fieldsProducer(segmentReadState);
+        // Ask codec for its Fields
-      assert fields != null;
+        fields = format.fieldsProducer(segmentReadState);
        assert fields != null;
      } else {
        fields = null;
      }
      // ask codec for its Norms:
      // TODO: since we don't write any norms file if there are no norms,
      // kinda jaky to assume the codec handles the case of no norms file at all gracefully?!
--- a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java
@ -203,8 +203,10 @@ final class SegmentMerger {
        // Use the merge instance in order to reuse the same IndexInput for all terms
        normsMergeInstance = norms.getMergeInstance();
      }
-      try (FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(segmentWriteState)) {
+      if (mergeState.mergeFieldInfos.hasPostings()) {
-        consumer.merge(mergeState, normsMergeInstance);
+        try (FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(segmentWriteState)) {
          consumer.merge(mergeState, normsMergeInstance);
        }
      }
    }
  }
--- a/lucene/core/src/test/META-INF/services/org.apache.lucene.codecs.Codec
+++ b/lucene/core/src/test/META-INF/services/org.apache.lucene.codecs.Codec
@ -0,0 +1,17 @@
 #  Licensed to the Apache Software Foundation (ASF) under one or more
 #  contributor license agreements.  See the NOTICE file distributed with
 #  this work for additional information regarding copyright ownership.
 #  The ASF licenses this file to You under the Apache License, Version 2.0
 #  (the "License"); you may not use this file except in compliance with
 #  the License.  You may obtain a copy of the License at
 #
 #       http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 org.apache.lucene.codecs.TestMinimalCodec$MinimalCodec
 org.apache.lucene.codecs.TestMinimalCodec$MinimalCompoundCodec
--- a/lucene/core/src/test/org/apache/lucene/codecs/TestMinimalCodec.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/TestMinimalCodec.java
@ -0,0 +1,162 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.codecs;
 import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean;
 import java.io.IOException;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.tests.analysis.MockAnalyzer;
 import org.apache.lucene.tests.store.BaseDirectoryWrapper;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.tests.util.TestUtil;
 /**
 * Tests to ensure that {@link Codec}s won't need to implement all formats in case where only a
 * small subset of Lucene's functionality is used.
 */
 public class TestMinimalCodec extends LuceneTestCase {
  public void testMinimalCodec() throws IOException {
    runMinimalCodecTest(false);
  }
  public void testMinimalCompoundCodec() throws IOException {
    runMinimalCodecTest(true);
  }
  private void runMinimalCodecTest(boolean useCompoundFile) throws IOException {
    try (BaseDirectoryWrapper dir = newDirectory()) {
      IndexWriterConfig writerConfig =
          newIndexWriterConfig(new MockAnalyzer(random()))
              .setCodec(useCompoundFile ? new MinimalCompoundCodec() : new MinimalCodec())
              .setUseCompoundFile(useCompoundFile);
      if (!useCompoundFile) {
        writerConfig.getMergePolicy().setNoCFSRatio(0.0);
        writerConfig.getMergePolicy().setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
      }
      try (IndexWriter writer = new IndexWriter(dir, writerConfig)) {
        writer.addDocument(basicDocument());
        writer.flush();
        // create second segment
        writer.addDocument(basicDocument());
        writer.forceMerge(1); // test merges
        if (randomBoolean()) {
          writer.commit();
        }
        try (DirectoryReader reader = DirectoryReader.open(writer)) {
          assertEquals(2, reader.numDocs());
        }
      }
    }
  }
  /** returns a basic document with no indexed fields */
  private static Document basicDocument() {
    return new Document();
  }
  /** Minimal codec implementation for working with the most basic documents */
  public static class MinimalCodec extends Codec {
    protected final Codec wrappedCodec = TestUtil.getDefaultCodec();
    public MinimalCodec() {
      this("MinimalCodec");
    }
    protected MinimalCodec(String name) {
      super(name);
    }
    @Override
    public FieldInfosFormat fieldInfosFormat() {
      return wrappedCodec.fieldInfosFormat();
    }
    @Override
    public SegmentInfoFormat segmentInfoFormat() {
      return wrappedCodec.segmentInfoFormat();
    }
    @Override
    public CompoundFormat compoundFormat() {
      throw new UnsupportedOperationException();
    }
    @Override
    public LiveDocsFormat liveDocsFormat() {
      throw new UnsupportedOperationException();
    }
    @Override
    public StoredFieldsFormat storedFieldsFormat() {
      // TODO: avoid calling this when no stored fields are written or read
      return wrappedCodec.storedFieldsFormat();
    }
    @Override
    public PostingsFormat postingsFormat() {
      throw new UnsupportedOperationException();
    }
    @Override
    public DocValuesFormat docValuesFormat() {
      throw new UnsupportedOperationException();
    }
    @Override
    public TermVectorsFormat termVectorsFormat() {
      throw new UnsupportedOperationException();
    }
    @Override
    public NormsFormat normsFormat() {
      throw new UnsupportedOperationException();
    }
    @Override
    public PointsFormat pointsFormat() {
      throw new UnsupportedOperationException();
    }
    @Override
    public KnnVectorsFormat knnVectorsFormat() {
      throw new UnsupportedOperationException();
    }
  }
  /**
   * Minimal codec implementation for working with the most basic documents, supporting compound
   * formats
   */
  public static class MinimalCompoundCodec extends MinimalCodec {
    public MinimalCompoundCodec() {
      super("MinimalCompoundCodec");
    }
    @Override
    public CompoundFormat compoundFormat() {
      return wrappedCodec.compoundFormat();
    }
  }
 }