LUCENE-5578: Prevent stored fields from accumulating stale data on merge.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1585900 13f79535-47bb-0310-9956-ffa450edef68
2014-04-09 07:44:47 +00:00 · 2014-04-09 07:44:47 +00:00 · 47eb98d13f
parent 5f11ef3fb0
commit 47eb98d13f
14 changed files with 218 additions and 84 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexWriter.java
@ -192,7 +192,7 @@ public final class CompressingStoredFieldsIndexWriter implements Closeable {
    maxStartPointer = startPointer;
  }

-  void finish(int numDocs) throws IOException {
+  void finish(int numDocs, long maxPointer) throws IOException {
    if (numDocs != totalDocs) {
      throw new IllegalStateException("Expected " + numDocs + " docs, but got " + totalDocs);
    }
@ -200,6 +200,7 @@ public final class CompressingStoredFieldsIndexWriter implements Closeable {
      writeBlock();
    }
    fieldsIndexOut.writeVInt(0); // end marker
+    fieldsIndexOut.writeVLong(maxPointer);
    CodecUtil.writeFooter(fieldsIndexOut);
  }

--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
@ -84,6 +84,7 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
  private final int version;
  private final FieldInfos fieldInfos;
  private final CompressingStoredFieldsIndexReader indexReader;
+  private final long maxPointer;
  private final IndexInput fieldsStream;
  private final int chunkSize;
  private final int packedIntsVersion;
@ -99,6 +100,7 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
    this.fieldInfos = reader.fieldInfos;
    this.fieldsStream = reader.fieldsStream.clone();
    this.indexReader = reader.indexReader.clone();
+    this.maxPointer = reader.maxPointer;
    this.chunkSize = reader.chunkSize;
    this.packedIntsVersion = reader.packedIntsVersion;
    this.compressionMode = reader.compressionMode;
@ -118,24 +120,27 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
    numDocs = si.getDocCount();
    ChecksumIndexInput indexStream = null;
    try {
-      // Load the index into memory
      final String indexStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_INDEX_EXTENSION);
+      final String fieldsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_EXTENSION);
+      // Load the index into memory
      indexStream = d.openChecksumInput(indexStreamFN, context);
      final String codecNameIdx = formatName + CODEC_SFX_IDX;
      version = CodecUtil.checkHeader(indexStream, codecNameIdx, VERSION_START, VERSION_CURRENT);
      assert CodecUtil.headerLength(codecNameIdx) == indexStream.getFilePointer();
      indexReader = new CompressingStoredFieldsIndexReader(indexStream, si);
-      
+
      if (version >= VERSION_CHECKSUM) {
+        maxPointer = indexStream.readVLong();
+        assert maxPointer + CodecUtil.footerLength() == d.fileLength(fieldsStreamFN);
        CodecUtil.checkFooter(indexStream);
      } else {
+        maxPointer = d.fileLength(fieldsStreamFN);
        CodecUtil.checkEOF(indexStream);
      }
      indexStream.close();
      indexStream = null;

      // Open the data file and read metadata
-      final String fieldsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_EXTENSION);
      fieldsStream = d.openInput(fieldsStreamFN, context);
      final String codecNameDat = formatName + CODEC_SFX_DAT;
      final int fieldsVersion = CodecUtil.checkHeader(fieldsStream, codecNameDat, VERSION_START, VERSION_CURRENT);
@ -502,8 +507,9 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
     * Copy compressed data.
     */
    void copyCompressedData(DataOutput out) throws IOException {
+      assert getVersion() == VERSION_CURRENT;
      final long chunkEnd = docBase + chunkDocs == numDocs
-          ? fieldsStream.length()
+          ? maxPointer
          : indexReader.getStartPointer(docBase + chunkDocs);
      out.copyBytes(fieldsStream, chunkEnd - fieldsStream.getFilePointer());
    }
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
@ -316,7 +316,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
    if (docBase != numDocs) {
      throw new RuntimeException("Wrote " + docBase + " docs, finish called with numDocs=" + numDocs);
    }
-    indexWriter.finish(numDocs);
+    indexWriter.finish(numDocs, fieldsStream.getFilePointer());
    CodecUtil.writeFooter(fieldsStream);
    assert bufferedDocs.length == 0;
  }
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java
@ -114,6 +114,7 @@ public final class CompressingTermVectorsReader extends TermVectorsReader implem
      indexReader = new CompressingStoredFieldsIndexReader(indexStream, si);
      
      if (version >= VERSION_CHECKSUM) {
+        indexStream.readVLong(); // the end of the data file
        CodecUtil.checkFooter(indexStream);
      } else {
        CodecUtil.checkEOF(indexStream);
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
@ -661,7 +661,7 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter {
    if (numDocs != this.numDocs) {
      throw new RuntimeException("Wrote " + this.numDocs + " docs, finish called with numDocs=" + numDocs);
    }
-    indexWriter.finish(numDocs);
+    indexWriter.finish(numDocs, vectorsStream.getFilePointer());
    CodecUtil.writeFooter(vectorsStream);
  }

--- a/lucene/core/src/java/org/apache/lucene/index/IndexFileNames.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexFileNames.java
@ -199,6 +199,19 @@ public final class IndexFileNames {
    return filename;
  }  

+  /**
+   * Return the extension (anything after the first '.'),
+   * or null if there is no '.' in the file name.
+   */
+  public static String getExtension(String filename) {
+    final int idx = filename.indexOf('.');
+    if (idx == -1) {
+      return null;
+    } else {
+      return filename.substring(idx + 1, filename.length());
+    }
+  }
+
  /**
   * All files created by codecs much match this pattern (checked in
   * SegmentInfo).
--- a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat.java
@ -28,16 +28,15 @@ import org.apache.lucene.index.RandomCodec;
 * Basic tests of PerFieldPostingsFormat
 */
 public class TestPerFieldPostingsFormat extends BasePostingsFormatTestCase {
-  private Codec codec;
-  
-  @Override
-  public void setUp() throws Exception {
-    super.setUp();
-    codec = new RandomCodec(new Random(random().nextLong()), Collections.<String>emptySet());
-  }
-  
+
  @Override
  protected Codec getCodec() {
-    return codec;
+    return new RandomCodec(new Random(random().nextLong()), Collections.<String>emptySet());
  }
+
+  @Override
+  public void testMergeStability() throws Exception {
+    assumeTrue("The MockRandom PF randomizes content on the fly, so we can't check it", false);
+  }
+
 }
--- a/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java
@ -28,4 +28,10 @@ public class TestPostingsFormat extends BasePostingsFormatTestCase {
  protected Codec getCodec() {
    return Codec.getDefault();
  }
+
+  @Override
+  public void testMergeStability() throws Exception {
+    assumeTrue("The MockRandom PF randomizes content on the fly, so we can't check it", false);
+  }
+
 }
--- a/lucene/core/src/test/org/apache/lucene/index/TestTermVectorsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestTermVectorsFormat.java
@ -30,4 +30,10 @@ public class TestTermVectorsFormat extends BaseTermVectorsFormatTestCase {
  protected Codec getCodec() {
    return Codec.getDefault();
  }
+
+  @Override
+  public void testMergeStability() throws Exception {
+    assumeTrue("The MockRandom PF randomizes content on the fly, so we can't check it", false);
+  }
+
 }
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
@ -17,13 +17,15 @@ package org.apache.lucene.index;
 * limitations under the License.
 */

+import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
+
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
-import java.util.Map.Entry;
 import java.util.Map;
+import java.util.Map.Entry;
 import java.util.Set;
 import java.util.TreeSet;
 import java.util.concurrent.CountDownLatch;
@ -55,11 +57,8 @@ import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefHash;
-import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;

-import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
-
 /**
 * Abstract class to do basic tests for a docvalues format.
 * NOTE: This test focuses on the docvalues impl, nothing else.
@ -68,23 +67,21 @@ import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
 * test passes, then all Lucene/Solr tests should also pass.  Ie,
 * if there is some bug in a given DocValuesFormat that this
 * test fails to catch then this test needs to be improved! */
-public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
-  
-  /** Returns the codec to run tests against */
-  protected abstract Codec getCodec();
-  
-  private Codec savedCodec;
-  
-  public void setUp() throws Exception {
-    super.setUp();
-    // set the default codec, so adding test cases to this isn't fragile
-    savedCodec = Codec.getDefault();
-    Codec.setDefault(getCodec());
-  }
-  
-  public void tearDown() throws Exception {
-    Codec.setDefault(savedCodec); // restore
-    super.tearDown();
+public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTestCase {
+
+  @Override
+  protected void addRandomFields(Document doc) {
+    if (usually()) {
+      doc.add(new NumericDocValuesField("ndv", random().nextInt(1 << 12)));
+      doc.add(new BinaryDocValuesField("bdv", new BytesRef(TestUtil.randomSimpleString(random()))));
+      doc.add(new SortedDocValuesField("sdv", new BytesRef(TestUtil.randomSimpleString(random(), 2))));
+    }
+    if (defaultCodecSupportsSortedSet()) {
+      final int numValues = random().nextInt(5);
+      for (int i = 0; i < numValues; ++i) {
+        doc.add(new SortedSetDocValuesField("ssdv", new BytesRef(TestUtil.randomSimpleString(random(), 2))));
+      }
+    }
  }

  public void testOneNumber() throws IOException {
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java
@ -0,0 +1,115 @@
+package org.apache.lucene.index;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+/**
+ * Common tests to all index formats.
+ */
+abstract class BaseIndexFileFormatTestCase extends LuceneTestCase {
+
+  /** Returns the codec to run tests against */
+  protected abstract Codec getCodec();
+
+  private Codec savedCodec;
+
+  public void setUp() throws Exception {
+    super.setUp();
+    // set the default codec, so adding test cases to this isn't fragile
+    savedCodec = Codec.getDefault();
+    Codec.setDefault(getCodec());
+  }
+
+  public void tearDown() throws Exception {
+    Codec.setDefault(savedCodec); // restore
+    super.tearDown();
+  }
+
+  /** Add random fields to the provided document. */
+  protected abstract void addRandomFields(Document doc);
+
+  private Map<String, Long> bytesUsedByExtension(Directory d) throws IOException {
+    Map<String, Long> bytesUsedByExtension = new HashMap<>();
+    for (String file : d.listAll()) {
+      final String ext = IndexFileNames.getExtension(file);
+      final long previousLength = bytesUsedByExtension.containsKey(ext) ? bytesUsedByExtension.get(ext) : 0;
+      bytesUsedByExtension.put(ext, previousLength + d.fileLength(file));
+    }
+    bytesUsedByExtension.keySet().removeAll(excludedExtensionsFromByteCounts());
+
+    return bytesUsedByExtension;
+  }
+
+  /**
+   * Return the list of extensions that should be excluded from byte counts when
+   * comparing indices that store the same content.
+   */
+  protected Collection<String> excludedExtensionsFromByteCounts() {
+    // segment infos store various pieces of information that don't solely depend
+    // on the content of the index in the diagnostics (such as a timestamp) so we
+    // exclude this file from the bytes counts
+    return Collections.singleton("si");
+  }
+
+  /** The purpose of this test is to make sure that bulk merge doesn't accumulate useless data over runs. */
+  public void testMergeStability() throws Exception {
+    Directory dir = newDirectory();
+    // do not use newMergePolicy that might return a MockMergePolicy that ignores the no-CFS ratio
+    MergePolicy mp = newTieredMergePolicy();
+    mp.setNoCFSRatio(0);
+    IndexWriterConfig cfg = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setUseCompoundFile(false).setMergePolicy(mp);
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir, cfg);
+    final int numDocs = atLeast(500);
+    for (int i = 0; i < numDocs; ++i) {
+      Document d = new Document();
+      addRandomFields(d);
+      w.addDocument(d);
+    }
+    w.forceMerge(1);
+    w.commit();
+    w.close();
+    IndexReader reader = DirectoryReader.open(dir);
+
+    Directory dir2 = newDirectory();
+    mp = newTieredMergePolicy();
+    mp.setNoCFSRatio(0);
+    cfg = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setUseCompoundFile(false).setMergePolicy(mp);
+    w = new RandomIndexWriter(random(), dir2, cfg);
+    w.addIndexes(reader);
+    w.commit();
+    w.close();
+
+    assertEquals(bytesUsedByExtension(dir), bytesUsedByExtension(dir2));
+
+    reader.close();
+    dir.close();
+    dir2.close();
+  }
+
+}
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java
@ -44,6 +44,8 @@ import org.apache.lucene.codecs.lucene46.Lucene46Codec;
 import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.FieldInfo.DocValuesType;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.store.Directory;
@ -84,12 +86,7 @@ import org.junit.BeforeClass;
    they weren't indexed
 */

-public abstract class BasePostingsFormatTestCase extends LuceneTestCase {
-
-  /**
-   * Returns the Codec to run tests against
-   */
-  protected abstract Codec getCodec();
+public abstract class BasePostingsFormatTestCase extends BaseIndexFileFormatTestCase {

  private enum Option {
    // Sometimes use .advance():
@ -1574,4 +1571,18 @@ public abstract class BasePostingsFormatTestCase extends LuceneTestCase {
    r.close();
    dir.close();
  }
+
+  @Override
+  protected void addRandomFields(Document doc) {
+    for (IndexOptions opts : IndexOptions.values()) {
+      FieldType ft = new FieldType();
+      ft.setIndexOptions(opts);
+      ft.setIndexed(true);
+      ft.freeze();
+      final int numFields = random().nextInt(5);
+      for (int j = 0; j < numFields; ++j) {
+        doc.add(new Field("f_" + opts, TestUtil.randomSimpleString(random(), 2), ft));
+      }
+    }
+  }
 }
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseStoredFieldsFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseStoredFieldsFormatTestCase.java
@ -57,8 +57,6 @@ import org.apache.lucene.store.MMapDirectory;
 import org.apache.lucene.store.MockDirectoryWrapper;
 import org.apache.lucene.store.MockDirectoryWrapper.Throttling;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.TestUtil;

 import com.carrotsearch.randomizedtesting.generators.RandomInts;
@ -70,28 +68,16 @@ import com.carrotsearch.randomizedtesting.generators.RandomPicks;
 * uses it and extend this class and override {@link #getCodec()}.
 * @lucene.experimental
 */
-public abstract class BaseStoredFieldsFormatTestCase extends LuceneTestCase {
-  private Codec savedCodec;
-
-  /**
-   * Returns the Codec to run tests against
-   */
-  protected abstract Codec getCodec();
+public abstract class BaseStoredFieldsFormatTestCase extends BaseIndexFileFormatTestCase {

  @Override
-  public void setUp() throws Exception {
-    super.setUp();
-    // set the default codec, so adding test cases to this isn't fragile
-    savedCodec = Codec.getDefault();
-    Codec.setDefault(getCodec());
+  protected void addRandomFields(Document d) {
+    final int numValues = random().nextInt(3);
+    for (int i = 0; i < numValues; ++i) {
+      d.add(new StoredField("f", TestUtil.randomSimpleString(random(), 100)));
+    }
  }

-  @Override
-  public void tearDown() throws Exception {
-    Codec.setDefault(savedCodec); // restore
-    super.tearDown();
-  }
-  
  public void testRandomStoredFields() throws IOException {
    Directory dir = newDirectory();
    Random rand = random();
@ -661,4 +647,5 @@ public abstract class BaseStoredFieldsFormatTestCase extends LuceneTestCase {
    iw.shutdown();
    dir.close();
  }
+
 }
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java
@ -40,6 +40,7 @@ import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.index.TermsEnum.SeekStatus;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.TermQuery;
@ -47,7 +48,6 @@ import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.AttributeImpl;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
-import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;

 import com.carrotsearch.randomizedtesting.generators.RandomPicks;
@ -58,26 +58,7 @@ import com.carrotsearch.randomizedtesting.generators.RandomPicks;
 * uses it and extend this class and override {@link #getCodec()}.
 * @lucene.experimental
 */
-public abstract class BaseTermVectorsFormatTestCase extends LuceneTestCase {
-
-  private Codec savedCodec;
-
-  /**
-   * Returns the Codec to run tests against
-   */
-  protected abstract Codec getCodec();
-
-  public void setUp() throws Exception {
-    super.setUp();
-    // set the default codec, so adding test cases to this isn't fragile
-    savedCodec = Codec.getDefault();
-    Codec.setDefault(getCodec());
-  }
-
-  public void tearDown() throws Exception {
-    Codec.setDefault(savedCodec); // restore
-    super.tearDown();
-  }
+public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatTestCase {

  /**
   * A combination of term vectors options.
@ -126,6 +107,17 @@ public abstract class BaseTermVectorsFormatTestCase extends LuceneTestCase {
    return payload;
  }

+  @Override
+  protected void addRandomFields(Document doc) {
+    for (Options opts : validOptions()) {
+      FieldType ft = fieldType(opts);
+      final int numFields = random().nextInt(5);
+      for (int j = 0; j < numFields; ++j) {
+        doc.add(new Field("f_" + opts, TestUtil.randomSimpleString(random(), 2), ft));
+      }
+    }
+  }
+
  // custom impl to test cases that are forbidden by the default OffsetAttribute impl
  private static class PermissiveOffsetAttributeImpl extends AttributeImpl implements OffsetAttribute {