LUCENE-5578: Prevent stored fields from accumulating stale data on merge.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1585900 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2014-04-09 07:44:47 +00:00
parent 5f11ef3fb0
commit 47eb98d13f
14 changed files with 218 additions and 84 deletions

View File

@ -192,7 +192,7 @@ public final class CompressingStoredFieldsIndexWriter implements Closeable {
maxStartPointer = startPointer;
}
void finish(int numDocs) throws IOException {
void finish(int numDocs, long maxPointer) throws IOException {
if (numDocs != totalDocs) {
throw new IllegalStateException("Expected " + numDocs + " docs, but got " + totalDocs);
}
@ -200,6 +200,7 @@ public final class CompressingStoredFieldsIndexWriter implements Closeable {
writeBlock();
}
fieldsIndexOut.writeVInt(0); // end marker
fieldsIndexOut.writeVLong(maxPointer);
CodecUtil.writeFooter(fieldsIndexOut);
}

View File

@ -84,6 +84,7 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
private final int version;
private final FieldInfos fieldInfos;
private final CompressingStoredFieldsIndexReader indexReader;
private final long maxPointer;
private final IndexInput fieldsStream;
private final int chunkSize;
private final int packedIntsVersion;
@ -99,6 +100,7 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
this.fieldInfos = reader.fieldInfos;
this.fieldsStream = reader.fieldsStream.clone();
this.indexReader = reader.indexReader.clone();
this.maxPointer = reader.maxPointer;
this.chunkSize = reader.chunkSize;
this.packedIntsVersion = reader.packedIntsVersion;
this.compressionMode = reader.compressionMode;
@ -118,24 +120,27 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
numDocs = si.getDocCount();
ChecksumIndexInput indexStream = null;
try {
// Load the index into memory
final String indexStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_INDEX_EXTENSION);
final String fieldsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_EXTENSION);
// Load the index into memory
indexStream = d.openChecksumInput(indexStreamFN, context);
final String codecNameIdx = formatName + CODEC_SFX_IDX;
version = CodecUtil.checkHeader(indexStream, codecNameIdx, VERSION_START, VERSION_CURRENT);
assert CodecUtil.headerLength(codecNameIdx) == indexStream.getFilePointer();
indexReader = new CompressingStoredFieldsIndexReader(indexStream, si);
if (version >= VERSION_CHECKSUM) {
maxPointer = indexStream.readVLong();
assert maxPointer + CodecUtil.footerLength() == d.fileLength(fieldsStreamFN);
CodecUtil.checkFooter(indexStream);
} else {
maxPointer = d.fileLength(fieldsStreamFN);
CodecUtil.checkEOF(indexStream);
}
indexStream.close();
indexStream = null;
// Open the data file and read metadata
final String fieldsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_EXTENSION);
fieldsStream = d.openInput(fieldsStreamFN, context);
final String codecNameDat = formatName + CODEC_SFX_DAT;
final int fieldsVersion = CodecUtil.checkHeader(fieldsStream, codecNameDat, VERSION_START, VERSION_CURRENT);
@ -502,8 +507,9 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
* Copy compressed data.
*/
void copyCompressedData(DataOutput out) throws IOException {
assert getVersion() == VERSION_CURRENT;
final long chunkEnd = docBase + chunkDocs == numDocs
? fieldsStream.length()
? maxPointer
: indexReader.getStartPointer(docBase + chunkDocs);
out.copyBytes(fieldsStream, chunkEnd - fieldsStream.getFilePointer());
}

View File

@ -316,7 +316,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
if (docBase != numDocs) {
throw new RuntimeException("Wrote " + docBase + " docs, finish called with numDocs=" + numDocs);
}
indexWriter.finish(numDocs);
indexWriter.finish(numDocs, fieldsStream.getFilePointer());
CodecUtil.writeFooter(fieldsStream);
assert bufferedDocs.length == 0;
}

View File

@ -114,6 +114,7 @@ public final class CompressingTermVectorsReader extends TermVectorsReader implem
indexReader = new CompressingStoredFieldsIndexReader(indexStream, si);
if (version >= VERSION_CHECKSUM) {
indexStream.readVLong(); // the end of the data file
CodecUtil.checkFooter(indexStream);
} else {
CodecUtil.checkEOF(indexStream);

View File

@ -661,7 +661,7 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter {
if (numDocs != this.numDocs) {
throw new RuntimeException("Wrote " + this.numDocs + " docs, finish called with numDocs=" + numDocs);
}
indexWriter.finish(numDocs);
indexWriter.finish(numDocs, vectorsStream.getFilePointer());
CodecUtil.writeFooter(vectorsStream);
}

View File

@ -199,6 +199,19 @@ public final class IndexFileNames {
return filename;
}
/**
* Return the extension (anything after the first '.'),
* or null if there is no '.' in the file name.
*/
public static String getExtension(String filename) {
final int idx = filename.indexOf('.');
if (idx == -1) {
return null;
} else {
return filename.substring(idx + 1, filename.length());
}
}
/**
* All files created by codecs much match this pattern (checked in
* SegmentInfo).

View File

@ -28,16 +28,15 @@ import org.apache.lucene.index.RandomCodec;
* Basic tests of PerFieldPostingsFormat
*/
public class TestPerFieldPostingsFormat extends BasePostingsFormatTestCase {
private Codec codec;
@Override
public void setUp() throws Exception {
super.setUp();
codec = new RandomCodec(new Random(random().nextLong()), Collections.<String>emptySet());
}
@Override
protected Codec getCodec() {
return codec;
return new RandomCodec(new Random(random().nextLong()), Collections.<String>emptySet());
}
@Override
public void testMergeStability() throws Exception {
assumeTrue("The MockRandom PF randomizes content on the fly, so we can't check it", false);
}
}

View File

@ -28,4 +28,10 @@ public class TestPostingsFormat extends BasePostingsFormatTestCase {
protected Codec getCodec() {
return Codec.getDefault();
}
@Override
public void testMergeStability() throws Exception {
assumeTrue("The MockRandom PF randomizes content on the fly, so we can't check it", false);
}
}

View File

@ -30,4 +30,10 @@ public class TestTermVectorsFormat extends BaseTermVectorsFormatTestCase {
protected Codec getCodec() {
return Codec.getDefault();
}
@Override
public void testMergeStability() throws Exception {
assumeTrue("The MockRandom PF randomizes content on the fly, so we can't check it", false);
}
}

View File

@ -17,13 +17,15 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.CountDownLatch;
@ -55,11 +57,8 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
/**
* Abstract class to do basic tests for a docvalues format.
* NOTE: This test focuses on the docvalues impl, nothing else.
@ -68,23 +67,21 @@ import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
* test passes, then all Lucene/Solr tests should also pass. Ie,
* if there is some bug in a given DocValuesFormat that this
* test fails to catch then this test needs to be improved! */
public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
/** Returns the codec to run tests against */
protected abstract Codec getCodec();
private Codec savedCodec;
public void setUp() throws Exception {
super.setUp();
// set the default codec, so adding test cases to this isn't fragile
savedCodec = Codec.getDefault();
Codec.setDefault(getCodec());
}
public void tearDown() throws Exception {
Codec.setDefault(savedCodec); // restore
super.tearDown();
public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTestCase {
@Override
protected void addRandomFields(Document doc) {
if (usually()) {
doc.add(new NumericDocValuesField("ndv", random().nextInt(1 << 12)));
doc.add(new BinaryDocValuesField("bdv", new BytesRef(TestUtil.randomSimpleString(random()))));
doc.add(new SortedDocValuesField("sdv", new BytesRef(TestUtil.randomSimpleString(random(), 2))));
}
if (defaultCodecSupportsSortedSet()) {
final int numValues = random().nextInt(5);
for (int i = 0; i < numValues; ++i) {
doc.add(new SortedSetDocValuesField("ssdv", new BytesRef(TestUtil.randomSimpleString(random(), 2))));
}
}
}
public void testOneNumber() throws IOException {

View File

@ -0,0 +1,115 @@
package org.apache.lucene.index;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
/**
* Common tests to all index formats.
*/
abstract class BaseIndexFileFormatTestCase extends LuceneTestCase {
/** Returns the codec to run tests against */
protected abstract Codec getCodec();
private Codec savedCodec;
public void setUp() throws Exception {
super.setUp();
// set the default codec, so adding test cases to this isn't fragile
savedCodec = Codec.getDefault();
Codec.setDefault(getCodec());
}
public void tearDown() throws Exception {
Codec.setDefault(savedCodec); // restore
super.tearDown();
}
/** Add random fields to the provided document. */
protected abstract void addRandomFields(Document doc);
private Map<String, Long> bytesUsedByExtension(Directory d) throws IOException {
Map<String, Long> bytesUsedByExtension = new HashMap<>();
for (String file : d.listAll()) {
final String ext = IndexFileNames.getExtension(file);
final long previousLength = bytesUsedByExtension.containsKey(ext) ? bytesUsedByExtension.get(ext) : 0;
bytesUsedByExtension.put(ext, previousLength + d.fileLength(file));
}
bytesUsedByExtension.keySet().removeAll(excludedExtensionsFromByteCounts());
return bytesUsedByExtension;
}
/**
* Return the list of extensions that should be excluded from byte counts when
* comparing indices that store the same content.
*/
protected Collection<String> excludedExtensionsFromByteCounts() {
// segment infos store various pieces of information that don't solely depend
// on the content of the index in the diagnostics (such as a timestamp) so we
// exclude this file from the bytes counts
return Collections.singleton("si");
}
/** The purpose of this test is to make sure that bulk merge doesn't accumulate useless data over runs. */
public void testMergeStability() throws Exception {
Directory dir = newDirectory();
// do not use newMergePolicy that might return a MockMergePolicy that ignores the no-CFS ratio
MergePolicy mp = newTieredMergePolicy();
mp.setNoCFSRatio(0);
IndexWriterConfig cfg = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setUseCompoundFile(false).setMergePolicy(mp);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, cfg);
final int numDocs = atLeast(500);
for (int i = 0; i < numDocs; ++i) {
Document d = new Document();
addRandomFields(d);
w.addDocument(d);
}
w.forceMerge(1);
w.commit();
w.close();
IndexReader reader = DirectoryReader.open(dir);
Directory dir2 = newDirectory();
mp = newTieredMergePolicy();
mp.setNoCFSRatio(0);
cfg = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setUseCompoundFile(false).setMergePolicy(mp);
w = new RandomIndexWriter(random(), dir2, cfg);
w.addIndexes(reader);
w.commit();
w.close();
assertEquals(bytesUsedByExtension(dir), bytesUsedByExtension(dir2));
reader.close();
dir.close();
dir2.close();
}
}

View File

@ -44,6 +44,8 @@ import org.apache.lucene.codecs.lucene46.Lucene46Codec;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.FieldInfo.DocValuesType;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.store.Directory;
@ -84,12 +86,7 @@ import org.junit.BeforeClass;
they weren't indexed
*/
public abstract class BasePostingsFormatTestCase extends LuceneTestCase {
/**
* Returns the Codec to run tests against
*/
protected abstract Codec getCodec();
public abstract class BasePostingsFormatTestCase extends BaseIndexFileFormatTestCase {
private enum Option {
// Sometimes use .advance():
@ -1574,4 +1571,18 @@ public abstract class BasePostingsFormatTestCase extends LuceneTestCase {
r.close();
dir.close();
}
@Override
protected void addRandomFields(Document doc) {
for (IndexOptions opts : IndexOptions.values()) {
FieldType ft = new FieldType();
ft.setIndexOptions(opts);
ft.setIndexed(true);
ft.freeze();
final int numFields = random().nextInt(5);
for (int j = 0; j < numFields; ++j) {
doc.add(new Field("f_" + opts, TestUtil.randomSimpleString(random(), 2), ft));
}
}
}
}

View File

@ -57,8 +57,6 @@ import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.store.MockDirectoryWrapper.Throttling;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.TestUtil;
import com.carrotsearch.randomizedtesting.generators.RandomInts;
@ -70,28 +68,16 @@ import com.carrotsearch.randomizedtesting.generators.RandomPicks;
* uses it and extend this class and override {@link #getCodec()}.
* @lucene.experimental
*/
public abstract class BaseStoredFieldsFormatTestCase extends LuceneTestCase {
private Codec savedCodec;
/**
* Returns the Codec to run tests against
*/
protected abstract Codec getCodec();
public abstract class BaseStoredFieldsFormatTestCase extends BaseIndexFileFormatTestCase {
@Override
public void setUp() throws Exception {
super.setUp();
// set the default codec, so adding test cases to this isn't fragile
savedCodec = Codec.getDefault();
Codec.setDefault(getCodec());
protected void addRandomFields(Document d) {
final int numValues = random().nextInt(3);
for (int i = 0; i < numValues; ++i) {
d.add(new StoredField("f", TestUtil.randomSimpleString(random(), 100)));
}
}
@Override
public void tearDown() throws Exception {
Codec.setDefault(savedCodec); // restore
super.tearDown();
}
public void testRandomStoredFields() throws IOException {
Directory dir = newDirectory();
Random rand = random();
@ -661,4 +647,5 @@ public abstract class BaseStoredFieldsFormatTestCase extends LuceneTestCase {
iw.shutdown();
dir.close();
}
}

View File

@ -40,6 +40,7 @@ import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
@ -47,7 +48,6 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
@ -58,26 +58,7 @@ import com.carrotsearch.randomizedtesting.generators.RandomPicks;
* uses it and extend this class and override {@link #getCodec()}.
* @lucene.experimental
*/
public abstract class BaseTermVectorsFormatTestCase extends LuceneTestCase {
private Codec savedCodec;
/**
* Returns the Codec to run tests against
*/
protected abstract Codec getCodec();
public void setUp() throws Exception {
super.setUp();
// set the default codec, so adding test cases to this isn't fragile
savedCodec = Codec.getDefault();
Codec.setDefault(getCodec());
}
public void tearDown() throws Exception {
Codec.setDefault(savedCodec); // restore
super.tearDown();
}
public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatTestCase {
/**
* A combination of term vectors options.
@ -126,6 +107,17 @@ public abstract class BaseTermVectorsFormatTestCase extends LuceneTestCase {
return payload;
}
@Override
protected void addRandomFields(Document doc) {
for (Options opts : validOptions()) {
FieldType ft = fieldType(opts);
final int numFields = random().nextInt(5);
for (int j = 0; j < numFields; ++j) {
doc.add(new Field("f_" + opts, TestUtil.randomSimpleString(random(), 2), ft));
}
}
}
// custom impl to test cases that are forbidden by the default OffsetAttribute impl
private static class PermissiveOffsetAttributeImpl extends AttributeImpl implements OffsetAttribute {