LUCENE-9662: CheckIndex should be concurrent - parallelizing index check across segments (#128)

This commit is contained in:
zacharymorn 2021-08-31 19:24:14 -07:00 committed by GitHub
parent 9c7f0d45ee
commit 424192e170
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 836 additions and 379 deletions

View File

@ -258,6 +258,9 @@ Improvements
* LUCENE-10019: Align file starts in CFS files to have proper alignment (8 bytes)
(Uwe Schinder)
* LUCENE-9662: Make CheckIndex concurrent by parallelizing index check across segments.
(Zach Chen, Mike McCandless, Dawid Weiss, Robert Muir)
Bug fixes
* LUCENE-9686: Fix read past EOF handling in DirectIODirectory. (Zach Chen,

View File

@ -70,7 +70,7 @@ public class TestManyPointsInOldIndex extends LuceneTestCase {
dir.setCheckIndexOnClose(false);
// ... because we check ourselves here:
TestUtil.checkIndex(dir, false, true, null);
TestUtil.checkIndex(dir, false, true, true, null);
dir.close();
}
}

File diff suppressed because it is too large Load Diff

View File

@ -121,7 +121,7 @@ public class TestAllFilesDetectTruncation extends LuceneTestCase {
// CheckIndex should also fail:
expectThrowsAnyOf(
Arrays.asList(CorruptIndexException.class, EOFException.class),
() -> TestUtil.checkIndex(dirCopy, true, true, null));
() -> TestUtil.checkIndex(dirCopy, true, true, true, null));
}
}
}

View File

@ -16,8 +16,19 @@
*/
package org.apache.lucene.index;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.document.*;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.VectorUtil;
import org.junit.Test;
public class TestCheckIndex extends BaseTestCheckIndex {
@ -54,4 +65,142 @@ public class TestCheckIndex extends BaseTestCheckIndex {
public void testObtainsLock() throws IOException {
testObtainsLock(directory);
}
@Test
public void testCheckIndexAllValid() throws Exception {
try (Directory dir = newDirectory()) {
int liveDocCount = 1 + random().nextInt(10);
IndexWriterConfig conifg = newIndexWriterConfig();
conifg.setIndexSort(new Sort(new SortField("sort_field", SortField.Type.INT, true)));
conifg.setSoftDeletesField("soft_delete");
try (IndexWriter w = new IndexWriter(dir, conifg)) {
for (int i = 0; i < liveDocCount; i++) {
Document doc = new Document();
// stored field
doc.add(new StringField("id", Integer.toString(random().nextInt()), Field.Store.YES));
doc.add(new StoredField("field", "value" + TestUtil.randomSimpleString(random())));
// vector
doc.add(new KnnVectorField("v1", randomVector(3)));
doc.add(new KnnVectorField("v2", randomVector(3)));
// doc value
doc.add(new NumericDocValuesField("dv", random().nextLong()));
// point value
byte[] point = new byte[4];
NumericUtils.intToSortableBytes(random().nextInt(), point, 0);
doc.add(new BinaryPoint("point", point));
// term vector
Token token1 =
new Token("bar", 0, 3) {
{
setPayload(new BytesRef("pay1"));
}
};
Token token2 =
new Token("bar", 4, 8) {
{
setPayload(new BytesRef("pay2"));
}
};
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorPositions(true);
ft.setStoreTermVectorPayloads(true);
doc.add(new Field("termvector", new CannedTokenStream(token1, token2), ft));
w.addDocument(doc);
}
Document tombstone = new Document();
tombstone.add(new NumericDocValuesField("soft_delete", 1));
w.softUpdateDocument(
new Term("id", "1"), tombstone, new NumericDocValuesField("soft_delete", 1));
}
ByteArrayOutputStream output = new ByteArrayOutputStream();
CheckIndex.Status status = TestUtil.checkIndex(dir, false, true, true, output);
assertEquals(1, status.segmentInfos.size());
CheckIndex.Status.SegmentInfoStatus segStatus = status.segmentInfos.get(0);
// confirm live docs testing status
assertEquals(0, segStatus.liveDocStatus.numDeleted);
assertTrue(output.toString(IOUtils.UTF_8).contains("test: check live docs"));
assertNull(segStatus.liveDocStatus.error);
// confirm field infos testing status
assertEquals(8, segStatus.fieldInfoStatus.totFields);
assertTrue(output.toString(IOUtils.UTF_8).contains("test: field infos"));
assertNull(segStatus.fieldInfoStatus.error);
// confirm field norm (from term vector) testing status
assertEquals(1, segStatus.fieldNormStatus.totFields);
assertTrue(output.toString(IOUtils.UTF_8).contains("test: field norms"));
assertNull(segStatus.fieldNormStatus.error);
// confirm term index testing status
assertTrue(segStatus.termIndexStatus.termCount > 0);
assertTrue(segStatus.termIndexStatus.totFreq > 0);
assertTrue(segStatus.termIndexStatus.totPos > 0);
assertTrue(output.toString(IOUtils.UTF_8).contains("test: terms, freq, prox"));
assertNull(segStatus.termIndexStatus.error);
// confirm stored field testing status
// add storedField from tombstone doc
assertEquals(liveDocCount + 1, segStatus.storedFieldStatus.docCount);
assertEquals(2 * liveDocCount, segStatus.storedFieldStatus.totFields);
assertTrue(output.toString(IOUtils.UTF_8).contains("test: stored fields"));
assertNull(segStatus.storedFieldStatus.error);
// confirm term vector testing status
assertEquals(liveDocCount, segStatus.termVectorStatus.docCount);
assertEquals(liveDocCount, segStatus.termVectorStatus.totVectors);
assertTrue(output.toString(IOUtils.UTF_8).contains("test: term vectors"));
assertNull(segStatus.termVectorStatus.error);
// confirm doc values testing status
assertEquals(2, segStatus.docValuesStatus.totalNumericFields);
assertTrue(output.toString(IOUtils.UTF_8).contains("test: docvalues"));
assertNull(segStatus.docValuesStatus.error);
// confirm point values testing status
assertEquals(1, segStatus.pointsStatus.totalValueFields);
assertEquals(liveDocCount, segStatus.pointsStatus.totalValuePoints);
assertTrue(output.toString(IOUtils.UTF_8).contains("test: points"));
assertNull(segStatus.pointsStatus.error);
// confirm vector testing status
assertEquals(2 * liveDocCount, segStatus.vectorValuesStatus.totalVectorValues);
assertEquals(2, segStatus.vectorValuesStatus.totalKnnVectorFields);
assertTrue(output.toString(IOUtils.UTF_8).contains("test: vectors"));
assertNull(segStatus.vectorValuesStatus.error);
// confirm index sort testing status
assertTrue(output.toString(IOUtils.UTF_8).contains("test: index sort"));
assertNull(segStatus.indexSortStatus.error);
// confirm soft deletes testing status
assertTrue(output.toString(IOUtils.UTF_8).contains("test: check soft deletes"));
assertNull(segStatus.softDeletesStatus.error);
}
}
public void testInvalidThreadCountArgument() {
String[] args = new String[] {"-threadCount", "0"};
expectThrows(IllegalArgumentException.class, () -> CheckIndex.parseOptions(args));
}
private float[] randomVector(int dim) {
float[] v = new float[dim];
for (int i = 0; i < dim; i++) {
v[i] = random().nextFloat();
}
VectorUtil.l2normalize(v);
return v;
}
}

View File

@ -726,7 +726,7 @@ public class TestPointValues extends LuceneTestCase {
w.close();
ByteArrayOutputStream output = new ByteArrayOutputStream();
CheckIndex.Status status = TestUtil.checkIndex(dir, false, true, output);
CheckIndex.Status status = TestUtil.checkIndex(dir, false, true, true, output);
assertEquals(1, status.segmentInfos.size());
CheckIndex.Status.SegmentInfoStatus segStatus = status.segmentInfos.get(0);
// total 3 point values were index:

View File

@ -116,8 +116,11 @@ public class TestSwappedIndexFiles extends LuceneTestCase {
// CheckIndex should also fail:
expectThrowsAnyOf(
Arrays.asList(
CorruptIndexException.class, EOFException.class, IndexFormatTooOldException.class),
() -> TestUtil.checkIndex(dirCopy, true, true, null));
CorruptIndexException.class,
EOFException.class,
IndexFormatTooOldException.class,
CheckIndex.CheckIndexException.class),
() -> TestUtil.checkIndex(dirCopy, true, true, true, null));
}
}
}

View File

@ -926,7 +926,7 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe
}
ByteArrayOutputStream output = new ByteArrayOutputStream();
CheckIndex.Status status = TestUtil.checkIndex(dir, false, true, output);
CheckIndex.Status status = TestUtil.checkIndex(dir, false, true, true, output);
assertEquals(1, status.segmentInfos.size());
CheckIndex.Status.SegmentInfoStatus segStatus = status.segmentInfos.get(0);
// total 3 vector values were indexed:

View File

@ -895,7 +895,11 @@ public class MockDirectoryWrapper extends BaseDirectoryWrapper {
System.out.println("\nNOTE: MockDirectoryWrapper: now run CheckIndex");
}
TestUtil.checkIndex(this, getCrossCheckTermVectorsOnClose(), true, null);
// Methods in MockDirectoryWrapper hold locks on this, which will cause deadlock when
// TestUtil#checkIndex checks segment concurrently using another thread, but making
// call back to synchronized methods such as MockDirectoryWrapper#fileLength.
// Hence passing concurrent = false to this method to turn off concurrent checks.
TestUtil.checkIndex(this, getCrossCheckTermVectorsOnClose(), true, false, null);
}
// TODO: factor this out / share w/ TestIW.assertNoUnreferencedFiles

View File

@ -16,6 +16,7 @@
*/
package org.apache.lucene.util;
import com.carrotsearch.randomizedtesting.RandomizedTest;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import java.io.BufferedInputStream;
@ -303,7 +304,7 @@ public final class TestUtil {
public static CheckIndex.Status checkIndex(Directory dir, boolean doSlowChecks)
throws IOException {
return checkIndex(dir, doSlowChecks, false, null);
return checkIndex(dir, doSlowChecks, false, true, null);
}
/**
@ -311,7 +312,11 @@ public final class TestUtil {
* moving on to other fields/segments to look for any other corruption.
*/
public static CheckIndex.Status checkIndex(
Directory dir, boolean doSlowChecks, boolean failFast, ByteArrayOutputStream output)
Directory dir,
boolean doSlowChecks,
boolean failFast,
boolean concurrent,
ByteArrayOutputStream output)
throws IOException {
if (output == null) {
output = new ByteArrayOutputStream(1024);
@ -323,6 +328,11 @@ public final class TestUtil {
checker.setDoSlowChecks(doSlowChecks);
checker.setFailFast(failFast);
checker.setInfoStream(new PrintStream(output, false, IOUtils.UTF_8), false);
if (concurrent) {
checker.setThreadCount(RandomizedTest.randomIntBetween(2, 5));
} else {
checker.setThreadCount(1);
}
CheckIndex.Status indexStatus = checker.checkIndex(null);
if (indexStatus == null || indexStatus.clean == false) {