Enable Bulk-Merge if all source remains (#37269)

Today we still wrap recovery source readers on merge even if we
keep all documents recovery source. This basically disables bulk
merging for stored fields. This change skips wrapping if all docs
sources are kept anyway.
This commit is contained in:
Simon Willnauer 2019-01-09 23:46:31 +01:00 committed by GitHub
parent 95479f1766
commit 234059d2c0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 44 additions and 8 deletions

View File

@ -33,11 +33,8 @@ import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConjunctionDISI;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.DocValuesFieldExistsQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
@ -68,15 +65,18 @@ final class RecoverySourcePruneMergePolicy extends OneMergeWrappingMergePolicy {
if (recoverySource == null || recoverySource.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
return reader; // early terminate - nothing to do here since non of the docs has a recovery source anymore.
}
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(new DocValuesFieldExistsQuery(recoverySourceField), BooleanClause.Occur.FILTER);
builder.add(retainSourceQuerySupplier.get(), BooleanClause.Occur.FILTER);
IndexSearcher s = new IndexSearcher(reader);
s.setQueryCache(null);
Weight weight = s.createWeight(s.rewrite(builder.build()), ScoreMode.COMPLETE_NO_SCORES, 1.0f);
Weight weight = s.createWeight(s.rewrite(retainSourceQuerySupplier.get()), ScoreMode.COMPLETE_NO_SCORES, 1.0f);
Scorer scorer = weight.scorer(reader.getContext());
if (scorer != null) {
return new SourcePruningFilterCodecReader(recoverySourceField, reader, BitSet.of(scorer.iterator(), reader.maxDoc()));
BitSet recoverySourceToKeep = BitSet.of(scorer.iterator(), reader.maxDoc());
// calculating the cardinality is significantly cheaper than skipping all bulk-merging we might do
// if retentions are high we keep most of it
if (recoverySourceToKeep.cardinality() == reader.maxDoc()) {
return reader; // keep all source
}
return new SourcePruningFilterCodecReader(recoverySourceField, reader, recoverySourceToKeep);
} else {
return new SourcePruningFilterCodecReader(recoverySourceField, reader, null);
}

View File

@ -37,6 +37,7 @@ import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.index.StandardDirectoryReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
@ -158,4 +159,39 @@ public class RecoverySourcePruneMergePolicyTests extends ESTestCase {
}
}
}
public void testPruneNone() throws IOException {
try (Directory dir = newDirectory()) {
IndexWriterConfig iwc = newIndexWriterConfig();
iwc.setMergePolicy(new RecoverySourcePruneMergePolicy("extra_source",
() -> new MatchAllDocsQuery(), iwc.getMergePolicy()));
try (IndexWriter writer = new IndexWriter(dir, iwc)) {
for (int i = 0; i < 20; i++) {
if (i > 0 && randomBoolean()) {
writer.flush();
}
Document doc = new Document();
doc.add(new StoredField("source", "hello world"));
doc.add(new StoredField("extra_source", "hello world"));
doc.add(new NumericDocValuesField("extra_source", 1));
writer.addDocument(doc);
}
writer.forceMerge(1);
writer.commit();
try (DirectoryReader reader = DirectoryReader.open(writer)) {
assertEquals(1, reader.leaves().size());
NumericDocValues extra_source = reader.leaves().get(0).reader().getNumericDocValues("extra_source");
assertNotNull(extra_source);
for (int i = 0; i < reader.maxDoc(); i++) {
Document document = reader.document(i);
Set<String> collect = document.getFields().stream().map(IndexableField::name).collect(Collectors.toSet());
assertTrue(collect.contains("source"));
assertTrue(collect.contains("extra_source"));
assertEquals(i, extra_source.nextDoc());
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, extra_source.nextDoc());
}
}
}
}
}