mirror of https://github.com/apache/lucene.git
Check `Terms#intersect` in CheckIndex. (#12925)
This commit adds coverage to `Terms#intersect` to `CheckIndex` and indexes `LineFileDocs` in `BasePostingsFormatTestCase` to get some coverage with real-world data. With this change, `TestLucene90PostingsFormat` now exhibits #12895.
This commit is contained in:
parent
05b14e23b1
commit
e0f4321b40
|
@ -83,6 +83,11 @@ import org.apache.lucene.util.NamedThreadFactory;
|
||||||
import org.apache.lucene.util.StringHelper;
|
import org.apache.lucene.util.StringHelper;
|
||||||
import org.apache.lucene.util.SuppressForbidden;
|
import org.apache.lucene.util.SuppressForbidden;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.apache.lucene.util.automaton.Automata;
|
||||||
|
import org.apache.lucene.util.automaton.Automaton;
|
||||||
|
import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||||
|
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||||
|
import org.apache.lucene.util.automaton.Operations;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Basic tool and API to check the health of an index and write a new segments file that removes
|
* Basic tool and API to check the health of an index and write a new segments file that removes
|
||||||
|
@ -2298,6 +2303,33 @@ public final class CheckIndex implements Closeable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Test Terms#intersect
|
||||||
|
TermsEnum allTerms = terms.iterator();
|
||||||
|
// An automaton that should match a good number of terms
|
||||||
|
Automaton a =
|
||||||
|
Operations.concatenate(
|
||||||
|
Arrays.asList(
|
||||||
|
Automata.makeAnyBinary(),
|
||||||
|
Automata.makeCharRange('a', 'e'),
|
||||||
|
Automata.makeAnyBinary()));
|
||||||
|
a = Operations.determinize(a, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
|
||||||
|
CompiledAutomaton ca = new CompiledAutomaton(a);
|
||||||
|
ByteRunAutomaton runAutomaton = new ByteRunAutomaton(a);
|
||||||
|
TermsEnum filteredTerms = terms.intersect(ca, null);
|
||||||
|
for (BytesRef term = allTerms.next(); term != null; term = allTerms.next()) {
|
||||||
|
if (runAutomaton.run(term.bytes, term.offset, term.length)) {
|
||||||
|
BytesRef filteredTerm = filteredTerms.next();
|
||||||
|
if (Objects.equals(term, filteredTerm) == false) {
|
||||||
|
throw new CheckIndexException(
|
||||||
|
"Expected next filtered term: " + term + ", but got " + filteredTerm);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
BytesRef filteredTerm = filteredTerms.next();
|
||||||
|
if (filteredTerm != null) {
|
||||||
|
throw new CheckIndexException("Expected exhausted TermsEnum, but got " + filteredTerm);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -38,6 +38,7 @@ import org.apache.lucene.codecs.NormsProducer;
|
||||||
import org.apache.lucene.codecs.PostingsFormat;
|
import org.apache.lucene.codecs.PostingsFormat;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.document.Field.Store;
|
||||||
import org.apache.lucene.document.FieldType;
|
import org.apache.lucene.document.FieldType;
|
||||||
import org.apache.lucene.document.StringField;
|
import org.apache.lucene.document.StringField;
|
||||||
import org.apache.lucene.document.TextField;
|
import org.apache.lucene.document.TextField;
|
||||||
|
@ -47,6 +48,7 @@ import org.apache.lucene.index.IndexOptions;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.index.IndexWriterConfig;
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
|
import org.apache.lucene.index.IndexableField;
|
||||||
import org.apache.lucene.index.LeafReader;
|
import org.apache.lucene.index.LeafReader;
|
||||||
import org.apache.lucene.index.MultiTerms;
|
import org.apache.lucene.index.MultiTerms;
|
||||||
import org.apache.lucene.index.PostingsEnum;
|
import org.apache.lucene.index.PostingsEnum;
|
||||||
|
@ -1610,4 +1612,29 @@ public abstract class BasePostingsFormatTestCase extends BaseIndexFileFormatTest
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Test realistic data, which is often better at uncovering real bugs. */
|
||||||
|
@Nightly // this test takes a few seconds
|
||||||
|
public void testLineFileDocs() throws IOException {
|
||||||
|
// Use a FS dir and a non-randomized IWC to not slow down indexing
|
||||||
|
try (Directory dir = newFSDirectory(createTempDir())) {
|
||||||
|
try (LineFileDocs docs = new LineFileDocs(random());
|
||||||
|
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig())) {
|
||||||
|
final int numDocs = atLeast(10_000);
|
||||||
|
for (int i = 0; i < numDocs; ++i) {
|
||||||
|
// Only keep the body field, and don't index term vectors on it, we only care about
|
||||||
|
// postings
|
||||||
|
Document doc = docs.nextDoc();
|
||||||
|
IndexableField body = doc.getField("body");
|
||||||
|
assertNotNull(body);
|
||||||
|
assertNotNull(body.stringValue());
|
||||||
|
assertNotEquals(IndexOptions.NONE, body.fieldType().indexOptions());
|
||||||
|
body = new TextField("body", body.stringValue(), Store.NO);
|
||||||
|
w.addDocument(Collections.singletonList(body));
|
||||||
|
}
|
||||||
|
w.forceMerge(1);
|
||||||
|
}
|
||||||
|
TestUtil.checkIndex(dir);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue