diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 12b757f151a..c14901b7c79 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -179,6 +179,8 @@ Bug Fixes * GITHUB#12202: Fix MultiFieldQueryParser to apply boosts to regexp, wildcard, prefix, range, fuzzy queries. (Jasir KT) +* GITHUB#12178: Add explanations for TermAutomatonQuery (Marcus Eagan via Patrick Zhai, Mike McCandless, Robert Muir, Mikhail Khludnev) + * GITHUB#12214: Fix ordered intervals query to avoid skipping some of the results over interleaved terms. (Hongyu Yan) * GITHUB#12212: Bug fix for a DrillSideways issue where matching hits could occasionally be missed. (Frederic Thevenet) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonQuery.java index 0fbdaf4ac8a..7fae86711d8 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonQuery.java @@ -442,8 +442,44 @@ public class TermAutomatonQuery extends Query implements Accountable { @Override public Explanation explain(LeafReaderContext context, int doc) throws IOException { - // TODO - return null; + Scorer scorer = scorer(context); + if (scorer == null) { + return Explanation.noMatch("No matching terms in the document"); + } + + int advancedDoc = scorer.iterator().advance(doc); + if (advancedDoc != doc) { + return Explanation.noMatch("No matching terms in the document"); + } + + float score = scorer.score(); + LeafSimScorer leafSimScorer = ((TermAutomatonScorer) scorer).getLeafSimScorer(); + EnumAndScorer[] originalSubsOnDoc = ((TermAutomatonScorer) scorer).getOriginalSubsOnDoc(); + + List termExplanations = new ArrayList<>(); + for (EnumAndScorer enumAndScorer : originalSubsOnDoc) { + if (enumAndScorer != null) { + PostingsEnum postingsEnum = enumAndScorer.posEnum; + if (postingsEnum.docID() == doc) { + float termScore = leafSimScorer.score(doc, postingsEnum.freq()); + termExplanations.add( + Explanation.match( + postingsEnum.freq(), + "term frequency in the document", + Explanation.match( + termScore, + "score for term: " + idToTerm.get(enumAndScorer.termID).utf8ToString()))); + } + } + } + + if (termExplanations.isEmpty()) { + return Explanation.noMatch("No matching terms in the document"); + } + + Explanation freqExplanation = + Explanation.match(score, "TermAutomatonQuery, sum of:", termExplanations); + return leafSimScorer.explain(doc, freqExplanation); } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonScorer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonScorer.java index 17c8e58d239..5a44190d1d7 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonScorer.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonScorer.java @@ -53,6 +53,13 @@ class TermAutomatonScorer extends Scorer { private int docID = -1; private int freq; + /** + * originalSubsOnDoc is an array of EnumAndScorer instances used to create this + * TermAutomatonScorer. This field is only for explain purposes and should not be used for + * scoring/matching. + */ + private final EnumAndScorer[] originalSubsOnDoc; + public TermAutomatonScorer( TermAutomatonWeight weight, EnumAndScorer[] subs, int anyTermID, LeafSimScorer docScorer) throws IOException { @@ -65,6 +72,7 @@ class TermAutomatonScorer extends Scorer { this.anyTermID = anyTermID; this.subsOnDoc = new EnumAndScorer[subs.length]; this.positions = new PosState[4]; + this.originalSubsOnDoc = subs; for (int i = 0; i < this.positions.length; i++) { this.positions[i] = new PosState(); } @@ -345,6 +353,14 @@ class TermAutomatonScorer extends Scorer { } } + EnumAndScorer[] getOriginalSubsOnDoc() { + return originalSubsOnDoc; + } + + LeafSimScorer getLeafSimScorer() { + return docScorer; + } + @Override public String toString() { return "TermAutomatonScorer(" + weight + ")"; diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestTermAutomatonQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestTermAutomatonQuery.java index 3c3c3aac159..7cd3316a1c1 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestTermAutomatonQuery.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestTermAutomatonQuery.java @@ -43,6 +43,7 @@ import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.ConstantScoreScorer; import org.apache.lucene.search.ConstantScoreWeight; +import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.MultiPhraseQuery; @@ -842,6 +843,97 @@ public class TestTermAutomatonQuery extends LuceneTestCase { IOUtils.close(w, r, dir); } + /* Implement a custom term automaton query to ensure that rewritten queries + * do not get rewritten to primitive queries. The custom extension will allow + * the following explain tests to evaluate Explain for the query we intend to + * test, TermAutomatonQuery. + * */ + + private static class CustomTermAutomatonQuery extends TermAutomatonQuery { + public CustomTermAutomatonQuery(String field) { + super(field); + } + + @Override + public Query rewrite(IndexSearcher searcher) throws IOException { + return this; + } + } + + public void testExplainNoMatchingDocument() throws Exception { + CustomTermAutomatonQuery q = new CustomTermAutomatonQuery("field"); + int initState = q.createState(); + int s1 = q.createState(); + q.addTransition(initState, s1, "xml"); + q.setAccept(s1, true); + q.finish(); + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + Document doc = new Document(); + doc.add(newTextField("field", "protobuf", Field.Store.NO)); + w.addDocument(doc); + + IndexReader r = w.getReader(); + IndexSearcher searcher = newSearcher(r); + Query rewrittenQuery = q.rewrite(searcher); + assertTrue(rewrittenQuery instanceof TermAutomatonQuery); + + TopDocs topDocs = searcher.search(rewrittenQuery, 10); + assertEquals(0, topDocs.totalHits.value); + + Explanation explanation = searcher.explain(rewrittenQuery, 0); + assertFalse("Explanation should indicate no match", explanation.isMatch()); + + IOUtils.close(w, r, dir); + } + + // TODO: improve experience of working with explain + public void testExplainMatchingDocuments() throws Exception { + CustomTermAutomatonQuery q = new CustomTermAutomatonQuery("field"); + + int initState = q.createState(); + int s1 = q.createState(); + int s2 = q.createState(); + q.addTransition(initState, s1, "xml"); + q.addTransition(s1, s2, "json"); + q.addTransition(s1, s2, "protobuf"); + q.setAccept(s2, true); + q.finish(); + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + + Document doc1 = new Document(); + doc1.add(newTextField("field", "xml json", Field.Store.NO)); + w.addDocument(doc1); + + Document doc2 = new Document(); + doc2.add(newTextField("field", "xml protobuf", Field.Store.NO)); + w.addDocument(doc2); + + Document doc3 = new Document(); + doc3.add(newTextField("field", "xml qux", Field.Store.NO)); + w.addDocument(doc3); + + IndexReader r = w.getReader(); + IndexSearcher searcher = newSearcher(r); + Query rewrittenQuery = q.rewrite(searcher); + assertTrue( + "Rewritten query should be an instance of TermAutomatonQuery", + rewrittenQuery instanceof TermAutomatonQuery); + TopDocs topDocs = searcher.search(q, 10); + assertEquals(2, topDocs.totalHits.value); + + for (ScoreDoc scoreDoc : topDocs.scoreDocs) { + Explanation explanation = searcher.explain(q, scoreDoc.doc); + assertNotNull("Explanation should not be null", explanation); + assertTrue("Explanation should indicate a match", explanation.isMatch()); + } + + IOUtils.close(w, r, dir); + } + public void testRewritePhraseWithAny() throws Exception { TermAutomatonQuery q = new TermAutomatonQuery("field"); int initState = q.createState();