Explain term automaton queries (#12208)

2023-04-08 16:09:42 -07:00 · 2023-04-08 16:09:42 -07:00 · 2d7908e3c9
parent c31017589b
commit 2d7908e3c9
4 changed files with 148 additions and 2 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -179,6 +179,8 @@ Bug Fixes

 * GITHUB#12202: Fix MultiFieldQueryParser to apply boosts to regexp, wildcard, prefix, range, fuzzy queries.  (Jasir KT)

+* GITHUB#12178: Add explanations for TermAutomatonQuery (Marcus Eagan via Patrick Zhai, Mike McCandless, Robert Muir, Mikhail Khludnev)
+
 * GITHUB#12214: Fix ordered intervals query to avoid skipping some of the results over interleaved terms. (Hongyu Yan)

 * GITHUB#12212: Bug fix for a DrillSideways issue where matching hits could occasionally be missed. (Frederic Thevenet)
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonQuery.java
@ -442,8 +442,44 @@ public class TermAutomatonQuery extends Query implements Accountable {

    @Override
    public Explanation explain(LeafReaderContext context, int doc) throws IOException {
-      // TODO
-      return null;
+      Scorer scorer = scorer(context);
+      if (scorer == null) {
+        return Explanation.noMatch("No matching terms in the document");
+      }
+
+      int advancedDoc = scorer.iterator().advance(doc);
+      if (advancedDoc != doc) {
+        return Explanation.noMatch("No matching terms in the document");
+      }
+
+      float score = scorer.score();
+      LeafSimScorer leafSimScorer = ((TermAutomatonScorer) scorer).getLeafSimScorer();
+      EnumAndScorer[] originalSubsOnDoc = ((TermAutomatonScorer) scorer).getOriginalSubsOnDoc();
+
+      List<Explanation> termExplanations = new ArrayList<>();
+      for (EnumAndScorer enumAndScorer : originalSubsOnDoc) {
+        if (enumAndScorer != null) {
+          PostingsEnum postingsEnum = enumAndScorer.posEnum;
+          if (postingsEnum.docID() == doc) {
+            float termScore = leafSimScorer.score(doc, postingsEnum.freq());
+            termExplanations.add(
+                Explanation.match(
+                    postingsEnum.freq(),
+                    "term frequency in the document",
+                    Explanation.match(
+                        termScore,
+                        "score for term: " + idToTerm.get(enumAndScorer.termID).utf8ToString())));
+          }
+        }
+      }
+
+      if (termExplanations.isEmpty()) {
+        return Explanation.noMatch("No matching terms in the document");
+      }
+
+      Explanation freqExplanation =
+          Explanation.match(score, "TermAutomatonQuery, sum of:", termExplanations);
+      return leafSimScorer.explain(doc, freqExplanation);
    }
  }

--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonScorer.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonScorer.java
@ -53,6 +53,13 @@ class TermAutomatonScorer extends Scorer {
  private int docID = -1;
  private int freq;

+  /**
+   * originalSubsOnDoc is an array of EnumAndScorer instances used to create this
+   * TermAutomatonScorer. This field is only for explain purposes and should not be used for
+   * scoring/matching.
+   */
+  private final EnumAndScorer[] originalSubsOnDoc;
+
  public TermAutomatonScorer(
      TermAutomatonWeight weight, EnumAndScorer[] subs, int anyTermID, LeafSimScorer docScorer)
      throws IOException {
@ -65,6 +72,7 @@ class TermAutomatonScorer extends Scorer {
    this.anyTermID = anyTermID;
    this.subsOnDoc = new EnumAndScorer[subs.length];
    this.positions = new PosState[4];
+    this.originalSubsOnDoc = subs;
    for (int i = 0; i < this.positions.length; i++) {
      this.positions[i] = new PosState();
    }
@ -345,6 +353,14 @@ class TermAutomatonScorer extends Scorer {
    }
  }

+  EnumAndScorer[] getOriginalSubsOnDoc() {
+    return originalSubsOnDoc;
+  }
+
+  LeafSimScorer getLeafSimScorer() {
+    return docScorer;
+  }
+
  @Override
  public String toString() {
    return "TermAutomatonScorer(" + weight + ")";
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestTermAutomatonQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestTermAutomatonQuery.java
@ -43,6 +43,7 @@ import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.ConstantScoreScorer;
 import org.apache.lucene.search.ConstantScoreWeight;
+import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.search.MultiPhraseQuery;
@ -842,6 +843,97 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
    IOUtils.close(w, r, dir);
  }

+  /* Implement a custom term automaton query to ensure that rewritten queries
+   *  do not get rewritten to primitive queries. The custom extension will allow
+   *  the following explain tests to evaluate Explain for the query we intend to
+   *  test, TermAutomatonQuery.
+   * */
+
+  private static class CustomTermAutomatonQuery extends TermAutomatonQuery {
+    public CustomTermAutomatonQuery(String field) {
+      super(field);
+    }
+
+    @Override
+    public Query rewrite(IndexSearcher searcher) throws IOException {
+      return this;
+    }
+  }
+
+  public void testExplainNoMatchingDocument() throws Exception {
+    CustomTermAutomatonQuery q = new CustomTermAutomatonQuery("field");
+    int initState = q.createState();
+    int s1 = q.createState();
+    q.addTransition(initState, s1, "xml");
+    q.setAccept(s1, true);
+    q.finish();
+
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+    Document doc = new Document();
+    doc.add(newTextField("field", "protobuf", Field.Store.NO));
+    w.addDocument(doc);
+
+    IndexReader r = w.getReader();
+    IndexSearcher searcher = newSearcher(r);
+    Query rewrittenQuery = q.rewrite(searcher);
+    assertTrue(rewrittenQuery instanceof TermAutomatonQuery);
+
+    TopDocs topDocs = searcher.search(rewrittenQuery, 10);
+    assertEquals(0, topDocs.totalHits.value);
+
+    Explanation explanation = searcher.explain(rewrittenQuery, 0);
+    assertFalse("Explanation should indicate no match", explanation.isMatch());
+
+    IOUtils.close(w, r, dir);
+  }
+
+  // TODO: improve experience of working with explain
+  public void testExplainMatchingDocuments() throws Exception {
+    CustomTermAutomatonQuery q = new CustomTermAutomatonQuery("field");
+
+    int initState = q.createState();
+    int s1 = q.createState();
+    int s2 = q.createState();
+    q.addTransition(initState, s1, "xml");
+    q.addTransition(s1, s2, "json");
+    q.addTransition(s1, s2, "protobuf");
+    q.setAccept(s2, true);
+    q.finish();
+
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+
+    Document doc1 = new Document();
+    doc1.add(newTextField("field", "xml json", Field.Store.NO));
+    w.addDocument(doc1);
+
+    Document doc2 = new Document();
+    doc2.add(newTextField("field", "xml protobuf", Field.Store.NO));
+    w.addDocument(doc2);
+
+    Document doc3 = new Document();
+    doc3.add(newTextField("field", "xml qux", Field.Store.NO));
+    w.addDocument(doc3);
+
+    IndexReader r = w.getReader();
+    IndexSearcher searcher = newSearcher(r);
+    Query rewrittenQuery = q.rewrite(searcher);
+    assertTrue(
+        "Rewritten query should be an instance of TermAutomatonQuery",
+        rewrittenQuery instanceof TermAutomatonQuery);
+    TopDocs topDocs = searcher.search(q, 10);
+    assertEquals(2, topDocs.totalHits.value);
+
+    for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
+      Explanation explanation = searcher.explain(q, scoreDoc.doc);
+      assertNotNull("Explanation should not be null", explanation);
+      assertTrue("Explanation should indicate a match", explanation.isMatch());
+    }
+
+    IOUtils.close(w, r, dir);
+  }
+
  public void testRewritePhraseWithAny() throws Exception {
    TermAutomatonQuery q = new TermAutomatonQuery("field");
    int initState = q.createState();