Explain term automaton queries (#12208)

2023-04-08 16:09:42 -07:00 · 2023-04-08 16:09:42 -07:00 · 2d7908e3c9
parent c31017589b
commit 2d7908e3c9
4 changed files with 148 additions and 2 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -179,6 +179,8 @@ Bug Fixes
 * GITHUB#12202: Fix MultiFieldQueryParser to apply boosts to regexp, wildcard, prefix, range, fuzzy queries.  (Jasir KT)
 * GITHUB#12178: Add explanations for TermAutomatonQuery (Marcus Eagan via Patrick Zhai, Mike McCandless, Robert Muir, Mikhail Khludnev)
 * GITHUB#12214: Fix ordered intervals query to avoid skipping some of the results over interleaved terms. (Hongyu Yan)
 * GITHUB#12212: Bug fix for a DrillSideways issue where matching hits could occasionally be missed. (Frederic Thevenet)
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonQuery.java
@ -442,8 +442,44 @@ public class TermAutomatonQuery extends Query implements Accountable {
    @Override
    public Explanation explain(LeafReaderContext context, int doc) throws IOException {
-      // TODO
+      Scorer scorer = scorer(context);
-      return null;
+      if (scorer == null) {
        return Explanation.noMatch("No matching terms in the document");
      }
      int advancedDoc = scorer.iterator().advance(doc);
      if (advancedDoc != doc) {
        return Explanation.noMatch("No matching terms in the document");
      }
      float score = scorer.score();
      LeafSimScorer leafSimScorer = ((TermAutomatonScorer) scorer).getLeafSimScorer();
      EnumAndScorer[] originalSubsOnDoc = ((TermAutomatonScorer) scorer).getOriginalSubsOnDoc();
      List<Explanation> termExplanations = new ArrayList<>();
      for (EnumAndScorer enumAndScorer : originalSubsOnDoc) {
        if (enumAndScorer != null) {
          PostingsEnum postingsEnum = enumAndScorer.posEnum;
          if (postingsEnum.docID() == doc) {
            float termScore = leafSimScorer.score(doc, postingsEnum.freq());
            termExplanations.add(
                Explanation.match(
                    postingsEnum.freq(),
                    "term frequency in the document",
                    Explanation.match(
                        termScore,
                        "score for term: " + idToTerm.get(enumAndScorer.termID).utf8ToString())));
          }
        }
      }
      if (termExplanations.isEmpty()) {
        return Explanation.noMatch("No matching terms in the document");
      }
      Explanation freqExplanation =
          Explanation.match(score, "TermAutomatonQuery, sum of:", termExplanations);
      return leafSimScorer.explain(doc, freqExplanation);
    }
  }
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonScorer.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/TermAutomatonScorer.java
@ -53,6 +53,13 @@ class TermAutomatonScorer extends Scorer {
  private int docID = -1;
  private int freq;
  /**
   * originalSubsOnDoc is an array of EnumAndScorer instances used to create this
   * TermAutomatonScorer. This field is only for explain purposes and should not be used for
   * scoring/matching.
   */
  private final EnumAndScorer[] originalSubsOnDoc;
  public TermAutomatonScorer(
      TermAutomatonWeight weight, EnumAndScorer[] subs, int anyTermID, LeafSimScorer docScorer)
      throws IOException {
@ -65,6 +72,7 @@ class TermAutomatonScorer extends Scorer {
    this.anyTermID = anyTermID;
    this.subsOnDoc = new EnumAndScorer[subs.length];
    this.positions = new PosState[4];
    this.originalSubsOnDoc = subs;
    for (int i = 0; i < this.positions.length; i++) {
      this.positions[i] = new PosState();
    }
@ -345,6 +353,14 @@ class TermAutomatonScorer extends Scorer {
    }
  }
  EnumAndScorer[] getOriginalSubsOnDoc() {
    return originalSubsOnDoc;
  }
  LeafSimScorer getLeafSimScorer() {
    return docScorer;
  }
  @Override
  public String toString() {
    return "TermAutomatonScorer(" + weight + ")";
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestTermAutomatonQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestTermAutomatonQuery.java
@ -43,6 +43,7 @@ import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.ConstantScoreScorer;
 import org.apache.lucene.search.ConstantScoreWeight;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.search.MultiPhraseQuery;
@ -842,6 +843,97 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
    IOUtils.close(w, r, dir);
  }
  /* Implement a custom term automaton query to ensure that rewritten queries
   *  do not get rewritten to primitive queries. The custom extension will allow
   *  the following explain tests to evaluate Explain for the query we intend to
   *  test, TermAutomatonQuery.
   * */
  private static class CustomTermAutomatonQuery extends TermAutomatonQuery {
    public CustomTermAutomatonQuery(String field) {
      super(field);
    }
    @Override
    public Query rewrite(IndexSearcher searcher) throws IOException {
      return this;
    }
  }
  public void testExplainNoMatchingDocument() throws Exception {
    CustomTermAutomatonQuery q = new CustomTermAutomatonQuery("field");
    int initState = q.createState();
    int s1 = q.createState();
    q.addTransition(initState, s1, "xml");
    q.setAccept(s1, true);
    q.finish();
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    doc.add(newTextField("field", "protobuf", Field.Store.NO));
    w.addDocument(doc);
    IndexReader r = w.getReader();
    IndexSearcher searcher = newSearcher(r);
    Query rewrittenQuery = q.rewrite(searcher);
    assertTrue(rewrittenQuery instanceof TermAutomatonQuery);
    TopDocs topDocs = searcher.search(rewrittenQuery, 10);
    assertEquals(0, topDocs.totalHits.value);
    Explanation explanation = searcher.explain(rewrittenQuery, 0);
    assertFalse("Explanation should indicate no match", explanation.isMatch());
    IOUtils.close(w, r, dir);
  }
  // TODO: improve experience of working with explain
  public void testExplainMatchingDocuments() throws Exception {
    CustomTermAutomatonQuery q = new CustomTermAutomatonQuery("field");
    int initState = q.createState();
    int s1 = q.createState();
    int s2 = q.createState();
    q.addTransition(initState, s1, "xml");
    q.addTransition(s1, s2, "json");
    q.addTransition(s1, s2, "protobuf");
    q.setAccept(s2, true);
    q.finish();
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    Document doc1 = new Document();
    doc1.add(newTextField("field", "xml json", Field.Store.NO));
    w.addDocument(doc1);
    Document doc2 = new Document();
    doc2.add(newTextField("field", "xml protobuf", Field.Store.NO));
    w.addDocument(doc2);
    Document doc3 = new Document();
    doc3.add(newTextField("field", "xml qux", Field.Store.NO));
    w.addDocument(doc3);
    IndexReader r = w.getReader();
    IndexSearcher searcher = newSearcher(r);
    Query rewrittenQuery = q.rewrite(searcher);
    assertTrue(
        "Rewritten query should be an instance of TermAutomatonQuery",
        rewrittenQuery instanceof TermAutomatonQuery);
    TopDocs topDocs = searcher.search(q, 10);
    assertEquals(2, topDocs.totalHits.value);
    for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
      Explanation explanation = searcher.explain(q, scoreDoc.doc);
      assertNotNull("Explanation should not be null", explanation);
      assertTrue("Explanation should indicate a match", explanation.isMatch());
    }
    IOUtils.close(w, r, dir);
  }
  public void testRewritePhraseWithAny() throws Exception {
    TermAutomatonQuery q = new TermAutomatonQuery("field");
    int initState = q.createState();