LUCENE-10229: Unify behaviour of match offsets for interval queries (#521)

2021-12-09 08:19:18 -08:00 · 2021-12-09 08:19:18 -08:00 · 53099e01de
parent 40c213d873
commit 53099e01de
9 changed files with 299 additions and 22 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -79,6 +79,9 @@ Improvements
 * LUCENE-9538: Detect polygon self-intersections in the Tessellator. (Ignacio Vera)

 * LUCENE-10275: Speed up MultiRangeQuery by using an interval tree. (Ignacio Vera)
+
+* LUCENE-10229: Unify behaviour of match offsets for interval queries on fields
+  with or without offsets enabled. (Patrick Zhai)
  
 Optimizations
 ---------------------
--- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java
@ -332,8 +332,6 @@ public class TestMatchHighlighter extends LuceneTestCase {
          }
        };

-    // TODO: the highlights are different when the field is indexed with offsets. Weird.
-    // String field = FLD_TEXT1;
    String field = FLD_TEXT2;
    new IndexBuilder(this::toField)
        // Just one document and multiple interval queries.
@ -541,6 +539,235 @@ public class TestMatchHighlighter extends LuceneTestCase {
            });
  }

+  /**
+   * Almost the same as the one above, make sure the fields indexed with offsets are also
+   * highlighted correctly
+   */
+  @Test
+  public void testIntervalFunctionsWithOffsetField() throws Exception {
+    Analyzer analyzer =
+        new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer tokenizer = new StandardTokenizer();
+            TokenStream ts = tokenizer;
+            ts = new LowerCaseFilter(ts);
+            return new TokenStreamComponents(tokenizer, ts);
+          }
+        };
+
+    String field = FLD_TEXT1;
+    new IndexBuilder(this::toField)
+        // Just one document and multiple interval queries.
+        .doc(field, "The quick brown fox jumps over the lazy dog")
+        .build(
+            analyzer,
+            reader -> {
+              IndexSearcher searcher = new IndexSearcher(reader);
+              Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
+
+              MatchHighlighter highlighter =
+                  new MatchHighlighter(searcher, analyzer)
+                      .appendFieldHighlighter(
+                          FieldValueHighlighters.highlighted(
+                              80 * 3, 1, new PassageFormatter("...", ">", "<"), fld -> true))
+                      .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+              StandardQueryParser qp = new StandardQueryParser(analyzer);
+
+              // Run all pairs of query-expected highlight.
+              List<String> errors = new ArrayList<>();
+              for (var queryHighlightPair :
+                  new String[][] {
+                    {
+                      "fn:ordered(brown dog)",
+                      "0. %s: The quick >brown fox jumps over the lazy dog<"
+                    },
+                    {
+                      "fn:within(fn:or(lazy quick) 1 fn:or(dog fox))",
+                      "0. %s: The quick brown fox jumps over the >lazy< dog"
+                    },
+                    {
+                      "fn:containedBy(fox fn:ordered(brown fox dog))",
+                      "0. %s: The quick brown >fox< jumps over the lazy dog"
+                    },
+                    {
+                      "fn:atLeast(2 quick fox \"furry dog\")",
+                      "0. %s: The >quick brown fox< jumps over the lazy dog"
+                    },
+                    {
+                      "fn:maxgaps(0 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))",
+                      "0. %s: The quick brown fox jumps over the >lazy dog<"
+                    },
+                    {
+                      "fn:maxgaps(1 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))",
+                      "0. %s: The >quick brown fox< jumps over the >lazy dog<"
+                    },
+                    {
+                      "fn:maxwidth(2 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))",
+                      "0. %s: The quick brown fox jumps over the >lazy dog<"
+                    },
+                    {
+                      "fn:maxwidth(3 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))",
+                      "0. %s: The >quick brown fox< jumps over the >lazy dog<"
+                    },
+                    {
+                      "fn:or(quick \"fox\")",
+                      "0. %s: The >quick< brown >fox< jumps over the lazy dog"
+                    },
+                    {"fn:or(\"quick fox\")"},
+                    {
+                      "fn:phrase(quick brown fox)",
+                      "0. %s: The >quick brown fox< jumps over the lazy dog"
+                    },
+                    {"fn:wildcard(jump*)", "0. %s: The quick brown fox >jumps< over the lazy dog"},
+                    {"fn:wildcard(br*n)", "0. %s: The quick >brown< fox jumps over the lazy dog"},
+                    {"fn:or(dog fox)", "0. %s: The quick brown >fox< jumps over the lazy >dog<"},
+                    {
+                      "fn:phrase(fn:ordered(quick fox) jumps)",
+                      "0. %s: The >quick brown fox jumps< over the lazy dog"
+                    },
+                    {
+                      "fn:ordered(quick jumps dog)",
+                      "0. %s: The >quick brown fox jumps over the lazy dog<"
+                    },
+                    {
+                      "fn:ordered(quick fn:or(fox dog))",
+                      "0. %s: The >quick brown fox< jumps over the lazy dog"
+                    },
+                    {
+                      "fn:ordered(quick jumps fn:or(fox dog))",
+                      "0. %s: The >quick brown fox jumps over the lazy dog<"
+                    },
+                    {
+                      "fn:unordered(dog jumps quick)",
+                      "0. %s: The >quick brown fox jumps over the lazy dog<"
+                    },
+                    {
+                      "fn:unordered(fn:or(fox dog) quick)",
+                      "0. %s: The >quick brown fox< jumps over the lazy dog"
+                    },
+                    {
+                      "fn:unordered(fn:phrase(brown fox) fn:phrase(fox jumps))",
+                      "0. %s: The quick >brown fox jumps< over the lazy dog"
+                    },
+                    {"fn:ordered(fn:phrase(brown fox) fn:phrase(fox jumps))"},
+                    {"fn:unorderedNoOverlaps(fn:phrase(brown fox) fn:phrase(fox jumps))"},
+                    {
+                      "fn:before(fn:or(brown lazy) fox)",
+                      "0. %s: The quick >brown< fox jumps over the lazy dog"
+                    },
+                    {
+                      "fn:before(fn:or(brown lazy) fn:or(dog fox))",
+                      "0. %s: The quick >brown< fox jumps over the >lazy< dog"
+                    },
+                    {
+                      "fn:after(fn:or(brown lazy) fox)",
+                      "0. %s: The quick brown fox jumps over the >lazy< dog"
+                    },
+                    {
+                      "fn:after(fn:or(brown lazy) fn:or(dog fox))",
+                      "0. %s: The quick brown fox jumps over the >lazy< dog"
+                    },
+                    {
+                      "fn:within(fn:or(fox dog) 1 fn:or(quick lazy))",
+                      "0. %s: The quick brown fox jumps over the lazy >dog<"
+                    },
+                    {
+                      "fn:within(fn:or(fox dog) 2 fn:or(quick lazy))",
+                      "0. %s: The quick brown >fox< jumps over the lazy >dog<"
+                    },
+                    {
+                      "fn:notWithin(fn:or(fox dog) 1 fn:or(quick lazy))",
+                      "0. %s: The quick brown >fox< jumps over the lazy dog"
+                    },
+                    {
+                      "fn:containedBy(fn:or(fox dog) fn:ordered(quick lazy))",
+                      "0. %s: The quick brown >fox< jumps over the lazy dog"
+                    },
+                    {
+                      "fn:notContainedBy(fn:or(fox dog) fn:ordered(quick lazy))",
+                      "0. %s: The quick brown fox jumps over the lazy >dog<"
+                    },
+                    {
+                      "fn:containing(fn:atLeast(2 quick fox dog) jumps)",
+                      "0. %s: The quick brown >fox jumps over the lazy dog<"
+                    },
+                    {
+                      "fn:notContaining(fn:ordered(fn:or(the The) fn:or(fox dog)) brown)",
+                      "0. %s: The quick brown fox jumps over >the lazy dog<"
+                    },
+                    {
+                      "fn:overlapping(fn:phrase(brown fox) fn:phrase(fox jumps))",
+                      "0. %s: The quick >brown fox< jumps over the lazy dog"
+                    },
+                    {
+                      "fn:overlapping(fn:or(fox dog) fn:extend(lazy 2 2))",
+                      "0. %s: The quick brown fox jumps over the lazy >dog<"
+                    },
+                    {
+                      "fn:nonOverlapping(fn:phrase(brown fox) fn:phrase(lazy dog))",
+                      "0. %s: The quick >brown fox< jumps over the lazy dog"
+                    },
+                    {
+                      "fn:nonOverlapping(fn:or(fox dog) fn:extend(lazy 2 2))",
+                      "0. %s: The quick brown >fox< jumps over the lazy dog"
+                    },
+                    {
+                      "fn:atLeast(2 fn:unordered(furry dog) fn:unordered(brown dog) lazy quick)",
+                      "0. %s: The >quick >brown fox jumps over the lazy<<> dog<"
+                    },
+                    /*
+                    The test cases below do not work for fields enabled with offset yet:
+                    mainly "extend"
+                    TODO: Fix them!
+
+                    {"fn:extend(fox 1 2)", "0. %s: The quick >brown fox jumps over< the lazy dog"},
+                    {
+                      "fn:extend(fn:or(dog fox) 2 0)",
+                      "0. %s: The >quick brown fox< jumps over >the lazy dog<"
+                    },
+                    {
+                      "fn:containedBy(fn:or(fox dog) fn:extend(lazy 3 3))",
+                      "0. %s: The quick brown fox jumps over the lazy >dog<"
+                    },
+                    {
+                      "fn:notContainedBy(fn:or(fox dog) fn:extend(lazy 3 3))",
+                      "0. %s: The quick brown >fox< jumps over the lazy dog"
+                    },
+                    {
+                      "fn:containing(fn:extend(fn:or(lazy brown) 1 1) fn:or(fox dog))",
+                      "0. %s: The >quick brown fox< jumps over >the lazy dog<"
+                    },
+                    {
+                      "fn:notContaining(fn:extend(fn:or(fox dog) 1 0) fn:or(brown yellow))",
+                      "0. %s: The quick brown fox jumps over the >lazy dog<"
+                    } */
+                  }) {
+                assert queryHighlightPair.length >= 1;
+                String queryString = queryHighlightPair[0];
+                var query = qp.parse(queryString, field);
+                var expected =
+                    Arrays.stream(queryHighlightPair)
+                        .skip(1)
+                        .map(v -> String.format(Locale.ROOT, v, field))
+                        .toArray(String[]::new);
+
+                try {
+                  assertHighlights(
+                      toDocList(
+                          highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
+                      expected);
+                } catch (AssertionError e) {
+                  errors.add("MISMATCH: query: " + queryString + "\n" + e.getMessage());
+                }
+              }
+              if (errors.size() > 0) {
+                throw new AssertionError(String.join("\n\n", errors));
+              }
+            });
+  }
+
  @Test
  public void testCustomFieldHighlightHandling() throws Exception {
    // Match highlighter is a showcase of individual components in this package, suitable
--- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java
@ -379,7 +379,7 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
                          Intervals.containedBy(
                              Intervals.term("foo"),
                              Intervals.unordered(Intervals.term("foo"), Intervals.term("bar"))))),
-                  containsInAnyOrder(fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)));
+                  containsInAnyOrder(fmt("2: (field_text_offs: 'bar baz >foo< xyz')", field)));

              MatcherAssert.assertThat(
                  highlights(
@ -387,9 +387,9 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
                      new IntervalQuery(
                          field,
                          Intervals.overlapping(
-                              Intervals.unordered(Intervals.term("foo"), Intervals.term("bar")),
-                              Intervals.term("foo")))),
-                  containsInAnyOrder(fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)));
+                              Intervals.term("foo"),
+                              Intervals.unordered(Intervals.term("foo"), Intervals.term("bar"))))),
+                  containsInAnyOrder(fmt("2: (field_text_offs: 'bar baz >foo< xyz')", field)));
            });
  }

--- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/ConjunctionIntervalsSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/ConjunctionIntervalsSource.java
@ -59,6 +59,15 @@ abstract class ConjunctionIntervalsSource extends IntervalsSource {

  protected abstract IntervalIterator combine(List<IntervalIterator> iterators);

+  /**
+   * Create matches iterator from an advanced and validated interval iterator and a list of matches
+   * iterator of all the sub-sources
+   */
+  protected IntervalMatchesIterator createMatchesIterator(
+      IntervalIterator it, List<IntervalMatchesIterator> subs) {
+    return new ConjunctionMatchesIterator(it, subs);
+  }
+
  @Override
  public final IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc)
      throws IOException {
@ -81,6 +90,6 @@ abstract class ConjunctionIntervalsSource extends IntervalsSource {
    if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
      return null;
    }
-    return new ConjunctionMatchesIterator(it, subs);
+    return createMatchesIterator(it, subs);
  }
 }
--- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/ContainedByIntervalsSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/ContainedByIntervalsSource.java
@ -66,6 +66,14 @@ class ContainedByIntervalsSource extends ConjunctionIntervalsSource {
    };
  }

+  @Override
+  protected IntervalMatchesIterator createMatchesIterator(
+      IntervalIterator it, List<IntervalMatchesIterator> subs) {
+    assert subs.size() == 2;
+    // the only sub source we care is the "small" source
+    return new ConjunctionMatchesIterator(it, List.of(subs.get(0)));
+  }
+
  @Override
  public int minExtent() {
    return small.minExtent();
--- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java
@ -275,7 +275,10 @@ public final class Intervals {
  }

  /**
-   * Create an unordered {@link IntervalsSource}
+   * Create an unordered {@link IntervalsSource}. Note that if there are multiple intervals ends at
+   * the same position are eligible, only the narrowest one will be returned. For example if asking
+   * for <code>unordered(term("apple"), term("banana"))</code> on field of "apple wolf apple orange
+   * banana", only the "apple orange banana" will be returned.
   *
   * <p>Returns intervals in which all the subsources appear. The subsources may overlap
   *
--- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimumShouldMatchIntervalsSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimumShouldMatchIntervalsSource.java
@ -215,6 +215,7 @@ class MinimumShouldMatchIntervalsSource extends IntervalsSource {

    @Override
    public int nextInterval() throws IOException {
+      lead = null;
      // first, find a matching interval beyond the current start
      while (this.proximityQueue.size() == minShouldMatch
          && proximityQueue.top().start() == start) {
@ -258,7 +259,9 @@ class MinimumShouldMatchIntervalsSource extends IntervalsSource {

    Collection<IntervalIterator> getCurrentIterators() {
      currentIterators.clear();
-      currentIterators.add(lead);
+      if (lead != null) {
+        currentIterators.add(lead);
+      }
      for (IntervalIterator it : this.proximityQueue) {
        if (it.end() <= end) {
          currentIterators.add(it);
--- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OverlappingIntervalsSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OverlappingIntervalsSource.java
@ -62,6 +62,14 @@ class OverlappingIntervalsSource extends ConjunctionIntervalsSource {
    };
  }

+  @Override
+  protected IntervalMatchesIterator createMatchesIterator(
+      IntervalIterator it, List<IntervalMatchesIterator> subs) {
+    assert subs.size() == 2;
+    // the only sub source we care is the "real" source
+    return new ConjunctionMatchesIterator(it, List.of(subs.get(0)));
+  }
+
  @Override
  public int minExtent() {
    return source.minExtent();
--- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java
+++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java
@ -621,17 +621,13 @@ public class TestIntervals extends LuceneTestCase {
    checkIntervals(
        source, "field1", 3, new int[][] {{}, {4, 4, 7, 7}, {1, 1, 7, 7}, {}, {4, 4}, {}});
    MatchesIterator mi = getMatches(source, 1, "field1");
-    assertMatch(mi, 4, 4, 20, 39);
+    assertMatch(mi, 4, 4, 26, 34);
    MatchesIterator subs = mi.getSubMatches();
-    assertMatch(subs, 3, 3, 20, 25);
    assertMatch(subs, 4, 4, 26, 34);
-    assertMatch(subs, 5, 5, 35, 39);
    assertFalse(subs.next());
-    assertMatch(mi, 7, 7, 41, 118);
+    assertMatch(mi, 7, 7, 47, 55);
    subs = mi.getSubMatches();
-    assertMatch(subs, 6, 6, 41, 46);
    assertMatch(subs, 7, 7, 47, 55);
-    assertMatch(subs, 21, 21, 114, 118);
    assertFalse(subs.next());
    assertFalse(mi.next());
    assertEquals(1, source.minExtent());
@ -788,6 +784,30 @@ public class TestIntervals extends LuceneTestCase {
    assertEquals(3, source.minExtent());
  }

+  public void testMinShouldMatch2() throws IOException {
+    IntervalsSource source =
+        Intervals.atLeast(
+            2,
+            Intervals.unordered(Intervals.term("alph"), Intervals.term("ran")),
+            Intervals.term("where"),
+            Intervals.term("river"));
+    MatchesIterator mi = getMatches(source, 1, "field2");
+    assertMatch(mi, 0, 4, 0, 27); // contains "where" and "river"
+    MatchesIterator subs = mi.getSubMatches();
+    assertNotNull(subs);
+    assertMatch(subs, 0, 0, 0, 5);
+    assertMatch(subs, 4, 4, 22, 27);
+    assertFalse(subs.next());
+    assertMatch(mi, 1, 5, 6, 31); // contains "river" and unordered("alph", "run")
+    subs = mi.getSubMatches();
+    assertNotNull(subs);
+    assertMatch(subs, 1, 1, 6, 10);
+    assertMatch(subs, 4, 4, 22, 27);
+    assertMatch(subs, 5, 5, 28, 31);
+    assertFalse(subs.next());
+    assertFalse(mi.next());
+  }
+
  public void testDegenerateMinShouldMatch() throws IOException {
    IntervalsSource source =
        Intervals.ordered(
@ -851,11 +871,9 @@ public class TestIntervals extends LuceneTestCase {
    checkIntervals(source, "field1", 3, new int[][] {{}, {7, 7}, {4, 4, 7, 7}, {}, {7, 7}, {}});

    MatchesIterator mi = getMatches(source, 1, "field1");
-    assertMatch(mi, 7, 7, 20, 55);
+    assertMatch(mi, 7, 7, 47, 55);
    MatchesIterator sub = mi.getSubMatches();
    assertNotNull(sub);
-    assertMatch(sub, 3, 3, 20, 25);
-    assertMatch(sub, 5, 5, 35, 39);
    assertMatch(sub, 7, 7, 47, 55);
    assertFalse(sub.next());

@ -890,15 +908,13 @@ public class TestIntervals extends LuceneTestCase {

    MatchesIterator mi = getMatches(source, 1, "field1");
    assertNotNull(mi);
-    assertMatch(mi, 2, 4, 15, 39);
+    assertMatch(mi, 2, 4, 15, 34);
    MatchesIterator sub = mi.getSubMatches();
    assertNotNull(sub);
    assertMatch(sub, 2, 2, 15, 18);
-    assertMatch(sub, 3, 3, 20, 25);
    assertMatch(sub, 4, 4, 26, 34);
-    assertMatch(sub, 5, 5, 35, 39);
    assertFalse(sub.next());
-    assertMatch(mi, 7, 17, 41, 118);
+    assertMatch(mi, 7, 17, 47, 99);

    assertEquals(2, source.minExtent());
  }