Add IntervalsSource for range and regexp queries (#13562)

We already have convenient functions for contructing IntervalsSource for wildcard and fuzzy functions. This adds functions for regexp and range as well.
2024-07-14 09:48:16 -04:00 · 2024-07-14 09:48:16 -04:00 · 5e52b8094a
parent c55d664b3e
commit 5e52b8094a
3 changed files with 158 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -261,6 +261,9 @@ Improvements

 * GITHUB#13548: Refactor and javadoc update for KNN vector writer classes. (Patrick Zhai)

+* GITHUB#13562: Add Intervals.regexp and Intervals.range methods to produce  IntervalsSource
+  for regexp and range queries. (Mayya Sharipova)
+
 Optimizations
 ---------------------

--- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java
@ -27,11 +27,15 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.FuzzyQuery;
 import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.RegexpQuery;
+import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.CompiledAutomaton;
 import org.apache.lucene.util.automaton.LevenshteinAutomata;
 import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.RegExp;

 /**
 * Factory functions for creating {@link IntervalsSource interval sources}.
@ -206,6 +210,88 @@ public final class Intervals {
    return new MultiTermIntervalsSource(ca, maxExpansions, wildcard.utf8ToString());
  }

+  /**
+   * Return an {@link IntervalsSource} over the disjunction of all terms that match a regular
+   * expression
+   *
+   * @param regexp regular expression
+   * @throws IllegalStateException if the regex expands to more than {@link #DEFAULT_MAX_EXPANSIONS}
+   *     terms
+   * @see RegexpQuery for regexp format
+   */
+  public static IntervalsSource regexp(BytesRef regexp) {
+    return regexp(regexp, DEFAULT_MAX_EXPANSIONS);
+  }
+
+  /**
+   * Expert: Return an {@link IntervalsSource} over the disjunction of all terms that match a
+   * regular expression
+   *
+   * <p>WARNING: Setting {@code maxExpansions} to higher than the default value of {@link
+   * #DEFAULT_MAX_EXPANSIONS} can be both slow and memory-intensive
+   *
+   * @param regexp regular expression
+   * @param maxExpansions the maximum number of terms to expand to
+   * @throws IllegalStateException if the regex expands to more than {@link #DEFAULT_MAX_EXPANSIONS}
+   *     terms
+   * @see RegexpQuery for regexp format
+   */
+  public static IntervalsSource regexp(BytesRef regexp, int maxExpansions) {
+    Automaton automaton = new RegExp(new Term("", regexp).text()).toAutomaton();
+    CompiledAutomaton ca = new CompiledAutomaton(automaton, false, true, false);
+    return new MultiTermIntervalsSource(ca, maxExpansions, regexp.utf8ToString());
+  }
+
+  /**
+   * Return an {@link IntervalsSource} over the disjunction of all terms that fall within the given
+   * range
+   *
+   * @param lowerTerm The term text at the lower end of the range
+   * @param upperTerm The term text at the upper end of the range
+   * @param includeLower If true, the <code>lowerTerm</code> is included in the range
+   * @param includeUpper If true, the <code>upperTerm</code> is included in the range
+   * @throws IllegalStateException if the range expands to more than {@link #DEFAULT_MAX_EXPANSIONS}
+   *     terms
+   */
+  public static IntervalsSource range(
+      BytesRef lowerTerm, BytesRef upperTerm, boolean includeLower, boolean includeUpper) {
+    return range(lowerTerm, upperTerm, includeLower, includeUpper, DEFAULT_MAX_EXPANSIONS);
+  }
+
+  /**
+   * Expert: Return an {@link IntervalsSource} over the disjunction of all terms that fall within
+   * the given range
+   *
+   * <p>WARNING: Setting {@code maxExpansions} to higher than the default value of {@link
+   * #DEFAULT_MAX_EXPANSIONS} can be both slow and memory-intensive
+   *
+   * @param lowerTerm The term text at the lower end of the range
+   * @param upperTerm The term text at the upper end of the range
+   * @param includeLower If true, the <code>lowerTerm</code> is included in the range
+   * @param includeUpper If true, the <code>upperTerm</code> is included in the range
+   * @param maxExpansions the maximum number of terms to expand to
+   * @throws IllegalStateException if the wildcard glob expands to more than {@code maxExpansions}
+   *     terms
+   */
+  public static IntervalsSource range(
+      BytesRef lowerTerm,
+      BytesRef upperTerm,
+      boolean includeLower,
+      boolean includeUpper,
+      int maxExpansions) {
+    Automaton automaton =
+        TermRangeQuery.toAutomaton(lowerTerm, upperTerm, includeLower, includeUpper);
+    CompiledAutomaton ca = new CompiledAutomaton(automaton, false, true, true);
+
+    StringBuilder buffer = new StringBuilder();
+    buffer.append("{");
+    buffer.append(lowerTerm.utf8ToString());
+    buffer.append(",");
+    buffer.append(upperTerm.utf8ToString());
+    buffer.append("}");
+    return new MultiTermIntervalsSource(ca, maxExpansions, buffer.toString());
+  }
+
  /**
   * A fuzzy term {@link IntervalsSource} matches the disjunction of intervals of terms that are
   * within the specified {@code maxEdits} from the provided term.
--- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java
+++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java
@ -1028,6 +1028,40 @@ public class TestIntervals extends LuceneTestCase {
    checkVisits(Intervals.wildcard(new BytesRef("p??")), 1);
  }

+  public void testRegexp() throws IOException {
+    IntervalsSource source = Intervals.regexp(new BytesRef(".ot"));
+    checkIntervals(
+        source,
+        "field1",
+        4,
+        new int[][] {
+          {},
+          {2, 2, 10, 10, 17, 17, 27, 27},
+          {5, 5, 10, 10, 21, 21},
+          {3, 3},
+          {2, 2, 10, 10, 17, 17},
+          {}
+        });
+    MatchesIterator mi = getMatches(source, 4, "field1");
+    assertNotNull(mi);
+    assertMatch(mi, 2, 2, 15, 18);
+    assertMatch(mi, 10, 10, 63, 66);
+    assertMatch(mi, 17, 17, 97, 100);
+
+    IllegalStateException e =
+        expectThrows(
+            IllegalStateException.class,
+            () -> {
+              IntervalsSource s = Intervals.regexp(new BytesRef(".ot"), 1);
+              for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
+                s.intervals("field1", ctx);
+              }
+            });
+    assertEquals("Automaton [.ot] expanded to too many terms (limit 1)", e.getMessage());
+
+    checkVisits(Intervals.regexp(new BytesRef("p.*")), 1);
+  }
+
  public void testFuzzyTerm() throws IOException {
    IntervalsSource source = Intervals.fuzzyTerm("kot", 1); // matches 'pot'
    checkIntervals(
@ -1069,6 +1103,41 @@ public class TestIntervals extends LuceneTestCase {
    checkVisits(Intervals.fuzzyTerm("kot", FuzzyQuery.defaultMaxEdits), 1);
  }

+  public void testRange() throws IOException {
+    IntervalsSource source = Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true);
+    checkIntervals(
+        source,
+        "field1",
+        6,
+        new int[][] {
+          {5, 5},
+          {2, 2, 5, 5, 12, 12, 17, 17, 21, 21, 29, 29},
+          {2, 2, 5, 5, 12, 12, 17, 17, 21, 21, 27, 27},
+          {1, 1, 3, 3, 4, 4},
+          {2, 2, 5, 5, 17, 17},
+          {2, 2}
+        });
+    MatchesIterator mi = getMatches(source, 3, "field1");
+    assertNotNull(mi);
+    assertMatch(mi, 1, 1, 4, 8);
+    assertMatch(mi, 3, 3, 15, 18);
+    assertMatch(mi, 4, 4, 19, 24);
+
+    IllegalStateException e =
+        expectThrows(
+            IllegalStateException.class,
+            () -> {
+              IntervalsSource s =
+                  Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true, 1);
+              for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
+                s.intervals("field1", ctx);
+              }
+            });
+    assertEquals("Automaton [{cold,hot}] expanded to too many terms (limit 1)", e.getMessage());
+
+    checkVisits(source, 1);
+  }
+
  public void testWrappedFilters() throws IOException {
    IntervalsSource source =
        Intervals.or(