From d49b36ec9c1c51f900c8d89cb51fb70eba026ba0 Mon Sep 17 00:00:00 2001
From: Mikhail Khludnev <mkhl@apache.org>
Date: Tue, 29 Oct 2019 17:04:43 -0700
Subject: [PATCH] LUCENE-9028: Introduce Intervals.multiterm()

---
 lucene/CHANGES.txt                            |  2 +
 .../lucene/queries/intervals/Intervals.java   | 30 +++++++++++
 .../queries/intervals/TestIntervals.java      | 52 +++++++++++--------
 3 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index d2d111b29d8..5ff3a52fe03 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -18,6 +18,8 @@ Improvements
 * LUCENE-9006: WordDelimiterGraphFilter's catenateAll token is now ordered before any token parts, like WDF did.
   (David Smiley)
 
+* LUCENE-9028: introducing Intervals.multiterm() (Mikhail Khludnev)
+
 Optimizations
 
 * LUCENE-8928: When building a kd-tree for dimensions n > 2, compute exact bounds for an inner node every N splits
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java
index 2f5570b8287..6fd3901f23c 100644
--- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java
@@ -25,6 +25,7 @@ import org.apache.lucene.index.Term;
 import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.CompiledAutomaton;
 
 /**
@@ -195,6 +196,35 @@ public final class Intervals {
     return new MultiTermIntervalsSource(ca, maxExpansions, wildcard.utf8ToString());
   }
 
+  /**
+   * Expert: Return an {@link IntervalsSource} over the disjunction of all terms that's accepted by the given automaton 
+   *
+   * @param automaton accepts terms for to expand to
+   * @param pattern string representation of the given automaton, mostly used in exception messages  
+   *
+   * @throws IllegalStateException if the automaton accepts more than 128 terms
+   */
+  public static IntervalsSource multiterm(Automaton automaton, String pattern) {
+    return multiterm(automaton, 128, pattern);
+  }
+
+  /**
+   * Expert: Return an {@link IntervalsSource} over the disjunction of all terms that's accepted by the given automaton 
+   *
+   * WARNING: Setting {@code maxExpansions} to higher than the default value of 128
+   * can be both slow and memory-intensive
+   *
+   * @param automaton accepts terms for to expand to
+   * @param maxExpansions the maximum number of terms to expand to
+   * @param pattern string representation of the given automaton, mostly used in exception messages  
+   *
+   * @throws IllegalStateException if the automaton accepts more than {@code maxExpansions} terms
+   */
+  public static IntervalsSource multiterm(Automaton automaton, int maxExpansions, String pattern) {
+    CompiledAutomaton ca = new CompiledAutomaton(automaton);
+    return new MultiTermIntervalsSource(ca, maxExpansions, pattern);
+  }
+  
   /**
    * Create an {@link IntervalsSource} that filters a sub-source by the width of its intervals
    * @param width       the maximum width of intervals in the sub-source to filter
diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java
index e0c54a5a4c6..942b7d3da2c 100644
--- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java
+++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java
@@ -18,6 +18,7 @@
 package org.apache.lucene.queries.intervals;
 
 import java.io.IOException;
+import java.util.List;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
@@ -37,6 +38,7 @@ import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.ReaderUtil;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchesIterator;
+import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
@@ -738,30 +740,36 @@ public class TestIntervals extends LuceneTestCase {
   }
 
   public void testPrefix() throws IOException {
-    IntervalsSource source = Intervals.prefix(new BytesRef("p"));
-    checkIntervals(source, "field1", 5, new int[][]{
-        {},
-        { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10, 27, 27 },
-        { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10 },
-        { 7, 7 },
-        { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10 },
-        { 0, 0 }
-    });
-    MatchesIterator mi = getMatches(source, 1, "field1");
-    assertNotNull(mi);
-    assertMatch(mi, 0, 0, 0, 5);
-    assertMatch(mi, 1, 1, 6, 14);
+    for (IntervalsSource source : List.of(Intervals.prefix(new BytesRef("p")),
+        Intervals.multiterm(PrefixQuery.toAutomaton(new BytesRef("p")), "p*" ) )) {
+      checkIntervals(source, "field1", 5, new int[][]{
+          {},
+          { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10, 27, 27 },
+          { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10 },
+          { 7, 7 },
+          { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10 },
+          { 0, 0 }
+      });
+      MatchesIterator mi = getMatches(source, 1, "field1");
+      assertNotNull(mi);
+      assertMatch(mi, 0, 0, 0, 5);
+      assertMatch(mi, 1, 1, 6, 14);
+    }
 
-    IntervalsSource noSuch = Intervals.prefix(new BytesRef("qqq"));
-    checkIntervals(noSuch, "field1", 0, new int[][]{});
+    for (IntervalsSource noSuch : List.of(Intervals.prefix(new BytesRef("qqq")),
+        Intervals.multiterm(PrefixQuery.toAutomaton(new BytesRef("qqq")), "qqq*" ))) {
+      checkIntervals(noSuch, "field1", 0, new int[][]{});
+    }
 
-    IllegalStateException e = expectThrows(IllegalStateException.class, () -> {
-      IntervalsSource s = Intervals.prefix(new BytesRef("p"), 1);
-      for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
-        s.intervals("field1", ctx);
-      }
-    });
-    assertEquals("Automaton [p*] expanded to too many terms (limit 1)", e.getMessage());
+    for (IntervalsSource source : List.of(Intervals.prefix(new BytesRef("p"), 1), 
+        Intervals.multiterm(PrefixQuery.toAutomaton(new BytesRef("p")), 1, "p*")) ) {
+      IllegalStateException e = expectThrows(IllegalStateException.class, () -> {
+        for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
+          source.intervals("field1", ctx);
+        }
+      });
+      assertEquals("Automaton [p*] expanded to too many terms (limit 1)", e.getMessage());
+    }
   }
 
   public void testWildcard() throws IOException {