From d49b36ec9c1c51f900c8d89cb51fb70eba026ba0 Mon Sep 17 00:00:00 2001 From: Mikhail Khludnev Date: Tue, 29 Oct 2019 17:04:43 -0700 Subject: [PATCH] LUCENE-9028: Introduce Intervals.multiterm() --- lucene/CHANGES.txt | 2 + .../lucene/queries/intervals/Intervals.java | 30 +++++++++++ .../queries/intervals/TestIntervals.java | 52 +++++++++++-------- 3 files changed, 62 insertions(+), 22 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index d2d111b29d8..5ff3a52fe03 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -18,6 +18,8 @@ Improvements * LUCENE-9006: WordDelimiterGraphFilter's catenateAll token is now ordered before any token parts, like WDF did. (David Smiley) +* LUCENE-9028: introducing Intervals.multiterm() (Mikhail Khludnev) + Optimizations * LUCENE-8928: When building a kd-tree for dimensions n > 2, compute exact bounds for an inner node every N splits diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java index 2f5570b8287..6fd3901f23c 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java @@ -25,6 +25,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.CompiledAutomaton; /** @@ -195,6 +196,35 @@ public final class Intervals { return new MultiTermIntervalsSource(ca, maxExpansions, wildcard.utf8ToString()); } + /** + * Expert: Return an {@link IntervalsSource} over the disjunction of all terms that's accepted by the given automaton + * + * @param automaton accepts terms for to expand to + * @param pattern string representation of the given automaton, mostly used in exception messages + * + * @throws IllegalStateException if the automaton accepts more than 128 terms + */ + public static IntervalsSource multiterm(Automaton automaton, String pattern) { + return multiterm(automaton, 128, pattern); + } + + /** + * Expert: Return an {@link IntervalsSource} over the disjunction of all terms that's accepted by the given automaton + * + * WARNING: Setting {@code maxExpansions} to higher than the default value of 128 + * can be both slow and memory-intensive + * + * @param automaton accepts terms for to expand to + * @param maxExpansions the maximum number of terms to expand to + * @param pattern string representation of the given automaton, mostly used in exception messages + * + * @throws IllegalStateException if the automaton accepts more than {@code maxExpansions} terms + */ + public static IntervalsSource multiterm(Automaton automaton, int maxExpansions, String pattern) { + CompiledAutomaton ca = new CompiledAutomaton(automaton); + return new MultiTermIntervalsSource(ca, maxExpansions, pattern); + } + /** * Create an {@link IntervalsSource} that filters a sub-source by the width of its intervals * @param width the maximum width of intervals in the sub-source to filter diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java index e0c54a5a4c6..942b7d3da2c 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java @@ -18,6 +18,7 @@ package org.apache.lucene.queries.intervals; import java.io.IOException; +import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; @@ -37,6 +38,7 @@ import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchesIterator; +import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; @@ -738,30 +740,36 @@ public class TestIntervals extends LuceneTestCase { } public void testPrefix() throws IOException { - IntervalsSource source = Intervals.prefix(new BytesRef("p")); - checkIntervals(source, "field1", 5, new int[][]{ - {}, - { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10, 27, 27 }, - { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10 }, - { 7, 7 }, - { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10 }, - { 0, 0 } - }); - MatchesIterator mi = getMatches(source, 1, "field1"); - assertNotNull(mi); - assertMatch(mi, 0, 0, 0, 5); - assertMatch(mi, 1, 1, 6, 14); + for (IntervalsSource source : List.of(Intervals.prefix(new BytesRef("p")), + Intervals.multiterm(PrefixQuery.toAutomaton(new BytesRef("p")), "p*" ) )) { + checkIntervals(source, "field1", 5, new int[][]{ + {}, + { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10, 27, 27 }, + { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10 }, + { 7, 7 }, + { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10 }, + { 0, 0 } + }); + MatchesIterator mi = getMatches(source, 1, "field1"); + assertNotNull(mi); + assertMatch(mi, 0, 0, 0, 5); + assertMatch(mi, 1, 1, 6, 14); + } - IntervalsSource noSuch = Intervals.prefix(new BytesRef("qqq")); - checkIntervals(noSuch, "field1", 0, new int[][]{}); + for (IntervalsSource noSuch : List.of(Intervals.prefix(new BytesRef("qqq")), + Intervals.multiterm(PrefixQuery.toAutomaton(new BytesRef("qqq")), "qqq*" ))) { + checkIntervals(noSuch, "field1", 0, new int[][]{}); + } - IllegalStateException e = expectThrows(IllegalStateException.class, () -> { - IntervalsSource s = Intervals.prefix(new BytesRef("p"), 1); - for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) { - s.intervals("field1", ctx); - } - }); - assertEquals("Automaton [p*] expanded to too many terms (limit 1)", e.getMessage()); + for (IntervalsSource source : List.of(Intervals.prefix(new BytesRef("p"), 1), + Intervals.multiterm(PrefixQuery.toAutomaton(new BytesRef("p")), 1, "p*")) ) { + IllegalStateException e = expectThrows(IllegalStateException.class, () -> { + for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) { + source.intervals("field1", ctx); + } + }); + assertEquals("Automaton [p*] expanded to too many terms (limit 1)", e.getMessage()); + } } public void testWildcard() throws IOException {