From 90028a7b935ad3205a8a6837cbb7ce1e9dbb6dff Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Mon, 24 Feb 2020 11:08:48 +0000 Subject: [PATCH] LUCENE-9212: Intervals.multiterm() should take CompiledAutomaton --- lucene/CHANGES.txt | 3 + .../lucene/queries/intervals/Intervals.java | 34 ++++++++++ .../queries/intervals/TestIntervals.java | 68 ++++++++++++------- 3 files changed, 81 insertions(+), 24 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 15f53c2fc08..c13cf6abea5 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -24,6 +24,9 @@ API Changes * LUCENE-9218: XY geometries API works in float space. (Ignacio Vera) +* LUCENE-9212: Intervals.multiterm() takes CompiledAutomaton rather than plain Automaton + (Alan Woodward) + New Features --------------------- diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java index 6fd3901f23c..5d835bd0277 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java @@ -203,7 +203,10 @@ public final class Intervals { * @param pattern string representation of the given automaton, mostly used in exception messages * * @throws IllegalStateException if the automaton accepts more than 128 terms + * + * @deprecated use {@link #multiterm(CompiledAutomaton, String)} */ + @Deprecated public static IntervalsSource multiterm(Automaton automaton, String pattern) { return multiterm(automaton, 128, pattern); } @@ -219,11 +222,42 @@ public final class Intervals { * @param pattern string representation of the given automaton, mostly used in exception messages * * @throws IllegalStateException if the automaton accepts more than {@code maxExpansions} terms + * + * @deprecated use {@link #multiterm(CompiledAutomaton, int, String)} */ + @Deprecated public static IntervalsSource multiterm(Automaton automaton, int maxExpansions, String pattern) { CompiledAutomaton ca = new CompiledAutomaton(automaton); return new MultiTermIntervalsSource(ca, maxExpansions, pattern); } + + /** + * Expert: Return an {@link IntervalsSource} over the disjunction of all terms that's accepted by the given automaton + * + * @param ca an automaton accepting matching terms + * @param pattern string representation of the given automaton, mostly used in exception messages + * + * @throws IllegalStateException if the automaton accepts more than 128 terms + */ + public static IntervalsSource multiterm(CompiledAutomaton ca, String pattern) { + return multiterm(ca, 128, pattern); + } + + /** + * Expert: Return an {@link IntervalsSource} over the disjunction of all terms that's accepted by the given automaton + * + * WARNING: Setting {@code maxExpansions} to higher than the default value of 128 + * can be both slow and memory-intensive + * + * @param ca an automaton accepting matching terms + * @param maxExpansions the maximum number of terms to expand to + * @param pattern string representation of the given automaton, mostly used in exception messages + * + * @throws IllegalStateException if the automaton accepts more than {@code maxExpansions} terms + */ + public static IntervalsSource multiterm(CompiledAutomaton ca, int maxExpansions, String pattern) { + return new MultiTermIntervalsSource(ca, maxExpansions, pattern); + } /** * Create an {@link IntervalsSource} that filters a sub-source by the width of its intervals diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java index e98f1152f67..b230dd28d35 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java @@ -43,7 +43,6 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchesIterator; -import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.TermQuery; @@ -51,6 +50,8 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.RegExp; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -885,36 +886,31 @@ public class TestIntervals extends LuceneTestCase { } public void testPrefix() throws IOException { - for (IntervalsSource source : Arrays.asList(Intervals.prefix(new BytesRef("p")), - Intervals.multiterm(PrefixQuery.toAutomaton(new BytesRef("p")), "p*" ) )) { - checkIntervals(source, "field1", 5, new int[][]{ - {}, - { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10, 27, 27 }, - { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10 }, - { 7, 7 }, - { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10 }, - { 0, 0 } - }); - MatchesIterator mi = getMatches(source, 1, "field1"); - assertNotNull(mi); - assertMatch(mi, 0, 0, 0, 5); - assertMatch(mi, 1, 1, 6, 14); - } - for (IntervalsSource noSuch : Arrays.asList(Intervals.prefix(new BytesRef("qqq")), - Intervals.multiterm(PrefixQuery.toAutomaton(new BytesRef("qqq")), "qqq*" ))) { - checkIntervals(noSuch, "field1", 0, new int[][]{}); - } + IntervalsSource source = Intervals.prefix(new BytesRef("p")); + checkIntervals(source, "field1", 5, new int[][]{ + {}, + {0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10, 27, 27}, + {0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10}, + {7, 7}, + {0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10}, + {0, 0} + }); + MatchesIterator mi = getMatches(source, 1, "field1"); + assertNotNull(mi); + assertMatch(mi, 0, 0, 0, 5); + assertMatch(mi, 1, 1, 6, 14); - for (IntervalsSource source : Arrays.asList(Intervals.prefix(new BytesRef("p"), 1), - Intervals.multiterm(PrefixQuery.toAutomaton(new BytesRef("p")), 1, "p*")) ) { + IntervalsSource noSuch = Intervals.prefix(new BytesRef("qqq")); + checkIntervals(noSuch, "field1", 0, new int[][]{}); + + IntervalsSource s = Intervals.prefix(new BytesRef("p"), 1); IllegalStateException e = expectThrows(IllegalStateException.class, () -> { for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) { - source.intervals("field1", ctx); + s.intervals("field1", ctx); } }); assertEquals("Automaton [p*] expanded to too many terms (limit 1)", e.getMessage()); - } checkVisits(Intervals.prefix(new BytesRef("p")), 1); } @@ -964,4 +960,28 @@ public class TestIntervals extends LuceneTestCase { } + public void testMultiTerm() throws IOException { + RegExp re = new RegExp("p.*e"); + IntervalsSource source = Intervals.multiterm(new CompiledAutomaton(re.toAutomaton()), re.toString()); + + checkIntervals(source, "field1", 5, new int[][]{ + {}, + { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 }, + { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 }, + { 7, 7 }, + { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 }, + { 0, 0 } + }); + + IllegalStateException e = expectThrows(IllegalStateException.class, () -> { + IntervalsSource s = Intervals.multiterm(new CompiledAutomaton(re.toAutomaton()), 1, re.toString()); + for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) { + s.intervals("field1", ctx); + } + }); + assertEquals("Automaton [\\p(.)*\\e] expanded to too many terms (limit 1)", e.getMessage()); + + checkVisits(source, 1); + } + }