From e8950f4a528605f9be17c644eef4f47d0659317b Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Mon, 10 Jun 2019 15:49:15 +0100 Subject: [PATCH] LUCENE-8845: Allow configurable maxExpansions for prefix/wildcard intervals --- lucene/CHANGES.txt | 3 ++ .../lucene/search/intervals/Intervals.java | 36 +++++++++++++++++-- .../intervals/MultiTermIntervalsSource.java | 4 +-- .../search/intervals/TestIntervals.java | 16 +++++++++ 4 files changed, 55 insertions(+), 4 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index e2dd243539c..1c818bdb695 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -85,6 +85,9 @@ Improvements * LUCENE-8818: Fix smokeTestRelease.py encoding bug (janhoy) +* LUCENE-8845: Allow Intervals.prefix() and Intervals.wildcard() to specify + their maximum allowed expansions (Alan Woodward) + Test Framework * LUCENE-8825: CheckHits now display the shard index in case of mismatch diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Intervals.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Intervals.java index d579c6f4eb4..cfe132a221a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Intervals.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Intervals.java @@ -147,8 +147,23 @@ public final class Intervals { * @throws IllegalStateException if the prefix expands to more than 128 terms */ public static IntervalsSource prefix(String prefix) { + return prefix(prefix, 128); + } + + /** + * Expert: Return an {@link IntervalsSource} over the disjunction of all terms that begin with a prefix + * + * WARNING: Setting {@code maxExpansions} to higher than the default value of 128 + * can be both slow and memory-intensive + * + * @param prefix the prefix to expand + * @param maxExpansions the maximum number of terms to expand to + * + * @throws IllegalStateException if the prefix expands to more than {@code maxExpansions} terms + */ + public static IntervalsSource prefix(String prefix, int maxExpansions) { CompiledAutomaton ca = new CompiledAutomaton(PrefixQuery.toAutomaton(new BytesRef(prefix))); - return new MultiTermIntervalsSource(ca, 128, prefix); + return new MultiTermIntervalsSource(ca, maxExpansions, prefix + "*"); } /** @@ -159,8 +174,25 @@ public final class Intervals { * @see WildcardQuery for glob format */ public static IntervalsSource wildcard(String wildcard) { + return wildcard(wildcard, 128); + } + + /** + * Expert: Return an {@link IntervalsSource} over the disjunction of all terms that match a wildcard glob + * + * WARNING: Setting {@code maxExpansions} to higher than the default value of 128 + * can be both slow and memory-intensive + * + * @param wildcard the glob to expand + * @param maxExpansions the maximum number of terms to expand to + * + * @throws IllegalStateException if the wildcard glob expands to more than {@code maxExpansions} terms + * + * @see WildcardQuery for glob format + */ + public static IntervalsSource wildcard(String wildcard, int maxExpansions) { CompiledAutomaton ca = new CompiledAutomaton(WildcardQuery.toAutomaton(new Term("", wildcard))); - return new MultiTermIntervalsSource(ca, 128, wildcard); + return new MultiTermIntervalsSource(ca, maxExpansions, wildcard); } /** diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MultiTermIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MultiTermIntervalsSource.java index 4b1d23328f3..213ef730476 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MultiTermIntervalsSource.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MultiTermIntervalsSource.java @@ -57,8 +57,8 @@ class MultiTermIntervalsSource extends IntervalsSource { int count = 0; while ((term = te.next()) != null) { subSources.add(TermIntervalsSource.intervals(term, te)); - if (count++ > maxExpansions) { - throw new IllegalStateException("Automaton " + this.pattern + " expanded to too many terms (limit " + maxExpansions + ")"); + if (++count > maxExpansions) { + throw new IllegalStateException("Automaton [" + this.pattern + "] expanded to too many terms (limit " + maxExpansions + ")"); } } if (subSources.size() == 0) { diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java index 8bf7f8a33bb..2130514941c 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java +++ b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java @@ -753,6 +753,14 @@ public class TestIntervals extends LuceneTestCase { IntervalsSource noSuch = Intervals.prefix("qqq"); checkIntervals(noSuch, "field1", 0, new int[][]{}); + + IllegalStateException e = expectThrows(IllegalStateException.class, () -> { + IntervalsSource s = Intervals.prefix("p", 1); + for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) { + s.intervals("field1", ctx); + } + }); + assertEquals("Automaton [p*] expanded to too many terms (limit 1)", e.getMessage()); } public void testWildcard() throws IOException { @@ -770,6 +778,14 @@ public class TestIntervals extends LuceneTestCase { assertMatch(mi, 2, 2, 15, 18); assertMatch(mi, 10, 10, 63, 66); assertMatch(mi, 17, 17, 97, 100); + + IllegalStateException e = expectThrows(IllegalStateException.class, () -> { + IntervalsSource s = Intervals.wildcard("?ot", 1); + for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) { + s.intervals("field1", ctx); + } + }); + assertEquals("Automaton [?ot] expanded to too many terms (limit 1)", e.getMessage()); } public void testWrappedFilters() throws IOException {