LUCENE-9028: Introduce Intervals.multiterm()

This commit is contained in:
Mikhail Khludnev 2019-10-29 17:04:43 -07:00
parent b17d630e50
commit 3cf131de52
3 changed files with 62 additions and 22 deletions

View File

@ -79,6 +79,8 @@ Improvements
* LUCENE-9006: WordDelimiterGraphFilter's catenateAll token is now ordered before any token parts, like WDF did.
(David Smiley)
* LUCENE-9028: introducing Intervals.multiterm() (Mikhail Khludnev)
Optimizations
* LUCENE-8928: When building a kd-tree for dimensions n > 2, compute exact bounds for an inner node every N splits

View File

@ -25,6 +25,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
/**
@ -195,6 +196,35 @@ public final class Intervals {
return new MultiTermIntervalsSource(ca, maxExpansions, wildcard.utf8ToString());
}
/**
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that's accepted by the given automaton
*
* @param automaton accepts terms for to expand to
* @param pattern string representation of the given automaton, mostly used in exception messages
*
* @throws IllegalStateException if the automaton accepts more than 128 terms
*/
public static IntervalsSource multiterm(Automaton automaton, String pattern) {
return multiterm(automaton, 128, pattern);
}
/**
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that's accepted by the given automaton
*
* WARNING: Setting {@code maxExpansions} to higher than the default value of 128
* can be both slow and memory-intensive
*
* @param automaton accepts terms for to expand to
* @param maxExpansions the maximum number of terms to expand to
* @param pattern string representation of the given automaton, mostly used in exception messages
*
* @throws IllegalStateException if the automaton accepts more than {@code maxExpansions} terms
*/
public static IntervalsSource multiterm(Automaton automaton, int maxExpansions, String pattern) {
CompiledAutomaton ca = new CompiledAutomaton(automaton);
return new MultiTermIntervalsSource(ca, maxExpansions, pattern);
}
/**
* Create an {@link IntervalsSource} that filters a sub-source by the width of its intervals
* @param width the maximum width of intervals in the sub-source to filter

View File

@ -18,6 +18,7 @@
package org.apache.lucene.queries.intervals;
import java.io.IOException;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
@ -37,6 +38,7 @@ import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
@ -738,30 +740,36 @@ public class TestIntervals extends LuceneTestCase {
}
public void testPrefix() throws IOException {
IntervalsSource source = Intervals.prefix(new BytesRef("p"));
checkIntervals(source, "field1", 5, new int[][]{
{},
{ 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10, 27, 27 },
{ 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10 },
{ 7, 7 },
{ 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10 },
{ 0, 0 }
});
MatchesIterator mi = getMatches(source, 1, "field1");
assertNotNull(mi);
assertMatch(mi, 0, 0, 0, 5);
assertMatch(mi, 1, 1, 6, 14);
for (IntervalsSource source : List.of(Intervals.prefix(new BytesRef("p")),
Intervals.multiterm(PrefixQuery.toAutomaton(new BytesRef("p")), "p*" ) )) {
checkIntervals(source, "field1", 5, new int[][]{
{},
{ 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10, 27, 27 },
{ 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10 },
{ 7, 7 },
{ 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10 },
{ 0, 0 }
});
MatchesIterator mi = getMatches(source, 1, "field1");
assertNotNull(mi);
assertMatch(mi, 0, 0, 0, 5);
assertMatch(mi, 1, 1, 6, 14);
}
IntervalsSource noSuch = Intervals.prefix(new BytesRef("qqq"));
checkIntervals(noSuch, "field1", 0, new int[][]{});
for (IntervalsSource noSuch : List.of(Intervals.prefix(new BytesRef("qqq")),
Intervals.multiterm(PrefixQuery.toAutomaton(new BytesRef("qqq")), "qqq*" ))) {
checkIntervals(noSuch, "field1", 0, new int[][]{});
}
IllegalStateException e = expectThrows(IllegalStateException.class, () -> {
IntervalsSource s = Intervals.prefix(new BytesRef("p"), 1);
for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
s.intervals("field1", ctx);
}
});
assertEquals("Automaton [p*] expanded to too many terms (limit 1)", e.getMessage());
for (IntervalsSource source : List.of(Intervals.prefix(new BytesRef("p"), 1),
Intervals.multiterm(PrefixQuery.toAutomaton(new BytesRef("p")), 1, "p*")) ) {
IllegalStateException e = expectThrows(IllegalStateException.class, () -> {
for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
source.intervals("field1", ctx);
}
});
assertEquals("Automaton [p*] expanded to too many terms (limit 1)", e.getMessage());
}
}
public void testWildcard() throws IOException {