Add IntervalsSource for range and regexp queries (#13562)

We already have convenient functions for contructing IntervalsSource
for wildcard and fuzzy functions. This adds functions for
regexp and range as well.
This commit is contained in:
Mayya Sharipova 2024-07-14 09:48:16 -04:00 committed by GitHub
parent c55d664b3e
commit 5e52b8094a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 158 additions and 0 deletions

View File

@ -261,6 +261,9 @@ Improvements
* GITHUB#13548: Refactor and javadoc update for KNN vector writer classes. (Patrick Zhai)
* GITHUB#13562: Add Intervals.regexp and Intervals.range methods to produce IntervalsSource
for regexp and range queries. (Mayya Sharipova)
Optimizations
---------------------

View File

@ -27,11 +27,15 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
/**
* Factory functions for creating {@link IntervalsSource interval sources}.
@ -206,6 +210,88 @@ public final class Intervals {
return new MultiTermIntervalsSource(ca, maxExpansions, wildcard.utf8ToString());
}
/**
* Return an {@link IntervalsSource} over the disjunction of all terms that match a regular
* expression
*
* @param regexp regular expression
* @throws IllegalStateException if the regex expands to more than {@link #DEFAULT_MAX_EXPANSIONS}
* terms
* @see RegexpQuery for regexp format
*/
public static IntervalsSource regexp(BytesRef regexp) {
return regexp(regexp, DEFAULT_MAX_EXPANSIONS);
}
/**
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that match a
* regular expression
*
* <p>WARNING: Setting {@code maxExpansions} to higher than the default value of {@link
* #DEFAULT_MAX_EXPANSIONS} can be both slow and memory-intensive
*
* @param regexp regular expression
* @param maxExpansions the maximum number of terms to expand to
* @throws IllegalStateException if the regex expands to more than {@link #DEFAULT_MAX_EXPANSIONS}
* terms
* @see RegexpQuery for regexp format
*/
public static IntervalsSource regexp(BytesRef regexp, int maxExpansions) {
Automaton automaton = new RegExp(new Term("", regexp).text()).toAutomaton();
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, true, false);
return new MultiTermIntervalsSource(ca, maxExpansions, regexp.utf8ToString());
}
/**
* Return an {@link IntervalsSource} over the disjunction of all terms that fall within the given
* range
*
* @param lowerTerm The term text at the lower end of the range
* @param upperTerm The term text at the upper end of the range
* @param includeLower If true, the <code>lowerTerm</code> is included in the range
* @param includeUpper If true, the <code>upperTerm</code> is included in the range
* @throws IllegalStateException if the range expands to more than {@link #DEFAULT_MAX_EXPANSIONS}
* terms
*/
public static IntervalsSource range(
BytesRef lowerTerm, BytesRef upperTerm, boolean includeLower, boolean includeUpper) {
return range(lowerTerm, upperTerm, includeLower, includeUpper, DEFAULT_MAX_EXPANSIONS);
}
/**
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that fall within
* the given range
*
* <p>WARNING: Setting {@code maxExpansions} to higher than the default value of {@link
* #DEFAULT_MAX_EXPANSIONS} can be both slow and memory-intensive
*
* @param lowerTerm The term text at the lower end of the range
* @param upperTerm The term text at the upper end of the range
* @param includeLower If true, the <code>lowerTerm</code> is included in the range
* @param includeUpper If true, the <code>upperTerm</code> is included in the range
* @param maxExpansions the maximum number of terms to expand to
* @throws IllegalStateException if the wildcard glob expands to more than {@code maxExpansions}
* terms
*/
public static IntervalsSource range(
BytesRef lowerTerm,
BytesRef upperTerm,
boolean includeLower,
boolean includeUpper,
int maxExpansions) {
Automaton automaton =
TermRangeQuery.toAutomaton(lowerTerm, upperTerm, includeLower, includeUpper);
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, true, true);
StringBuilder buffer = new StringBuilder();
buffer.append("{");
buffer.append(lowerTerm.utf8ToString());
buffer.append(",");
buffer.append(upperTerm.utf8ToString());
buffer.append("}");
return new MultiTermIntervalsSource(ca, maxExpansions, buffer.toString());
}
/**
* A fuzzy term {@link IntervalsSource} matches the disjunction of intervals of terms that are
* within the specified {@code maxEdits} from the provided term.

View File

@ -1028,6 +1028,40 @@ public class TestIntervals extends LuceneTestCase {
checkVisits(Intervals.wildcard(new BytesRef("p??")), 1);
}
public void testRegexp() throws IOException {
IntervalsSource source = Intervals.regexp(new BytesRef(".ot"));
checkIntervals(
source,
"field1",
4,
new int[][] {
{},
{2, 2, 10, 10, 17, 17, 27, 27},
{5, 5, 10, 10, 21, 21},
{3, 3},
{2, 2, 10, 10, 17, 17},
{}
});
MatchesIterator mi = getMatches(source, 4, "field1");
assertNotNull(mi);
assertMatch(mi, 2, 2, 15, 18);
assertMatch(mi, 10, 10, 63, 66);
assertMatch(mi, 17, 17, 97, 100);
IllegalStateException e =
expectThrows(
IllegalStateException.class,
() -> {
IntervalsSource s = Intervals.regexp(new BytesRef(".ot"), 1);
for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
s.intervals("field1", ctx);
}
});
assertEquals("Automaton [.ot] expanded to too many terms (limit 1)", e.getMessage());
checkVisits(Intervals.regexp(new BytesRef("p.*")), 1);
}
public void testFuzzyTerm() throws IOException {
IntervalsSource source = Intervals.fuzzyTerm("kot", 1); // matches 'pot'
checkIntervals(
@ -1069,6 +1103,41 @@ public class TestIntervals extends LuceneTestCase {
checkVisits(Intervals.fuzzyTerm("kot", FuzzyQuery.defaultMaxEdits), 1);
}
public void testRange() throws IOException {
IntervalsSource source = Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true);
checkIntervals(
source,
"field1",
6,
new int[][] {
{5, 5},
{2, 2, 5, 5, 12, 12, 17, 17, 21, 21, 29, 29},
{2, 2, 5, 5, 12, 12, 17, 17, 21, 21, 27, 27},
{1, 1, 3, 3, 4, 4},
{2, 2, 5, 5, 17, 17},
{2, 2}
});
MatchesIterator mi = getMatches(source, 3, "field1");
assertNotNull(mi);
assertMatch(mi, 1, 1, 4, 8);
assertMatch(mi, 3, 3, 15, 18);
assertMatch(mi, 4, 4, 19, 24);
IllegalStateException e =
expectThrows(
IllegalStateException.class,
() -> {
IntervalsSource s =
Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true, 1);
for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
s.intervals("field1", ctx);
}
});
assertEquals("Automaton [{cold,hot}] expanded to too many terms (limit 1)", e.getMessage());
checkVisits(source, 1);
}
public void testWrappedFilters() throws IOException {
IntervalsSource source =
Intervals.or(