mirror of https://github.com/apache/lucene.git
Add IntervalsSource for range and regexp queries (#13562)
We already have convenient functions for contructing IntervalsSource for wildcard and fuzzy functions. This adds functions for regexp and range as well.
This commit is contained in:
parent
c55d664b3e
commit
5e52b8094a
|
@ -261,6 +261,9 @@ Improvements
|
|||
|
||||
* GITHUB#13548: Refactor and javadoc update for KNN vector writer classes. (Patrick Zhai)
|
||||
|
||||
* GITHUB#13562: Add Intervals.regexp and Intervals.range methods to produce IntervalsSource
|
||||
for regexp and range queries. (Mayya Sharipova)
|
||||
|
||||
Optimizations
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -27,11 +27,15 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.FuzzyQuery;
|
||||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.search.RegexpQuery;
|
||||
import org.apache.lucene.search.TermRangeQuery;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
||||
/**
|
||||
* Factory functions for creating {@link IntervalsSource interval sources}.
|
||||
|
@ -206,6 +210,88 @@ public final class Intervals {
|
|||
return new MultiTermIntervalsSource(ca, maxExpansions, wildcard.utf8ToString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an {@link IntervalsSource} over the disjunction of all terms that match a regular
|
||||
* expression
|
||||
*
|
||||
* @param regexp regular expression
|
||||
* @throws IllegalStateException if the regex expands to more than {@link #DEFAULT_MAX_EXPANSIONS}
|
||||
* terms
|
||||
* @see RegexpQuery for regexp format
|
||||
*/
|
||||
public static IntervalsSource regexp(BytesRef regexp) {
|
||||
return regexp(regexp, DEFAULT_MAX_EXPANSIONS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that match a
|
||||
* regular expression
|
||||
*
|
||||
* <p>WARNING: Setting {@code maxExpansions} to higher than the default value of {@link
|
||||
* #DEFAULT_MAX_EXPANSIONS} can be both slow and memory-intensive
|
||||
*
|
||||
* @param regexp regular expression
|
||||
* @param maxExpansions the maximum number of terms to expand to
|
||||
* @throws IllegalStateException if the regex expands to more than {@link #DEFAULT_MAX_EXPANSIONS}
|
||||
* terms
|
||||
* @see RegexpQuery for regexp format
|
||||
*/
|
||||
public static IntervalsSource regexp(BytesRef regexp, int maxExpansions) {
|
||||
Automaton automaton = new RegExp(new Term("", regexp).text()).toAutomaton();
|
||||
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, true, false);
|
||||
return new MultiTermIntervalsSource(ca, maxExpansions, regexp.utf8ToString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an {@link IntervalsSource} over the disjunction of all terms that fall within the given
|
||||
* range
|
||||
*
|
||||
* @param lowerTerm The term text at the lower end of the range
|
||||
* @param upperTerm The term text at the upper end of the range
|
||||
* @param includeLower If true, the <code>lowerTerm</code> is included in the range
|
||||
* @param includeUpper If true, the <code>upperTerm</code> is included in the range
|
||||
* @throws IllegalStateException if the range expands to more than {@link #DEFAULT_MAX_EXPANSIONS}
|
||||
* terms
|
||||
*/
|
||||
public static IntervalsSource range(
|
||||
BytesRef lowerTerm, BytesRef upperTerm, boolean includeLower, boolean includeUpper) {
|
||||
return range(lowerTerm, upperTerm, includeLower, includeUpper, DEFAULT_MAX_EXPANSIONS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that fall within
|
||||
* the given range
|
||||
*
|
||||
* <p>WARNING: Setting {@code maxExpansions} to higher than the default value of {@link
|
||||
* #DEFAULT_MAX_EXPANSIONS} can be both slow and memory-intensive
|
||||
*
|
||||
* @param lowerTerm The term text at the lower end of the range
|
||||
* @param upperTerm The term text at the upper end of the range
|
||||
* @param includeLower If true, the <code>lowerTerm</code> is included in the range
|
||||
* @param includeUpper If true, the <code>upperTerm</code> is included in the range
|
||||
* @param maxExpansions the maximum number of terms to expand to
|
||||
* @throws IllegalStateException if the wildcard glob expands to more than {@code maxExpansions}
|
||||
* terms
|
||||
*/
|
||||
public static IntervalsSource range(
|
||||
BytesRef lowerTerm,
|
||||
BytesRef upperTerm,
|
||||
boolean includeLower,
|
||||
boolean includeUpper,
|
||||
int maxExpansions) {
|
||||
Automaton automaton =
|
||||
TermRangeQuery.toAutomaton(lowerTerm, upperTerm, includeLower, includeUpper);
|
||||
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, true, true);
|
||||
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
buffer.append("{");
|
||||
buffer.append(lowerTerm.utf8ToString());
|
||||
buffer.append(",");
|
||||
buffer.append(upperTerm.utf8ToString());
|
||||
buffer.append("}");
|
||||
return new MultiTermIntervalsSource(ca, maxExpansions, buffer.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* A fuzzy term {@link IntervalsSource} matches the disjunction of intervals of terms that are
|
||||
* within the specified {@code maxEdits} from the provided term.
|
||||
|
|
|
@ -1028,6 +1028,40 @@ public class TestIntervals extends LuceneTestCase {
|
|||
checkVisits(Intervals.wildcard(new BytesRef("p??")), 1);
|
||||
}
|
||||
|
||||
public void testRegexp() throws IOException {
|
||||
IntervalsSource source = Intervals.regexp(new BytesRef(".ot"));
|
||||
checkIntervals(
|
||||
source,
|
||||
"field1",
|
||||
4,
|
||||
new int[][] {
|
||||
{},
|
||||
{2, 2, 10, 10, 17, 17, 27, 27},
|
||||
{5, 5, 10, 10, 21, 21},
|
||||
{3, 3},
|
||||
{2, 2, 10, 10, 17, 17},
|
||||
{}
|
||||
});
|
||||
MatchesIterator mi = getMatches(source, 4, "field1");
|
||||
assertNotNull(mi);
|
||||
assertMatch(mi, 2, 2, 15, 18);
|
||||
assertMatch(mi, 10, 10, 63, 66);
|
||||
assertMatch(mi, 17, 17, 97, 100);
|
||||
|
||||
IllegalStateException e =
|
||||
expectThrows(
|
||||
IllegalStateException.class,
|
||||
() -> {
|
||||
IntervalsSource s = Intervals.regexp(new BytesRef(".ot"), 1);
|
||||
for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
|
||||
s.intervals("field1", ctx);
|
||||
}
|
||||
});
|
||||
assertEquals("Automaton [.ot] expanded to too many terms (limit 1)", e.getMessage());
|
||||
|
||||
checkVisits(Intervals.regexp(new BytesRef("p.*")), 1);
|
||||
}
|
||||
|
||||
public void testFuzzyTerm() throws IOException {
|
||||
IntervalsSource source = Intervals.fuzzyTerm("kot", 1); // matches 'pot'
|
||||
checkIntervals(
|
||||
|
@ -1069,6 +1103,41 @@ public class TestIntervals extends LuceneTestCase {
|
|||
checkVisits(Intervals.fuzzyTerm("kot", FuzzyQuery.defaultMaxEdits), 1);
|
||||
}
|
||||
|
||||
public void testRange() throws IOException {
|
||||
IntervalsSource source = Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true);
|
||||
checkIntervals(
|
||||
source,
|
||||
"field1",
|
||||
6,
|
||||
new int[][] {
|
||||
{5, 5},
|
||||
{2, 2, 5, 5, 12, 12, 17, 17, 21, 21, 29, 29},
|
||||
{2, 2, 5, 5, 12, 12, 17, 17, 21, 21, 27, 27},
|
||||
{1, 1, 3, 3, 4, 4},
|
||||
{2, 2, 5, 5, 17, 17},
|
||||
{2, 2}
|
||||
});
|
||||
MatchesIterator mi = getMatches(source, 3, "field1");
|
||||
assertNotNull(mi);
|
||||
assertMatch(mi, 1, 1, 4, 8);
|
||||
assertMatch(mi, 3, 3, 15, 18);
|
||||
assertMatch(mi, 4, 4, 19, 24);
|
||||
|
||||
IllegalStateException e =
|
||||
expectThrows(
|
||||
IllegalStateException.class,
|
||||
() -> {
|
||||
IntervalsSource s =
|
||||
Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true, 1);
|
||||
for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
|
||||
s.intervals("field1", ctx);
|
||||
}
|
||||
});
|
||||
assertEquals("Automaton [{cold,hot}] expanded to too many terms (limit 1)", e.getMessage());
|
||||
|
||||
checkVisits(source, 1);
|
||||
}
|
||||
|
||||
public void testWrappedFilters() throws IOException {
|
||||
IntervalsSource source =
|
||||
Intervals.or(
|
||||
|
|
Loading…
Reference in New Issue