mirror of https://github.com/apache/lucene.git
LUCENE-8646: Multi-term intervals
This commit is contained in:
parent
87d68c8253
commit
7d7ab14776
|
@ -99,7 +99,7 @@ class DisjunctionIntervalsSource extends IntervalsSource {
|
|||
return minExtent;
|
||||
}
|
||||
|
||||
private static class DisjunctionIntervalIterator extends IntervalIterator {
|
||||
static class DisjunctionIntervalIterator extends IntervalIterator {
|
||||
|
||||
final DocIdSetIterator approximation;
|
||||
final PriorityQueue<IntervalIterator> intervalQueue;
|
||||
|
|
|
@ -19,7 +19,11 @@ package org.apache.lucene.search.intervals;
|
|||
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
|
||||
/**
|
||||
* Constructor functions for {@link IntervalsSource} types
|
||||
|
@ -75,6 +79,28 @@ public final class Intervals {
|
|||
return new DisjunctionIntervalsSource(Arrays.asList(subSources));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an {@link IntervalsSource} over the disjunction of all terms that begin with a prefix
|
||||
*
|
||||
* @throws IllegalStateException if the prefix expands to more than 128 terms
|
||||
*/
|
||||
public static IntervalsSource prefix(String prefix) {
|
||||
CompiledAutomaton ca = new CompiledAutomaton(PrefixQuery.toAutomaton(new BytesRef(prefix)));
|
||||
return new MultiTermIntervalsSource(ca, 128, prefix);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an {@link IntervalsSource} over the disjunction of all terms that match a wildcard glob
|
||||
*
|
||||
* @throws IllegalStateException if the wildcard glob expands to more than 128 terms
|
||||
*
|
||||
* @see WildcardQuery for glob format
|
||||
*/
|
||||
public static IntervalsSource wildcard(String wildcard) {
|
||||
CompiledAutomaton ca = new CompiledAutomaton(WildcardQuery.toAutomaton(new Term("", wildcard)));
|
||||
return new MultiTermIntervalsSource(ca, 128, wildcard);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an {@link IntervalsSource} that filters a sub-source by the width of its intervals
|
||||
* @param width the maximum width of intervals in the sub-source to filter
|
||||
|
|
|
@ -0,0 +1,120 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search.intervals;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.MatchesIterator;
|
||||
import org.apache.lucene.search.MatchesUtils;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
|
||||
class MultiTermIntervalsSource extends IntervalsSource {
|
||||
|
||||
private final CompiledAutomaton automaton;
|
||||
private final int maxExpansions;
|
||||
private final String pattern;
|
||||
|
||||
MultiTermIntervalsSource(CompiledAutomaton automaton, int maxExpansions, String pattern) {
|
||||
this.automaton = automaton;
|
||||
this.maxExpansions = maxExpansions;
|
||||
this.pattern = pattern;
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
|
||||
Terms terms = ctx.reader().terms(field);
|
||||
if (terms == null) {
|
||||
return null;
|
||||
}
|
||||
List<IntervalIterator> subSources = new ArrayList<>();
|
||||
TermsEnum te = automaton.getTermsEnum(terms);
|
||||
BytesRef term;
|
||||
int count = 0;
|
||||
while ((term = te.next()) != null) {
|
||||
subSources.add(TermIntervalsSource.intervals(term, te));
|
||||
if (count++ > maxExpansions) {
|
||||
throw new IllegalStateException("Automaton " + this.pattern + " expanded to too many terms (limit " + maxExpansions + ")");
|
||||
}
|
||||
}
|
||||
if (subSources.size() == 0) {
|
||||
return null;
|
||||
}
|
||||
return new DisjunctionIntervalsSource.DisjunctionIntervalIterator(subSources);
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
|
||||
Terms terms = ctx.reader().terms(field);
|
||||
if (terms == null) {
|
||||
return null;
|
||||
}
|
||||
List<MatchesIterator> subMatches = new ArrayList<>();
|
||||
TermsEnum te = automaton.getTermsEnum(terms);
|
||||
BytesRef term;
|
||||
int count = 0;
|
||||
while ((term = te.next()) != null) {
|
||||
MatchesIterator mi = TermIntervalsSource.matches(te, doc);
|
||||
if (mi != null) {
|
||||
subMatches.add(mi);
|
||||
if (count++ > maxExpansions) {
|
||||
throw new IllegalStateException("Automaton " + term + " expanded to too many terms (limit " + maxExpansions + ")");
|
||||
}
|
||||
}
|
||||
}
|
||||
return MatchesUtils.disjunction(subMatches);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void extractTerms(String field, Set<Term> terms) {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public int minExtent() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
MultiTermIntervalsSource that = (MultiTermIntervalsSource) o;
|
||||
return maxExpansions == that.maxExpansions &&
|
||||
Objects.equals(automaton, that.automaton) &&
|
||||
Objects.equals(pattern, that.pattern);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(automaton, maxExpansions, pattern);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "MultiTerm(" + pattern + ")";
|
||||
}
|
||||
}
|
|
@ -55,6 +55,10 @@ class TermIntervalsSource extends IntervalsSource {
|
|||
if (te.seekExact(term) == false) {
|
||||
return null;
|
||||
}
|
||||
return intervals(term, te);
|
||||
}
|
||||
|
||||
static IntervalIterator intervals(BytesRef term, TermsEnum te) throws IOException {
|
||||
PostingsEnum pe = te.postings(null, PostingsEnum.POSITIONS);
|
||||
float cost = termPositionsCost(te);
|
||||
return new IntervalIterator() {
|
||||
|
@ -143,6 +147,10 @@ class TermIntervalsSource extends IntervalsSource {
|
|||
if (te.seekExact(term) == false) {
|
||||
return null;
|
||||
}
|
||||
return matches(te, doc);
|
||||
}
|
||||
|
||||
static MatchesIterator matches(TermsEnum te, int doc) throws IOException {
|
||||
PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
|
||||
if (pe.advance(doc) != doc) {
|
||||
return null;
|
||||
|
|
|
@ -726,4 +726,40 @@ public class TestIntervals extends LuceneTestCase {
|
|||
|
||||
}
|
||||
|
||||
public void testPrefix() throws IOException {
|
||||
IntervalsSource source = Intervals.prefix("p");
|
||||
checkIntervals(source, "field1", 5, new int[][]{
|
||||
{},
|
||||
{ 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10, 27, 27 },
|
||||
{ 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10 },
|
||||
{ 7, 7 },
|
||||
{ 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10 },
|
||||
{ 0, 0 }
|
||||
});
|
||||
MatchesIterator mi = getMatches(source, 1, "field1");
|
||||
assertNotNull(mi);
|
||||
assertMatch(mi, 0, 0, 0, 5);
|
||||
assertMatch(mi, 1, 1, 6, 14);
|
||||
|
||||
IntervalsSource noSuch = Intervals.prefix("qqq");
|
||||
checkIntervals(noSuch, "field1", 0, new int[][]{});
|
||||
}
|
||||
|
||||
public void testWildcard() throws IOException {
|
||||
IntervalsSource source = Intervals.wildcard("?ot");
|
||||
checkIntervals(source, "field1", 4, new int[][]{
|
||||
{},
|
||||
{ 2, 2, 10, 10, 17, 17, 27, 27 },
|
||||
{ 5, 5, 10, 10, 21, 21 },
|
||||
{ 3, 3 },
|
||||
{ 2, 2, 10, 10, 17, 17 },
|
||||
{}
|
||||
});
|
||||
MatchesIterator mi = getMatches(source, 4, "field1");
|
||||
assertNotNull(mi);
|
||||
assertMatch(mi, 2, 2, 15, 18);
|
||||
assertMatch(mi, 10, 10, 63, 66);
|
||||
assertMatch(mi, 17, 17, 97, 100);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue