LUCENE-8597: Add IntervalIterator.gaps() and Intervals.maxgaps()

This commit is contained in:
Alan Woodward 2018-12-10 09:47:29 +00:00
parent 18356de837
commit 1305501dbd
15 changed files with 285 additions and 33 deletions

View File

@ -98,6 +98,10 @@ API Changes
methods. This decouples normalization from tokenization entirely.
(Mayya Sharipova, Alan Woodward)
* LUCENE-8597: IntervalIterator now exposes a gaps() method that reports the
number of gaps between its component sub-intervals. This can be used in a
new filter available via Intervals.maxgaps(). (Alan Woodward)
Changes in Runtime Behavior
* LUCENE-8333: Switch MoreLikeThis.setMaxDocFreqPct to use maxDoc instead of

View File

@ -125,6 +125,11 @@ abstract class DifferenceIntervalFunction {
return a.end();
}
@Override
public int gaps() {
return a.gaps();
}
@Override
public float matchCost() {
return a.matchCost() + b.matchCost();
@ -232,6 +237,11 @@ abstract class DifferenceIntervalFunction {
return newEnd;
}
@Override
public int gaps() {
throw new UnsupportedOperationException();
}
@Override
public int nextInterval() throws IOException {
if (positioned == false) {

View File

@ -135,6 +135,11 @@ class DisjunctionIntervalsSource extends IntervalsSource {
return current.end();
}
@Override
public int gaps() {
return current.gaps();
}
private void reset() throws IOException {
intervalQueue.clear();
for (DisiWrapper dw = disiQueue.topList(); dw != null; dw = dw.next) {
@ -228,6 +233,11 @@ class DisjunctionIntervalsSource extends IntervalsSource {
return -1;
}
@Override
public int gaps() {
throw new UnsupportedOperationException();
}
@Override
public int nextInterval() {
return NO_MORE_INTERVALS;
@ -271,6 +281,11 @@ class DisjunctionIntervalsSource extends IntervalsSource {
return NO_MORE_INTERVALS;
}
@Override
public int gaps() {
throw new UnsupportedOperationException();
}
@Override
public int nextInterval() {
return NO_MORE_INTERVALS;

View File

@ -25,34 +25,28 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.MatchesIterator;
class LowpassIntervalsSource extends IntervalsSource {
/**
* An IntervalsSource that filters the intervals from another IntervalsSource
*/
public abstract class FilteredIntervalsSource extends IntervalsSource {
final IntervalsSource in;
private final int maxWidth;
private final String name;
private final IntervalsSource in;
LowpassIntervalsSource(IntervalsSource in, int maxWidth) {
/**
* Create a new FilteredIntervalsSource
* @param name the name of the filter
* @param in the source to filter
*/
public FilteredIntervalsSource(String name, IntervalsSource in) {
this.name = name;
this.in = in;
this.maxWidth = maxWidth;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
LowpassIntervalsSource that = (LowpassIntervalsSource) o;
return maxWidth == that.maxWidth &&
Objects.equals(in, that.in);
}
@Override
public String toString() {
return "MAXWIDTH/" + maxWidth + "(" + in + ")";
}
@Override
public void extractTerms(String field, Set<Term> terms) {
in.extractTerms(field, terms);
}
/**
* @return {@code false} if the current interval should be filtered out
*/
protected abstract boolean accept(IntervalIterator it);
@Override
public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
@ -63,7 +57,7 @@ class LowpassIntervalsSource extends IntervalsSource {
return new IntervalFilter(i) {
@Override
protected boolean accept() {
return (i.end() - i.start()) + 1 <= maxWidth;
return FilteredIntervalsSource.this.accept(in);
}
};
}
@ -77,14 +71,33 @@ class LowpassIntervalsSource extends IntervalsSource {
IntervalIterator filtered = new IntervalFilter(IntervalMatches.wrapMatches(mi, doc)) {
@Override
protected boolean accept() {
return (this.in.end() - this.in.start()) + 1 <= maxWidth;
return FilteredIntervalsSource.this.accept(in);
}
};
return IntervalMatches.asMatches(filtered, mi, doc);
}
@Override
public void extractTerms(String field, Set<Term> terms) {
in.extractTerms(field, terms);
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
FilteredIntervalsSource that = (FilteredIntervalsSource) o;
return Objects.equals(name, that.name) &&
Objects.equals(in, that.in);
}
@Override
public int hashCode() {
return Objects.hash(in, maxWidth);
return Objects.hash(name, in);
}
@Override
public String toString() {
return name + "(" + in + ")";
}
}

View File

@ -64,6 +64,11 @@ public abstract class IntervalFilter extends IntervalIterator {
return in.end();
}
@Override
public int gaps() {
return in.gaps();
}
@Override
public float matchCost() {
return in.matchCost();

View File

@ -18,6 +18,7 @@
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.util.PriorityQueue;
@ -66,6 +67,11 @@ abstract class IntervalFunction {
return end;
}
@Override
public int gaps() {
return 0;
}
@Override
public int nextInterval() throws IOException {
if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
@ -109,6 +115,7 @@ abstract class IntervalFunction {
private static class OrderedIntervalIterator extends ConjunctionIntervalIterator {
int start = -1, end = -1, i;
int firstEnd;
private OrderedIntervalIterator(List<IntervalIterator> subIntervals) {
super(subIntervals);
@ -143,6 +150,7 @@ abstract class IntervalFunction {
i++;
}
start = subIterators.get(0).start();
firstEnd = subIterators.get(0).end();
end = subIterators.get(subIterators.size() - 1).end();
b = subIterators.get(subIterators.size() - 1).start();
i = 1;
@ -151,11 +159,20 @@ abstract class IntervalFunction {
}
}
@Override
public int gaps() {
int gaps = subIterators.get(1).start() - firstEnd - 1;
for (int i = 2; i < subIterators.size(); i++) {
gaps += (subIterators.get(i).start() - subIterators.get(i - 1).end() - 1);
}
return gaps;
}
@Override
protected void reset() throws IOException {
subIterators.get(0).nextInterval();
i = 1;
start = end = -1;
start = end = firstEnd = -1;
}
}
@ -183,9 +200,10 @@ abstract class IntervalFunction {
private final PriorityQueue<IntervalIterator> queue;
private final IntervalIterator[] subIterators;
private final int[] innerPositions;
private final boolean allowOverlaps;
int start = -1, end = -1, queueEnd;
int start = -1, end = -1, firstEnd, queueEnd;
UnorderedIntervalIterator(List<IntervalIterator> subIterators, boolean allowOverlaps) {
super(subIterators);
@ -196,6 +214,7 @@ abstract class IntervalFunction {
}
};
this.subIterators = new IntervalIterator[subIterators.size()];
this.innerPositions = new int[subIterators.size() * 2];
this.allowOverlaps = allowOverlaps;
for (int i = 0; i < subIterators.size(); i++) {
@ -241,6 +260,7 @@ abstract class IntervalFunction {
// then, minimize it
do {
start = queue.top().start();
firstEnd = queue.top().end();
end = queueEnd;
if (queue.top().end() == end)
return start;
@ -260,6 +280,26 @@ abstract class IntervalFunction {
return start;
}
@Override
public int gaps() {
for (int i = 0; i < subIterators.length; i++) {
if (subIterators[i].end() > end) {
innerPositions[i * 2] = start;
innerPositions[i * 2 + 1] = firstEnd;
}
else {
innerPositions[i * 2] = subIterators[i].start();
innerPositions[i * 2 + 1] = subIterators[i].end();
}
}
Arrays.sort(innerPositions);
int gaps = 0;
for (int i = 1; i < subIterators.length; i++) {
gaps += (innerPositions[i * 2] - innerPositions[i * 2 - 1] - 1);
}
return gaps;
}
@Override
protected void reset() throws IOException {
queueEnd = start = end = -1;
@ -324,6 +364,11 @@ abstract class IntervalFunction {
return a.end();
}
@Override
public int gaps() {
return a.gaps();
}
@Override
public int nextInterval() throws IOException {
if (bpos == false)
@ -371,6 +416,11 @@ abstract class IntervalFunction {
return a.end();
}
@Override
public int gaps() {
return a.gaps();
}
@Override
public int nextInterval() throws IOException {
if (bpos == false)

View File

@ -59,6 +59,17 @@ public abstract class IntervalIterator extends DocIdSetIterator {
*/
public abstract int end();
/**
* The number of gaps within the current interval
*
* Note that this returns the number of gaps between the immediate sub-intervals
* of this interval, and does not include the gaps inside those sub-intervals.
*
* Should not be called before {@link #nextInterval()}, or after it has returned
* {@link #NO_MORE_INTERVALS}
*/
public abstract int gaps();
/**
* Advance the iterator to the next interval
*

View File

@ -98,6 +98,15 @@ final class IntervalMatches {
return mi.endPosition();
}
@Override
public int gaps() {
assert state == State.ITERATING;
if (mi instanceof IntervalMatchesIterator) {
return ((IntervalMatchesIterator)mi).gaps();
}
return 0;
}
@Override
public int nextInterval() throws IOException {
assert state == State.ITERATING;

View File

@ -0,0 +1,38 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import org.apache.lucene.search.MatchesIterator;
/**
* An extension of MatchesIterator that allows the gaps from a wrapped
* IntervalIterator to be reported.
*
* This is necessary because {@link MatchesIterator#getSubMatches()} returns
* the submatches of all nested matches as a flat iterator, but
* {@link IntervalIterator#gaps()} only returns the gaps between its immediate
* sub-matches, so we can't calculate the latter using the former.
*/
interface IntervalMatchesIterator extends MatchesIterator {
/**
* The number of top-level gaps inside the current match
*/
int gaps();
}

View File

@ -77,11 +77,30 @@ public final class Intervals {
/**
* Create an {@link IntervalsSource} that filters a sub-source by the width of its intervals
* @param width the maximum width of intervals in the sub-source ot return
* @param width the maximum width of intervals in the sub-source to filter
* @param subSource the sub-source to filter
*/
public static IntervalsSource maxwidth(int width, IntervalsSource subSource) {
return new LowpassIntervalsSource(subSource, width);
return new FilteredIntervalsSource("MAXWIDTH/" + width, subSource) {
@Override
protected boolean accept(IntervalIterator it) {
return (it.end() - it.start()) + 1 <= width;
}
};
}
/**
* Create an {@link IntervalsSource} that filters a sub-source by its gaps
* @param gaps the maximum number of gaps in the sub-source to filter
* @param subSource the sub-source to filter
*/
public static IntervalsSource maxgaps(int gaps, IntervalsSource subSource) {
return new FilteredIntervalsSource("MAXGAPS/" + gaps, subSource) {
@Override
protected boolean accept(IntervalIterator it) {
return it.gaps() <= gaps;
}
};
}
/**

View File

@ -61,7 +61,7 @@ class MinimizingConjunctionIntervalsSource extends ConjunctionIntervalsSource {
return new ConjunctionMatchesIterator(it, subs);
}
private static class ConjunctionMatchesIterator implements MatchesIterator {
private static class ConjunctionMatchesIterator implements IntervalMatchesIterator {
final IntervalIterator iterator;
final List<CacheingMatchesIterator> subs;
@ -111,6 +111,11 @@ class MinimizingConjunctionIntervalsSource extends ConjunctionIntervalsSource {
return end;
}
@Override
public int gaps() {
return iterator.gaps();
}
@Override
public MatchesIterator getSubMatches() throws IOException {
List<MatchesIterator> mis = new ArrayList<>();

View File

@ -95,6 +95,11 @@ class TermIntervalsSource extends IntervalsSource {
return pos;
}
@Override
public int gaps() {
return 0;
}
@Override
public int nextInterval() throws IOException {
if (upto <= 0)

View File

@ -44,6 +44,8 @@
* <ul>
* <li>{@link org.apache.lucene.search.intervals.Intervals#maxwidth(int, org.apache.lucene.search.intervals.IntervalsSource)}
* &mdash; Filters out intervals that are larger than a set width</li>
* <li>{@link org.apache.lucene.search.intervals.Intervals#maxgaps(int, org.apache.lucene.search.intervals.IntervalsSource)}
* &mdash; Filters out intervals that have more than a set number of gaps between their constituent sub-intervals</li>
* <li>{@link org.apache.lucene.search.intervals.Intervals#containedBy(org.apache.lucene.search.intervals.IntervalsSource, org.apache.lucene.search.intervals.IntervalsSource)}
* &mdash; Returns intervals that are contained by another interval</li>
* <li>{@link org.apache.lucene.search.intervals.Intervals#notContainedBy(org.apache.lucene.search.intervals.IntervalsSource, org.apache.lucene.search.intervals.IntervalsSource)}

View File

@ -91,6 +91,16 @@ public class TestIntervalQuery extends LuceneTestCase {
new int[]{0, 1, 2, 3, 5});
}
public void testOrderedNearQueryGaps1() throws IOException {
checkHits(new IntervalQuery(field, Intervals.maxgaps(1, Intervals.ordered(Intervals.term("w1"), Intervals.term("w2")))),
new int[]{0, 1, 2, 5});
}
public void testOrderedNearQueryGaps2() throws IOException {
checkHits(new IntervalQuery(field, Intervals.maxgaps(2, Intervals.ordered(Intervals.term("w1"), Intervals.term("w2")))),
new int[]{0, 1, 2, 3, 5});
}
public void testNestedOrderedNearQuery() throws IOException {
// onear/1(w1, onear/2(w2, w3))
Query q = new IntervalQuery(field,

View File

@ -64,7 +64,7 @@ public class TestIntervals extends LuceneTestCase {
"Down to a sunless sea",
"So thrice five miles of fertile ground",
"Pease hot porridge porridge",
"Pease porridge porridge hot"
"w1 w2 w3 w4 w1 w6 w3 w8 w4 w7 w1 w6"
};
private static Directory directory;
@ -84,7 +84,7 @@ public class TestIntervals extends LuceneTestCase {
for (int i = 0; i < field1_docs.length; i++) {
Document doc = new Document();
doc.add(new Field("field1", field1_docs[i], FIELD_TYPE));
doc.add(new TextField("field2", field2_docs[i], Field.Store.NO));
doc.add(new Field("field2", field2_docs[i], FIELD_TYPE));
doc.add(new StringField("id", Integer.toString(i), Field.Store.NO));
doc.add(new NumericDocValuesField("id", i));
writer.addDocument(doc);
@ -149,6 +149,19 @@ public class TestIntervals extends LuceneTestCase {
assertEquals(endOffset, mi.endOffset());
}
private void assertGaps(IntervalsSource source, int doc, String field, int[] expectedGaps) throws IOException {
int ord = ReaderUtil.subIndex(doc, searcher.getIndexReader().leaves());
LeafReaderContext ctx = searcher.getIndexReader().leaves().get(ord);
IntervalIterator it = source.intervals(field, ctx);
assertEquals(doc, it.advance(doc));
for (int expectedGap : expectedGaps) {
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
fail("Unexpected interval " + it);
}
assertEquals(expectedGap, it.gaps());
}
}
public void testIntervalsOnFieldWithNoPositions() throws IOException {
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> {
Intervals.term("wibble").intervals("id", searcher.getIndexReader().leaves().get(0));
@ -241,6 +254,10 @@ public class TestIntervals extends LuceneTestCase {
assertMatch(sub, 17, 17, 96, 99);
assertFalse(sub.next());
assertFalse(mi.next());
assertGaps(source, 1, "field1", new int[]{
1, 0, 10
});
}
public void testIntervalDisjunction() throws IOException {
@ -287,6 +304,7 @@ public class TestIntervals extends LuceneTestCase {
{ 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 17 },
{}
});
assertNull(getMatches(source, 0, "field1"));
MatchesIterator mi = getMatches(source, 1, "field1");
assertMatch(mi, 0, 2, 0, 18);
@ -302,6 +320,11 @@ public class TestIntervals extends LuceneTestCase {
assertMatch(mi, 5, 7, 35, 55);
assertMatch(mi, 6, 17, 41, 99);
assertFalse(mi.next());
assertGaps(source, 1, "field1", new int[]{
0, 0, 0, 0, 0, 0, 9
});
}
public void testNesting2() throws IOException {
@ -447,4 +470,37 @@ public class TestIntervals extends LuceneTestCase {
assertFalse(mi.next());
}
public void testMaxGaps() throws IOException {
IntervalsSource source = Intervals.maxgaps(1,
Intervals.unordered(Intervals.term("w1"), Intervals.term("w3"), Intervals.term("w4")));
checkIntervals(source, "field2", 1, new int[][]{
{}, {}, {}, {}, {},
{ 0, 3, 2, 4, 3, 6 }
});
MatchesIterator mi = getMatches(source, 5, "field2");
assertMatch(mi, 0, 3, 0, 11);
}
public void testNestedMaxGaps() throws IOException {
IntervalsSource source = Intervals.maxgaps(1,
Intervals.unordered(
Intervals.ordered(Intervals.term("w1"), Intervals.term("w3")),
Intervals.term("w4")
));
checkIntervals(source, "field2", 1, new int[][]{
{}, {}, {}, {}, {},
{ 0, 3, 3, 6, 4, 8 }
});
assertGaps(source, 5, "field2", new int[]{ 0, 0, 1 });
MatchesIterator mi = getMatches(source, 5, "field2");
assertMatch(mi, 0, 3, 0, 11);
assertMatch(mi, 3, 6, 9, 20);
assertMatch(mi, 4, 8, 12, 26);
}
}