LUCENE-8612: Add Intervals.extend()

This commit is contained in:
Alan Woodward 2019-01-01 17:50:20 +00:00
parent 752989fd74
commit 2532a5d31c
10 changed files with 290 additions and 112 deletions

View File

@ -105,6 +105,10 @@ API Changes
* LUCENE-8609: Remove IndexWriter#numDocs() and IndexWriter#maxDoc() in favor
of IndexWriter#getDocStats(). (Simon Willnauer)
* LUCENE-8612: Intervals.extend() treats an interval as if it covered a wider
span than it actually does, allowing users to force minimum gaps between
intervals in a phrase. (Alan Woodward)
Changes in Runtime Behavior
* LUCENE-8333: Switch MoreLikeThis.setMaxDocFreqPct to use maxDoc instead of

View File

@ -18,7 +18,6 @@
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.Objects;
/**
* A function that takes two interval iterators and combines them to produce a third,
@ -160,106 +159,6 @@ abstract class DifferenceIntervalFunction {
}
}
/**
* Filters the minuend iterator so that only intervals that do not occur within a set number
* of positions of intervals from the subtrahend iterator are returned
*/
static class NotWithinFunction extends DifferenceIntervalFunction {
private final int positions;
NotWithinFunction(int positions) {
this.positions = positions;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
NotWithinFunction that = (NotWithinFunction) o;
return positions == that.positions;
}
@Override
public String toString() {
return "NOTWITHIN/" + positions;
}
@Override
public int hashCode() {
return Objects.hash(positions);
}
@Override
public IntervalIterator apply(IntervalIterator minuend, IntervalIterator subtrahend) {
IntervalIterator notWithin = new IntervalIterator() {
@Override
public int docID() {
return subtrahend.docID();
}
@Override
public int nextDoc() throws IOException {
positioned = false;
return subtrahend.nextDoc();
}
@Override
public int advance(int target) throws IOException {
positioned = false;
return subtrahend.advance(target);
}
@Override
public long cost() {
return subtrahend.cost();
}
boolean positioned = false;
@Override
public int start() {
if (positioned == false)
return -1;
int start = subtrahend.start();
return Math.max(0, start - positions);
}
@Override
public int end() {
if (positioned == false)
return -1;
int end = subtrahend.end();
int newEnd = end + positions;
if (newEnd < 0) // check for overflow
return Integer.MAX_VALUE;
return newEnd;
}
@Override
public int gaps() {
throw new UnsupportedOperationException();
}
@Override
public int nextInterval() throws IOException {
if (positioned == false) {
positioned = true;
}
return subtrahend.nextInterval();
}
@Override
public float matchCost() {
return subtrahend.matchCost();
}
};
return NON_OVERLAPPING.apply(minuend, notWithin);
}
}
private static class NotContainingIterator extends RelativeIterator {
private NotContainingIterator(IntervalIterator minuend, IntervalIterator subtrahend) {

View File

@ -0,0 +1,118 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import java.io.IOException;
/**
* Wraps an IntervalIterator and extends the bounds of its intervals
*
* Useful for specifying gaps in an ordered iterator; if you want to match
* `a b [2 spaces] c`, you can search for phrase(a, extended(b, 0, 2), c)
*
* An interval with prefix bounds extended by n will skip over matches that
* appear in positions lower than n
*/
class ExtendedIntervalIterator extends IntervalIterator {
private final IntervalIterator in;
private final int before;
private final int after;
private boolean positioned;
/**
* Create a new ExtendedIntervalIterator
* @param in the iterator to wrap
* @param before the number of positions to extend before the delegated interval
* @param after the number of positions to extend beyond the delegated interval
*/
ExtendedIntervalIterator(IntervalIterator in, int before, int after) {
this.in = in;
this.before = before;
this.after = after;
}
@Override
public int start() {
if (positioned == false) {
return -1;
}
int start = in.start();
if (start == NO_MORE_INTERVALS) {
return NO_MORE_INTERVALS;
}
return Math.max(0, start - before);
}
@Override
public int end() {
if (positioned == false) {
return -1;
}
int end = in.end();
if (end == NO_MORE_INTERVALS) {
return NO_MORE_INTERVALS;
}
end += after;
if (end < 0 || end == NO_MORE_INTERVALS) {
// overflow
end = NO_MORE_INTERVALS - 1;
}
return end;
}
@Override
public int gaps() {
return in.gaps();
}
@Override
public int nextInterval() throws IOException {
positioned = true;
in.nextInterval();
return start();
}
@Override
public float matchCost() {
return in.matchCost();
}
@Override
public int docID() {
return in.docID();
}
@Override
public int nextDoc() throws IOException {
positioned = false;
return in.nextDoc();
}
@Override
public int advance(int target) throws IOException {
positioned = false;
return in.advance(target);
}
@Override
public long cost() {
return in.cost();
}
}

View File

@ -0,0 +1,83 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.Objects;
import java.util.Set;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.MatchesIterator;
class ExtendedIntervalsSource extends IntervalsSource {
final IntervalsSource source;
private final int before;
private final int after;
ExtendedIntervalsSource(IntervalsSource source, int before, int after) {
this.source = source;
this.before = before;
this.after = after;
}
@Override
public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
IntervalIterator in = source.intervals(field, ctx);
if (in == null) {
return null;
}
return new ExtendedIntervalIterator(in, before, after);
}
@Override
public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
MatchesIterator in = source.matches(field, ctx, doc);
if (in == null) {
return null;
}
IntervalIterator wrapped = new ExtendedIntervalIterator(IntervalMatches.wrapMatches(in, doc), before, after);
return IntervalMatches.asMatches(wrapped, in, doc);
}
@Override
public void extractTerms(String field, Set<Term> terms) {
source.extractTerms(field, terms);
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
ExtendedIntervalsSource that = (ExtendedIntervalsSource) o;
return before == that.before &&
after == that.after &&
Objects.equals(source, that.source);
}
@Override
public int hashCode() {
return Objects.hash(source, before, after);
}
@Override
public String toString() {
return "EXTEND(" + source + "," + before + "," + after + ")";
}
}

View File

@ -75,19 +75,19 @@ abstract class IntervalFunction {
@Override
public int nextInterval() throws IOException {
if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return IntervalIterator.NO_MORE_INTERVALS;
return start = end = IntervalIterator.NO_MORE_INTERVALS;
int i = 1;
while (i < subIterators.size()) {
while (subIterators.get(i).start() <= subIterators.get(i - 1).end()) {
if (subIterators.get(i).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return IntervalIterator.NO_MORE_INTERVALS;
return start = end = IntervalIterator.NO_MORE_INTERVALS;
}
if (subIterators.get(i).start() == subIterators.get(i - 1).end() + 1) {
i = i + 1;
}
else {
if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return IntervalIterator.NO_MORE_INTERVALS;
return start = end = IntervalIterator.NO_MORE_INTERVALS;
i = 1;
}
}
@ -150,6 +150,9 @@ abstract class IntervalFunction {
i++;
}
start = subIterators.get(0).start();
if (start == NO_MORE_INTERVALS) {
return end = NO_MORE_INTERVALS;
}
firstEnd = subIterators.get(0).end();
end = subIterators.get(subIterators.size() - 1).end();
b = subIterators.get(subIterators.size() - 1).start();
@ -248,7 +251,7 @@ abstract class IntervalFunction {
if (allowOverlaps == false) {
while (hasOverlaps(it)) {
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return IntervalIterator.NO_MORE_INTERVALS;
return start = end = IntervalIterator.NO_MORE_INTERVALS;
}
}
queue.add(it);
@ -256,7 +259,7 @@ abstract class IntervalFunction {
}
}
if (this.queue.size() < subIterators.length)
return IntervalIterator.NO_MORE_INTERVALS;
return start = end = IntervalIterator.NO_MORE_INTERVALS;
// then, minimize it
do {
start = queue.top().start();
@ -408,11 +411,17 @@ abstract class IntervalFunction {
@Override
public int start() {
if (bpos == false) {
return NO_MORE_INTERVALS;
}
return a.start();
}
@Override
public int end() {
if (bpos == false) {
return NO_MORE_INTERVALS;
}
return a.end();
}
@ -427,12 +436,15 @@ abstract class IntervalFunction {
return IntervalIterator.NO_MORE_INTERVALS;
while (a.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
while (b.end() < a.end()) {
if (b.nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
if (b.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
bpos = false;
return IntervalIterator.NO_MORE_INTERVALS;
}
}
if (b.start() <= a.start())
return a.start();
}
bpos = false;
return IntervalIterator.NO_MORE_INTERVALS;
}

View File

@ -48,14 +48,16 @@ public abstract class IntervalIterator extends DocIdSetIterator {
/**
* The start of the current interval
*
* Returns -1 if {@link #nextInterval()} has not yet been called
* Returns -1 if {@link #nextInterval()} has not yet been called and {@link #NO_MORE_INTERVALS}
* once the iterator is exhausted.
*/
public abstract int start();
/**
* The end of the current interval
*
* Returns -1 if {@link #nextInterval()} has not yet been called
* Returns -1 if {@link #nextInterval()} has not yet been called and {@link #NO_MORE_INTERVALS}
* once the iterator is exhausted.
*/
public abstract int end();

View File

@ -49,12 +49,12 @@ final class IntervalMatches {
@Override
public int startPosition() {
return source.startPosition();
return iterator.start();
}
@Override
public int endPosition() {
return source.endPosition();
return iterator.end();
}
@Override

View File

@ -103,6 +103,27 @@ public final class Intervals {
};
}
/**
* Create an {@link IntervalsSource} that wraps another source, extending its
* intervals by a number of positions before and after.
*
* This can be useful for adding defined gaps in a block query; for example,
* to find 'a b [2 arbitrary terms] c', you can call:
* <pre>
* Intervals.phrase(Intervals.term("a"), Intervals.extend(Intervals.term("b"), 0, 2), Intervals.term("c"));
* </pre>
*
* Note that calling {@link IntervalIterator#gaps()} on iterators returned by this source
* delegates directly to the wrapped iterator, and does not include the extensions.
*
* @param source the source to extend
* @param before how many positions to extend before the delegated interval
* @param after how many positions to extend after the delegated interval
*/
public static IntervalsSource extend(IntervalsSource source, int before, int after) {
return new ExtendedIntervalsSource(source, before, after);
}
/**
* Create an ordered {@link IntervalsSource}
*
@ -162,7 +183,8 @@ public final class Intervals {
* @param subtrahend the {@link IntervalsSource} to filter by
*/
public static IntervalsSource notWithin(IntervalsSource minuend, int positions, IntervalsSource subtrahend) {
return new DifferenceIntervalsSource(minuend, subtrahend, new DifferenceIntervalFunction.NotWithinFunction(positions));
return new DifferenceIntervalsSource(minuend, Intervals.extend(subtrahend, positions, positions),
DifferenceIntervalFunction.NON_OVERLAPPING);
}
/**

View File

@ -188,4 +188,10 @@ public class TestIntervalQuery extends LuceneTestCase {
);
checkHits(q, new int[]{3});
}
public void testDefinedGaps() throws IOException {
Query q = new IntervalQuery(field,
Intervals.phrase(Intervals.term("w1"), Intervals.extend(Intervals.term("w2"), 1, 0)));
checkHits(q, new int[]{ 1, 2, 5 });
}
}

View File

@ -124,6 +124,8 @@ public class TestIntervals extends LuceneTestCase {
i += 2;
}
assertEquals("Wrong number of endpoints in doc " + id, expected[id].length, i);
assertEquals(IntervalIterator.NO_MORE_INTERVALS, intervals.start());
assertEquals(IntervalIterator.NO_MORE_INTERVALS, intervals.end());
if (i > 0)
matchedDocs++;
}
@ -504,4 +506,34 @@ public class TestIntervals extends LuceneTestCase {
assertMatch(mi, 4, 8, 12, 26);
}
public void testDefinedGaps() throws IOException {
IntervalsSource source = Intervals.phrase(
Intervals.term("pease"),
Intervals.extend(Intervals.term("cold"), 1, 1),
Intervals.term("porridge")
);
checkIntervals(source, "field1", 3, new int[][]{
{},
{ 3, 7 },
{ 0, 4 },
{},
{ 3, 7 },
{}
});
MatchesIterator mi = getMatches(source, 1, "field1");
assertMatch(mi, 3, 7, 20, 55);
MatchesIterator sub = mi.getSubMatches();
assertNotNull(sub);
assertMatch(sub, 3, 3, 20, 25);
assertMatch(sub, 4, 6, 35, 39);
assertMatch(sub, 7, 7, 47, 55);
source = Intervals.extend(Intervals.term("w1"), 5, Integer.MAX_VALUE);
checkIntervals(source, "field2", 1, new int[][]{
{}, {}, {}, {}, {},
{ 0, Integer.MAX_VALUE - 1, 0, Integer.MAX_VALUE - 1, 5, Integer.MAX_VALUE - 1 }
});
}
}