mirror of https://github.com/apache/lucene.git
LUCENE-8612: Add Intervals.extend()
This commit is contained in:
parent
752989fd74
commit
2532a5d31c
|
@ -105,6 +105,10 @@ API Changes
|
|||
* LUCENE-8609: Remove IndexWriter#numDocs() and IndexWriter#maxDoc() in favor
|
||||
of IndexWriter#getDocStats(). (Simon Willnauer)
|
||||
|
||||
* LUCENE-8612: Intervals.extend() treats an interval as if it covered a wider
|
||||
span than it actually does, allowing users to force minimum gaps between
|
||||
intervals in a phrase. (Alan Woodward)
|
||||
|
||||
Changes in Runtime Behavior
|
||||
|
||||
* LUCENE-8333: Switch MoreLikeThis.setMaxDocFreqPct to use maxDoc instead of
|
||||
|
|
|
@ -18,7 +18,6 @@
|
|||
package org.apache.lucene.search.intervals;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* A function that takes two interval iterators and combines them to produce a third,
|
||||
|
@ -160,106 +159,6 @@ abstract class DifferenceIntervalFunction {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Filters the minuend iterator so that only intervals that do not occur within a set number
|
||||
* of positions of intervals from the subtrahend iterator are returned
|
||||
*/
|
||||
static class NotWithinFunction extends DifferenceIntervalFunction {
|
||||
|
||||
private final int positions;
|
||||
|
||||
NotWithinFunction(int positions) {
|
||||
this.positions = positions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
NotWithinFunction that = (NotWithinFunction) o;
|
||||
return positions == that.positions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "NOTWITHIN/" + positions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(positions);
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntervalIterator apply(IntervalIterator minuend, IntervalIterator subtrahend) {
|
||||
IntervalIterator notWithin = new IntervalIterator() {
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return subtrahend.docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
positioned = false;
|
||||
return subtrahend.nextDoc();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
positioned = false;
|
||||
return subtrahend.advance(target);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return subtrahend.cost();
|
||||
}
|
||||
|
||||
boolean positioned = false;
|
||||
|
||||
@Override
|
||||
public int start() {
|
||||
if (positioned == false)
|
||||
return -1;
|
||||
int start = subtrahend.start();
|
||||
return Math.max(0, start - positions);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int end() {
|
||||
if (positioned == false)
|
||||
return -1;
|
||||
int end = subtrahend.end();
|
||||
int newEnd = end + positions;
|
||||
if (newEnd < 0) // check for overflow
|
||||
return Integer.MAX_VALUE;
|
||||
return newEnd;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int gaps() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextInterval() throws IOException {
|
||||
if (positioned == false) {
|
||||
positioned = true;
|
||||
}
|
||||
return subtrahend.nextInterval();
|
||||
}
|
||||
|
||||
@Override
|
||||
public float matchCost() {
|
||||
return subtrahend.matchCost();
|
||||
}
|
||||
|
||||
};
|
||||
return NON_OVERLAPPING.apply(minuend, notWithin);
|
||||
}
|
||||
}
|
||||
|
||||
private static class NotContainingIterator extends RelativeIterator {
|
||||
|
||||
private NotContainingIterator(IntervalIterator minuend, IntervalIterator subtrahend) {
|
||||
|
|
|
@ -0,0 +1,118 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search.intervals;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Wraps an IntervalIterator and extends the bounds of its intervals
|
||||
*
|
||||
* Useful for specifying gaps in an ordered iterator; if you want to match
|
||||
* `a b [2 spaces] c`, you can search for phrase(a, extended(b, 0, 2), c)
|
||||
*
|
||||
* An interval with prefix bounds extended by n will skip over matches that
|
||||
* appear in positions lower than n
|
||||
*/
|
||||
class ExtendedIntervalIterator extends IntervalIterator {
|
||||
|
||||
private final IntervalIterator in;
|
||||
private final int before;
|
||||
private final int after;
|
||||
|
||||
private boolean positioned;
|
||||
|
||||
/**
|
||||
* Create a new ExtendedIntervalIterator
|
||||
* @param in the iterator to wrap
|
||||
* @param before the number of positions to extend before the delegated interval
|
||||
* @param after the number of positions to extend beyond the delegated interval
|
||||
*/
|
||||
ExtendedIntervalIterator(IntervalIterator in, int before, int after) {
|
||||
this.in = in;
|
||||
this.before = before;
|
||||
this.after = after;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int start() {
|
||||
if (positioned == false) {
|
||||
return -1;
|
||||
}
|
||||
int start = in.start();
|
||||
if (start == NO_MORE_INTERVALS) {
|
||||
return NO_MORE_INTERVALS;
|
||||
}
|
||||
return Math.max(0, start - before);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int end() {
|
||||
if (positioned == false) {
|
||||
return -1;
|
||||
}
|
||||
int end = in.end();
|
||||
if (end == NO_MORE_INTERVALS) {
|
||||
return NO_MORE_INTERVALS;
|
||||
}
|
||||
end += after;
|
||||
if (end < 0 || end == NO_MORE_INTERVALS) {
|
||||
// overflow
|
||||
end = NO_MORE_INTERVALS - 1;
|
||||
}
|
||||
return end;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int gaps() {
|
||||
return in.gaps();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextInterval() throws IOException {
|
||||
positioned = true;
|
||||
in.nextInterval();
|
||||
return start();
|
||||
}
|
||||
|
||||
@Override
|
||||
public float matchCost() {
|
||||
return in.matchCost();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return in.docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
positioned = false;
|
||||
return in.nextDoc();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
positioned = false;
|
||||
return in.advance(target);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return in.cost();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search.intervals;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.MatchesIterator;
|
||||
|
||||
class ExtendedIntervalsSource extends IntervalsSource {
|
||||
|
||||
final IntervalsSource source;
|
||||
private final int before;
|
||||
private final int after;
|
||||
|
||||
ExtendedIntervalsSource(IntervalsSource source, int before, int after) {
|
||||
this.source = source;
|
||||
this.before = before;
|
||||
this.after = after;
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
|
||||
IntervalIterator in = source.intervals(field, ctx);
|
||||
if (in == null) {
|
||||
return null;
|
||||
}
|
||||
return new ExtendedIntervalIterator(in, before, after);
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
|
||||
MatchesIterator in = source.matches(field, ctx, doc);
|
||||
if (in == null) {
|
||||
return null;
|
||||
}
|
||||
IntervalIterator wrapped = new ExtendedIntervalIterator(IntervalMatches.wrapMatches(in, doc), before, after);
|
||||
return IntervalMatches.asMatches(wrapped, in, doc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void extractTerms(String field, Set<Term> terms) {
|
||||
source.extractTerms(field, terms);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
ExtendedIntervalsSource that = (ExtendedIntervalsSource) o;
|
||||
return before == that.before &&
|
||||
after == that.after &&
|
||||
Objects.equals(source, that.source);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(source, before, after);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "EXTEND(" + source + "," + before + "," + after + ")";
|
||||
}
|
||||
}
|
|
@ -75,19 +75,19 @@ abstract class IntervalFunction {
|
|||
@Override
|
||||
public int nextInterval() throws IOException {
|
||||
if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
|
||||
return IntervalIterator.NO_MORE_INTERVALS;
|
||||
return start = end = IntervalIterator.NO_MORE_INTERVALS;
|
||||
int i = 1;
|
||||
while (i < subIterators.size()) {
|
||||
while (subIterators.get(i).start() <= subIterators.get(i - 1).end()) {
|
||||
if (subIterators.get(i).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
|
||||
return IntervalIterator.NO_MORE_INTERVALS;
|
||||
return start = end = IntervalIterator.NO_MORE_INTERVALS;
|
||||
}
|
||||
if (subIterators.get(i).start() == subIterators.get(i - 1).end() + 1) {
|
||||
i = i + 1;
|
||||
}
|
||||
else {
|
||||
if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
|
||||
return IntervalIterator.NO_MORE_INTERVALS;
|
||||
return start = end = IntervalIterator.NO_MORE_INTERVALS;
|
||||
i = 1;
|
||||
}
|
||||
}
|
||||
|
@ -150,6 +150,9 @@ abstract class IntervalFunction {
|
|||
i++;
|
||||
}
|
||||
start = subIterators.get(0).start();
|
||||
if (start == NO_MORE_INTERVALS) {
|
||||
return end = NO_MORE_INTERVALS;
|
||||
}
|
||||
firstEnd = subIterators.get(0).end();
|
||||
end = subIterators.get(subIterators.size() - 1).end();
|
||||
b = subIterators.get(subIterators.size() - 1).start();
|
||||
|
@ -248,7 +251,7 @@ abstract class IntervalFunction {
|
|||
if (allowOverlaps == false) {
|
||||
while (hasOverlaps(it)) {
|
||||
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
|
||||
return IntervalIterator.NO_MORE_INTERVALS;
|
||||
return start = end = IntervalIterator.NO_MORE_INTERVALS;
|
||||
}
|
||||
}
|
||||
queue.add(it);
|
||||
|
@ -256,7 +259,7 @@ abstract class IntervalFunction {
|
|||
}
|
||||
}
|
||||
if (this.queue.size() < subIterators.length)
|
||||
return IntervalIterator.NO_MORE_INTERVALS;
|
||||
return start = end = IntervalIterator.NO_MORE_INTERVALS;
|
||||
// then, minimize it
|
||||
do {
|
||||
start = queue.top().start();
|
||||
|
@ -408,11 +411,17 @@ abstract class IntervalFunction {
|
|||
|
||||
@Override
|
||||
public int start() {
|
||||
if (bpos == false) {
|
||||
return NO_MORE_INTERVALS;
|
||||
}
|
||||
return a.start();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int end() {
|
||||
if (bpos == false) {
|
||||
return NO_MORE_INTERVALS;
|
||||
}
|
||||
return a.end();
|
||||
}
|
||||
|
||||
|
@ -427,12 +436,15 @@ abstract class IntervalFunction {
|
|||
return IntervalIterator.NO_MORE_INTERVALS;
|
||||
while (a.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
|
||||
while (b.end() < a.end()) {
|
||||
if (b.nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
|
||||
if (b.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
|
||||
bpos = false;
|
||||
return IntervalIterator.NO_MORE_INTERVALS;
|
||||
}
|
||||
}
|
||||
if (b.start() <= a.start())
|
||||
return a.start();
|
||||
}
|
||||
bpos = false;
|
||||
return IntervalIterator.NO_MORE_INTERVALS;
|
||||
}
|
||||
|
||||
|
|
|
@ -48,14 +48,16 @@ public abstract class IntervalIterator extends DocIdSetIterator {
|
|||
/**
|
||||
* The start of the current interval
|
||||
*
|
||||
* Returns -1 if {@link #nextInterval()} has not yet been called
|
||||
* Returns -1 if {@link #nextInterval()} has not yet been called and {@link #NO_MORE_INTERVALS}
|
||||
* once the iterator is exhausted.
|
||||
*/
|
||||
public abstract int start();
|
||||
|
||||
/**
|
||||
* The end of the current interval
|
||||
*
|
||||
* Returns -1 if {@link #nextInterval()} has not yet been called
|
||||
* Returns -1 if {@link #nextInterval()} has not yet been called and {@link #NO_MORE_INTERVALS}
|
||||
* once the iterator is exhausted.
|
||||
*/
|
||||
public abstract int end();
|
||||
|
||||
|
|
|
@ -49,12 +49,12 @@ final class IntervalMatches {
|
|||
|
||||
@Override
|
||||
public int startPosition() {
|
||||
return source.startPosition();
|
||||
return iterator.start();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endPosition() {
|
||||
return source.endPosition();
|
||||
return iterator.end();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -103,6 +103,27 @@ public final class Intervals {
|
|||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an {@link IntervalsSource} that wraps another source, extending its
|
||||
* intervals by a number of positions before and after.
|
||||
*
|
||||
* This can be useful for adding defined gaps in a block query; for example,
|
||||
* to find 'a b [2 arbitrary terms] c', you can call:
|
||||
* <pre>
|
||||
* Intervals.phrase(Intervals.term("a"), Intervals.extend(Intervals.term("b"), 0, 2), Intervals.term("c"));
|
||||
* </pre>
|
||||
*
|
||||
* Note that calling {@link IntervalIterator#gaps()} on iterators returned by this source
|
||||
* delegates directly to the wrapped iterator, and does not include the extensions.
|
||||
*
|
||||
* @param source the source to extend
|
||||
* @param before how many positions to extend before the delegated interval
|
||||
* @param after how many positions to extend after the delegated interval
|
||||
*/
|
||||
public static IntervalsSource extend(IntervalsSource source, int before, int after) {
|
||||
return new ExtendedIntervalsSource(source, before, after);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an ordered {@link IntervalsSource}
|
||||
*
|
||||
|
@ -162,7 +183,8 @@ public final class Intervals {
|
|||
* @param subtrahend the {@link IntervalsSource} to filter by
|
||||
*/
|
||||
public static IntervalsSource notWithin(IntervalsSource minuend, int positions, IntervalsSource subtrahend) {
|
||||
return new DifferenceIntervalsSource(minuend, subtrahend, new DifferenceIntervalFunction.NotWithinFunction(positions));
|
||||
return new DifferenceIntervalsSource(minuend, Intervals.extend(subtrahend, positions, positions),
|
||||
DifferenceIntervalFunction.NON_OVERLAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -188,4 +188,10 @@ public class TestIntervalQuery extends LuceneTestCase {
|
|||
);
|
||||
checkHits(q, new int[]{3});
|
||||
}
|
||||
|
||||
public void testDefinedGaps() throws IOException {
|
||||
Query q = new IntervalQuery(field,
|
||||
Intervals.phrase(Intervals.term("w1"), Intervals.extend(Intervals.term("w2"), 1, 0)));
|
||||
checkHits(q, new int[]{ 1, 2, 5 });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -124,6 +124,8 @@ public class TestIntervals extends LuceneTestCase {
|
|||
i += 2;
|
||||
}
|
||||
assertEquals("Wrong number of endpoints in doc " + id, expected[id].length, i);
|
||||
assertEquals(IntervalIterator.NO_MORE_INTERVALS, intervals.start());
|
||||
assertEquals(IntervalIterator.NO_MORE_INTERVALS, intervals.end());
|
||||
if (i > 0)
|
||||
matchedDocs++;
|
||||
}
|
||||
|
@ -504,4 +506,34 @@ public class TestIntervals extends LuceneTestCase {
|
|||
assertMatch(mi, 4, 8, 12, 26);
|
||||
}
|
||||
|
||||
public void testDefinedGaps() throws IOException {
|
||||
IntervalsSource source = Intervals.phrase(
|
||||
Intervals.term("pease"),
|
||||
Intervals.extend(Intervals.term("cold"), 1, 1),
|
||||
Intervals.term("porridge")
|
||||
);
|
||||
checkIntervals(source, "field1", 3, new int[][]{
|
||||
{},
|
||||
{ 3, 7 },
|
||||
{ 0, 4 },
|
||||
{},
|
||||
{ 3, 7 },
|
||||
{}
|
||||
});
|
||||
|
||||
MatchesIterator mi = getMatches(source, 1, "field1");
|
||||
assertMatch(mi, 3, 7, 20, 55);
|
||||
MatchesIterator sub = mi.getSubMatches();
|
||||
assertNotNull(sub);
|
||||
assertMatch(sub, 3, 3, 20, 25);
|
||||
assertMatch(sub, 4, 6, 35, 39);
|
||||
assertMatch(sub, 7, 7, 47, 55);
|
||||
|
||||
source = Intervals.extend(Intervals.term("w1"), 5, Integer.MAX_VALUE);
|
||||
checkIntervals(source, "field2", 1, new int[][]{
|
||||
{}, {}, {}, {}, {},
|
||||
{ 0, Integer.MAX_VALUE - 1, 0, Integer.MAX_VALUE - 1, 5, Integer.MAX_VALUE - 1 }
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue