mirror of https://github.com/apache/lucene.git
LUCENE-8629: New interval functions: before, after, within, overlapping
This commit is contained in:
parent
906f94a8cc
commit
7d34bfdaad
|
@ -108,10 +108,6 @@ API Changes
|
|||
* LUCENE-8609: Remove IndexWriter#numDocs() and IndexWriter#maxDoc() in favor
|
||||
of IndexWriter#getDocStats(). (Simon Willnauer)
|
||||
|
||||
* LUCENE-8612: Intervals.extend() treats an interval as if it covered a wider
|
||||
span than it actually does, allowing users to force minimum gaps between
|
||||
intervals in a phrase. (Alan Woodward)
|
||||
|
||||
Changes in Runtime Behavior
|
||||
|
||||
* LUCENE-8333: Switch MoreLikeThis.setMaxDocFreqPct to use maxDoc instead of
|
||||
|
@ -177,6 +173,13 @@ New Features
|
|||
to read-ahead in the token stream and take into account graph structures. This
|
||||
also changes FixedShingleFilter to extend GraphTokenFilter (Alan Woodward)
|
||||
|
||||
* LUCENE-8612: Intervals.extend() treats an interval as if it covered a wider
|
||||
span than it actually does, allowing users to force minimum gaps between
|
||||
intervals in a phrase. (Alan Woodward)
|
||||
|
||||
* LUCENE-8629: New interval functions: Intervals.before(), Intervals.after(),
|
||||
Intervals.within() and Intervals.overlapping(). (Alan Woodward)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.
|
||||
|
|
|
@ -353,25 +353,7 @@ abstract class IntervalFunction {
|
|||
throw new IllegalStateException("CONTAINING function requires two iterators");
|
||||
IntervalIterator a = iterators.get(0);
|
||||
IntervalIterator b = iterators.get(1);
|
||||
return new ConjunctionIntervalIterator(iterators) {
|
||||
|
||||
boolean bpos;
|
||||
|
||||
@Override
|
||||
public int start() {
|
||||
return a.start();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int end() {
|
||||
return a.end();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int gaps() {
|
||||
return a.gaps();
|
||||
}
|
||||
|
||||
return new FilteringIntervalIterator(a, b) {
|
||||
@Override
|
||||
public int nextInterval() throws IOException {
|
||||
if (bpos == false)
|
||||
|
@ -386,11 +368,6 @@ abstract class IntervalFunction {
|
|||
}
|
||||
return IntervalIterator.NO_MORE_INTERVALS;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void reset() throws IOException {
|
||||
bpos = b.nextInterval() != NO_MORE_INTERVALS;
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
|
@ -405,31 +382,7 @@ abstract class IntervalFunction {
|
|||
throw new IllegalStateException("CONTAINED_BY function requires two iterators");
|
||||
IntervalIterator a = iterators.get(0);
|
||||
IntervalIterator b = iterators.get(1);
|
||||
return new ConjunctionIntervalIterator(iterators) {
|
||||
|
||||
boolean bpos;
|
||||
|
||||
@Override
|
||||
public int start() {
|
||||
if (bpos == false) {
|
||||
return NO_MORE_INTERVALS;
|
||||
}
|
||||
return a.start();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int end() {
|
||||
if (bpos == false) {
|
||||
return NO_MORE_INTERVALS;
|
||||
}
|
||||
return a.end();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int gaps() {
|
||||
return a.gaps();
|
||||
}
|
||||
|
||||
return new FilteringIntervalIterator(a, b) {
|
||||
@Override
|
||||
public int nextInterval() throws IOException {
|
||||
if (bpos == false)
|
||||
|
@ -447,15 +400,79 @@ abstract class IntervalFunction {
|
|||
bpos = false;
|
||||
return IntervalIterator.NO_MORE_INTERVALS;
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
static final IntervalFunction OVERLAPPING = new SingletonFunction("OVERLAPPING") {
|
||||
@Override
|
||||
public IntervalIterator apply(List<IntervalIterator> iterators) {
|
||||
if (iterators.size() != 2)
|
||||
throw new IllegalStateException("OVERLAPPING function requires two iterators");
|
||||
IntervalIterator a = iterators.get(0);
|
||||
IntervalIterator b = iterators.get(1);
|
||||
return new FilteringIntervalIterator(a, b) {
|
||||
@Override
|
||||
protected void reset() throws IOException {
|
||||
bpos = b.nextInterval() != NO_MORE_INTERVALS;
|
||||
public int nextInterval() throws IOException {
|
||||
if (bpos == false)
|
||||
return IntervalIterator.NO_MORE_INTERVALS;
|
||||
while (a.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
|
||||
while (b.end() < a.start()) {
|
||||
if (b.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
|
||||
bpos = false;
|
||||
return IntervalIterator.NO_MORE_INTERVALS;
|
||||
}
|
||||
}
|
||||
if (b.start() <= a.end())
|
||||
return a.start();
|
||||
}
|
||||
bpos = false;
|
||||
return IntervalIterator.NO_MORE_INTERVALS;
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
private static abstract class FilteringIntervalIterator extends ConjunctionIntervalIterator {
|
||||
|
||||
final IntervalIterator a;
|
||||
final IntervalIterator b;
|
||||
|
||||
boolean bpos;
|
||||
|
||||
protected FilteringIntervalIterator(IntervalIterator a, IntervalIterator b) {
|
||||
super(Arrays.asList(a, b));
|
||||
this.a = a;
|
||||
this.b = b;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int start() {
|
||||
if (bpos == false) {
|
||||
return NO_MORE_INTERVALS;
|
||||
}
|
||||
return a.start();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int end() {
|
||||
if (bpos == false) {
|
||||
return NO_MORE_INTERVALS;
|
||||
}
|
||||
return a.end();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int gaps() {
|
||||
return a.gaps();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void reset() throws IOException {
|
||||
bpos = b.nextInterval() != NO_MORE_INTERVALS;
|
||||
}
|
||||
}
|
||||
|
||||
private static abstract class SingletonFunction extends IntervalFunction {
|
||||
|
||||
private final String name;
|
||||
|
|
|
@ -171,6 +171,15 @@ public final class Intervals {
|
|||
return new DifferenceIntervalsSource(minuend, subtrahend, DifferenceIntervalFunction.NON_OVERLAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns intervals from a source that overlap with intervals from another source
|
||||
* @param source the source to filter
|
||||
* @param reference the source to filter by
|
||||
*/
|
||||
public static IntervalsSource overlapping(IntervalsSource source, IntervalsSource reference) {
|
||||
return new ConjunctionIntervalsSource(Arrays.asList(source, reference), IntervalFunction.OVERLAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a not-within {@link IntervalsSource}
|
||||
*
|
||||
|
@ -178,7 +187,7 @@ public final class Intervals {
|
|||
* intervals from the subtrahend query
|
||||
*
|
||||
* @param minuend the {@link IntervalsSource} to filter
|
||||
* @param positions the maximum distance that intervals from the minuend may occur from intervals
|
||||
* @param positions the minimum distance that intervals from the minuend may occur from intervals
|
||||
* of the subtrahend
|
||||
* @param subtrahend the {@link IntervalsSource} to filter by
|
||||
*/
|
||||
|
@ -187,6 +196,18 @@ public final class Intervals {
|
|||
DifferenceIntervalFunction.NON_OVERLAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns intervals of the source that appear within a set number of positions of intervals from
|
||||
* the reference
|
||||
*
|
||||
* @param source the {@link IntervalsSource} to filter
|
||||
* @param positions the maximum distance that intervals of the source may occur from intervals of the reference
|
||||
* @param reference the {@link IntervalsSource} to filter by
|
||||
*/
|
||||
public static IntervalsSource within(IntervalsSource source, int positions, IntervalsSource reference) {
|
||||
return containedBy(source, Intervals.extend(reference, positions, positions));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a not-containing {@link IntervalsSource}
|
||||
*
|
||||
|
@ -237,6 +258,22 @@ public final class Intervals {
|
|||
return new ConjunctionIntervalsSource(Arrays.asList(small, big), IntervalFunction.CONTAINED_BY);
|
||||
}
|
||||
|
||||
// TODO: beforeQuery, afterQuery, arbitrary IntervalFunctions
|
||||
/**
|
||||
* Returns intervals from the source that appear before intervals from the reference
|
||||
*/
|
||||
public static IntervalsSource before(IntervalsSource source, IntervalsSource reference) {
|
||||
return new ConjunctionIntervalsSource(Arrays.asList(source,
|
||||
Intervals.extend(new OffsetIntervalsSource(reference, true), Integer.MAX_VALUE, 0)),
|
||||
IntervalFunction.CONTAINED_BY);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns intervals from the source that appear after intervals from the reference
|
||||
*/
|
||||
public static IntervalsSource after(IntervalsSource source, IntervalsSource reference) {
|
||||
return new ConjunctionIntervalsSource(Arrays.asList(source,
|
||||
Intervals.extend(new OffsetIntervalsSource(reference, false), 0, Integer.MAX_VALUE)),
|
||||
IntervalFunction.CONTAINED_BY);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,172 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search.intervals;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.MatchesIterator;
|
||||
|
||||
/**
|
||||
* Tracks a reference intervals source, and produces a pseudo-interval that appears
|
||||
* either one position before or one position after each interval from the reference
|
||||
*/
|
||||
class OffsetIntervalsSource extends IntervalsSource {
|
||||
|
||||
private final IntervalsSource in;
|
||||
private final boolean before;
|
||||
|
||||
OffsetIntervalsSource(IntervalsSource in, boolean before) {
|
||||
this.in = in;
|
||||
this.before = before;
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
|
||||
IntervalIterator it = in.intervals(field, ctx);
|
||||
if (it == null) {
|
||||
return null;
|
||||
}
|
||||
return offset(it);
|
||||
}
|
||||
|
||||
private IntervalIterator offset(IntervalIterator it) {
|
||||
if (before) {
|
||||
return new OffsetIntervalIterator(it) {
|
||||
@Override
|
||||
public int start() {
|
||||
int pos = in.start();
|
||||
if (pos == -1) {
|
||||
return -1;
|
||||
}
|
||||
if (pos == NO_MORE_INTERVALS) {
|
||||
return NO_MORE_INTERVALS;
|
||||
}
|
||||
return Math.max(0, pos - 1);
|
||||
}
|
||||
};
|
||||
}
|
||||
else {
|
||||
return new OffsetIntervalIterator(it) {
|
||||
@Override
|
||||
public int start() {
|
||||
int pos = in.end() + 1;
|
||||
if (pos == 0) {
|
||||
return -1;
|
||||
}
|
||||
if (pos < 0) { // overflow
|
||||
return Integer.MAX_VALUE;
|
||||
}
|
||||
if (pos == Integer.MAX_VALUE) {
|
||||
return Integer.MAX_VALUE - 1;
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private static abstract class OffsetIntervalIterator extends IntervalIterator {
|
||||
|
||||
final IntervalIterator in;
|
||||
|
||||
OffsetIntervalIterator(IntervalIterator in) {
|
||||
this.in = in;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int end() {
|
||||
return start();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int gaps() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextInterval() throws IOException {
|
||||
in.nextInterval();
|
||||
return start();
|
||||
}
|
||||
|
||||
@Override
|
||||
public float matchCost() {
|
||||
return in.matchCost();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return in.docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
return in.nextDoc();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
return in.advance(target);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return in.cost();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
|
||||
MatchesIterator mi = in.matches(field, ctx, doc);
|
||||
if (mi == null) {
|
||||
return null;
|
||||
}
|
||||
return IntervalMatches.asMatches(offset(IntervalMatches.wrapMatches(mi, doc)), mi, doc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void extractTerms(String field, Set<Term> terms) {
|
||||
in.extractTerms(field, terms);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
OffsetIntervalsSource that = (OffsetIntervalsSource) o;
|
||||
return before == that.before &&
|
||||
Objects.equals(in, that.in);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(in, before);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
if (before) {
|
||||
return ("PRECEDING(" + in + ")");
|
||||
}
|
||||
return ("FOLLOWING(" + in + ")");
|
||||
}
|
||||
}
|
|
@ -330,6 +330,33 @@ public class TestIntervals extends LuceneTestCase {
|
|||
|
||||
}
|
||||
|
||||
public void testOffsetIntervals() throws IOException {
|
||||
IntervalsSource source = Intervals.unordered(
|
||||
Intervals.term("pease"),
|
||||
Intervals.term("porridge"),
|
||||
Intervals.or(Intervals.term("hot"), Intervals.term("cold")));
|
||||
|
||||
IntervalsSource before = new OffsetIntervalsSource(source, true);
|
||||
checkIntervals(before, "field1", 3, new int[][]{
|
||||
{},
|
||||
{ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5 },
|
||||
{ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5 },
|
||||
{},
|
||||
{ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5 },
|
||||
{}
|
||||
});
|
||||
|
||||
IntervalsSource after = new OffsetIntervalsSource(source, false);
|
||||
checkIntervals(after, "field1", 3, new int[][]{
|
||||
{},
|
||||
{ 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 18, 18 },
|
||||
{ 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 18, 18 },
|
||||
{},
|
||||
{ 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 18, 18 },
|
||||
{}
|
||||
});
|
||||
}
|
||||
|
||||
public void testNesting2() throws IOException {
|
||||
IntervalsSource source = Intervals.unordered(
|
||||
Intervals.ordered(
|
||||
|
@ -536,4 +563,80 @@ public class TestIntervals extends LuceneTestCase {
|
|||
});
|
||||
}
|
||||
|
||||
public void testAfter() throws IOException {
|
||||
IntervalsSource source = Intervals.after(Intervals.term("porridge"),
|
||||
Intervals.ordered(Intervals.term("pease"), Intervals.term("cold")));
|
||||
checkIntervals(source, "field1", 3, new int[][]{
|
||||
{},
|
||||
{ 7, 7 },
|
||||
{ 4, 4, 7, 7 },
|
||||
{},
|
||||
{ 7, 7 },
|
||||
{}
|
||||
});
|
||||
|
||||
MatchesIterator mi = getMatches(source, 1, "field1");
|
||||
assertMatch(mi, 7, 7, 20, 55);
|
||||
MatchesIterator sub = mi.getSubMatches();
|
||||
assertNotNull(sub);
|
||||
assertMatch(sub, 3, 3, 20, 25);
|
||||
assertMatch(sub, 5, 5, 35, 39);
|
||||
assertMatch(sub, 7, 7, 47, 55);
|
||||
assertFalse(sub.next());
|
||||
}
|
||||
|
||||
public void testBefore() throws IOException {
|
||||
IntervalsSource source = Intervals.before(Intervals.term("cold"), Intervals.term("twelve"));
|
||||
checkIntervals(source, "field1", 2, new int[][]{
|
||||
{},
|
||||
{},
|
||||
{ 2, 2 },
|
||||
{},
|
||||
{ 5, 5 },
|
||||
{}
|
||||
});
|
||||
}
|
||||
|
||||
public void testWithin() throws IOException {
|
||||
IntervalsSource source = Intervals.within(Intervals.term("hot"), 6,
|
||||
Intervals.or(Intervals.term("porridge"), Intervals.term("fraggle")));
|
||||
checkIntervals(source, "field1", 3, new int[][]{
|
||||
{},
|
||||
{ 2, 2 },
|
||||
{ 5, 5, 21, 21 },
|
||||
{},
|
||||
{ 2, 2 },
|
||||
{}
|
||||
});
|
||||
}
|
||||
|
||||
public void testOverlapping() throws IOException {
|
||||
IntervalsSource source = Intervals.overlapping(
|
||||
Intervals.unordered(Intervals.term("hot"), Intervals.term("porridge")),
|
||||
Intervals.unordered(Intervals.term("cold"), Intervals.term("pease"))
|
||||
);
|
||||
checkIntervals(source, "field1", 3, new int[][]{
|
||||
{},
|
||||
{ 2, 4, 7, 17 },
|
||||
{ 5, 7, 7, 21 },
|
||||
{},
|
||||
{ 2, 4 },
|
||||
{}
|
||||
});
|
||||
|
||||
assertGaps(source, 2, "field1", new int[]{ 1, 13 });
|
||||
|
||||
MatchesIterator mi = getMatches(source, 1, "field1");
|
||||
assertNotNull(mi);
|
||||
assertMatch(mi, 2, 4, 15, 39);
|
||||
MatchesIterator sub = mi.getSubMatches();
|
||||
assertNotNull(sub);
|
||||
assertMatch(sub, 2, 2, 15, 18);
|
||||
assertMatch(sub, 3, 3, 20, 25);
|
||||
assertMatch(sub, 4, 4, 26, 34);
|
||||
assertMatch(sub, 5, 5, 35, 39);
|
||||
assertFalse(sub.next());
|
||||
assertMatch(mi, 7, 17, 41, 118);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue