mirror of https://github.com/apache/lucene.git
LUCENE-8300: Allow unordered intervals to exclude overlaps
This commit is contained in:
parent
9aa16b64c7
commit
e3d4c7e9b7
|
@ -145,9 +145,9 @@ New Features
|
|||
|
||||
* LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir)
|
||||
|
||||
* LUCENE-8196: A new IntervalQuery in the sandbox allows efficient proximity
|
||||
* LUCENE-8196, LUCENE-8300: A new IntervalQuery in the sandbox allows efficient proximity
|
||||
searches based on minimum-interval semantics. (Alan Woodward, Adrien Grand,
|
||||
Jim Ferenczi, Simon Willnauer)
|
||||
Jim Ferenczi, Simon Willnauer, Matt Weber)
|
||||
|
||||
* LUCENE-8233: Add support for soft deletes to IndexWriter delete accounting.
|
||||
Soft deletes are accounted for inside the index writer and therefor also
|
||||
|
|
|
@ -165,7 +165,17 @@ abstract class IntervalFunction {
|
|||
static final IntervalFunction UNORDERED = new SingletonFunction("UNORDERED") {
|
||||
@Override
|
||||
public IntervalIterator apply(List<IntervalIterator> intervalIterators) {
|
||||
return new UnorderedIntervalIterator(intervalIterators);
|
||||
return new UnorderedIntervalIterator(intervalIterators, true);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Return an iterator over intervals where the subiterators appear in any order, and do not overlap
|
||||
*/
|
||||
static final IntervalFunction UNORDERED_NO_OVERLAP = new SingletonFunction("UNORDERED_NO_OVERLAP") {
|
||||
@Override
|
||||
public IntervalIterator apply(List<IntervalIterator> iterators) {
|
||||
return new UnorderedIntervalIterator(iterators, false);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -173,10 +183,11 @@ abstract class IntervalFunction {
|
|||
|
||||
private final PriorityQueue<IntervalIterator> queue;
|
||||
private final IntervalIterator[] subIterators;
|
||||
private final boolean allowOverlaps;
|
||||
|
||||
int start = -1, end = -1, queueEnd;
|
||||
|
||||
UnorderedIntervalIterator(List<IntervalIterator> subIterators) {
|
||||
UnorderedIntervalIterator(List<IntervalIterator> subIterators, boolean allowOverlaps) {
|
||||
super(subIterators);
|
||||
this.queue = new PriorityQueue<IntervalIterator>(subIterators.size()) {
|
||||
@Override
|
||||
|
@ -185,6 +196,7 @@ abstract class IntervalFunction {
|
|||
}
|
||||
};
|
||||
this.subIterators = new IntervalIterator[subIterators.size()];
|
||||
this.allowOverlaps = allowOverlaps;
|
||||
|
||||
for (int i = 0; i < subIterators.size(); i++) {
|
||||
this.subIterators[i] = subIterators.get(i);
|
||||
|
@ -210,15 +222,23 @@ abstract class IntervalFunction {
|
|||
|
||||
@Override
|
||||
public int nextInterval() throws IOException {
|
||||
// first, find a matching interval
|
||||
while (this.queue.size() == subIterators.length && queue.top().start() == start) {
|
||||
IntervalIterator it = queue.pop();
|
||||
if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
|
||||
if (allowOverlaps == false) {
|
||||
while (hasOverlaps(it)) {
|
||||
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
|
||||
return IntervalIterator.NO_MORE_INTERVALS;
|
||||
}
|
||||
}
|
||||
queue.add(it);
|
||||
updateRightExtreme(it);
|
||||
}
|
||||
}
|
||||
if (this.queue.size() < subIterators.length)
|
||||
return IntervalIterator.NO_MORE_INTERVALS;
|
||||
// then, minimize it
|
||||
do {
|
||||
start = queue.top().start();
|
||||
end = queueEnd;
|
||||
|
@ -226,6 +246,13 @@ abstract class IntervalFunction {
|
|||
return start;
|
||||
IntervalIterator it = queue.pop();
|
||||
if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
|
||||
if (allowOverlaps == false) {
|
||||
while (hasOverlaps(it)) {
|
||||
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
|
||||
return start;
|
||||
}
|
||||
}
|
||||
}
|
||||
queue.add(it);
|
||||
updateRightExtreme(it);
|
||||
}
|
||||
|
@ -237,15 +264,40 @@ abstract class IntervalFunction {
|
|||
protected void reset() throws IOException {
|
||||
queueEnd = start = end = -1;
|
||||
this.queue.clear();
|
||||
for (IntervalIterator it : subIterators) {
|
||||
loop: for (IntervalIterator it : subIterators) {
|
||||
if (it.nextInterval() == NO_MORE_INTERVALS) {
|
||||
break;
|
||||
}
|
||||
if (allowOverlaps == false) {
|
||||
while (hasOverlaps(it)) {
|
||||
if (it.nextInterval() == NO_MORE_INTERVALS) {
|
||||
break loop;
|
||||
}
|
||||
}
|
||||
}
|
||||
queue.add(it);
|
||||
updateRightExtreme(it);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean hasOverlaps(IntervalIterator candidate) {
|
||||
for (IntervalIterator it : queue) {
|
||||
if (it.start() < candidate.start()) {
|
||||
if (it.end() >= candidate.start()) {
|
||||
return true;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (it.start() == candidate.start()) {
|
||||
return true;
|
||||
}
|
||||
if (it.start() <= candidate.end()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -85,7 +85,7 @@ public final class Intervals {
|
|||
}
|
||||
|
||||
/**
|
||||
* Create an ordered {@link IntervalsSource} with an unbounded width range
|
||||
* Create an ordered {@link IntervalsSource}
|
||||
*
|
||||
* Returns intervals in which the subsources all appear in the given order
|
||||
*
|
||||
|
@ -96,14 +96,27 @@ public final class Intervals {
|
|||
}
|
||||
|
||||
/**
|
||||
* Create an unordered {@link IntervalsSource} with an unbounded width range
|
||||
* Create an unordered {@link IntervalsSource}
|
||||
*
|
||||
* Returns intervals in which all the subsources appear. The subsources may overlap
|
||||
*
|
||||
* @param subSources an unordered set of {@link IntervalsSource}s
|
||||
*/
|
||||
public static IntervalsSource unordered(IntervalsSource... subSources) {
|
||||
return unordered(true, subSources);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an unordered {@link IntervalsSource}
|
||||
*
|
||||
* Returns intervals in which all the subsources appear.
|
||||
*
|
||||
* @param subSources an unordered set of queries
|
||||
* @param subSources an unordered set of {@link IntervalsSource}s
|
||||
* @param allowOverlaps whether or not the sources should be allowed to overlap in a hit
|
||||
*/
|
||||
public static IntervalsSource unordered(IntervalsSource... subSources) {
|
||||
return new ConjunctionIntervalsSource(Arrays.asList(subSources), IntervalFunction.UNORDERED);
|
||||
public static IntervalsSource unordered(boolean allowOverlaps, IntervalsSource... subSources) {
|
||||
return new ConjunctionIntervalsSource(Arrays.asList(subSources),
|
||||
allowOverlaps ? IntervalFunction.UNORDERED : IntervalFunction.UNORDERED_NO_OVERLAP);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -55,8 +55,8 @@ public class TestIntervals extends LuceneTestCase {
|
|||
"Where Alph the sacred river ran through caverns measureless to man",
|
||||
"Down to a sunless sea",
|
||||
"So thrice five miles of fertile ground",
|
||||
"With walls and towers were girdled round",
|
||||
"Which was nice"
|
||||
"Pease hot porridge porridge",
|
||||
"Pease porridge porridge hot"
|
||||
};
|
||||
|
||||
private static Directory directory;
|
||||
|
@ -102,12 +102,12 @@ public class TestIntervals extends LuceneTestCase {
|
|||
assertEquals(-1, intervals.end());
|
||||
while ((pos = intervals.nextInterval()) != IntervalIterator.NO_MORE_INTERVALS) {
|
||||
//System.out.println(doc + ": " + intervals);
|
||||
assertEquals(expected[id][i], pos);
|
||||
assertEquals(expected[id][i], intervals.start());
|
||||
assertEquals(expected[id][i + 1], intervals.end());
|
||||
assertEquals("Wrong start value", expected[id][i], pos);
|
||||
assertEquals("start() != pos returned from nextInterval()", expected[id][i], intervals.start());
|
||||
assertEquals("Wrong end value", expected[id][i + 1], intervals.end());
|
||||
i += 2;
|
||||
}
|
||||
assertEquals(expected[id].length, i);
|
||||
assertEquals("Wrong number of endpoints", expected[id].length, i);
|
||||
if (i > 0)
|
||||
matchedDocs++;
|
||||
}
|
||||
|
@ -215,4 +215,38 @@ public class TestIntervals extends LuceneTestCase {
|
|||
});
|
||||
}
|
||||
|
||||
public void testUnorderedDistinct() throws IOException {
|
||||
checkIntervals(Intervals.unordered(false, Intervals.term("pease"), Intervals.term("pease")),
|
||||
"field1", 3, new int[][]{
|
||||
{},
|
||||
{ 0, 3, 3, 6 },
|
||||
{ 0, 3, 3, 6 },
|
||||
{},
|
||||
{ 0, 3, 3, 6 },
|
||||
{}
|
||||
});
|
||||
checkIntervals(Intervals.unordered(false,
|
||||
Intervals.unordered(Intervals.term("pease"), Intervals.term("porridge"), Intervals.term("hot")),
|
||||
Intervals.term("porridge")),
|
||||
"field1", 3, new int[][]{
|
||||
{},
|
||||
{ 1, 4, 4, 17 },
|
||||
{ 1, 5, 4, 7 },
|
||||
{},
|
||||
{ 1, 4, 4, 17 },
|
||||
{}
|
||||
});
|
||||
checkIntervals(Intervals.unordered(false,
|
||||
Intervals.unordered(Intervals.term("pease"), Intervals.term("porridge"), Intervals.term("hot")),
|
||||
Intervals.term("porridge")),
|
||||
"field2", 1, new int[][]{
|
||||
{},
|
||||
{},
|
||||
{},
|
||||
{},
|
||||
{ 0, 3 },
|
||||
{}
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue