mirror of https://github.com/apache/lucene.git
LUCENE-8300: Allow unordered intervals to exclude overlaps
This commit is contained in:
parent
9aa16b64c7
commit
e3d4c7e9b7
|
@ -145,9 +145,9 @@ New Features
|
||||||
|
|
||||||
* LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir)
|
* LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir)
|
||||||
|
|
||||||
* LUCENE-8196: A new IntervalQuery in the sandbox allows efficient proximity
|
* LUCENE-8196, LUCENE-8300: A new IntervalQuery in the sandbox allows efficient proximity
|
||||||
searches based on minimum-interval semantics. (Alan Woodward, Adrien Grand,
|
searches based on minimum-interval semantics. (Alan Woodward, Adrien Grand,
|
||||||
Jim Ferenczi, Simon Willnauer)
|
Jim Ferenczi, Simon Willnauer, Matt Weber)
|
||||||
|
|
||||||
* LUCENE-8233: Add support for soft deletes to IndexWriter delete accounting.
|
* LUCENE-8233: Add support for soft deletes to IndexWriter delete accounting.
|
||||||
Soft deletes are accounted for inside the index writer and therefor also
|
Soft deletes are accounted for inside the index writer and therefor also
|
||||||
|
|
|
@ -165,7 +165,17 @@ abstract class IntervalFunction {
|
||||||
static final IntervalFunction UNORDERED = new SingletonFunction("UNORDERED") {
|
static final IntervalFunction UNORDERED = new SingletonFunction("UNORDERED") {
|
||||||
@Override
|
@Override
|
||||||
public IntervalIterator apply(List<IntervalIterator> intervalIterators) {
|
public IntervalIterator apply(List<IntervalIterator> intervalIterators) {
|
||||||
return new UnorderedIntervalIterator(intervalIterators);
|
return new UnorderedIntervalIterator(intervalIterators, true);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return an iterator over intervals where the subiterators appear in any order, and do not overlap
|
||||||
|
*/
|
||||||
|
static final IntervalFunction UNORDERED_NO_OVERLAP = new SingletonFunction("UNORDERED_NO_OVERLAP") {
|
||||||
|
@Override
|
||||||
|
public IntervalIterator apply(List<IntervalIterator> iterators) {
|
||||||
|
return new UnorderedIntervalIterator(iterators, false);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -173,10 +183,11 @@ abstract class IntervalFunction {
|
||||||
|
|
||||||
private final PriorityQueue<IntervalIterator> queue;
|
private final PriorityQueue<IntervalIterator> queue;
|
||||||
private final IntervalIterator[] subIterators;
|
private final IntervalIterator[] subIterators;
|
||||||
|
private final boolean allowOverlaps;
|
||||||
|
|
||||||
int start = -1, end = -1, queueEnd;
|
int start = -1, end = -1, queueEnd;
|
||||||
|
|
||||||
UnorderedIntervalIterator(List<IntervalIterator> subIterators) {
|
UnorderedIntervalIterator(List<IntervalIterator> subIterators, boolean allowOverlaps) {
|
||||||
super(subIterators);
|
super(subIterators);
|
||||||
this.queue = new PriorityQueue<IntervalIterator>(subIterators.size()) {
|
this.queue = new PriorityQueue<IntervalIterator>(subIterators.size()) {
|
||||||
@Override
|
@Override
|
||||||
|
@ -185,6 +196,7 @@ abstract class IntervalFunction {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
this.subIterators = new IntervalIterator[subIterators.size()];
|
this.subIterators = new IntervalIterator[subIterators.size()];
|
||||||
|
this.allowOverlaps = allowOverlaps;
|
||||||
|
|
||||||
for (int i = 0; i < subIterators.size(); i++) {
|
for (int i = 0; i < subIterators.size(); i++) {
|
||||||
this.subIterators[i] = subIterators.get(i);
|
this.subIterators[i] = subIterators.get(i);
|
||||||
|
@ -210,15 +222,23 @@ abstract class IntervalFunction {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int nextInterval() throws IOException {
|
public int nextInterval() throws IOException {
|
||||||
|
// first, find a matching interval
|
||||||
while (this.queue.size() == subIterators.length && queue.top().start() == start) {
|
while (this.queue.size() == subIterators.length && queue.top().start() == start) {
|
||||||
IntervalIterator it = queue.pop();
|
IntervalIterator it = queue.pop();
|
||||||
if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
|
if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
|
||||||
|
if (allowOverlaps == false) {
|
||||||
|
while (hasOverlaps(it)) {
|
||||||
|
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
|
||||||
|
return IntervalIterator.NO_MORE_INTERVALS;
|
||||||
|
}
|
||||||
|
}
|
||||||
queue.add(it);
|
queue.add(it);
|
||||||
updateRightExtreme(it);
|
updateRightExtreme(it);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (this.queue.size() < subIterators.length)
|
if (this.queue.size() < subIterators.length)
|
||||||
return IntervalIterator.NO_MORE_INTERVALS;
|
return IntervalIterator.NO_MORE_INTERVALS;
|
||||||
|
// then, minimize it
|
||||||
do {
|
do {
|
||||||
start = queue.top().start();
|
start = queue.top().start();
|
||||||
end = queueEnd;
|
end = queueEnd;
|
||||||
|
@ -226,6 +246,13 @@ abstract class IntervalFunction {
|
||||||
return start;
|
return start;
|
||||||
IntervalIterator it = queue.pop();
|
IntervalIterator it = queue.pop();
|
||||||
if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
|
if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
|
||||||
|
if (allowOverlaps == false) {
|
||||||
|
while (hasOverlaps(it)) {
|
||||||
|
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
|
||||||
|
return start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
queue.add(it);
|
queue.add(it);
|
||||||
updateRightExtreme(it);
|
updateRightExtreme(it);
|
||||||
}
|
}
|
||||||
|
@ -237,15 +264,40 @@ abstract class IntervalFunction {
|
||||||
protected void reset() throws IOException {
|
protected void reset() throws IOException {
|
||||||
queueEnd = start = end = -1;
|
queueEnd = start = end = -1;
|
||||||
this.queue.clear();
|
this.queue.clear();
|
||||||
for (IntervalIterator it : subIterators) {
|
loop: for (IntervalIterator it : subIterators) {
|
||||||
if (it.nextInterval() == NO_MORE_INTERVALS) {
|
if (it.nextInterval() == NO_MORE_INTERVALS) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
if (allowOverlaps == false) {
|
||||||
|
while (hasOverlaps(it)) {
|
||||||
|
if (it.nextInterval() == NO_MORE_INTERVALS) {
|
||||||
|
break loop;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
queue.add(it);
|
queue.add(it);
|
||||||
updateRightExtreme(it);
|
updateRightExtreme(it);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean hasOverlaps(IntervalIterator candidate) {
|
||||||
|
for (IntervalIterator it : queue) {
|
||||||
|
if (it.start() < candidate.start()) {
|
||||||
|
if (it.end() >= candidate.start()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (it.start() == candidate.start()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (it.start() <= candidate.end()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -85,7 +85,7 @@ public final class Intervals {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create an ordered {@link IntervalsSource} with an unbounded width range
|
* Create an ordered {@link IntervalsSource}
|
||||||
*
|
*
|
||||||
* Returns intervals in which the subsources all appear in the given order
|
* Returns intervals in which the subsources all appear in the given order
|
||||||
*
|
*
|
||||||
|
@ -96,14 +96,27 @@ public final class Intervals {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create an unordered {@link IntervalsSource} with an unbounded width range
|
* Create an unordered {@link IntervalsSource}
|
||||||
|
*
|
||||||
|
* Returns intervals in which all the subsources appear. The subsources may overlap
|
||||||
|
*
|
||||||
|
* @param subSources an unordered set of {@link IntervalsSource}s
|
||||||
|
*/
|
||||||
|
public static IntervalsSource unordered(IntervalsSource... subSources) {
|
||||||
|
return unordered(true, subSources);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create an unordered {@link IntervalsSource}
|
||||||
*
|
*
|
||||||
* Returns intervals in which all the subsources appear.
|
* Returns intervals in which all the subsources appear.
|
||||||
*
|
*
|
||||||
* @param subSources an unordered set of queries
|
* @param subSources an unordered set of {@link IntervalsSource}s
|
||||||
|
* @param allowOverlaps whether or not the sources should be allowed to overlap in a hit
|
||||||
*/
|
*/
|
||||||
public static IntervalsSource unordered(IntervalsSource... subSources) {
|
public static IntervalsSource unordered(boolean allowOverlaps, IntervalsSource... subSources) {
|
||||||
return new ConjunctionIntervalsSource(Arrays.asList(subSources), IntervalFunction.UNORDERED);
|
return new ConjunctionIntervalsSource(Arrays.asList(subSources),
|
||||||
|
allowOverlaps ? IntervalFunction.UNORDERED : IntervalFunction.UNORDERED_NO_OVERLAP);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -55,8 +55,8 @@ public class TestIntervals extends LuceneTestCase {
|
||||||
"Where Alph the sacred river ran through caverns measureless to man",
|
"Where Alph the sacred river ran through caverns measureless to man",
|
||||||
"Down to a sunless sea",
|
"Down to a sunless sea",
|
||||||
"So thrice five miles of fertile ground",
|
"So thrice five miles of fertile ground",
|
||||||
"With walls and towers were girdled round",
|
"Pease hot porridge porridge",
|
||||||
"Which was nice"
|
"Pease porridge porridge hot"
|
||||||
};
|
};
|
||||||
|
|
||||||
private static Directory directory;
|
private static Directory directory;
|
||||||
|
@ -102,12 +102,12 @@ public class TestIntervals extends LuceneTestCase {
|
||||||
assertEquals(-1, intervals.end());
|
assertEquals(-1, intervals.end());
|
||||||
while ((pos = intervals.nextInterval()) != IntervalIterator.NO_MORE_INTERVALS) {
|
while ((pos = intervals.nextInterval()) != IntervalIterator.NO_MORE_INTERVALS) {
|
||||||
//System.out.println(doc + ": " + intervals);
|
//System.out.println(doc + ": " + intervals);
|
||||||
assertEquals(expected[id][i], pos);
|
assertEquals("Wrong start value", expected[id][i], pos);
|
||||||
assertEquals(expected[id][i], intervals.start());
|
assertEquals("start() != pos returned from nextInterval()", expected[id][i], intervals.start());
|
||||||
assertEquals(expected[id][i + 1], intervals.end());
|
assertEquals("Wrong end value", expected[id][i + 1], intervals.end());
|
||||||
i += 2;
|
i += 2;
|
||||||
}
|
}
|
||||||
assertEquals(expected[id].length, i);
|
assertEquals("Wrong number of endpoints", expected[id].length, i);
|
||||||
if (i > 0)
|
if (i > 0)
|
||||||
matchedDocs++;
|
matchedDocs++;
|
||||||
}
|
}
|
||||||
|
@ -215,4 +215,38 @@ public class TestIntervals extends LuceneTestCase {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testUnorderedDistinct() throws IOException {
|
||||||
|
checkIntervals(Intervals.unordered(false, Intervals.term("pease"), Intervals.term("pease")),
|
||||||
|
"field1", 3, new int[][]{
|
||||||
|
{},
|
||||||
|
{ 0, 3, 3, 6 },
|
||||||
|
{ 0, 3, 3, 6 },
|
||||||
|
{},
|
||||||
|
{ 0, 3, 3, 6 },
|
||||||
|
{}
|
||||||
|
});
|
||||||
|
checkIntervals(Intervals.unordered(false,
|
||||||
|
Intervals.unordered(Intervals.term("pease"), Intervals.term("porridge"), Intervals.term("hot")),
|
||||||
|
Intervals.term("porridge")),
|
||||||
|
"field1", 3, new int[][]{
|
||||||
|
{},
|
||||||
|
{ 1, 4, 4, 17 },
|
||||||
|
{ 1, 5, 4, 7 },
|
||||||
|
{},
|
||||||
|
{ 1, 4, 4, 17 },
|
||||||
|
{}
|
||||||
|
});
|
||||||
|
checkIntervals(Intervals.unordered(false,
|
||||||
|
Intervals.unordered(Intervals.term("pease"), Intervals.term("porridge"), Intervals.term("hot")),
|
||||||
|
Intervals.term("porridge")),
|
||||||
|
"field2", 1, new int[][]{
|
||||||
|
{},
|
||||||
|
{},
|
||||||
|
{},
|
||||||
|
{},
|
||||||
|
{ 0, 3 },
|
||||||
|
{}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue