LUCENE-8300: Allow unordered intervals to exclude overlaps

This commit is contained in:
Alan Woodward 2018-05-30 15:42:41 +01:00
parent 9aa16b64c7
commit e3d4c7e9b7
4 changed files with 115 additions and 16 deletions

View File

@ -145,9 +145,9 @@ New Features
* LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir)
* LUCENE-8196: A new IntervalQuery in the sandbox allows efficient proximity
* LUCENE-8196, LUCENE-8300: A new IntervalQuery in the sandbox allows efficient proximity
searches based on minimum-interval semantics. (Alan Woodward, Adrien Grand,
Jim Ferenczi, Simon Willnauer)
Jim Ferenczi, Simon Willnauer, Matt Weber)
* LUCENE-8233: Add support for soft deletes to IndexWriter delete accounting.
Soft deletes are accounted for inside the index writer and therefor also

View File

@ -165,7 +165,17 @@ abstract class IntervalFunction {
static final IntervalFunction UNORDERED = new SingletonFunction("UNORDERED") {
@Override
public IntervalIterator apply(List<IntervalIterator> intervalIterators) {
return new UnorderedIntervalIterator(intervalIterators);
return new UnorderedIntervalIterator(intervalIterators, true);
}
};
/**
* Return an iterator over intervals where the subiterators appear in any order, and do not overlap
*/
static final IntervalFunction UNORDERED_NO_OVERLAP = new SingletonFunction("UNORDERED_NO_OVERLAP") {
@Override
public IntervalIterator apply(List<IntervalIterator> iterators) {
return new UnorderedIntervalIterator(iterators, false);
}
};
@ -173,10 +183,11 @@ abstract class IntervalFunction {
private final PriorityQueue<IntervalIterator> queue;
private final IntervalIterator[] subIterators;
private final boolean allowOverlaps;
int start = -1, end = -1, queueEnd;
UnorderedIntervalIterator(List<IntervalIterator> subIterators) {
UnorderedIntervalIterator(List<IntervalIterator> subIterators, boolean allowOverlaps) {
super(subIterators);
this.queue = new PriorityQueue<IntervalIterator>(subIterators.size()) {
@Override
@ -185,6 +196,7 @@ abstract class IntervalFunction {
}
};
this.subIterators = new IntervalIterator[subIterators.size()];
this.allowOverlaps = allowOverlaps;
for (int i = 0; i < subIterators.size(); i++) {
this.subIterators[i] = subIterators.get(i);
@ -210,15 +222,23 @@ abstract class IntervalFunction {
@Override
public int nextInterval() throws IOException {
// first, find a matching interval
while (this.queue.size() == subIterators.length && queue.top().start() == start) {
IntervalIterator it = queue.pop();
if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
if (allowOverlaps == false) {
while (hasOverlaps(it)) {
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return IntervalIterator.NO_MORE_INTERVALS;
}
}
queue.add(it);
updateRightExtreme(it);
}
}
if (this.queue.size() < subIterators.length)
return IntervalIterator.NO_MORE_INTERVALS;
// then, minimize it
do {
start = queue.top().start();
end = queueEnd;
@ -226,6 +246,13 @@ abstract class IntervalFunction {
return start;
IntervalIterator it = queue.pop();
if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
if (allowOverlaps == false) {
while (hasOverlaps(it)) {
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
return start;
}
}
}
queue.add(it);
updateRightExtreme(it);
}
@ -237,15 +264,40 @@ abstract class IntervalFunction {
protected void reset() throws IOException {
queueEnd = start = end = -1;
this.queue.clear();
for (IntervalIterator it : subIterators) {
loop: for (IntervalIterator it : subIterators) {
if (it.nextInterval() == NO_MORE_INTERVALS) {
break;
}
if (allowOverlaps == false) {
while (hasOverlaps(it)) {
if (it.nextInterval() == NO_MORE_INTERVALS) {
break loop;
}
}
}
queue.add(it);
updateRightExtreme(it);
}
}
private boolean hasOverlaps(IntervalIterator candidate) {
for (IntervalIterator it : queue) {
if (it.start() < candidate.start()) {
if (it.end() >= candidate.start()) {
return true;
}
continue;
}
if (it.start() == candidate.start()) {
return true;
}
if (it.start() <= candidate.end()) {
return true;
}
}
return false;
}
}
/**

View File

@ -85,7 +85,7 @@ public final class Intervals {
}
/**
* Create an ordered {@link IntervalsSource} with an unbounded width range
* Create an ordered {@link IntervalsSource}
*
* Returns intervals in which the subsources all appear in the given order
*
@ -96,14 +96,27 @@ public final class Intervals {
}
/**
* Create an unordered {@link IntervalsSource} with an unbounded width range
* Create an unordered {@link IntervalsSource}
*
* Returns intervals in which all the subsources appear. The subsources may overlap
*
* @param subSources an unordered set of {@link IntervalsSource}s
*/
public static IntervalsSource unordered(IntervalsSource... subSources) {
return unordered(true, subSources);
}
/**
* Create an unordered {@link IntervalsSource}
*
* Returns intervals in which all the subsources appear.
*
* @param subSources an unordered set of queries
* @param subSources an unordered set of {@link IntervalsSource}s
* @param allowOverlaps whether or not the sources should be allowed to overlap in a hit
*/
public static IntervalsSource unordered(IntervalsSource... subSources) {
return new ConjunctionIntervalsSource(Arrays.asList(subSources), IntervalFunction.UNORDERED);
public static IntervalsSource unordered(boolean allowOverlaps, IntervalsSource... subSources) {
return new ConjunctionIntervalsSource(Arrays.asList(subSources),
allowOverlaps ? IntervalFunction.UNORDERED : IntervalFunction.UNORDERED_NO_OVERLAP);
}
/**

View File

@ -55,8 +55,8 @@ public class TestIntervals extends LuceneTestCase {
"Where Alph the sacred river ran through caverns measureless to man",
"Down to a sunless sea",
"So thrice five miles of fertile ground",
"With walls and towers were girdled round",
"Which was nice"
"Pease hot porridge porridge",
"Pease porridge porridge hot"
};
private static Directory directory;
@ -102,12 +102,12 @@ public class TestIntervals extends LuceneTestCase {
assertEquals(-1, intervals.end());
while ((pos = intervals.nextInterval()) != IntervalIterator.NO_MORE_INTERVALS) {
//System.out.println(doc + ": " + intervals);
assertEquals(expected[id][i], pos);
assertEquals(expected[id][i], intervals.start());
assertEquals(expected[id][i + 1], intervals.end());
assertEquals("Wrong start value", expected[id][i], pos);
assertEquals("start() != pos returned from nextInterval()", expected[id][i], intervals.start());
assertEquals("Wrong end value", expected[id][i + 1], intervals.end());
i += 2;
}
assertEquals(expected[id].length, i);
assertEquals("Wrong number of endpoints", expected[id].length, i);
if (i > 0)
matchedDocs++;
}
@ -215,4 +215,38 @@ public class TestIntervals extends LuceneTestCase {
});
}
public void testUnorderedDistinct() throws IOException {
checkIntervals(Intervals.unordered(false, Intervals.term("pease"), Intervals.term("pease")),
"field1", 3, new int[][]{
{},
{ 0, 3, 3, 6 },
{ 0, 3, 3, 6 },
{},
{ 0, 3, 3, 6 },
{}
});
checkIntervals(Intervals.unordered(false,
Intervals.unordered(Intervals.term("pease"), Intervals.term("porridge"), Intervals.term("hot")),
Intervals.term("porridge")),
"field1", 3, new int[][]{
{},
{ 1, 4, 4, 17 },
{ 1, 5, 4, 7 },
{},
{ 1, 4, 4, 17 },
{}
});
checkIntervals(Intervals.unordered(false,
Intervals.unordered(Intervals.term("pease"), Intervals.term("porridge"), Intervals.term("hot")),
Intervals.term("porridge")),
"field2", 1, new int[][]{
{},
{},
{},
{},
{ 0, 3 },
{}
});
}
}