LUCENE-8300: Allow unordered intervals to exclude overlaps

This commit is contained in:
Alan Woodward 2018-05-30 15:42:41 +01:00
parent 9aa16b64c7
commit e3d4c7e9b7
4 changed files with 115 additions and 16 deletions

View File

@ -145,9 +145,9 @@ New Features
* LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir) * LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir)
* LUCENE-8196: A new IntervalQuery in the sandbox allows efficient proximity * LUCENE-8196, LUCENE-8300: A new IntervalQuery in the sandbox allows efficient proximity
searches based on minimum-interval semantics. (Alan Woodward, Adrien Grand, searches based on minimum-interval semantics. (Alan Woodward, Adrien Grand,
Jim Ferenczi, Simon Willnauer) Jim Ferenczi, Simon Willnauer, Matt Weber)
* LUCENE-8233: Add support for soft deletes to IndexWriter delete accounting. * LUCENE-8233: Add support for soft deletes to IndexWriter delete accounting.
Soft deletes are accounted for inside the index writer and therefor also Soft deletes are accounted for inside the index writer and therefor also

View File

@ -165,7 +165,17 @@ abstract class IntervalFunction {
static final IntervalFunction UNORDERED = new SingletonFunction("UNORDERED") { static final IntervalFunction UNORDERED = new SingletonFunction("UNORDERED") {
@Override @Override
public IntervalIterator apply(List<IntervalIterator> intervalIterators) { public IntervalIterator apply(List<IntervalIterator> intervalIterators) {
return new UnorderedIntervalIterator(intervalIterators); return new UnorderedIntervalIterator(intervalIterators, true);
}
};
/**
* Return an iterator over intervals where the subiterators appear in any order, and do not overlap
*/
static final IntervalFunction UNORDERED_NO_OVERLAP = new SingletonFunction("UNORDERED_NO_OVERLAP") {
@Override
public IntervalIterator apply(List<IntervalIterator> iterators) {
return new UnorderedIntervalIterator(iterators, false);
} }
}; };
@ -173,10 +183,11 @@ abstract class IntervalFunction {
private final PriorityQueue<IntervalIterator> queue; private final PriorityQueue<IntervalIterator> queue;
private final IntervalIterator[] subIterators; private final IntervalIterator[] subIterators;
private final boolean allowOverlaps;
int start = -1, end = -1, queueEnd; int start = -1, end = -1, queueEnd;
UnorderedIntervalIterator(List<IntervalIterator> subIterators) { UnorderedIntervalIterator(List<IntervalIterator> subIterators, boolean allowOverlaps) {
super(subIterators); super(subIterators);
this.queue = new PriorityQueue<IntervalIterator>(subIterators.size()) { this.queue = new PriorityQueue<IntervalIterator>(subIterators.size()) {
@Override @Override
@ -185,6 +196,7 @@ abstract class IntervalFunction {
} }
}; };
this.subIterators = new IntervalIterator[subIterators.size()]; this.subIterators = new IntervalIterator[subIterators.size()];
this.allowOverlaps = allowOverlaps;
for (int i = 0; i < subIterators.size(); i++) { for (int i = 0; i < subIterators.size(); i++) {
this.subIterators[i] = subIterators.get(i); this.subIterators[i] = subIterators.get(i);
@ -210,15 +222,23 @@ abstract class IntervalFunction {
@Override @Override
public int nextInterval() throws IOException { public int nextInterval() throws IOException {
// first, find a matching interval
while (this.queue.size() == subIterators.length && queue.top().start() == start) { while (this.queue.size() == subIterators.length && queue.top().start() == start) {
IntervalIterator it = queue.pop(); IntervalIterator it = queue.pop();
if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) { if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
if (allowOverlaps == false) {
while (hasOverlaps(it)) {
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return IntervalIterator.NO_MORE_INTERVALS;
}
}
queue.add(it); queue.add(it);
updateRightExtreme(it); updateRightExtreme(it);
} }
} }
if (this.queue.size() < subIterators.length) if (this.queue.size() < subIterators.length)
return IntervalIterator.NO_MORE_INTERVALS; return IntervalIterator.NO_MORE_INTERVALS;
// then, minimize it
do { do {
start = queue.top().start(); start = queue.top().start();
end = queueEnd; end = queueEnd;
@ -226,6 +246,13 @@ abstract class IntervalFunction {
return start; return start;
IntervalIterator it = queue.pop(); IntervalIterator it = queue.pop();
if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) { if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
if (allowOverlaps == false) {
while (hasOverlaps(it)) {
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
return start;
}
}
}
queue.add(it); queue.add(it);
updateRightExtreme(it); updateRightExtreme(it);
} }
@ -237,15 +264,40 @@ abstract class IntervalFunction {
protected void reset() throws IOException { protected void reset() throws IOException {
queueEnd = start = end = -1; queueEnd = start = end = -1;
this.queue.clear(); this.queue.clear();
for (IntervalIterator it : subIterators) { loop: for (IntervalIterator it : subIterators) {
if (it.nextInterval() == NO_MORE_INTERVALS) { if (it.nextInterval() == NO_MORE_INTERVALS) {
break; break;
} }
if (allowOverlaps == false) {
while (hasOverlaps(it)) {
if (it.nextInterval() == NO_MORE_INTERVALS) {
break loop;
}
}
}
queue.add(it); queue.add(it);
updateRightExtreme(it); updateRightExtreme(it);
} }
} }
private boolean hasOverlaps(IntervalIterator candidate) {
for (IntervalIterator it : queue) {
if (it.start() < candidate.start()) {
if (it.end() >= candidate.start()) {
return true;
}
continue;
}
if (it.start() == candidate.start()) {
return true;
}
if (it.start() <= candidate.end()) {
return true;
}
}
return false;
}
} }
/** /**

View File

@ -85,7 +85,7 @@ public final class Intervals {
} }
/** /**
* Create an ordered {@link IntervalsSource} with an unbounded width range * Create an ordered {@link IntervalsSource}
* *
* Returns intervals in which the subsources all appear in the given order * Returns intervals in which the subsources all appear in the given order
* *
@ -96,14 +96,27 @@ public final class Intervals {
} }
/** /**
* Create an unordered {@link IntervalsSource} with an unbounded width range * Create an unordered {@link IntervalsSource}
*
* Returns intervals in which all the subsources appear. The subsources may overlap
*
* @param subSources an unordered set of {@link IntervalsSource}s
*/
public static IntervalsSource unordered(IntervalsSource... subSources) {
return unordered(true, subSources);
}
/**
* Create an unordered {@link IntervalsSource}
* *
* Returns intervals in which all the subsources appear. * Returns intervals in which all the subsources appear.
* *
* @param subSources an unordered set of queries * @param subSources an unordered set of {@link IntervalsSource}s
* @param allowOverlaps whether or not the sources should be allowed to overlap in a hit
*/ */
public static IntervalsSource unordered(IntervalsSource... subSources) { public static IntervalsSource unordered(boolean allowOverlaps, IntervalsSource... subSources) {
return new ConjunctionIntervalsSource(Arrays.asList(subSources), IntervalFunction.UNORDERED); return new ConjunctionIntervalsSource(Arrays.asList(subSources),
allowOverlaps ? IntervalFunction.UNORDERED : IntervalFunction.UNORDERED_NO_OVERLAP);
} }
/** /**

View File

@ -55,8 +55,8 @@ public class TestIntervals extends LuceneTestCase {
"Where Alph the sacred river ran through caverns measureless to man", "Where Alph the sacred river ran through caverns measureless to man",
"Down to a sunless sea", "Down to a sunless sea",
"So thrice five miles of fertile ground", "So thrice five miles of fertile ground",
"With walls and towers were girdled round", "Pease hot porridge porridge",
"Which was nice" "Pease porridge porridge hot"
}; };
private static Directory directory; private static Directory directory;
@ -102,12 +102,12 @@ public class TestIntervals extends LuceneTestCase {
assertEquals(-1, intervals.end()); assertEquals(-1, intervals.end());
while ((pos = intervals.nextInterval()) != IntervalIterator.NO_MORE_INTERVALS) { while ((pos = intervals.nextInterval()) != IntervalIterator.NO_MORE_INTERVALS) {
//System.out.println(doc + ": " + intervals); //System.out.println(doc + ": " + intervals);
assertEquals(expected[id][i], pos); assertEquals("Wrong start value", expected[id][i], pos);
assertEquals(expected[id][i], intervals.start()); assertEquals("start() != pos returned from nextInterval()", expected[id][i], intervals.start());
assertEquals(expected[id][i + 1], intervals.end()); assertEquals("Wrong end value", expected[id][i + 1], intervals.end());
i += 2; i += 2;
} }
assertEquals(expected[id].length, i); assertEquals("Wrong number of endpoints", expected[id].length, i);
if (i > 0) if (i > 0)
matchedDocs++; matchedDocs++;
} }
@ -215,4 +215,38 @@ public class TestIntervals extends LuceneTestCase {
}); });
} }
public void testUnorderedDistinct() throws IOException {
checkIntervals(Intervals.unordered(false, Intervals.term("pease"), Intervals.term("pease")),
"field1", 3, new int[][]{
{},
{ 0, 3, 3, 6 },
{ 0, 3, 3, 6 },
{},
{ 0, 3, 3, 6 },
{}
});
checkIntervals(Intervals.unordered(false,
Intervals.unordered(Intervals.term("pease"), Intervals.term("porridge"), Intervals.term("hot")),
Intervals.term("porridge")),
"field1", 3, new int[][]{
{},
{ 1, 4, 4, 17 },
{ 1, 5, 4, 7 },
{},
{ 1, 4, 4, 17 },
{}
});
checkIntervals(Intervals.unordered(false,
Intervals.unordered(Intervals.term("pease"), Intervals.term("porridge"), Intervals.term("hot")),
Intervals.term("porridge")),
"field2", 1, new int[][]{
{},
{},
{},
{},
{ 0, 3 },
{}
});
}
} }