diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index d3f89ba6782..cd11e7dbeb5 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -145,9 +145,9 @@ New Features * LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir) -* LUCENE-8196: A new IntervalQuery in the sandbox allows efficient proximity +* LUCENE-8196, LUCENE-8300: A new IntervalQuery in the sandbox allows efficient proximity searches based on minimum-interval semantics. (Alan Woodward, Adrien Grand, - Jim Ferenczi, Simon Willnauer) + Jim Ferenczi, Simon Willnauer, Matt Weber) * LUCENE-8233: Add support for soft deletes to IndexWriter delete accounting. Soft deletes are accounted for inside the index writer and therefor also diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalFunction.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalFunction.java index ddd891f61ec..a6ce918f517 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalFunction.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalFunction.java @@ -165,7 +165,17 @@ abstract class IntervalFunction { static final IntervalFunction UNORDERED = new SingletonFunction("UNORDERED") { @Override public IntervalIterator apply(List intervalIterators) { - return new UnorderedIntervalIterator(intervalIterators); + return new UnorderedIntervalIterator(intervalIterators, true); + } + }; + + /** + * Return an iterator over intervals where the subiterators appear in any order, and do not overlap + */ + static final IntervalFunction UNORDERED_NO_OVERLAP = new SingletonFunction("UNORDERED_NO_OVERLAP") { + @Override + public IntervalIterator apply(List iterators) { + return new UnorderedIntervalIterator(iterators, false); } }; @@ -173,10 +183,11 @@ abstract class IntervalFunction { private final PriorityQueue queue; private final IntervalIterator[] subIterators; + private final boolean allowOverlaps; int start = -1, end = -1, queueEnd; - UnorderedIntervalIterator(List subIterators) { + UnorderedIntervalIterator(List subIterators, boolean allowOverlaps) { super(subIterators); this.queue = new PriorityQueue(subIterators.size()) { @Override @@ -185,6 +196,7 @@ abstract class IntervalFunction { } }; this.subIterators = new IntervalIterator[subIterators.size()]; + this.allowOverlaps = allowOverlaps; for (int i = 0; i < subIterators.size(); i++) { this.subIterators[i] = subIterators.get(i); @@ -210,15 +222,23 @@ abstract class IntervalFunction { @Override public int nextInterval() throws IOException { + // first, find a matching interval while (this.queue.size() == subIterators.length && queue.top().start() == start) { IntervalIterator it = queue.pop(); if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) { + if (allowOverlaps == false) { + while (hasOverlaps(it)) { + if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) + return IntervalIterator.NO_MORE_INTERVALS; + } + } queue.add(it); updateRightExtreme(it); } } if (this.queue.size() < subIterators.length) return IntervalIterator.NO_MORE_INTERVALS; + // then, minimize it do { start = queue.top().start(); end = queueEnd; @@ -226,6 +246,13 @@ abstract class IntervalFunction { return start; IntervalIterator it = queue.pop(); if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) { + if (allowOverlaps == false) { + while (hasOverlaps(it)) { + if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) { + return start; + } + } + } queue.add(it); updateRightExtreme(it); } @@ -237,15 +264,40 @@ abstract class IntervalFunction { protected void reset() throws IOException { queueEnd = start = end = -1; this.queue.clear(); - for (IntervalIterator it : subIterators) { + loop: for (IntervalIterator it : subIterators) { if (it.nextInterval() == NO_MORE_INTERVALS) { break; } + if (allowOverlaps == false) { + while (hasOverlaps(it)) { + if (it.nextInterval() == NO_MORE_INTERVALS) { + break loop; + } + } + } queue.add(it); updateRightExtreme(it); } } + private boolean hasOverlaps(IntervalIterator candidate) { + for (IntervalIterator it : queue) { + if (it.start() < candidate.start()) { + if (it.end() >= candidate.start()) { + return true; + } + continue; + } + if (it.start() == candidate.start()) { + return true; + } + if (it.start() <= candidate.end()) { + return true; + } + } + return false; + } + } /** diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Intervals.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Intervals.java index b3609192963..32ea6da6a8e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Intervals.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Intervals.java @@ -85,7 +85,7 @@ public final class Intervals { } /** - * Create an ordered {@link IntervalsSource} with an unbounded width range + * Create an ordered {@link IntervalsSource} * * Returns intervals in which the subsources all appear in the given order * @@ -96,14 +96,27 @@ public final class Intervals { } /** - * Create an unordered {@link IntervalsSource} with an unbounded width range + * Create an unordered {@link IntervalsSource} + * + * Returns intervals in which all the subsources appear. The subsources may overlap + * + * @param subSources an unordered set of {@link IntervalsSource}s + */ + public static IntervalsSource unordered(IntervalsSource... subSources) { + return unordered(true, subSources); + } + + /** + * Create an unordered {@link IntervalsSource} * * Returns intervals in which all the subsources appear. * - * @param subSources an unordered set of queries + * @param subSources an unordered set of {@link IntervalsSource}s + * @param allowOverlaps whether or not the sources should be allowed to overlap in a hit */ - public static IntervalsSource unordered(IntervalsSource... subSources) { - return new ConjunctionIntervalsSource(Arrays.asList(subSources), IntervalFunction.UNORDERED); + public static IntervalsSource unordered(boolean allowOverlaps, IntervalsSource... subSources) { + return new ConjunctionIntervalsSource(Arrays.asList(subSources), + allowOverlaps ? IntervalFunction.UNORDERED : IntervalFunction.UNORDERED_NO_OVERLAP); } /** diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java index 9405f79e5ff..33d3cd53e5e 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java +++ b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java @@ -55,8 +55,8 @@ public class TestIntervals extends LuceneTestCase { "Where Alph the sacred river ran through caverns measureless to man", "Down to a sunless sea", "So thrice five miles of fertile ground", - "With walls and towers were girdled round", - "Which was nice" + "Pease hot porridge porridge", + "Pease porridge porridge hot" }; private static Directory directory; @@ -102,12 +102,12 @@ public class TestIntervals extends LuceneTestCase { assertEquals(-1, intervals.end()); while ((pos = intervals.nextInterval()) != IntervalIterator.NO_MORE_INTERVALS) { //System.out.println(doc + ": " + intervals); - assertEquals(expected[id][i], pos); - assertEquals(expected[id][i], intervals.start()); - assertEquals(expected[id][i + 1], intervals.end()); + assertEquals("Wrong start value", expected[id][i], pos); + assertEquals("start() != pos returned from nextInterval()", expected[id][i], intervals.start()); + assertEquals("Wrong end value", expected[id][i + 1], intervals.end()); i += 2; } - assertEquals(expected[id].length, i); + assertEquals("Wrong number of endpoints", expected[id].length, i); if (i > 0) matchedDocs++; } @@ -215,4 +215,38 @@ public class TestIntervals extends LuceneTestCase { }); } + public void testUnorderedDistinct() throws IOException { + checkIntervals(Intervals.unordered(false, Intervals.term("pease"), Intervals.term("pease")), + "field1", 3, new int[][]{ + {}, + { 0, 3, 3, 6 }, + { 0, 3, 3, 6 }, + {}, + { 0, 3, 3, 6 }, + {} + }); + checkIntervals(Intervals.unordered(false, + Intervals.unordered(Intervals.term("pease"), Intervals.term("porridge"), Intervals.term("hot")), + Intervals.term("porridge")), + "field1", 3, new int[][]{ + {}, + { 1, 4, 4, 17 }, + { 1, 5, 4, 7 }, + {}, + { 1, 4, 4, 17 }, + {} + }); + checkIntervals(Intervals.unordered(false, + Intervals.unordered(Intervals.term("pease"), Intervals.term("porridge"), Intervals.term("hot")), + Intervals.term("porridge")), + "field2", 1, new int[][]{ + {}, + {}, + {}, + {}, + { 0, 3 }, + {} + }); + } + }