From 3ff331072a7435e971e35c2e28c38a90ca70802b Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Tue, 30 Jun 2020 09:13:55 +0100 Subject: [PATCH] LUCENE-9418: Fix ordered intervals over interleaved terms (#1618) Given the input text 'A B A C', an ordered interval 'A B C' will currently return an incorrect internal [2, 3] in addition to the correct [0, 3] interval. This is due to a bug in the ORDERED algorithm, where we assume that after the first interval is returned, the sub-intervals are always in-order. This assumption only holds during minimization, as minimizing an interval may move the earlier terms beyond the trailing terms. For example, after the initial [0, 3] interval is found above, the algorithm will attempt to minimize it by advancing A to [2,2]. Because this is still before C at [3,3], but after B at [1,1], we then try advancing B, leaving it at [Inf,Inf]. Minimization has failed, so we return the original interval of [0,3]. However, when we come to retrieve the next interval, our subintervals look like this: A[2,2], B[Inf,Inf], C[3,3] - the assumption that they are in order is broken. The algorithm sees that A is before B, assumes that therefore all subsequent subintervals are in order, and returns the new interval. This commit fixes things by changing the assumption of ordering to only hold during minimization. When first finding a candidate interval, the algorithm now checks that all sub-intervals appear in order. --- .../queries/intervals/OrderedIntervalsSource.java | 4 +++- .../queries/intervals/TestIntervalQuery.java | 10 +++++++++- .../lucene/queries/intervals/TestIntervals.java | 15 ++++++++++++++- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java index 6331688c0c8..2b7d1070f96 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java @@ -133,12 +133,13 @@ class OrderedIntervalsSource extends ConjunctionIntervalsSource { public int nextInterval() throws IOException { start = end = slop = IntervalIterator.NO_MORE_INTERVALS; int lastStart = Integer.MAX_VALUE; + boolean minimizing = false; i = 1; while (true) { while (true) { if (subIterators.get(i - 1).end() >= lastStart) return start; - if (i == subIterators.size() || subIterators.get(i).start() > subIterators.get(i - 1).end()) + if (i == subIterators.size() || (minimizing && subIterators.get(i).start() > subIterators.get(i - 1).end())) break; do { if (subIterators.get(i).end() >= lastStart || subIterators.get(i).nextInterval() == IntervalIterator.NO_MORE_INTERVALS) @@ -160,6 +161,7 @@ class OrderedIntervalsSource extends ConjunctionIntervalsSource { i = 1; if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS) return start; + minimizing = true; } } diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java index 097b4abf4a7..2ca79b38f38 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java @@ -73,7 +73,8 @@ public class TestIntervalQuery extends LuceneTestCase { "coordinate genome mapping research", "coordinate genome research", "greater new york", - "x x x x x intend x x x message x x x message x x x addressed x x" + "x x x x x intend x x x message x x x message x x x addressed x x", + "issue with intervals queries from search engine. So it's a big issue for us as we need to do ordered searches. Thank you to help us concerning that issue" }; private void checkHits(Query query, int[] results) throws IOException { @@ -262,6 +263,13 @@ public class TestIntervalQuery extends LuceneTestCase { checkHits(q, new int[]{ 6, 7 }); } + public void testOrderedWithGaps() throws IOException { + Query q = new IntervalQuery(field, Intervals.maxgaps(1, Intervals.ordered( + Intervals.term("issue"), Intervals.term("search"), Intervals.term("ordered") + ))); + checkHits(q, new int[]{}); + } + public void testNestedOrInContainedBy() throws IOException { Query q = new IntervalQuery(field, Intervals.containedBy( Intervals.term("genome"), diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java index 4addd2feba8..eda3934b0bd 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java @@ -73,7 +73,7 @@ public class TestIntervals extends LuceneTestCase { private static String field2_docs[] = { "In Xanadu did Kubla Khan a stately pleasure dome decree", "Where Alph the sacred river ran through caverns measureless to man", - "Down to a sunless sea", + "a b a c b a b c", "So thrice five miles of fertile ground", "Pease hot porridge porridge", "w1 w2 w3 w4 w1 w6 w3 w8 w4 w7 w1 w6" @@ -525,6 +525,14 @@ public class TestIntervals extends LuceneTestCase { assertEquals(4, source.minExtent()); } + public void testInterleavedOrdered() throws IOException { + IntervalsSource source = Intervals.ordered(Intervals.term("a"), Intervals.term("b"), Intervals.term("c")); + checkIntervals(source, "field2", 1, new int[][]{ + {}, {}, { 0, 3, 5, 7 }, {}, {}, {} + }); + assertGaps(source, 2, "field2", new int[]{ 1, 0 }); + } + public void testUnorderedDistinct() throws IOException { checkIntervals(Intervals.unorderedNoOverlaps(Intervals.term("pease"), Intervals.term("pease")), "field1", 3, new int[][]{ @@ -670,6 +678,11 @@ public class TestIntervals extends LuceneTestCase { assertMatch(mi, 0, 3, 0, 11); assertEquals(3, source.minExtent()); + assertEquals(source, source); + assertEquals(source, Intervals.maxgaps(1, + Intervals.unordered(Intervals.term("w1"), Intervals.term("w3"), Intervals.term("w4")))); + assertNotEquals(source, Intervals.maxgaps(2, + Intervals.unordered(Intervals.term("w1"), Intervals.term("w3"), Intervals.term("w4")))); }