diff --git a/CHANGES.txt b/CHANGES.txt index f2d4e58dc07..f5e776b1c9a 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -161,6 +161,10 @@ Bug fixes 18. LUCENE-696: Fix bug when scorer for DisjunctionMaxQuery has skipTo() called on it before next(). (Yonik Seeley) +19. LUCENE-569: Fixed SpanNearQuery bug, for 'inOrder' queries it would fail + to recognize ordered spans if they overlaped with unordered spans. + (Paul Elschot via Chris Hostetter) + Optimizations 1. LUCENE-586: TermDocs.skipTo() is now more efficient for diff --git a/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java b/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java new file mode 100644 index 00000000000..e0469a1312b --- /dev/null +++ b/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java @@ -0,0 +1,265 @@ +package org.apache.lucene.search.spans; + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import java.util.Arrays; +import java.util.Comparator; + +import org.apache.lucene.index.IndexReader; + +/** A Spans that is formed from the ordered subspans of a SpanNearQuery + * where the subspans do not overlap and have a maximum slop between them. + *

+ * The formed spans only contains minimum slop matches.
+ * The matching slop is computed from the distance(s) between + * the non overlapping matching Spans.
+ * Successive matches are always formed from the successive Spans + * of the SpanNearQuery. + *

+ * The formed spans may contain overlaps when the slop is at least 1. + * For example, when querying using + *

t1 t2 t3
+ * with slop at least 1, the fragment: + *
t1 t2 t1 t3 t2 t3
+ * matches twice: + *
t1 t2 .. t3      
+ *
      t1 .. t2 t3
+ */ +class NearSpansOrdered implements Spans { + private final int allowedSlop; + private boolean firstTime = true; + private boolean more = false; + + /** The spans in the same order as the SpanNearQuery */ + private final Spans[] subSpans; + + /** Indicates that all subSpans have same doc() */ + private boolean inSameDoc = false; + + private int matchDoc = -1; + private int matchStart = -1; + private int matchEnd = -1; + + private final Spans[] subSpansByDoc; + private final Comparator spanDocComparator = new Comparator() { + public int compare(Object o1, Object o2) { + return ((Spans)o1).doc() - ((Spans)o2).doc(); + } + }; + + private SpanNearQuery query; + + public NearSpansOrdered(SpanNearQuery spanNearQuery, IndexReader reader) + throws IOException { + if (spanNearQuery.getClauses().length < 2) { + throw new IllegalArgumentException("Less than 2 clauses: " + + spanNearQuery); + } + allowedSlop = spanNearQuery.getSlop(); + SpanQuery[] clauses = spanNearQuery.getClauses(); + subSpans = new Spans[clauses.length]; + subSpansByDoc = new Spans[clauses.length]; + for (int i = 0; i < clauses.length; i++) { + subSpans[i] = clauses[i].getSpans(reader); + subSpansByDoc[i] = subSpans[i]; // used in toSameDoc() + } + query = spanNearQuery; // kept for toString() only. + } + + // inherit javadocs + public int doc() { return matchDoc; } + + // inherit javadocs + public int start() { return matchStart; } + + // inherit javadocs + public int end() { return matchEnd; } + + // inherit javadocs + public boolean next() throws IOException { + if (firstTime) { + firstTime = false; + for (int i = 0; i < subSpans.length; i++) { + if (! subSpans[i].next()) { + more = false; + return false; + } + } + more = true; + } + return advanceAfterOrdered(); + } + + // inherit javadocs + public boolean skipTo(int target) throws IOException { + if (firstTime) { + firstTime = false; + for (int i = 0; i < subSpans.length; i++) { + if (! subSpans[i].skipTo(target)) { + more = false; + return false; + } + } + more = true; + } else if (more && (subSpans[0].doc() < target)) { + if (subSpans[0].skipTo(target)) { + inSameDoc = false; + } else { + more = false; + return false; + } + } + return advanceAfterOrdered(); + } + + /** Advances the subSpans to just after an ordered match with a minimum slop + * that is smaller than the slop allowed by the SpanNearQuery. + * @return true iff there is such a match. + */ + private boolean advanceAfterOrdered() throws IOException { + while (more && (inSameDoc || toSameDoc())) { + if (stretchToOrder() && shrinkToAfterShortestMatch()) { + return true; + } + } + return false; // no more matches + } + + + /** Advance the subSpans to the same document */ + private boolean toSameDoc() throws IOException { + Arrays.sort(subSpansByDoc, spanDocComparator); + int firstIndex = 0; + int maxDoc = subSpansByDoc[subSpansByDoc.length - 1].doc(); + while (subSpansByDoc[firstIndex].doc() != maxDoc) { + if (! subSpansByDoc[firstIndex].skipTo(maxDoc)) { + more = false; + inSameDoc = false; + return false; + } + maxDoc = subSpansByDoc[firstIndex].doc(); + if (++firstIndex == subSpansByDoc.length) { + firstIndex = 0; + } + } + for (int i = 0; i < subSpansByDoc.length; i++) { + assert (subSpansByDoc[i].doc() == maxDoc) + : " NearSpansOrdered.toSameDoc() spans " + subSpansByDoc[0] + + "\n at doc " + subSpansByDoc[i].doc() + + ", but should be at " + maxDoc; + } + inSameDoc = true; + return true; + } + + /** Check whether two Spans in the same document are ordered. + * @param spans1 + * @param spans2 + * @return true iff spans1 starts before spans2 + * or the spans start at the same position, + * and spans1 ends before spans2. + */ + static final boolean docSpansOrdered(Spans spans1, Spans spans2) { + assert spans1.doc() == spans2.doc() : "doc1 " + spans1.doc() + " != doc2 " + spans2.doc(); + int start1 = spans1.start(); + int start2 = spans2.start(); + /* Do not call docSpansOrdered(int,int,int,int) to avoid invoking .end() : */ + return (start1 == start2) ? (spans1.end() < spans2.end()) : (start1 < start2); + } + + /** Like {@link #docSpansOrdered(Spans,Spans)}, but use the spans + * starts and ends as parameters. + */ + private static final boolean docSpansOrdered(int start1, int end1, int start2, int end2) { + return (start1 == start2) ? (end1 < end2) : (start1 < start2); + } + + /** Order the subSpans within the same document by advancing all later spans + * after the previous one. + */ + private boolean stretchToOrder() throws IOException { + matchDoc = subSpans[0].doc(); + for (int i = 1; inSameDoc && (i < subSpans.length); i++) { + while (! docSpansOrdered(subSpans[i-1], subSpans[i])) { + if (! subSpans[i].next()) { + inSameDoc = false; + more = false; + break; + } else if (matchDoc != subSpans[i].doc()) { + inSameDoc = false; + break; + } + } + } + return inSameDoc; + } + + /** The subSpans are ordered in the same doc, so there is a possible match. + * Compute the slop while making the match as short as possible by advancing + * all subSpans except the last one in reverse order. + */ + private boolean shrinkToAfterShortestMatch() throws IOException { + matchStart = subSpans[subSpans.length - 1].start(); + matchEnd = subSpans[subSpans.length - 1].end(); + int matchSlop = 0; + int lastStart = matchStart; + int lastEnd = matchEnd; + for (int i = subSpans.length - 2; i >= 0; i--) { + Spans prevSpans = subSpans[i]; + int prevStart = prevSpans.start(); + int prevEnd = prevSpans.end(); + while (true) { // Advance prevSpans until after (lastStart, lastEnd) + if (! prevSpans.next()) { + inSameDoc = false; + more = false; + break; // Check remaining subSpans for final match. + } else if (matchDoc != prevSpans.doc()) { + inSameDoc = false; // The last subSpans is not advanced here. + break; // Check remaining subSpans for last match in this document. + } else { + int ppStart = prevSpans.start(); + int ppEnd = prevSpans.end(); // Cannot avoid invoking .end() + if (! docSpansOrdered(ppStart, ppEnd, lastStart, lastEnd)) { + break; // Check remaining subSpans. + } else { // prevSpans still before (lastStart, lastEnd) + prevStart = ppStart; + prevEnd = ppEnd; + } + } + } + assert prevStart <= matchStart; + if (matchStart > prevEnd) { // Only non overlapping spans add to slop. + matchSlop += (matchStart - prevEnd); + } + /* Do not break on (matchSlop > allowedSlop) here to make sure + * that subSpans[0] is advanced after the match, if any. + */ + matchStart = prevStart; + lastStart = prevStart; + lastEnd = prevEnd; + } + return matchSlop <= allowedSlop; // ordered and allowed slop + } + + public String toString() { + return getClass().getName() + "("+query.toString()+")@"+ + (firstTime?"START":(more?(doc()+":"+start()+"-"+end()):"END")); + } +} + diff --git a/src/java/org/apache/lucene/search/spans/NearSpans.java b/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java similarity index 58% rename from src/java/org/apache/lucene/search/spans/NearSpans.java rename to src/java/org/apache/lucene/search/spans/NearSpansUnordered.java index aef8b386c2b..6fcf9a9c8c9 100644 --- a/src/java/org/apache/lucene/search/spans/NearSpans.java +++ b/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java @@ -1,7 +1,7 @@ package org.apache.lucene.search.spans; /** - * Copyright 2004 The Apache Software Foundation + * Copyright 2006 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,12 +24,11 @@ import java.util.ArrayList; import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.PriorityQueue; -class NearSpans implements Spans { +class NearSpansUnordered implements Spans { private SpanNearQuery query; private List ordered = new ArrayList(); // spans in query order private int slop; // from query - private boolean inOrder; // from query private SpansCell first; // linked list of spans private SpansCell last; // sorted by doc only @@ -51,15 +50,7 @@ class NearSpans implements Spans { SpansCell spans1 = (SpansCell)o1; SpansCell spans2 = (SpansCell)o2; if (spans1.doc() == spans2.doc()) { - if (spans1.start() == spans2.start()) { - if (spans1.end() == spans2.end()) { - return spans1.index > spans2.index; - } else { - return spans1.end() < spans2.end(); - } - } else { - return spans1.start() < spans2.start(); - } + return NearSpansOrdered.docSpansOrdered(spans1, spans2); } else { return spans1.doc() < spans2.doc(); } @@ -80,39 +71,28 @@ class NearSpans implements Spans { } public boolean next() throws IOException { - if (length != -1) // subtract old length - totalLength -= length; - - boolean more = spans.next(); // move to next - - if (more) { - length = end() - start(); // compute new length - totalLength += length; // add new length to total - - if (max == null || doc() > max.doc() || // maintain max - (doc() == max.doc() && end() > max.end())) - max = this; - } - - return more; + return adjust(spans.next()); } public boolean skipTo(int target) throws IOException { - if (length != -1) // subtract old length - totalLength -= length; - - boolean more = spans.skipTo(target); // skip - - if (more) { - length = end() - start(); // compute new length - totalLength += length; // add new length to total - - if (max == null || doc() > max.doc() || // maintain max - (doc() == max.doc() && end() > max.end())) - max = this; + return adjust(spans.skipTo(target)); + } + + private boolean adjust(boolean condition) { + if (length != -1) { + totalLength -= length; // subtract old length } + if (condition) { + length = end() - start(); + totalLength += length; // add new length - return more; + if (max == null || doc() > max.doc() + || (doc() == max.doc()) && (end() > max.end())) { + max = this; + } + } + more = condition; + return condition; } public int doc() { return spans.doc(); } @@ -122,30 +102,32 @@ class NearSpans implements Spans { public String toString() { return spans.toString() + "#" + index; } } - public NearSpans(SpanNearQuery query, IndexReader reader) + + public NearSpansUnordered(SpanNearQuery query, IndexReader reader) throws IOException { this.query = query; this.slop = query.getSlop(); - this.inOrder = query.isInOrder(); - SpanQuery[] clauses = query.getClauses(); // initialize spans & list + SpanQuery[] clauses = query.getClauses(); queue = new CellQueue(clauses.length); for (int i = 0; i < clauses.length; i++) { - SpansCell cell = // construct clause spans + SpansCell cell = new SpansCell(clauses[i].getSpans(reader), i); - ordered.add(cell); // add to ordered + ordered.add(cell); } } public boolean next() throws IOException { if (firstTime) { initList(true); - listToQueue(); // initialize queue + listToQueue(); // initialize queue firstTime = false; } else if (more) { - more = min().next(); // trigger further scanning - if (more) - queue.adjustTop(); // maintain queue + if (min().next()) { // trigger further scanning + queue.adjustTop(); // maintain queue + } else { + more = false; + } } while (more) { @@ -174,21 +156,13 @@ class NearSpans implements Spans { queueStale = false; } - if (atMatch()) + if (atMatch()) { return true; + } - // trigger further scanning - if (inOrder && checkSlop()) { - /* There is a non ordered match within slop and an ordered match is needed. */ - more = firstNonOrderedNextToPartialList(); - if (more) { - partialListToQueue(); - } - } else { - more = min().next(); - if (more) { - queue.adjustTop(); // maintain queue - } + more = min().next(); + if (more) { + queue.adjustTop(); // maintain queue } } return false; // no more matches @@ -206,20 +180,14 @@ class NearSpans implements Spans { firstTime = false; } else { // normal case while (more && min().doc() < target) { // skip as needed - more = min().skipTo(target); - if (more) + if (min().skipTo(target)) { queue.adjustTop(); + } else { + more = false; + } } } - if (more) { - - if (atMatch()) // at a match? - return true; - - return next(); // no, scan - } - - return false; + return more && (atMatch() || next()); } private SpansCell min() { return (SpansCell)queue.top(); } @@ -230,7 +198,7 @@ class NearSpans implements Spans { public String toString() { - return "spans("+query.toString()+")@"+ + return getClass().getName() + "("+query.toString()+")@"+ (firstTime?"START":(more?(doc()+":"+start()+"-"+end()):"END")); } @@ -268,35 +236,8 @@ class NearSpans implements Spans { } } - private boolean firstNonOrderedNextToPartialList() throws IOException { - /* Creates a partial list consisting of first non ordered and earlier. - * Returns first non ordered .next(). - */ - last = first = null; - int orderedIndex = 0; - while (queue.top() != null) { - SpansCell cell = (SpansCell)queue.pop(); - addToList(cell); - if (cell.index == orderedIndex) { - orderedIndex++; - } else { - return cell.next(); - // FIXME: continue here, rename to eg. checkOrderedMatch(): - // when checkSlop() and not ordered, repeat cell.next(). - // when checkSlop() and ordered, add to list and repeat queue.pop() - // without checkSlop(): no match, rebuild the queue from the partial list. - // When queue is empty and checkSlop() and ordered there is a match. - } - } - throw new RuntimeException("Unexpected: ordered"); - } - private void listToQueue() { queue.clear(); // rebuild queue - partialListToQueue(); - } - - private void partialListToQueue() { for (SpansCell cell = first; cell != null; cell = cell.next) { queue.put(cell); // add to queue from list } @@ -304,23 +245,6 @@ class NearSpans implements Spans { private boolean atMatch() { return (min().doc() == max.doc()) - && checkSlop() - && (!inOrder || matchIsOrdered()); - } - - private boolean checkSlop() { - int matchLength = max.end() - min().start(); - return (matchLength - totalLength) <= slop; - } - - private boolean matchIsOrdered() { - int lastStart = -1; - for (int i = 0; i < ordered.size(); i++) { - int start = ((SpansCell)ordered.get(i)).start(); - if (!(start > lastStart)) - return false; - lastStart = start; - } - return true; + && ((max.end() - min().start() - totalLength) <= slop); } } diff --git a/src/java/org/apache/lucene/search/spans/SpanNearQuery.java b/src/java/org/apache/lucene/search/spans/SpanNearQuery.java index 4da21d108ba..708463e90b0 100644 --- a/src/java/org/apache/lucene/search/spans/SpanNearQuery.java +++ b/src/java/org/apache/lucene/search/spans/SpanNearQuery.java @@ -124,7 +124,9 @@ public class SpanNearQuery extends SpanQuery { if (clauses.size() == 1) // optimize 1-clause case return ((SpanQuery)clauses.get(0)).getSpans(reader); - return new NearSpans(this, reader); + return inOrder + ? (Spans) new NearSpansOrdered(this, reader) + : (Spans) new NearSpansUnordered(this, reader); } public Query rewrite(IndexReader reader) throws IOException { diff --git a/src/test/org/apache/lucene/search/spans/TestNearSpansOrdered.java b/src/test/org/apache/lucene/search/spans/TestNearSpansOrdered.java new file mode 100644 index 00000000000..52723d00359 --- /dev/null +++ b/src/test/org/apache/lucene/search/spans/TestNearSpansOrdered.java @@ -0,0 +1,182 @@ +package org.apache.lucene.search.spans; + +/** + * Copyright 2006 Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Weight; +import org.apache.lucene.search.CheckHits; +import org.apache.lucene.store.RAMDirectory; + +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; + +import org.apache.lucene.analysis.WhitespaceAnalyzer; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + +import org.apache.lucene.queryParser.QueryParser; + +import junit.framework.TestCase; + +public class TestNearSpansOrdered extends TestCase { + protected IndexSearcher searcher; + + public static final String FIELD = "field"; + public static final QueryParser qp = + new QueryParser(FIELD, new WhitespaceAnalyzer()); + + public void tearDown() throws Exception { + searcher.close(); + } + + public void setUp() throws Exception { + RAMDirectory directory = new RAMDirectory(); + IndexWriter writer= new IndexWriter(directory, new WhitespaceAnalyzer(), true); + for (int i = 0; i < docFields.length; i++) { + Document doc = new Document(); + doc.add(new Field(FIELD, docFields[i], Field.Store.NO, Field.Index.TOKENIZED)); + writer.addDocument(doc); + } + writer.close(); + searcher = new IndexSearcher(directory); + } + + protected String[] docFields = { + "w1 w2 w3 w4 w5", + "w1 w3 w2 w3 zz", + "w1 xx w2 yy w3", + "w1 w3 xx w2 yy w3 zz" + }; + + protected SpanNearQuery makeQuery(String s1, String s2, String s3, + int slop, boolean inOrder) { + return new SpanNearQuery + (new SpanQuery[] { + new SpanTermQuery(new Term(FIELD, s1)), + new SpanTermQuery(new Term(FIELD, s2)), + new SpanTermQuery(new Term(FIELD, s3)) }, + slop, + inOrder); + } + protected SpanNearQuery makeQuery() { + return makeQuery("w1","w2","w3",1,true); + } + + public void testSpanNearQuery() throws Exception { + SpanNearQuery q = makeQuery(); + CheckHits.checkHits(q, FIELD, searcher, new int[] {0,1}); + } + + public String s(Spans span) { + return s(span.doc(), span.start(), span.end()); + } + public String s(int doc, int start, int end) { + return "s(" + doc + "," + start + "," + end +")"; + } + + public void testNearSpansNext() throws Exception { + SpanNearQuery q = makeQuery(); + Spans span = q.getSpans(searcher.getIndexReader()); + assertEquals(true, span.next()); + assertEquals(s(0,0,3), s(span)); + assertEquals(true, span.next()); + assertEquals(s(1,0,4), s(span)); + assertEquals(false, span.next()); + } + + /** + * test does not imply that skipTo(doc+1) should work exactly the + * same as next -- it's only applicable in this case since we know doc + * does not contain more than one span + */ + public void testNearSpansSkipToLikeNext() throws Exception { + SpanNearQuery q = makeQuery(); + Spans span = q.getSpans(searcher.getIndexReader()); + assertEquals(true, span.skipTo(0)); + assertEquals(s(0,0,3), s(span)); + assertEquals(true, span.skipTo(1)); + assertEquals(s(1,0,4), s(span)); + assertEquals(false, span.skipTo(2)); + } + + public void testNearSpansNextThenSkipTo() throws Exception { + SpanNearQuery q = makeQuery(); + Spans span = q.getSpans(searcher.getIndexReader()); + assertEquals(true, span.next()); + assertEquals(s(0,0,3), s(span)); + assertEquals(true, span.skipTo(1)); + assertEquals(s(1,0,4), s(span)); + assertEquals(false, span.next()); + } + + public void testNearSpansNextThenSkipPast() throws Exception { + SpanNearQuery q = makeQuery(); + Spans span = q.getSpans(searcher.getIndexReader()); + assertEquals(true, span.next()); + assertEquals(s(0,0,3), s(span)); + assertEquals(false, span.skipTo(2)); + } + + public void testNearSpansSkipPast() throws Exception { + SpanNearQuery q = makeQuery(); + Spans span = q.getSpans(searcher.getIndexReader()); + assertEquals(false, span.skipTo(2)); + } + + public void testNearSpansSkipTo0() throws Exception { + SpanNearQuery q = makeQuery(); + Spans span = q.getSpans(searcher.getIndexReader()); + assertEquals(true, span.skipTo(0)); + assertEquals(s(0,0,3), s(span)); + } + + public void testNearSpansSkipTo1() throws Exception { + SpanNearQuery q = makeQuery(); + Spans span = q.getSpans(searcher.getIndexReader()); + assertEquals(true, span.skipTo(1)); + assertEquals(s(1,0,4), s(span)); + } + + /** + * not a direct test of NearSpans, but a demonstration of how/when + * this causes problems + */ + public void testSpanNearScorerSkipTo1() throws Exception { + SpanNearQuery q = makeQuery(); + Weight w = q.createWeight(searcher); + Scorer s = w.scorer(searcher.getIndexReader()); + assertEquals(true, s.skipTo(1)); + assertEquals(1, s.doc()); + } + /** + * not a direct test of NearSpans, but a demonstration of how/when + * this causes problems + */ + public void testSpanNearScorerExplain() throws Exception { + SpanNearQuery q = makeQuery(); + Weight w = q.createWeight(searcher); + Scorer s = w.scorer(searcher.getIndexReader()); + Explanation e = s.explain(1); + assertTrue("Scorer explanation value for doc#1 isn't positive: " + + e.toString(), + 0.0f < e.getValue()); + } + +} diff --git a/src/test/org/apache/lucene/search/spans/TestSpanExplanations.java b/src/test/org/apache/lucene/search/spans/TestSpanExplanations.java index c1ef6614404..57a66a434c6 100644 --- a/src/test/org/apache/lucene/search/spans/TestSpanExplanations.java +++ b/src/test/org/apache/lucene/search/spans/TestSpanExplanations.java @@ -151,6 +151,10 @@ public class TestSpanExplanations extends TestExplanations { SpanQuery q = snear("xx","w1","w2",0,false); qtest(q, new int[] {2}); } + public void testSNear11() throws Exception { + SpanQuery q = snear("w1","w2","w3",1,true); + qtest(q, new int[] {0,1}); + } /* some SpanNotQueries */