diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 7818dbc779c..8d5a3489243 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -110,6 +110,9 @@ Bug Fixes containing non-BMP Unicode characters. (Dawid Weiss, Robert Muir, Mike McCandless) +* LUCENE-4224: Add in-order scorer to query time joining and the + out-of-order scorer throws an UOE. (Martijn van Groningen, Robert Muir) + Optimizations * LUCENE-4317: Improve reuse of internal TokenStreams and StringReader diff --git a/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java index da5b72f225b..368c018b7e2 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java @@ -99,9 +99,9 @@ class TermsIncludingScoreQuery extends Query { private TermsEnum segmentTermsEnum; public Explanation explain(AtomicReaderContext context, int doc) throws IOException { - SVInnerScorer scorer = (SVInnerScorer) scorer(context, true, false, context.reader().getLiveDocs()); + SVInnerScorer scorer = (SVInnerScorer) scorer(context, false, false, context.reader().getLiveDocs()); if (scorer != null) { - if (scorer.advance(doc) == doc) { + if (scorer.advanceForExplainOnly(doc) == doc) { return scorer.explain(); } } @@ -127,7 +127,13 @@ class TermsIncludingScoreQuery extends Query { } segmentTermsEnum = terms.iterator(segmentTermsEnum); - if (multipleValuesPerDocument) { + if (scoreDocsInOrder) { + if (multipleValuesPerDocument) { + return new MVInOrderScorer(this, acceptDocs, segmentTermsEnum, context.reader().maxDoc()); + } else { + return new SVInOrderScorer(this, acceptDocs, segmentTermsEnum, context.reader().maxDoc()); + } + } else if (multipleValuesPerDocument) { return new MVInnerScorer(this, acceptDocs, segmentTermsEnum, context.reader().maxDoc()); } else { return new SVInnerScorer(this, acceptDocs, segmentTermsEnum); @@ -182,8 +188,7 @@ class TermsIncludingScoreQuery extends Query { } scoreUpto = upto; - TermsEnum.SeekStatus status = termsEnum.seekCeil(terms.get(ords[upto++], spare), true); - if (status == TermsEnum.SeekStatus.FOUND) { + if (termsEnum.seekExact(terms.get(ords[upto++], spare), true)) { docsEnum = reuse = termsEnum.docs(acceptDocs, reuse, 0); } } while (docsEnum == null); @@ -192,6 +197,10 @@ class TermsIncludingScoreQuery extends Query { } public int advance(int target) throws IOException { + throw new UnsupportedOperationException("advance() isn't supported because doc ids are emitted out of order"); + } + + private int advanceForExplainOnly(int target) throws IOException { int docId; do { docId = nextDoc(); @@ -251,8 +260,7 @@ class TermsIncludingScoreQuery extends Query { } scoreUpto = upto; - TermsEnum.SeekStatus status = termsEnum.seekCeil(terms.get(ords[upto++], spare), true); - if (status == TermsEnum.SeekStatus.FOUND) { + if (termsEnum.seekExact(terms.get(ords[upto++], spare), true)) { docsEnum = reuse = termsEnum.docs(acceptDocs, reuse, 0); } } while (docsEnum == null); @@ -274,4 +282,89 @@ class TermsIncludingScoreQuery extends Query { } } + class SVInOrderScorer extends Scorer { + + final DocIdSetIterator matchingDocsIterator; + final float[] scores; + + int currentDoc = -1; + + SVInOrderScorer(Weight weight, Bits acceptDocs, TermsEnum termsEnum, int maxDoc) throws IOException { + super(weight); + FixedBitSet matchingDocs = new FixedBitSet(maxDoc); + this.scores = new float[maxDoc]; + fillDocsAndScores(matchingDocs, acceptDocs, termsEnum); + this.matchingDocsIterator = matchingDocs.iterator(); + } + + protected void fillDocsAndScores(FixedBitSet matchingDocs, Bits acceptDocs, TermsEnum termsEnum) throws IOException { + BytesRef spare = new BytesRef(); + DocsEnum docsEnum = null; + for (int i = 0; i < terms.size(); i++) { + if (termsEnum.seekExact(terms.get(ords[i], spare), true)) { + docsEnum = termsEnum.docs(acceptDocs, docsEnum, 0); + float score = TermsIncludingScoreQuery.this.scores[ords[i]]; + for (int doc = docsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = docsEnum.nextDoc()) { + matchingDocs.set(doc); + // In the case the same doc is also related to a another doc, a score might be overwritten. I think this + // can only happen in a many-to-many relation + scores[doc] = score; + } + } + } + } + + public float score() throws IOException { + return scores[currentDoc]; + } + + public float freq() throws IOException { + return 1; + } + + public int docID() { + return currentDoc; + } + + public int nextDoc() throws IOException { + return currentDoc = matchingDocsIterator.nextDoc(); + } + + public int advance(int target) throws IOException { + return currentDoc = matchingDocsIterator.advance(target); + } + } + + // This scorer deals with the fact that a document can have more than one score from multiple related documents. + class MVInOrderScorer extends SVInOrderScorer { + + MVInOrderScorer(Weight weight, Bits acceptDocs, TermsEnum termsEnum, int maxDoc) throws IOException { + super(weight, acceptDocs, termsEnum, maxDoc); + } + + @Override + protected void fillDocsAndScores(FixedBitSet matchingDocs, Bits acceptDocs, TermsEnum termsEnum) throws IOException { + BytesRef spare = new BytesRef(); + DocsEnum docsEnum = null; + for (int i = 0; i < terms.size(); i++) { + if (termsEnum.seekExact(terms.get(ords[i], spare), true)) { + docsEnum = termsEnum.docs(acceptDocs, docsEnum, 0); + float score = TermsIncludingScoreQuery.this.scores[ords[i]]; + for (int doc = docsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = docsEnum.nextDoc()) { + // I prefer this: + /*if (scores[doc] < score) { + scores[doc] = score; + matchingDocs.set(doc); + }*/ + // But this behaves the same as MVInnerScorer and only then the tests will pass: + if (!matchingDocs.get(doc)) { + scores[doc] = score; + matchingDocs.set(doc); + } + } + } + } + } + } + } diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java b/lucene/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java index fe33f91b26f..2d428761358 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java @@ -22,13 +22,17 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; +import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DocTermOrds; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.ReaderUtil; +import org.apache.lucene.index.SlowCompositeReaderWrapper; import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.Collector; import org.apache.lucene.search.DocIdSetIterator; @@ -53,7 +57,6 @@ import org.junit.Test; import java.io.IOException; import java.util.*; -@Slow public class TestJoinUtil extends LuceneTestCase { public void testSimple() throws Exception { @@ -229,6 +232,7 @@ public class TestJoinUtil extends LuceneTestCase { } @Test + @Slow public void testSingleValueRandomJoin() throws Exception { int maxIndexIter = _TestUtil.nextInt(random(), 6, 12); int maxSearchIter = _TestUtil.nextInt(random(), 13, 26); @@ -236,6 +240,7 @@ public class TestJoinUtil extends LuceneTestCase { } @Test + @Slow // This test really takes more time, that is why the number of iterations are smaller. public void testMultiValueRandomJoin() throws Exception { int maxIndexIter = _TestUtil.nextInt(random(), 3, 6); @@ -254,7 +259,8 @@ public class TestJoinUtil extends LuceneTestCase { dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)).setMergePolicy(newLogMergePolicy()) ); - IndexIterationContext context = createContext(numberOfDocumentsToIndex, w, multipleValuesPerDocument); + final boolean scoreDocsInOrder = TestJoinUtil.random().nextBoolean(); + IndexIterationContext context = createContext(numberOfDocumentsToIndex, w, multipleValuesPerDocument, scoreDocsInOrder); IndexReader topLevelReader = w.getReader(); w.close(); @@ -310,7 +316,7 @@ public class TestJoinUtil extends LuceneTestCase { } public boolean acceptsDocsOutOfOrder() { - return topScoreDocCollector.acceptsDocsOutOfOrder(); + return scoreDocsInOrder; } }); // Asserting bit set... @@ -354,11 +360,11 @@ public class TestJoinUtil extends LuceneTestCase { } } - private IndexIterationContext createContext(int nDocs, RandomIndexWriter writer, boolean multipleValuesPerDocument) throws IOException { - return createContext(nDocs, writer, writer, multipleValuesPerDocument); + private IndexIterationContext createContext(int nDocs, RandomIndexWriter writer, boolean multipleValuesPerDocument, boolean scoreDocsInOrder) throws IOException { + return createContext(nDocs, writer, writer, multipleValuesPerDocument, scoreDocsInOrder); } - private IndexIterationContext createContext(int nDocs, RandomIndexWriter fromWriter, RandomIndexWriter toWriter, boolean multipleValuesPerDocument) throws IOException { + private IndexIterationContext createContext(int nDocs, RandomIndexWriter fromWriter, RandomIndexWriter toWriter, boolean multipleValuesPerDocument, boolean scoreDocsInOrder) throws IOException { IndexIterationContext context = new IndexIterationContext(); int numRandomValues = nDocs / 2; context.randomUniqueValues = new String[numRandomValues]; @@ -541,55 +547,81 @@ public class TestJoinUtil extends LuceneTestCase { final Map docToJoinScore = new HashMap(); if (multipleValuesPerDocument) { - toSearcher.search(new MatchAllDocsQuery(), new Collector() { + if (scoreDocsInOrder) { + AtomicReader slowCompositeReader = SlowCompositeReaderWrapper.wrap(toSearcher.getIndexReader()); + Terms terms = slowCompositeReader.terms(toField); + if (terms != null) { + DocsEnum docsEnum = null; + TermsEnum termsEnum = null; + SortedSet joinValues = new TreeSet(BytesRef.getUTF8SortedAsUnicodeComparator()); + joinValues.addAll(joinValueToJoinScores.keySet()); + for (BytesRef joinValue : joinValues) { + termsEnum = terms.iterator(termsEnum); + if (termsEnum.seekExact(joinValue, true)) { + docsEnum = termsEnum.docs(slowCompositeReader.getLiveDocs(), docsEnum, 0); + JoinScore joinScore = joinValueToJoinScores.get(joinValue); - private DocTermOrds docTermOrds; - private TermsEnum docTermsEnum; - private DocTermOrds.TermOrdsIterator reuse; - private int docBase; - - public void collect(int doc) throws IOException { - if (docTermOrds.isEmpty()) { - return; + for (int doc = docsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = docsEnum.nextDoc()) { + // First encountered join value determines the score. + // Something to keep in mind for many-to-many relations. + if (!docToJoinScore.containsKey(doc)) { + docToJoinScore.put(doc, joinScore); + } + } + } } + } + } else { + toSearcher.search(new MatchAllDocsQuery(), new Collector() { - reuse = docTermOrds.lookup(doc, reuse); - int[] buffer = new int[5]; + private DocTermOrds docTermOrds; + private TermsEnum docTermsEnum; + private DocTermOrds.TermOrdsIterator reuse; + private int docBase; - int chunk; - do { - chunk = reuse.read(buffer); - if (chunk == 0) { + public void collect(int doc) throws IOException { + if (docTermOrds.isEmpty()) { return; } - for (int idx = 0; idx < chunk; idx++) { - int key = buffer[idx]; - docTermsEnum.seekExact((long) key); - JoinScore joinScore = joinValueToJoinScores.get(docTermsEnum.term()); - if (joinScore == null) { - continue; - } - Integer basedDoc = docBase + doc; - // First encountered join value determines the score. - // Something to keep in mind for many-to-many relations. - if (!docToJoinScore.containsKey(basedDoc)) { - docToJoinScore.put(basedDoc, joinScore); - } - } - } while (chunk >= buffer.length); - } + reuse = docTermOrds.lookup(doc, reuse); + int[] buffer = new int[5]; - public void setNextReader(AtomicReaderContext context) throws IOException { - docBase = context.docBase; - docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader(), toField); - docTermsEnum = docTermOrds.getOrdTermsEnum(context.reader()); - reuse = null; - } + int chunk; + do { + chunk = reuse.read(buffer); + if (chunk == 0) { + return; + } - public boolean acceptsDocsOutOfOrder() {return false;} - public void setScorer(Scorer scorer) {} - }); + for (int idx = 0; idx < chunk; idx++) { + int key = buffer[idx]; + docTermsEnum.seekExact((long) key); + JoinScore joinScore = joinValueToJoinScores.get(docTermsEnum.term()); + if (joinScore == null) { + continue; + } + Integer basedDoc = docBase + doc; + // First encountered join value determines the score. + // Something to keep in mind for many-to-many relations. + if (!docToJoinScore.containsKey(basedDoc)) { + docToJoinScore.put(basedDoc, joinScore); + } + } + } while (chunk >= buffer.length); + } + + public void setNextReader(AtomicReaderContext context) throws IOException { + docBase = context.docBase; + docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader(), toField); + docTermsEnum = docTermOrds.getOrdTermsEnum(context.reader()); + reuse = null; + } + + public boolean acceptsDocsOutOfOrder() {return false;} + public void setScorer(Scorer scorer) {} + }); + } } else { toSearcher.search(new MatchAllDocsQuery(), new Collector() {