From 4a3b51073921f590dafae2bc2d4b4fdd9db14051 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Tue, 28 Jun 2011 21:20:18 +0000 Subject: [PATCH] LUCENE-3171: add modules/join to enable joining parent + child documents when indexed as a doc block git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1140851 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 6 + .../apache/lucene/search/FieldComparator.java | 9 +- .../lucene/search/FieldValueHitQueue.java | 28 +- .../lucene/search/TopFieldCollector.java | 35 +- .../org/apache/lucene/util/ArrayUtil.java | 51 ++ .../org/apache/lucene/util/_TestUtil.java | 22 + .../apache/lucene/index/TestNRTThreads.java | 27 +- modules/build.xml | 7 + modules/join/build.xml | 39 ++ .../search/join/BlockJoinCollector.java | 472 +++++++++++++++++ .../lucene/search/join/BlockJoinQuery.java | 410 +++++++++++++++ .../apache/lucene/search/join/package.html | 32 ++ modules/join/src/java/overview.html | 5 + .../apache/lucene/search/TestBlockJoin.java | 476 ++++++++++++++++++ 14 files changed, 1562 insertions(+), 57 deletions(-) create mode 100644 modules/join/build.xml create mode 100644 modules/join/src/java/org/apache/lucene/search/join/BlockJoinCollector.java create mode 100644 modules/join/src/java/org/apache/lucene/search/join/BlockJoinQuery.java create mode 100644 modules/join/src/java/org/apache/lucene/search/join/package.html create mode 100644 modules/join/src/java/overview.html create mode 100644 modules/join/src/test/org/apache/lucene/search/TestBlockJoin.java diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 1ce2f0b2ec9..c53a28e9deb 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -66,6 +66,12 @@ New Features highlighting speed up. Use FastVectorHighlighter.setPhraseLimit() to set limit (e.g. 5000). (Mike Sokolov via Koji Sekiguchi) + * LUCENE-3171: Added BlockJoinQuery and BlockJoinCollector, under the + new contrib/join module, to enable searches that require joining + between parent and child documents. Joined (children + parent) + documents must be indexed as a document block, using + IndexWriter.add/UpdateDocuments ((Mark Harwood, Mike McCandless) + API Changes Bug Fixes diff --git a/lucene/src/java/org/apache/lucene/search/FieldComparator.java b/lucene/src/java/org/apache/lucene/search/FieldComparator.java index 4bb9406411b..98e58f9f9bc 100644 --- a/lucene/src/java/org/apache/lucene/search/FieldComparator.java +++ b/lucene/src/java/org/apache/lucene/search/FieldComparator.java @@ -793,8 +793,13 @@ public abstract class FieldComparator { @Override public void setScorer(Scorer scorer) { // wrap with a ScoreCachingWrappingScorer so that successive calls to - // score() will not incur score computation over and over again. - this.scorer = new ScoreCachingWrappingScorer(scorer); + // score() will not incur score computation over and + // over again. + if (!(scorer instanceof ScoreCachingWrappingScorer)) { + this.scorer = new ScoreCachingWrappingScorer(scorer); + } else { + this.scorer = scorer; + } } @Override diff --git a/lucene/src/java/org/apache/lucene/search/FieldValueHitQueue.java b/lucene/src/java/org/apache/lucene/search/FieldValueHitQueue.java index d32f0a231a4..ab6a30c62f3 100644 --- a/lucene/src/java/org/apache/lucene/search/FieldValueHitQueue.java +++ b/lucene/src/java/org/apache/lucene/search/FieldValueHitQueue.java @@ -31,12 +31,12 @@ import org.apache.lucene.util.PriorityQueue; * @see IndexSearcher#search(Query,Filter,int,Sort) * @see FieldCache */ -public abstract class FieldValueHitQueue extends PriorityQueue { +public abstract class FieldValueHitQueue extends PriorityQueue { - final static class Entry extends ScoreDoc { - int slot; + public static class Entry extends ScoreDoc { + public int slot; - Entry(int slot, int doc, float score) { + public Entry(int slot, int doc, float score) { super(doc, score); this.slot = slot; } @@ -51,7 +51,7 @@ public abstract class FieldValueHitQueue extends PriorityQueue extends FieldValueHitQueue { private final int oneReverseMul; public OneComparatorFieldValueHitQueue(SortField[] fields, int size) @@ -92,7 +92,7 @@ public abstract class FieldValueHitQueue extends PriorityQueue extends FieldValueHitQueue { public MultiComparatorsFieldValueHitQueue(SortField[] fields, int size) throws IOException { @@ -156,24 +156,28 @@ public abstract class FieldValueHitQueue extends PriorityQueue FieldValueHitQueue create(SortField[] fields, int size) throws IOException { if (fields.length == 0) { throw new IllegalArgumentException("Sort must contain at least one field"); } if (fields.length == 1) { - return new OneComparatorFieldValueHitQueue(fields, size); + return new OneComparatorFieldValueHitQueue(fields, size); } else { - return new MultiComparatorsFieldValueHitQueue(fields, size); + return new MultiComparatorsFieldValueHitQueue(fields, size); } } - FieldComparator[] getComparators() { return comparators; } + public FieldComparator[] getComparators() { + return comparators; + } - int[] getReverseMul() { return reverseMul; } + public int[] getReverseMul() { + return reverseMul; + } - protected void setComparator(int pos, FieldComparator comparator) { + public void setComparator(int pos, FieldComparator comparator) { if (pos==0) firstComparator = comparator; comparators[pos] = comparator; } diff --git a/lucene/src/java/org/apache/lucene/search/TopFieldCollector.java b/lucene/src/java/org/apache/lucene/search/TopFieldCollector.java index bc10124e90c..1b466fca35f 100644 --- a/lucene/src/java/org/apache/lucene/search/TopFieldCollector.java +++ b/lucene/src/java/org/apache/lucene/search/TopFieldCollector.java @@ -48,9 +48,9 @@ public abstract class TopFieldCollector extends TopDocsCollector { FieldComparator comparator; final int reverseMul; - final FieldValueHitQueue queue; + final FieldValueHitQueue queue; - public OneComparatorNonScoringCollector(FieldValueHitQueue queue, + public OneComparatorNonScoringCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); this.queue = queue; @@ -113,7 +113,7 @@ public abstract class TopFieldCollector extends TopDocsCollector { private static class OutOfOrderOneComparatorNonScoringCollector extends OneComparatorNonScoringCollector { - public OutOfOrderOneComparatorNonScoringCollector(FieldValueHitQueue queue, + public OutOfOrderOneComparatorNonScoringCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); } @@ -160,7 +160,7 @@ public abstract class TopFieldCollector extends TopDocsCollector { Scorer scorer; - public OneComparatorScoringNoMaxScoreCollector(FieldValueHitQueue queue, + public OneComparatorScoringNoMaxScoreCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); } @@ -221,7 +221,7 @@ public abstract class TopFieldCollector extends TopDocsCollector { OneComparatorScoringNoMaxScoreCollector { public OutOfOrderOneComparatorScoringNoMaxScoreCollector( - FieldValueHitQueue queue, int numHits, boolean fillFields) + FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); } @@ -274,7 +274,7 @@ public abstract class TopFieldCollector extends TopDocsCollector { Scorer scorer; - public OneComparatorScoringMaxScoreCollector(FieldValueHitQueue queue, + public OneComparatorScoringMaxScoreCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); // Must set maxScore to NEG_INF, or otherwise Math.max always returns NaN. @@ -334,7 +334,7 @@ public abstract class TopFieldCollector extends TopDocsCollector { private static class OutOfOrderOneComparatorScoringMaxScoreCollector extends OneComparatorScoringMaxScoreCollector { - public OutOfOrderOneComparatorScoringMaxScoreCollector(FieldValueHitQueue queue, + public OutOfOrderOneComparatorScoringMaxScoreCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); } @@ -384,8 +384,8 @@ public abstract class TopFieldCollector extends TopDocsCollector { final FieldComparator[] comparators; final int[] reverseMul; - final FieldValueHitQueue queue; - public MultiComparatorNonScoringCollector(FieldValueHitQueue queue, + final FieldValueHitQueue queue; + public MultiComparatorNonScoringCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); this.queue = queue; @@ -471,7 +471,7 @@ public abstract class TopFieldCollector extends TopDocsCollector { private static class OutOfOrderMultiComparatorNonScoringCollector extends MultiComparatorNonScoringCollector { - public OutOfOrderMultiComparatorNonScoringCollector(FieldValueHitQueue queue, + public OutOfOrderMultiComparatorNonScoringCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); } @@ -540,7 +540,7 @@ public abstract class TopFieldCollector extends TopDocsCollector { Scorer scorer; - public MultiComparatorScoringMaxScoreCollector(FieldValueHitQueue queue, + public MultiComparatorScoringMaxScoreCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); // Must set maxScore to NEG_INF, or otherwise Math.max always returns NaN. @@ -619,7 +619,7 @@ public abstract class TopFieldCollector extends TopDocsCollector { private final static class OutOfOrderMultiComparatorScoringMaxScoreCollector extends MultiComparatorScoringMaxScoreCollector { - public OutOfOrderMultiComparatorScoringMaxScoreCollector(FieldValueHitQueue queue, + public OutOfOrderMultiComparatorScoringMaxScoreCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); } @@ -692,7 +692,7 @@ public abstract class TopFieldCollector extends TopDocsCollector { Scorer scorer; - public MultiComparatorScoringNoMaxScoreCollector(FieldValueHitQueue queue, + public MultiComparatorScoringNoMaxScoreCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); } @@ -771,7 +771,7 @@ public abstract class TopFieldCollector extends TopDocsCollector { extends MultiComparatorScoringNoMaxScoreCollector { public OutOfOrderMultiComparatorScoringNoMaxScoreCollector( - FieldValueHitQueue queue, int numHits, boolean fillFields) + FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); } @@ -917,7 +917,7 @@ public abstract class TopFieldCollector extends TopDocsCollector { throw new IllegalArgumentException("numHits must be > 0; please use TotalHitCountCollector if you just need the total hit count"); } - FieldValueHitQueue queue = FieldValueHitQueue.create(sort.fields, numHits); + FieldValueHitQueue queue = FieldValueHitQueue.create(sort.fields, numHits); if (queue.getComparators().length == 1) { if (docsScoredInOrder) { if (trackMaxScore) { @@ -972,7 +972,7 @@ public abstract class TopFieldCollector extends TopDocsCollector { protected void populateResults(ScoreDoc[] results, int howMany) { if (fillFields) { // avoid casting if unnecessary. - FieldValueHitQueue queue = (FieldValueHitQueue) pq; + FieldValueHitQueue queue = (FieldValueHitQueue) pq; for (int i = howMany - 1; i >= 0; i--) { results[i] = queue.fillFields(queue.pop()); } @@ -993,12 +993,11 @@ public abstract class TopFieldCollector extends TopDocsCollector { } // If this is a maxScoring tracking collector and there were no results, - return new TopFieldDocs(totalHits, results, ((FieldValueHitQueue) pq).getFields(), maxScore); + return new TopFieldDocs(totalHits, results, ((FieldValueHitQueue) pq).getFields(), maxScore); } @Override public boolean acceptsDocsOutOfOrder() { return false; } - } diff --git a/lucene/src/java/org/apache/lucene/util/ArrayUtil.java b/lucene/src/java/org/apache/lucene/util/ArrayUtil.java index 236b9d8c403..f537c77827c 100644 --- a/lucene/src/java/org/apache/lucene/util/ArrayUtil.java +++ b/lucene/src/java/org/apache/lucene/util/ArrayUtil.java @@ -393,6 +393,56 @@ public final class ArrayUtil { return array; } + public static int[][] grow(int[][] array, int minSize) { + if (array.length < minSize) { + int[][] newArray = new int[oversize(minSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF)][]; + System.arraycopy(array, 0, newArray, 0, array.length); + return newArray; + } else { + return array; + } + } + + public static int[][] grow(int[][] array) { + return grow(array, 1 + array.length); + } + + public static int[][] shrink(int[][] array, int targetSize) { + final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF); + if (newSize != array.length) { + int[][] newArray = new int[newSize][]; + System.arraycopy(array, 0, newArray, 0, newSize); + return newArray; + } else { + return array; + } + } + + public static float[][] grow(float[][] array, int minSize) { + if (array.length < minSize) { + float[][] newArray = new float[oversize(minSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF)][]; + System.arraycopy(array, 0, newArray, 0, array.length); + return newArray; + } else { + return array; + } + } + + public static float[][] grow(float[][] array) { + return grow(array, 1 + array.length); + } + + public static float[][] shrink(float[][] array, int targetSize) { + final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF); + if (newSize != array.length) { + float[][] newArray = new float[newSize][]; + System.arraycopy(array, 0, newArray, 0, newSize); + return newArray; + } else { + return array; + } + } + /** * Returns hash of chars in range start (inclusive) to * end (inclusive) @@ -617,6 +667,7 @@ public final class ArrayUtil { */ public static void mergeSort(T[] a, int fromIndex, int toIndex, Comparator comp) { if (toIndex-fromIndex <= 1) return; + //System.out.println("SORT: " + (toIndex-fromIndex)); getSorter(a, comp).mergeSort(fromIndex, toIndex-1); } diff --git a/lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java b/lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java index a42941e51cd..a25283bbd8e 100644 --- a/lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java +++ b/lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java @@ -35,6 +35,7 @@ import java.util.zip.ZipEntry; import java.util.zip.ZipFile; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.CheckIndex; import org.apache.lucene.index.ConcurrentMergeScheduler; @@ -491,4 +492,25 @@ public class _TestUtil { } } } + + // NOTE: this is likely buggy, and cannot clone fields + // with tokenStreamValues, etc. Use at your own risk!! + + // TODO: is there a pre-existing way to do this!!! + public static Document cloneDocument(Document doc1) { + final Document doc2 = new Document(); + for(Fieldable f : doc1.getFields()) { + Field field1 = (Field) f; + + Field field2 = new Field(field1.name(), + field1.stringValue(), + field1.isStored() ? Field.Store.YES : Field.Store.NO, + field1.isIndexed() ? (field1.isTokenized() ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED) : Field.Index.NO); + field2.setOmitNorms(field1.getOmitNorms()); + field2.setOmitTermFreqAndPositions(field1.getOmitTermFreqAndPositions()); + doc2.add(field2); + } + + return doc2; + } } diff --git a/lucene/src/test/org/apache/lucene/index/TestNRTThreads.java b/lucene/src/test/org/apache/lucene/index/TestNRTThreads.java index fd9ed5f59e1..44d94710272 100644 --- a/lucene/src/test/org/apache/lucene/index/TestNRTThreads.java +++ b/lucene/src/test/org/apache/lucene/index/TestNRTThreads.java @@ -33,7 +33,6 @@ import java.util.concurrent.atomic.AtomicInteger; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PhraseQuery; @@ -69,28 +68,6 @@ public class TestNRTThreads extends LuceneTestCase { } } - // TODO: is there a pre-existing way to do this!!! - private Document cloneDoc(Document doc1) { - final Document doc2 = new Document(); - for(Fieldable f : doc1.getFields()) { - Field field1 = (Field) f; - - Field field2 = new Field(field1.name(), - field1.stringValue(), - field1.isStored() ? Field.Store.YES : Field.Store.NO, - field1.isIndexed() ? (field1.isTokenized() ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED) : Field.Index.NO); - if (field1.getOmitNorms()) { - field2.setOmitNorms(true); - } - if (field1.getOmitTermFreqAndPositions()) { - field2.setOmitTermFreqAndPositions(true); - } - doc2.add(field2); - } - - return doc2; - } - @Test public void testNRTThreads() throws Exception { @@ -218,7 +195,7 @@ public class TestNRTThreads extends LuceneTestCase { allSubDocs.add(subDocs); doc.add(packIDField); - docsList.add(cloneDoc(doc)); + docsList.add(_TestUtil.cloneDocument(doc)); docIDs.add(doc.get("docid")); final int maxDocCount = _TestUtil.nextInt(random, 1, 10); @@ -227,7 +204,7 @@ public class TestNRTThreads extends LuceneTestCase { if (doc == null) { break; } - docsList.add(cloneDoc(doc)); + docsList.add(_TestUtil.cloneDocument(doc)); docIDs.add(doc.get("docid")); } addCount.addAndGet(docsList.size()); diff --git a/modules/build.xml b/modules/build.xml index 8c250333d9b..7916b4d94cc 100644 --- a/modules/build.xml +++ b/modules/build.xml @@ -26,6 +26,7 @@ + @@ -39,6 +40,7 @@ + @@ -52,6 +54,7 @@ + @@ -65,6 +68,7 @@ + @@ -79,6 +83,7 @@ + @@ -91,6 +96,7 @@ + @@ -105,6 +111,7 @@ + diff --git a/modules/join/build.xml b/modules/join/build.xml new file mode 100644 index 00000000000..6da6e28daba --- /dev/null +++ b/modules/join/build.xml @@ -0,0 +1,39 @@ + + + + Queries and collectors for performing joins + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/modules/join/src/java/org/apache/lucene/search/join/BlockJoinCollector.java b/modules/join/src/java/org/apache/lucene/search/join/BlockJoinCollector.java new file mode 100644 index 00000000000..9efd5959418 --- /dev/null +++ b/modules/join/src/java/org/apache/lucene/search/join/BlockJoinCollector.java @@ -0,0 +1,472 @@ +package org.apache.lucene.search.join; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; // javadocs +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.FieldComparator; +import org.apache.lucene.search.FieldValueHitQueue; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreCachingWrappingScorer; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopDocsCollector; +import org.apache.lucene.search.TopFieldCollector; +import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.search.Weight; +import org.apache.lucene.search.grouping.GroupDocs; +import org.apache.lucene.search.grouping.TopGroups; +import org.apache.lucene.util.ArrayUtil; + + +/** Collects parent document hits for a Query containing one more more + * BlockJoinQuery clauses, sorted by the + * specified parent Sort. Note that this cannot perform + * arbitrary joins; rather, it requires that all joined + * documents are indexed as a doc block (using {@link + * IndexWriter#addDocuments} or {@link + * IndexWriter#updateDocuments}). Ie, the join is computed + * at index time. + * + *

The parent Sort must only use + * fields from the parent documents; sorting by field in + * the child documents is not supported.

+ * + *

You should only use this + * collector if one or more of the clauses in the query is + * a {@link BlockJoinQuery}. This collector will find those query + * clauses and record the matching child documents for the + * top scoring parent documents.

+ * + *

Multiple joins (star join) and nested joins and a mix + * of the two are allowed, as long as in all cases the + * documents corresponding to a single row of each joined + * parent table were indexed as a doc block.

+ * + *

For the simple star join you can retrieve the + * {@link TopGroups} instance containing each {@link BlockJoinQuery}'s + * matching child documents for the top parent groups, + * using {@link #getTopGroups}. Ie, + * a single query, which will contain two or more + * {@link BlockJoinQuery}'s as clauses representing the star join, + * can then retrieve two or more {@link TopGroups} instances.

+ * + *

For nested joins, the query will run correctly (ie, + * match the right parent and child documents), however, + * because TopGroups is currently unable to support nesting + * (each group is not able to hold another TopGroups), you + * are only able to retrieve the TopGroups of the first + * join. The TopGroups of the nested joins will not be + * correct. + * + * See {@link org.apache.lucene.search.join} for a code + * sample. + * + * @lucene.experimental + */ +public class BlockJoinCollector extends Collector { + + private final Sort sort; + + // Maps each BlockJoinQuery instance to its "slot" in + // joinScorers and in OneGroup's cached doc/scores/count: + private final Map joinQueryID = new HashMap(); + private final int numParentHits; + private final FieldValueHitQueue queue; + private final FieldComparator[] comparators; + private final int[] reverseMul; + private final int compEnd; + private final boolean trackMaxScore; + private final boolean trackScores; + + private int docBase; + private BlockJoinQuery.BlockJoinScorer[] joinScorers = new BlockJoinQuery.BlockJoinScorer[0]; + private IndexReader.AtomicReaderContext currentReaderContext; + private Scorer scorer; + private boolean queueFull; + + private OneGroup bottom; + private int totalHitCount; + private float maxScore = Float.NaN; + + /* Creates a BlockJoinCollector. The provided sort must + * not be null. */ + public BlockJoinCollector(Sort sort, int numParentHits, boolean trackScores, boolean trackMaxScore) throws IOException { + // TODO: allow null sort to be specialized to relevance + // only collector + this.sort = sort; + this.trackMaxScore = trackMaxScore; + this.trackScores = trackScores; + this.numParentHits = numParentHits; + queue = FieldValueHitQueue.create(sort.getSort(), numParentHits); + comparators = queue.getComparators(); + reverseMul = queue.getReverseMul(); + compEnd = comparators.length - 1; + } + + private static final class OneGroup extends FieldValueHitQueue.Entry { + public OneGroup(int comparatorSlot, int parentDoc, float parentScore, int numJoins, boolean doScores) { + super(comparatorSlot, parentDoc, parentScore); + docs = new int[numJoins][]; + for(int joinID=0;joinID 0) { + // Definitely competitive. + break; + } else if (i == compEnd) { + // Here c=0. If we're at the last comparator, this doc is not + // competitive, since docs are visited in doc Id order, which means + // this doc cannot compete with any other document in the queue. + //System.out.println(" skip"); + return; + } + } + + //System.out.println(" competes! doc=" + (docBase + parentDoc)); + + // This hit is competitive - replace bottom element in queue & adjustTop + for (int i = 0; i < comparators.length; i++) { + comparators[i].copy(bottom.slot, parentDoc); + } + if (!trackMaxScore && trackScores) { + score = scorer.score(); + } + bottom.doc = docBase + parentDoc; + bottom.readerContext = currentReaderContext; + bottom.score = score; + copyGroups(bottom); + bottom = queue.updateTop(); + + for (int i = 0; i < comparators.length; i++) { + comparators[i].setBottom(bottom.slot); + } + } else { + // Startup transient: queue is not yet full: + final int comparatorSlot = totalHitCount - 1; + + // Copy hit into queue + for (int i = 0; i < comparators.length; i++) { + comparators[i].copy(comparatorSlot, parentDoc); + } + //System.out.println(" startup: new OG doc=" + (docBase+parentDoc)); + final OneGroup og = new OneGroup(comparatorSlot, docBase+parentDoc, score, joinScorers.length, trackScores); + og.readerContext = currentReaderContext; + copyGroups(og); + bottom = queue.add(og); + queueFull = totalHitCount == numParentHits; + if (queueFull) { + // End of startup transient: queue just filled up: + for (int i = 0; i < comparators.length; i++) { + comparators[i].setBottom(bottom.slot); + } + } + } + } + + // Pulls out child doc and scores for all join queries: + private void copyGroups(OneGroup og) { + // While rare, it's possible top arrays could be too + // short if join query had null scorer on first + // segment(s) but then became non-null on later segments + final int numSubScorers = joinScorers.length; + if (og.docs.length < numSubScorers) { + // While rare, this could happen if join query had + // null scorer on first segment(s) but then became + // non-null on later segments + og.docs = ArrayUtil.grow(og.docs); + } + if (og.counts.length < numSubScorers) { + og.counts = ArrayUtil.grow(og.counts); + } + if (trackScores && og.scores.length < numSubScorers) { + og.scores = ArrayUtil.grow(og.scores); + } + + //System.out.println("copyGroups parentDoc=" + og.doc); + for(int scorerIDX = 0;scorerIDX < numSubScorers;scorerIDX++) { + final BlockJoinQuery.BlockJoinScorer joinScorer = joinScorers[scorerIDX]; + //System.out.println(" scorer=" + joinScorer); + if (joinScorer != null) { + og.counts[scorerIDX] = joinScorer.getChildCount(); + //System.out.println(" count=" + og.counts[scorerIDX]); + og.docs[scorerIDX] = joinScorer.swapChildDocs(og.docs[scorerIDX]); + /* + for(int idx=0;idx() { + private void enroll(BlockJoinQuery query, BlockJoinQuery.BlockJoinScorer scorer) { + final Integer slot = joinQueryID.get(query); + if (slot == null) { + joinQueryID.put(query, joinScorers.length); + //System.out.println("found JQ: " + query + " slot=" + joinScorers.length); + final BlockJoinQuery.BlockJoinScorer[] newArray = new BlockJoinQuery.BlockJoinScorer[1+joinScorers.length]; + System.arraycopy(joinScorers, 0, newArray, 0, joinScorers.length); + joinScorers = newArray; + joinScorers[joinScorers.length-1] = scorer; + } else { + joinScorers[slot] = scorer; + } + } + + @Override + public void visitOptional(Query parent, Query child, Scorer scorer) { + //System.out.println("visitOpt"); + if (child instanceof BlockJoinQuery) { + enroll((BlockJoinQuery) child, + (BlockJoinQuery.BlockJoinScorer) scorer); + } + } + + @Override + public void visitRequired(Query parent, Query child, Scorer scorer) { + //System.out.println("visitReq parent=" + parent + " child=" + child + " scorer=" + scorer); + if (child instanceof BlockJoinQuery) { + enroll((BlockJoinQuery) child, + (BlockJoinQuery.BlockJoinScorer) scorer); + } + } + + @Override + public void visitProhibited(Query parent, Query child, Scorer scorer) { + //System.out.println("visitProh"); + if (child instanceof BlockJoinQuery) { + enroll((BlockJoinQuery) child, + (BlockJoinQuery.BlockJoinScorer) scorer); + } + } + }); + } + + private final static class FakeScorer extends Scorer { + + float score; + int doc; + + public FakeScorer() { + super((Weight) null); + } + + @Override + public float score() { + return score; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int advance(int target) { + throw new UnsupportedOperationException(); + } + + @Override + public int nextDoc() { + throw new UnsupportedOperationException(); + } + } + + private OneGroup[] sortedGroups; + + private void sortQueue() { + sortedGroups = new OneGroup[queue.size()]; + for(int downTo=queue.size()-1;downTo>=0;downTo--) { + sortedGroups[downTo] = queue.pop(); + } + } + + /** Return the TopGroups for the specified + * BlockJoinQuery. The groupValue of each GroupDocs will + * be the parent docID for that group. Note that the + * {@link GroupDocs#totalHits}, which would be the + * total number of child documents matching that parent, + * is not computed (will always be 0). Returns null if + * no groups matched. */ + @SuppressWarnings("unchecked") + public TopGroups getTopGroups(BlockJoinQuery query, Sort withinGroupSort, int offset, int maxDocsPerGroup, int withinGroupOffset, boolean fillSortFields) + + throws IOException { + + final Integer _slot = joinQueryID.get(query); + if (_slot == null) { + if (totalHitCount == 0) { + return null; + } else { + throw new IllegalArgumentException("the Query did not contain the provided BlockJoinQuery"); + } + } + + // unbox once + final int slot = _slot; + + if (offset >= queue.size()) { + return null; + } + int totalGroupedHitCount = 0; + + if (sortedGroups == null) { + sortQueue(); + } + + final FakeScorer fakeScorer = new FakeScorer(); + + final GroupDocs[] groups = new GroupDocs[sortedGroups.length - offset]; + + for(int groupIDX=offset;groupIDX(topDocs.getMaxScore(), + og.counts[slot], + topDocs.scoreDocs, + og.doc, + groupSortValues); + } + + return new TopGroups(new TopGroups(sort.getSort(), + withinGroupSort == null ? null : withinGroupSort.getSort(), + 0, totalGroupedHitCount, groups), + totalHitCount); + } +} diff --git a/modules/join/src/java/org/apache/lucene/search/join/BlockJoinQuery.java b/modules/join/src/java/org/apache/lucene/search/join/BlockJoinQuery.java new file mode 100644 index 00000000000..8ec62eee5c5 --- /dev/null +++ b/modules/join/src/java/org/apache/lucene/search/join/BlockJoinQuery.java @@ -0,0 +1,410 @@ +package org.apache.lucene.search.join; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Set; + +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; // javadocs +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Weight; +import org.apache.lucene.search.grouping.TopGroups; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.OpenBitSet; + +/** + * This query requires that you index + * children and parent docs as a single block, using the + * {@link IndexWriter#addDocuments} or {@link + * IndexWriter#updateDocuments} API. In each block, the + * child documents must appear first, ending with the parent + * document. At search time you provide a Filter + * identifying the parents, however this Filter must provide + * an {@link OpenBitSet} per sub-reader. + * + *

Once the block index is built, use this query to wrap + * any sub-query matching only child docs and join matches in that + * child document space up to the parent document space. + * You can then use this Query as a clause with + * other queries in the parent document space.

+ * + *

The child documents must be orthogonal to the parent + * documents: the wrapped child query must never + * return a parent document.

+ * + * If you'd like to retrieve {@link TopGroups} for the + * resulting query, use the {@link BlockJoinCollector}. + * Note that this is not necessary, ie, if you simply want + * to collect the parent documents and don't need to see + * which child documents matched under that parent, then + * you can use any collector. + * + *

NOTE: If the overall query contains parent-only + * matches, for example you OR a parent-only query with a + * joined child-only query, then the resulting collected documents + * will be correct, however the {@link TopGroups} you get + * from {@link BlockJoinCollector} will not contain every + * child for parents that had matched. + * + *

See {@link org.apache.lucene.search.join} for an + * overview.

+ * + * @lucene.experimental + */ + +public class BlockJoinQuery extends Query { + + public static enum ScoreMode {None, Avg, Max, Total}; + + private final Filter parentsFilter; + private final Query childQuery; + + // If we are rewritten, this is the original childQuery we + // were passed; we use this for .equals() and + // .hashCode(). This makes rewritten query equal the + // original, so that user does not have to .rewrite() their + // query before searching: + private final Query origChildQuery; + private final ScoreMode scoreMode; + + public BlockJoinQuery(Query childQuery, Filter parentsFilter, ScoreMode scoreMode) { + super(); + this.origChildQuery = childQuery; + this.childQuery = childQuery; + this.parentsFilter = parentsFilter; + this.scoreMode = scoreMode; + } + + private BlockJoinQuery(Query origChildQuery, Query childQuery, Filter parentsFilter, ScoreMode scoreMode) { + super(); + this.origChildQuery = origChildQuery; + this.childQuery = childQuery; + this.parentsFilter = parentsFilter; + this.scoreMode = scoreMode; + } + + public Weight createWeight(IndexSearcher searcher) throws IOException { + return new BlockJoinWeight(this, childQuery.createWeight(searcher), parentsFilter, scoreMode); + } + + private static class BlockJoinWeight extends Weight { + private final Query joinQuery; + private final Weight childWeight; + private final Filter parentsFilter; + private final ScoreMode scoreMode; + + public BlockJoinWeight(Query joinQuery, Weight childWeight, Filter parentsFilter, ScoreMode scoreMode) { + super(); + this.joinQuery = joinQuery; + this.childWeight = childWeight; + this.parentsFilter = parentsFilter; + this.scoreMode = scoreMode; + } + + @Override + public Query getQuery() { + return joinQuery; + } + + @Override + public float getValue() { + return childWeight.getValue(); + } + + @Override + public float sumOfSquaredWeights() throws IOException { + return childWeight.sumOfSquaredWeights(); + } + + @Override + public void normalize(float norm) { + childWeight.normalize(norm); + } + + @Override + public Scorer scorer(AtomicReaderContext readerContext, ScorerContext context) throws IOException { + // Pass scoreDocsInOrder true, topScorer false to our sub: + final Scorer childScorer = childWeight.scorer(readerContext, ScorerContext.def().scoreDocsInOrder(true).topScorer(false)); + + if (childScorer == null) { + // No matches + return null; + } + + final int firstChildDoc = childScorer.nextDoc(); + if (firstChildDoc == DocIdSetIterator.NO_MORE_DOCS) { + // No matches + return null; + } + + final DocIdSet parents = parentsFilter.getDocIdSet(readerContext); + // TODO: once we do random-access filters we can + // generalize this: + if (parents == null) { + // No matches + return null; + } + if (!(parents instanceof OpenBitSet)) { + throw new IllegalStateException("parentFilter must return OpenBitSet; got " + parents); + } + + return new BlockJoinScorer(this, childScorer, (OpenBitSet) parents, firstChildDoc, scoreMode); + } + + @Override + public Explanation explain(AtomicReaderContext reader, int doc) throws IOException { + // TODO + throw new UnsupportedOperationException(getClass().getName() + + " cannot explain match on parent document"); + } + + @Override + public boolean scoresDocsOutOfOrder() { + return false; + } + } + + static class BlockJoinScorer extends Scorer { + private final Scorer childScorer; + private final OpenBitSet parentBits; + private final ScoreMode scoreMode; + private int parentDoc; + private float parentScore; + private int nextChildDoc; + + private int[] pendingChildDocs = new int[5]; + private float[] pendingChildScores; + private int childDocUpto; + + public BlockJoinScorer(Weight weight, Scorer childScorer, OpenBitSet parentBits, int firstChildDoc, ScoreMode scoreMode) { + super(weight); + //System.out.println("Q.init firstChildDoc=" + firstChildDoc); + this.parentBits = parentBits; + this.childScorer = childScorer; + this.scoreMode = scoreMode; + if (scoreMode != ScoreMode.None) { + pendingChildScores = new float[5]; + } + nextChildDoc = firstChildDoc; + } + + @Override + public void visitSubScorers(Query parent, BooleanClause.Occur relationship, + ScorerVisitor visitor) { + super.visitSubScorers(parent, relationship, visitor); + //childScorer.visitSubScorers(weight.getQuery(), BooleanClause.Occur.MUST, visitor); + childScorer.visitScorers(visitor); + } + + int getChildCount() { + return childDocUpto; + } + + int[] swapChildDocs(int[] other) { + final int[] ret = pendingChildDocs; + if (other == null) { + pendingChildDocs = new int[5]; + } else { + pendingChildDocs = other; + } + return ret; + } + + float[] swapChildScores(float[] other) { + if (scoreMode == ScoreMode.None) { + throw new IllegalStateException("ScoreMode is None"); + } + final float[] ret = pendingChildScores; + if (other == null) { + pendingChildScores = new float[5]; + } else { + pendingChildScores = other; + } + return ret; + } + + @Override + public int nextDoc() throws IOException { + //System.out.println("Q.nextDoc() nextChildDoc=" + nextChildDoc); + + if (nextChildDoc == NO_MORE_DOCS) { + //System.out.println(" end"); + return parentDoc = NO_MORE_DOCS; + } + + // Gather all children sharing the same parent as nextChildDoc + parentDoc = parentBits.nextSetBit(nextChildDoc); + //System.out.println(" parentDoc=" + parentDoc); + assert parentDoc != -1; + + float totalScore = 0; + float maxScore = Float.NEGATIVE_INFINITY; + + childDocUpto = 0; + do { + //System.out.println(" c=" + nextChildDoc); + if (pendingChildDocs.length == childDocUpto) { + pendingChildDocs = ArrayUtil.grow(pendingChildDocs); + if (scoreMode != ScoreMode.None) { + pendingChildScores = ArrayUtil.grow(pendingChildScores); + } + } + pendingChildDocs[childDocUpto] = nextChildDoc; + if (scoreMode != ScoreMode.None) { + // TODO: specialize this into dedicated classes per-scoreMode + final float childScore = childScorer.score(); + pendingChildScores[childDocUpto] = childScore; + maxScore = Math.max(childScore, maxScore); + totalScore += childScore; + } + childDocUpto++; + nextChildDoc = childScorer.nextDoc(); + } while (nextChildDoc < parentDoc); + //System.out.println(" nextChildDoc=" + nextChildDoc); + + // Parent & child docs are supposed to be orthogonal: + assert nextChildDoc != parentDoc; + + switch(scoreMode) { + case Avg: + parentScore = totalScore / childDocUpto; + break; + case Max: + parentScore = maxScore; + break; + case Total: + parentScore = totalScore; + break; + case None: + break; + } + + //System.out.println(" return parentDoc=" + parentDoc); + return parentDoc; + } + + @Override + public int docID() { + return parentDoc; + } + + @Override + public float score() throws IOException { + return parentScore; + } + + @Override + public int advance(int parentTarget) throws IOException { + + //System.out.println("Q.advance parentTarget=" + parentTarget); + if (parentTarget == NO_MORE_DOCS) { + return parentDoc = NO_MORE_DOCS; + } + + final int prevParentDoc = parentBits.prevSetBit(parentTarget-1); + + //System.out.println(" rolled back to prevParentDoc=" + prevParentDoc + " vs parentDoc=" + parentDoc); + assert prevParentDoc >= parentDoc; + if (prevParentDoc > nextChildDoc) { + nextChildDoc = childScorer.advance(prevParentDoc); + // System.out.println(" childScorer advanced to child docID=" + nextChildDoc); + //} else { + //System.out.println(" skip childScorer advance"); + } + + // Parent & child docs are supposed to be orthogonal: + assert nextChildDoc != prevParentDoc; + + final int nd = nextDoc(); + //System.out.println(" return nextParentDoc=" + nd); + return nd; + } + } + + @Override + public void extractTerms(Set terms) { + childQuery.extractTerms(terms); + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + final Query childRewrite = childQuery.rewrite(reader); + if (childRewrite != childQuery) { + return new BlockJoinQuery(childQuery, + childRewrite, + parentsFilter, + scoreMode); + } else { + return this; + } + } + + @Override + public String toString(String field) { + return "BlockJoinQuery ("+childQuery.toString()+")"; + } + + @Override + public void setBoost(float boost) { + throw new UnsupportedOperationException("this query cannot support boosting; please use childQuery.setBoost instead"); + } + + @Override + public float getBoost() { + throw new UnsupportedOperationException("this query cannot support boosting; please use childQuery.getBoost instead"); + } + + @Override + public boolean equals(Object _other) { + if (_other instanceof BlockJoinQuery) { + final BlockJoinQuery other = (BlockJoinQuery) _other; + return origChildQuery.equals(other.origChildQuery) && + parentsFilter.equals(other.parentsFilter) && + scoreMode == other.scoreMode; + } else { + return false; + } + } + + @Override + public int hashCode() { + final int prime = 31; + int hash = 1; + hash = prime * hash + origChildQuery.hashCode(); + hash = prime * hash + scoreMode.hashCode(); + hash = prime * hash + parentsFilter.hashCode(); + return hash; + } + + @Override + public Object clone() { + return new BlockJoinQuery((Query) origChildQuery.clone(), + parentsFilter, + scoreMode); + } +} diff --git a/modules/join/src/java/org/apache/lucene/search/join/package.html b/modules/join/src/java/org/apache/lucene/search/join/package.html new file mode 100644 index 00000000000..82204f0fc12 --- /dev/null +++ b/modules/join/src/java/org/apache/lucene/search/join/package.html @@ -0,0 +1,32 @@ + + + +

This module supports index-time joins while searching, where joined + documents are indexed as a single document block using + {@link org.apache.lucene.index.IndexWriter#addDocuments}. This is useful for any normalized content (XML documents or database tables). In database terms, all rows for all + joined tables matching a single row of the primary table must be + indexed as a single document block, with the parent document + being last in the group.

+ +

When you index in this way, the documents in your index are divided + into parent documents (the last document of each block) and child + documents (all others). You provide a {@link org.apache.lucene.search.Filter} that identifies the + parent documents, as Lucene does not currently record any information + about doc blocks.

+ +

At search time, use {@link org.apache.lucene.search.join.BlockJoinQuery} to remap + matches from any child {@link org.apache.lucene.search.Query} (ie, a query that matches only + child documents) up to the parent document space. The resulting + {@link org.apache.lucene.search.join.BlockJoinQuery} can then be used as a clause in any query that + matches parent documents.

+ +

If you only care about the parent documents matching the query, you + can use any collector to collect the parent hits, but if you'd also + like to see which child documents match for each parent document, + use the {@link org.apache.lucene.search.join.BlockJoinCollector} to collect the hits. Once the + search is done, you retrieve a {@link + org.apache.lucene.search.grouping.TopGroups} instance from the + {@link org.apache.lucene.search.join.BlockJoinCollector#getTopGroups} method.

+ + + diff --git a/modules/join/src/java/overview.html b/modules/join/src/java/overview.html new file mode 100644 index 00000000000..a5fcef0913f --- /dev/null +++ b/modules/join/src/java/overview.html @@ -0,0 +1,5 @@ + + +Lucene's join module + + diff --git a/modules/join/src/test/org/apache/lucene/search/TestBlockJoin.java b/modules/join/src/test/org/apache/lucene/search/TestBlockJoin.java new file mode 100644 index 00000000000..827a0a5b365 --- /dev/null +++ b/modules/join/src/test/org/apache/lucene/search/TestBlockJoin.java @@ -0,0 +1,476 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.grouping.GroupDocs; +import org.apache.lucene.search.grouping.TopGroups; +import org.apache.lucene.search.join.BlockJoinCollector; +import org.apache.lucene.search.join.BlockJoinQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +public class TestBlockJoin extends LuceneTestCase { + + // One resume... + private Document makeResume(String name, String country) { + Document resume = new Document(); + resume.add(newField("docType", "resume", Field.Index.NOT_ANALYZED)); + resume.add(newField("name", name, Field.Store.YES, Field.Index.NOT_ANALYZED)); + resume.add(newField("country", country, Field.Index.NOT_ANALYZED)); + return resume; + } + + // ... has multiple jobs + private Document makeJob(String skill, int year) { + Document job = new Document(); + job.add(newField("skill", skill, Field.Store.YES, Field.Index.NOT_ANALYZED)); + job.add(new NumericField("year").setIntValue(year)); + return job; + } + + public void testSimple() throws Exception { + + final Directory dir = newDirectory(); + final RandomIndexWriter w = new RandomIndexWriter(random, dir); + + final List docs = new ArrayList(); + + docs.add(makeJob("java", 2007)); + docs.add(makeJob("python", 2010)); + docs.add(makeResume("Lisa", "United Kingdom")); + w.addDocuments(docs); + + docs.clear(); + docs.add(makeJob("ruby", 2005)); + docs.add(makeJob("java", 2006)); + docs.add(makeResume("Frank", "United States")); + w.addDocuments(docs); + + IndexReader r = w.getReader(); + w.close(); + IndexSearcher s = new IndexSearcher(r); + + // Create a filter that defines "parent" documents in the index - in this case resumes + Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume")))); + + // Define child document criteria (finds an example of relevant work experience) + BooleanQuery childQuery = new BooleanQuery(); + childQuery.add(new BooleanClause(new TermQuery(new Term("skill", "java")), Occur.MUST)); + childQuery.add(new BooleanClause(NumericRangeQuery.newIntRange("year", 2006, 2011, true, true), Occur.MUST)); + + // Define parent document criteria (find a resident in the UK) + Query parentQuery = new TermQuery(new Term("country", "United Kingdom")); + + // Wrap the child document query to 'join' any matches + // up to corresponding parent: + BlockJoinQuery childJoinQuery = new BlockJoinQuery(childQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg); + + // Combine the parent and nested child queries into a single query for a candidate + BooleanQuery fullQuery = new BooleanQuery(); + fullQuery.add(new BooleanClause(parentQuery, Occur.MUST)); + fullQuery.add(new BooleanClause(childJoinQuery, Occur.MUST)); + + BlockJoinCollector c = new BlockJoinCollector(Sort.RELEVANCE, 1, true, false); + + s.search(fullQuery, c); + + TopGroups results = c.getTopGroups(childJoinQuery, null, 0, 10, 0, true); + + //assertEquals(1, results.totalHitCount); + assertEquals(1, results.totalGroupedHitCount); + assertEquals(1, results.groups.length); + + final GroupDocs group = results.groups[0]; + assertEquals(1, group.totalHits); + + Document childDoc = s.doc(group.scoreDocs[0].doc); + //System.out.println(" doc=" + group.scoreDocs[0].doc); + assertEquals("java", childDoc.get("skill")); + assertNotNull(group.groupValue); + Document parentDoc = s.doc(group.groupValue); + assertEquals("Lisa", parentDoc.get("name")); + + r.close(); + dir.close(); + } + + private String[][] getRandomFields(int maxUniqueValues) { + + final String[][] fields = new String[_TestUtil.nextInt(random, 2, 4)][]; + for(int fieldID=0;fieldID sortFields = new ArrayList(); + // TODO: sometimes sort by score; problem is scores are + // not comparable across the two indices + // sortFields.add(SortField.FIELD_SCORE); + if (random.nextBoolean()) { + sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.Type.STRING, random.nextBoolean())); + } else if (random.nextBoolean()) { + sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.Type.STRING, random.nextBoolean())); + sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.Type.STRING, random.nextBoolean())); + } + // Break ties: + sortFields.add(new SortField(prefix + "ID", SortField.Type.INT)); + return new Sort(sortFields.toArray(new SortField[sortFields.size()])); + } + + public void testRandom() throws Exception { + // We build two indices at once: one normalized (which + // BlockJoinQuery/Collector can query) and the other w/ + // same docs just fully denormalized: + final Directory dir = newDirectory(); + final Directory joinDir = newDirectory(); + + final int numParentDocs = _TestUtil.nextInt(random, 100*RANDOM_MULTIPLIER, 300*RANDOM_MULTIPLIER); + //final int numParentDocs = 30; + + // Values for parent fields: + final String[][] parentFields = getRandomFields(numParentDocs/2); + // Values for child fields: + final String[][] childFields = getRandomFields(numParentDocs); + + // TODO: test star join, nested join cases too! + final RandomIndexWriter w = new RandomIndexWriter(random, dir); + final RandomIndexWriter joinW = new RandomIndexWriter(random, joinDir); + for(int parentDocID=0;parentDocID joinDocs = new ArrayList(); + + if (VERBOSE) { + System.out.println(" " + parentDoc); + } + + final int numChildDocs = _TestUtil.nextInt(random, 1, 20); + for(int childDocID=0;childDocID joinResults = c.getTopGroups(childJoinQuery, childSort, 0, hitsPerGroup, 0, true); + + if (VERBOSE) { + System.out.println("\nTEST: block join index gets " + (joinResults == null ? 0 : joinResults.groups.length) + " groups; hitsPerGroup=" + hitsPerGroup); + if (joinResults != null) { + final GroupDocs[] groups = joinResults.groups; + for(int groupIDX=0;groupIDX group = groups[groupIDX]; + if (group.groupSortValues != null) { + System.out.print(" "); + for(Object o : group.groupSortValues) { + if (o instanceof BytesRef) { + System.out.print(((BytesRef) o).utf8ToString() + " "); + } else { + System.out.print(o + " "); + } + } + System.out.println(); + } + + assertNotNull(group.groupValue); + final Document parentDoc = joinS.doc(group.groupValue); + System.out.println(" group parentID=" + parentDoc.get("parentID") + " (docID=" + group.groupValue + ")"); + for(int hitIDX=0;hitIDX joinResults) throws Exception { + // results is 'complete'; joinResults is a subset + int resultUpto = 0; + int joinGroupUpto = 0; + + final ScoreDoc[] hits = results.scoreDocs; + final GroupDocs[] groupDocs = joinResults.groups; + + while(joinGroupUpto < groupDocs.length) { + final GroupDocs group = groupDocs[joinGroupUpto++]; + final ScoreDoc[] groupHits = group.scoreDocs; + assertNotNull(group.groupValue); + final Document parentDoc = joinR.document(group.groupValue); + final String parentID = parentDoc.get("parentID"); + //System.out.println("GROUP groupDoc=" + group.groupDoc + " parent=" + parentDoc); + assertNotNull(parentID); + assertTrue(groupHits.length > 0); + for(int hitIDX=0;hitIDX