LUCENE-3171: add modules/join to enable joining parent + child documents when indexed as a doc block

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1140851 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-06-28 21:20:18 +00:00
parent 2745612a4c
commit 4a3b510739
14 changed files with 1562 additions and 57 deletions

View File

@ -66,6 +66,12 @@ New Features
highlighting speed up. Use FastVectorHighlighter.setPhraseLimit() to set limit
(e.g. 5000). (Mike Sokolov via Koji Sekiguchi)
* LUCENE-3171: Added BlockJoinQuery and BlockJoinCollector, under the
new contrib/join module, to enable searches that require joining
between parent and child documents. Joined (children + parent)
documents must be indexed as a document block, using
IndexWriter.add/UpdateDocuments ((Mark Harwood, Mike McCandless)
API Changes
Bug Fixes

View File

@ -793,8 +793,13 @@ public abstract class FieldComparator<T> {
@Override
public void setScorer(Scorer scorer) {
// wrap with a ScoreCachingWrappingScorer so that successive calls to
// score() will not incur score computation over and over again.
this.scorer = new ScoreCachingWrappingScorer(scorer);
// score() will not incur score computation over and
// over again.
if (!(scorer instanceof ScoreCachingWrappingScorer)) {
this.scorer = new ScoreCachingWrappingScorer(scorer);
} else {
this.scorer = scorer;
}
}
@Override

View File

@ -31,12 +31,12 @@ import org.apache.lucene.util.PriorityQueue;
* @see IndexSearcher#search(Query,Filter,int,Sort)
* @see FieldCache
*/
public abstract class FieldValueHitQueue extends PriorityQueue<FieldValueHitQueue.Entry> {
public abstract class FieldValueHitQueue<T extends FieldValueHitQueue.Entry> extends PriorityQueue<T> {
final static class Entry extends ScoreDoc {
int slot;
public static class Entry extends ScoreDoc {
public int slot;
Entry(int slot, int doc, float score) {
public Entry(int slot, int doc, float score) {
super(doc, score);
this.slot = slot;
}
@ -51,7 +51,7 @@ public abstract class FieldValueHitQueue extends PriorityQueue<FieldValueHitQueu
* An implementation of {@link FieldValueHitQueue} which is optimized in case
* there is just one comparator.
*/
private static final class OneComparatorFieldValueHitQueue extends FieldValueHitQueue {
private static final class OneComparatorFieldValueHitQueue<T extends FieldValueHitQueue.Entry> extends FieldValueHitQueue<T> {
private final int oneReverseMul;
public OneComparatorFieldValueHitQueue(SortField[] fields, int size)
@ -92,7 +92,7 @@ public abstract class FieldValueHitQueue extends PriorityQueue<FieldValueHitQueu
* An implementation of {@link FieldValueHitQueue} which is optimized in case
* there is more than one comparator.
*/
private static final class MultiComparatorsFieldValueHitQueue extends FieldValueHitQueue {
private static final class MultiComparatorsFieldValueHitQueue<T extends FieldValueHitQueue.Entry> extends FieldValueHitQueue<T> {
public MultiComparatorsFieldValueHitQueue(SortField[] fields, int size)
throws IOException {
@ -156,24 +156,28 @@ public abstract class FieldValueHitQueue extends PriorityQueue<FieldValueHitQueu
* The number of hits to retain. Must be greater than zero.
* @throws IOException
*/
public static FieldValueHitQueue create(SortField[] fields, int size) throws IOException {
public static <T extends FieldValueHitQueue.Entry> FieldValueHitQueue<T> create(SortField[] fields, int size) throws IOException {
if (fields.length == 0) {
throw new IllegalArgumentException("Sort must contain at least one field");
}
if (fields.length == 1) {
return new OneComparatorFieldValueHitQueue(fields, size);
return new OneComparatorFieldValueHitQueue<T>(fields, size);
} else {
return new MultiComparatorsFieldValueHitQueue(fields, size);
return new MultiComparatorsFieldValueHitQueue<T>(fields, size);
}
}
FieldComparator[] getComparators() { return comparators; }
public FieldComparator[] getComparators() {
return comparators;
}
int[] getReverseMul() { return reverseMul; }
public int[] getReverseMul() {
return reverseMul;
}
protected void setComparator(int pos, FieldComparator comparator) {
public void setComparator(int pos, FieldComparator comparator) {
if (pos==0) firstComparator = comparator;
comparators[pos] = comparator;
}

View File

@ -48,9 +48,9 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
FieldComparator comparator;
final int reverseMul;
final FieldValueHitQueue queue;
final FieldValueHitQueue<Entry> queue;
public OneComparatorNonScoringCollector(FieldValueHitQueue queue,
public OneComparatorNonScoringCollector(FieldValueHitQueue<Entry> queue,
int numHits, boolean fillFields) throws IOException {
super(queue, numHits, fillFields);
this.queue = queue;
@ -113,7 +113,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
private static class OutOfOrderOneComparatorNonScoringCollector extends
OneComparatorNonScoringCollector {
public OutOfOrderOneComparatorNonScoringCollector(FieldValueHitQueue queue,
public OutOfOrderOneComparatorNonScoringCollector(FieldValueHitQueue<Entry> queue,
int numHits, boolean fillFields) throws IOException {
super(queue, numHits, fillFields);
}
@ -160,7 +160,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
Scorer scorer;
public OneComparatorScoringNoMaxScoreCollector(FieldValueHitQueue queue,
public OneComparatorScoringNoMaxScoreCollector(FieldValueHitQueue<Entry> queue,
int numHits, boolean fillFields) throws IOException {
super(queue, numHits, fillFields);
}
@ -221,7 +221,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
OneComparatorScoringNoMaxScoreCollector {
public OutOfOrderOneComparatorScoringNoMaxScoreCollector(
FieldValueHitQueue queue, int numHits, boolean fillFields)
FieldValueHitQueue<Entry> queue, int numHits, boolean fillFields)
throws IOException {
super(queue, numHits, fillFields);
}
@ -274,7 +274,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
Scorer scorer;
public OneComparatorScoringMaxScoreCollector(FieldValueHitQueue queue,
public OneComparatorScoringMaxScoreCollector(FieldValueHitQueue<Entry> queue,
int numHits, boolean fillFields) throws IOException {
super(queue, numHits, fillFields);
// Must set maxScore to NEG_INF, or otherwise Math.max always returns NaN.
@ -334,7 +334,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
private static class OutOfOrderOneComparatorScoringMaxScoreCollector extends
OneComparatorScoringMaxScoreCollector {
public OutOfOrderOneComparatorScoringMaxScoreCollector(FieldValueHitQueue queue,
public OutOfOrderOneComparatorScoringMaxScoreCollector(FieldValueHitQueue<Entry> queue,
int numHits, boolean fillFields) throws IOException {
super(queue, numHits, fillFields);
}
@ -384,8 +384,8 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
final FieldComparator[] comparators;
final int[] reverseMul;
final FieldValueHitQueue queue;
public MultiComparatorNonScoringCollector(FieldValueHitQueue queue,
final FieldValueHitQueue<Entry> queue;
public MultiComparatorNonScoringCollector(FieldValueHitQueue<Entry> queue,
int numHits, boolean fillFields) throws IOException {
super(queue, numHits, fillFields);
this.queue = queue;
@ -471,7 +471,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
private static class OutOfOrderMultiComparatorNonScoringCollector extends
MultiComparatorNonScoringCollector {
public OutOfOrderMultiComparatorNonScoringCollector(FieldValueHitQueue queue,
public OutOfOrderMultiComparatorNonScoringCollector(FieldValueHitQueue<Entry> queue,
int numHits, boolean fillFields) throws IOException {
super(queue, numHits, fillFields);
}
@ -540,7 +540,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
Scorer scorer;
public MultiComparatorScoringMaxScoreCollector(FieldValueHitQueue queue,
public MultiComparatorScoringMaxScoreCollector(FieldValueHitQueue<Entry> queue,
int numHits, boolean fillFields) throws IOException {
super(queue, numHits, fillFields);
// Must set maxScore to NEG_INF, or otherwise Math.max always returns NaN.
@ -619,7 +619,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
private final static class OutOfOrderMultiComparatorScoringMaxScoreCollector
extends MultiComparatorScoringMaxScoreCollector {
public OutOfOrderMultiComparatorScoringMaxScoreCollector(FieldValueHitQueue queue,
public OutOfOrderMultiComparatorScoringMaxScoreCollector(FieldValueHitQueue<Entry> queue,
int numHits, boolean fillFields) throws IOException {
super(queue, numHits, fillFields);
}
@ -692,7 +692,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
Scorer scorer;
public MultiComparatorScoringNoMaxScoreCollector(FieldValueHitQueue queue,
public MultiComparatorScoringNoMaxScoreCollector(FieldValueHitQueue<Entry> queue,
int numHits, boolean fillFields) throws IOException {
super(queue, numHits, fillFields);
}
@ -771,7 +771,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
extends MultiComparatorScoringNoMaxScoreCollector {
public OutOfOrderMultiComparatorScoringNoMaxScoreCollector(
FieldValueHitQueue queue, int numHits, boolean fillFields)
FieldValueHitQueue<Entry> queue, int numHits, boolean fillFields)
throws IOException {
super(queue, numHits, fillFields);
}
@ -917,7 +917,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
throw new IllegalArgumentException("numHits must be > 0; please use TotalHitCountCollector if you just need the total hit count");
}
FieldValueHitQueue queue = FieldValueHitQueue.create(sort.fields, numHits);
FieldValueHitQueue<Entry> queue = FieldValueHitQueue.create(sort.fields, numHits);
if (queue.getComparators().length == 1) {
if (docsScoredInOrder) {
if (trackMaxScore) {
@ -972,7 +972,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
protected void populateResults(ScoreDoc[] results, int howMany) {
if (fillFields) {
// avoid casting if unnecessary.
FieldValueHitQueue queue = (FieldValueHitQueue) pq;
FieldValueHitQueue<Entry> queue = (FieldValueHitQueue<Entry>) pq;
for (int i = howMany - 1; i >= 0; i--) {
results[i] = queue.fillFields(queue.pop());
}
@ -993,12 +993,11 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
}
// If this is a maxScoring tracking collector and there were no results,
return new TopFieldDocs(totalHits, results, ((FieldValueHitQueue) pq).getFields(), maxScore);
return new TopFieldDocs(totalHits, results, ((FieldValueHitQueue<Entry>) pq).getFields(), maxScore);
}
@Override
public boolean acceptsDocsOutOfOrder() {
return false;
}
}

View File

@ -393,6 +393,56 @@ public final class ArrayUtil {
return array;
}
public static int[][] grow(int[][] array, int minSize) {
if (array.length < minSize) {
int[][] newArray = new int[oversize(minSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF)][];
System.arraycopy(array, 0, newArray, 0, array.length);
return newArray;
} else {
return array;
}
}
public static int[][] grow(int[][] array) {
return grow(array, 1 + array.length);
}
public static int[][] shrink(int[][] array, int targetSize) {
final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
if (newSize != array.length) {
int[][] newArray = new int[newSize][];
System.arraycopy(array, 0, newArray, 0, newSize);
return newArray;
} else {
return array;
}
}
public static float[][] grow(float[][] array, int minSize) {
if (array.length < minSize) {
float[][] newArray = new float[oversize(minSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF)][];
System.arraycopy(array, 0, newArray, 0, array.length);
return newArray;
} else {
return array;
}
}
public static float[][] grow(float[][] array) {
return grow(array, 1 + array.length);
}
public static float[][] shrink(float[][] array, int targetSize) {
final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
if (newSize != array.length) {
float[][] newArray = new float[newSize][];
System.arraycopy(array, 0, newArray, 0, newSize);
return newArray;
} else {
return array;
}
}
/**
* Returns hash of chars in range start (inclusive) to
* end (inclusive)
@ -617,6 +667,7 @@ public final class ArrayUtil {
*/
public static <T> void mergeSort(T[] a, int fromIndex, int toIndex, Comparator<? super T> comp) {
if (toIndex-fromIndex <= 1) return;
//System.out.println("SORT: " + (toIndex-fromIndex));
getSorter(a, comp).mergeSort(fromIndex, toIndex-1);
}

View File

@ -35,6 +35,7 @@ import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.index.ConcurrentMergeScheduler;
@ -491,4 +492,25 @@ public class _TestUtil {
}
}
}
// NOTE: this is likely buggy, and cannot clone fields
// with tokenStreamValues, etc. Use at your own risk!!
// TODO: is there a pre-existing way to do this!!!
public static Document cloneDocument(Document doc1) {
final Document doc2 = new Document();
for(Fieldable f : doc1.getFields()) {
Field field1 = (Field) f;
Field field2 = new Field(field1.name(),
field1.stringValue(),
field1.isStored() ? Field.Store.YES : Field.Store.NO,
field1.isIndexed() ? (field1.isTokenized() ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED) : Field.Index.NO);
field2.setOmitNorms(field1.getOmitNorms());
field2.setOmitTermFreqAndPositions(field1.getOmitTermFreqAndPositions());
doc2.add(field2);
}
return doc2;
}
}

View File

@ -33,7 +33,6 @@ import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
@ -69,28 +68,6 @@ public class TestNRTThreads extends LuceneTestCase {
}
}
// TODO: is there a pre-existing way to do this!!!
private Document cloneDoc(Document doc1) {
final Document doc2 = new Document();
for(Fieldable f : doc1.getFields()) {
Field field1 = (Field) f;
Field field2 = new Field(field1.name(),
field1.stringValue(),
field1.isStored() ? Field.Store.YES : Field.Store.NO,
field1.isIndexed() ? (field1.isTokenized() ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED) : Field.Index.NO);
if (field1.getOmitNorms()) {
field2.setOmitNorms(true);
}
if (field1.getOmitTermFreqAndPositions()) {
field2.setOmitTermFreqAndPositions(true);
}
doc2.add(field2);
}
return doc2;
}
@Test
public void testNRTThreads() throws Exception {
@ -218,7 +195,7 @@ public class TestNRTThreads extends LuceneTestCase {
allSubDocs.add(subDocs);
doc.add(packIDField);
docsList.add(cloneDoc(doc));
docsList.add(_TestUtil.cloneDocument(doc));
docIDs.add(doc.get("docid"));
final int maxDocCount = _TestUtil.nextInt(random, 1, 10);
@ -227,7 +204,7 @@ public class TestNRTThreads extends LuceneTestCase {
if (doc == null) {
break;
}
docsList.add(cloneDoc(doc));
docsList.add(_TestUtil.cloneDocument(doc));
docIDs.add(doc.get("docid"));
}
addCount.addAndGet(docsList.size());

View File

@ -26,6 +26,7 @@
<fileset dir="common" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="queries" includes="build.xml" />
<fileset dir="join" includes="build.xml" />
<fileset dir="suggest" includes="build.xml" />
</subant>
</sequential>
@ -39,6 +40,7 @@
<fileset dir="common" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="queries" includes="build.xml" />
<fileset dir="join" includes="build.xml" />
<fileset dir="suggest" includes="build.xml" />
</subant>
</sequential>
@ -52,6 +54,7 @@
<fileset dir="common" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="queries" includes="build.xml" />
<fileset dir="join" includes="build.xml" />
<fileset dir="suggest" includes="build.xml" />
</subant>
</sequential>
@ -65,6 +68,7 @@
<fileset dir="common" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="queries" includes="build.xml" />
<fileset dir="join" includes="build.xml" />
<fileset dir="suggest" includes="build.xml" />
</subant>
</sequential>
@ -79,6 +83,7 @@
<fileset dir="common" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="queries" includes="build.xml" />
<fileset dir="join" includes="build.xml" />
<fileset dir="suggest" includes="build.xml" />
</subant>
</sequential>
@ -91,6 +96,7 @@
<fileset dir="common" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="queries" includes="build.xml" />
<fileset dir="join" includes="build.xml" />
<fileset dir="suggest" includes="build.xml" />
</subant>
</sequential>
@ -105,6 +111,7 @@
<fileset dir="common" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="queries" includes="build.xml" />
<fileset dir="join" includes="build.xml" />
<fileset dir="suggest" includes="build.xml" />
</subant>
</sequential>

39
modules/join/build.xml Normal file
View File

@ -0,0 +1,39 @@
<?xml version="1.0"?>
<project name="join" default="default">
<description>
Queries and collectors for performing joins
</description>
<property name="build.dir" location="build/" />
<import file="../../lucene/contrib/contrib-build.xml"/>
<property name="build.dir" location="build/" />
<property name="dist.dir" location="dist/" />
<module-uptodate name="grouping" jarfile="${common.dir}/../modules/grouping/build/lucene-grouping-${version}.jar"
property="grouping.uptodate" classpath.property="grouping.jar"/>
<path id="classpath">
<pathelement path="${grouping.jar}"/>
<path refid="base.classpath"/>
</path>
<path id="run.classpath">
<path refid="classpath"/>
<pathelement location="${build.dir}/classes/java"/>
</path>
<property name="build.dir" location="build/" />
<property name="dist.dir" location="dist/" />
<property name="maven.dist.dir" location="../dist/maven" />
<target name="compile-grouping" unless="grouping.uptodate">
<subant target="default">
<fileset dir="${common.dir}/../modules/grouping" includes="build.xml"/>
</subant>
</target>
<target name="init" depends="contrib-build.init,compile-grouping"/>
<target name="dist-maven" depends="jar-core,javadocs,contrib-build.dist-maven" />
</project>

View File

@ -0,0 +1,472 @@
package org.apache.lucene.search.join;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter; // javadocs
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.FieldValueHitQueue;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreCachingWrappingScorer;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopDocsCollector;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.grouping.GroupDocs;
import org.apache.lucene.search.grouping.TopGroups;
import org.apache.lucene.util.ArrayUtil;
/** Collects parent document hits for a Query containing one more more
* BlockJoinQuery clauses, sorted by the
* specified parent Sort. Note that this cannot perform
* arbitrary joins; rather, it requires that all joined
* documents are indexed as a doc block (using {@link
* IndexWriter#addDocuments} or {@link
* IndexWriter#updateDocuments}). Ie, the join is computed
* at index time.
*
* <p>The parent Sort must only use
* fields from the parent documents; sorting by field in
* the child documents is not supported.</p>
*
* <p>You should only use this
* collector if one or more of the clauses in the query is
* a {@link BlockJoinQuery}. This collector will find those query
* clauses and record the matching child documents for the
* top scoring parent documents.</p>
*
* <p>Multiple joins (star join) and nested joins and a mix
* of the two are allowed, as long as in all cases the
* documents corresponding to a single row of each joined
* parent table were indexed as a doc block.</p>
*
* <p>For the simple star join you can retrieve the
* {@link TopGroups} instance containing each {@link BlockJoinQuery}'s
* matching child documents for the top parent groups,
* using {@link #getTopGroups}. Ie,
* a single query, which will contain two or more
* {@link BlockJoinQuery}'s as clauses representing the star join,
* can then retrieve two or more {@link TopGroups} instances.</p>
*
* <p>For nested joins, the query will run correctly (ie,
* match the right parent and child documents), however,
* because TopGroups is currently unable to support nesting
* (each group is not able to hold another TopGroups), you
* are only able to retrieve the TopGroups of the first
* join. The TopGroups of the nested joins will not be
* correct.
*
* See {@link org.apache.lucene.search.join} for a code
* sample.
*
* @lucene.experimental
*/
public class BlockJoinCollector extends Collector {
private final Sort sort;
// Maps each BlockJoinQuery instance to its "slot" in
// joinScorers and in OneGroup's cached doc/scores/count:
private final Map<Query,Integer> joinQueryID = new HashMap<Query,Integer>();
private final int numParentHits;
private final FieldValueHitQueue<OneGroup> queue;
private final FieldComparator[] comparators;
private final int[] reverseMul;
private final int compEnd;
private final boolean trackMaxScore;
private final boolean trackScores;
private int docBase;
private BlockJoinQuery.BlockJoinScorer[] joinScorers = new BlockJoinQuery.BlockJoinScorer[0];
private IndexReader.AtomicReaderContext currentReaderContext;
private Scorer scorer;
private boolean queueFull;
private OneGroup bottom;
private int totalHitCount;
private float maxScore = Float.NaN;
/* Creates a BlockJoinCollector. The provided sort must
* not be null. */
public BlockJoinCollector(Sort sort, int numParentHits, boolean trackScores, boolean trackMaxScore) throws IOException {
// TODO: allow null sort to be specialized to relevance
// only collector
this.sort = sort;
this.trackMaxScore = trackMaxScore;
this.trackScores = trackScores;
this.numParentHits = numParentHits;
queue = FieldValueHitQueue.create(sort.getSort(), numParentHits);
comparators = queue.getComparators();
reverseMul = queue.getReverseMul();
compEnd = comparators.length - 1;
}
private static final class OneGroup extends FieldValueHitQueue.Entry {
public OneGroup(int comparatorSlot, int parentDoc, float parentScore, int numJoins, boolean doScores) {
super(comparatorSlot, parentDoc, parentScore);
docs = new int[numJoins][];
for(int joinID=0;joinID<numJoins;joinID++) {
docs[joinID] = new int[5];
}
if (doScores) {
scores = new float[numJoins][];
for(int joinID=0;joinID<numJoins;joinID++) {
scores[joinID] = new float[5];
}
}
counts = new int[numJoins];
}
AtomicReaderContext readerContext;
int[][] docs;
float[][] scores;
int[] counts;
};
@Override
public void collect(int parentDoc) throws IOException {
//System.out.println("C parentDoc=" + parentDoc);
totalHitCount++;
float score = Float.NaN;
if (trackMaxScore) {
score = scorer.score();
if (score > maxScore) {
maxScore = score;
}
}
// TODO: we could sweep all joinScorers here and
// aggregate total child hit count, so we can fill this
// in getTopGroups (we wire it to 0 now)
if (queueFull) {
//System.out.println(" queueFull");
// Fastmatch: return if this hit is not competitive
for (int i = 0;; i++) {
final int c = reverseMul[i] * comparators[i].compareBottom(parentDoc);
if (c < 0) {
// Definitely not competitive.
//System.out.println(" skip");
return;
} else if (c > 0) {
// Definitely competitive.
break;
} else if (i == compEnd) {
// Here c=0. If we're at the last comparator, this doc is not
// competitive, since docs are visited in doc Id order, which means
// this doc cannot compete with any other document in the queue.
//System.out.println(" skip");
return;
}
}
//System.out.println(" competes! doc=" + (docBase + parentDoc));
// This hit is competitive - replace bottom element in queue & adjustTop
for (int i = 0; i < comparators.length; i++) {
comparators[i].copy(bottom.slot, parentDoc);
}
if (!trackMaxScore && trackScores) {
score = scorer.score();
}
bottom.doc = docBase + parentDoc;
bottom.readerContext = currentReaderContext;
bottom.score = score;
copyGroups(bottom);
bottom = queue.updateTop();
for (int i = 0; i < comparators.length; i++) {
comparators[i].setBottom(bottom.slot);
}
} else {
// Startup transient: queue is not yet full:
final int comparatorSlot = totalHitCount - 1;
// Copy hit into queue
for (int i = 0; i < comparators.length; i++) {
comparators[i].copy(comparatorSlot, parentDoc);
}
//System.out.println(" startup: new OG doc=" + (docBase+parentDoc));
final OneGroup og = new OneGroup(comparatorSlot, docBase+parentDoc, score, joinScorers.length, trackScores);
og.readerContext = currentReaderContext;
copyGroups(og);
bottom = queue.add(og);
queueFull = totalHitCount == numParentHits;
if (queueFull) {
// End of startup transient: queue just filled up:
for (int i = 0; i < comparators.length; i++) {
comparators[i].setBottom(bottom.slot);
}
}
}
}
// Pulls out child doc and scores for all join queries:
private void copyGroups(OneGroup og) {
// While rare, it's possible top arrays could be too
// short if join query had null scorer on first
// segment(s) but then became non-null on later segments
final int numSubScorers = joinScorers.length;
if (og.docs.length < numSubScorers) {
// While rare, this could happen if join query had
// null scorer on first segment(s) but then became
// non-null on later segments
og.docs = ArrayUtil.grow(og.docs);
}
if (og.counts.length < numSubScorers) {
og.counts = ArrayUtil.grow(og.counts);
}
if (trackScores && og.scores.length < numSubScorers) {
og.scores = ArrayUtil.grow(og.scores);
}
//System.out.println("copyGroups parentDoc=" + og.doc);
for(int scorerIDX = 0;scorerIDX < numSubScorers;scorerIDX++) {
final BlockJoinQuery.BlockJoinScorer joinScorer = joinScorers[scorerIDX];
//System.out.println(" scorer=" + joinScorer);
if (joinScorer != null) {
og.counts[scorerIDX] = joinScorer.getChildCount();
//System.out.println(" count=" + og.counts[scorerIDX]);
og.docs[scorerIDX] = joinScorer.swapChildDocs(og.docs[scorerIDX]);
/*
for(int idx=0;idx<og.counts[scorerIDX];idx++) {
System.out.println(" docs[" + idx + "]=" + og.docs[scorerIDX][idx]);
}
*/
if (trackScores) {
og.scores[scorerIDX] = joinScorer.swapChildScores(og.scores[scorerIDX]);
}
}
}
}
@Override
public void setNextReader(AtomicReaderContext context) throws IOException {
currentReaderContext = context;
docBase = context.docBase;
for (int compIDX = 0; compIDX < comparators.length; compIDX++) {
queue.setComparator(compIDX, comparators[compIDX].setNextReader(context));
}
}
@Override
public boolean acceptsDocsOutOfOrder() {
return false;
}
@Override
public void setScorer(Scorer scorer) {
//System.out.println("C.setScorer scorer=" + scorer);
// Since we invoke .score(), and the comparators likely
// do as well, cache it so it's only "really" computed
// once:
this.scorer = new ScoreCachingWrappingScorer(scorer);
for (int compIDX = 0; compIDX < comparators.length; compIDX++) {
comparators[compIDX].setScorer(this.scorer);
}
Arrays.fill(joinScorers, null);
// Find any BlockJoinScorers out there:
scorer.visitScorers(new Scorer.ScorerVisitor<Query,Query,Scorer>() {
private void enroll(BlockJoinQuery query, BlockJoinQuery.BlockJoinScorer scorer) {
final Integer slot = joinQueryID.get(query);
if (slot == null) {
joinQueryID.put(query, joinScorers.length);
//System.out.println("found JQ: " + query + " slot=" + joinScorers.length);
final BlockJoinQuery.BlockJoinScorer[] newArray = new BlockJoinQuery.BlockJoinScorer[1+joinScorers.length];
System.arraycopy(joinScorers, 0, newArray, 0, joinScorers.length);
joinScorers = newArray;
joinScorers[joinScorers.length-1] = scorer;
} else {
joinScorers[slot] = scorer;
}
}
@Override
public void visitOptional(Query parent, Query child, Scorer scorer) {
//System.out.println("visitOpt");
if (child instanceof BlockJoinQuery) {
enroll((BlockJoinQuery) child,
(BlockJoinQuery.BlockJoinScorer) scorer);
}
}
@Override
public void visitRequired(Query parent, Query child, Scorer scorer) {
//System.out.println("visitReq parent=" + parent + " child=" + child + " scorer=" + scorer);
if (child instanceof BlockJoinQuery) {
enroll((BlockJoinQuery) child,
(BlockJoinQuery.BlockJoinScorer) scorer);
}
}
@Override
public void visitProhibited(Query parent, Query child, Scorer scorer) {
//System.out.println("visitProh");
if (child instanceof BlockJoinQuery) {
enroll((BlockJoinQuery) child,
(BlockJoinQuery.BlockJoinScorer) scorer);
}
}
});
}
private final static class FakeScorer extends Scorer {
float score;
int doc;
public FakeScorer() {
super((Weight) null);
}
@Override
public float score() {
return score;
}
@Override
public int docID() {
return doc;
}
@Override
public int advance(int target) {
throw new UnsupportedOperationException();
}
@Override
public int nextDoc() {
throw new UnsupportedOperationException();
}
}
private OneGroup[] sortedGroups;
private void sortQueue() {
sortedGroups = new OneGroup[queue.size()];
for(int downTo=queue.size()-1;downTo>=0;downTo--) {
sortedGroups[downTo] = queue.pop();
}
}
/** Return the TopGroups for the specified
* BlockJoinQuery. The groupValue of each GroupDocs will
* be the parent docID for that group. Note that the
* {@link GroupDocs#totalHits}, which would be the
* total number of child documents matching that parent,
* is not computed (will always be 0). Returns null if
* no groups matched. */
@SuppressWarnings("unchecked")
public TopGroups<Integer> getTopGroups(BlockJoinQuery query, Sort withinGroupSort, int offset, int maxDocsPerGroup, int withinGroupOffset, boolean fillSortFields)
throws IOException {
final Integer _slot = joinQueryID.get(query);
if (_slot == null) {
if (totalHitCount == 0) {
return null;
} else {
throw new IllegalArgumentException("the Query did not contain the provided BlockJoinQuery");
}
}
// unbox once
final int slot = _slot;
if (offset >= queue.size()) {
return null;
}
int totalGroupedHitCount = 0;
if (sortedGroups == null) {
sortQueue();
}
final FakeScorer fakeScorer = new FakeScorer();
final GroupDocs<Integer>[] groups = new GroupDocs[sortedGroups.length - offset];
for(int groupIDX=offset;groupIDX<sortedGroups.length;groupIDX++) {
final OneGroup og = sortedGroups[groupIDX];
// At this point we hold all docs w/ in each group,
// unsorted; we now sort them:
final TopDocsCollector collector;
if (withinGroupSort == null) {
// Sort by score
if (!trackScores) {
throw new IllegalArgumentException("cannot sort by relevance within group: trackScores=false");
}
collector = TopScoreDocCollector.create(maxDocsPerGroup, true);
} else {
// Sort by fields
collector = TopFieldCollector.create(withinGroupSort, maxDocsPerGroup, fillSortFields, trackScores, trackMaxScore, true);
}
collector.setScorer(fakeScorer);
collector.setNextReader(og.readerContext);
final int numChildDocs = og.counts[slot];
for(int docIDX=0;docIDX<numChildDocs;docIDX++) {
final int doc = og.docs[slot][docIDX];
fakeScorer.doc = doc;
if (trackScores) {
fakeScorer.score = og.scores[slot][docIDX];
}
collector.collect(doc);
}
totalGroupedHitCount += numChildDocs;
final Object[] groupSortValues;
if (fillSortFields) {
groupSortValues = new Object[comparators.length];
for(int sortFieldIDX=0;sortFieldIDX<comparators.length;sortFieldIDX++) {
groupSortValues[sortFieldIDX] = comparators[sortFieldIDX].value(og.slot);
}
} else {
groupSortValues = null;
}
final TopDocs topDocs = collector.topDocs(withinGroupOffset, maxDocsPerGroup);
groups[groupIDX-offset] = new GroupDocs<Integer>(topDocs.getMaxScore(),
og.counts[slot],
topDocs.scoreDocs,
og.doc,
groupSortValues);
}
return new TopGroups<Integer>(new TopGroups<Integer>(sort.getSort(),
withinGroupSort == null ? null : withinGroupSort.getSort(),
0, totalGroupedHitCount, groups),
totalHitCount);
}
}

View File

@ -0,0 +1,410 @@
package org.apache.lucene.search.join;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter; // javadocs
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.grouping.TopGroups;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.OpenBitSet;
/**
* This query requires that you index
* children and parent docs as a single block, using the
* {@link IndexWriter#addDocuments} or {@link
* IndexWriter#updateDocuments} API. In each block, the
* child documents must appear first, ending with the parent
* document. At search time you provide a Filter
* identifying the parents, however this Filter must provide
* an {@link OpenBitSet} per sub-reader.
*
* <p>Once the block index is built, use this query to wrap
* any sub-query matching only child docs and join matches in that
* child document space up to the parent document space.
* You can then use this Query as a clause with
* other queries in the parent document space.</p>
*
* <p>The child documents must be orthogonal to the parent
* documents: the wrapped child query must never
* return a parent document.</p>
*
* If you'd like to retrieve {@link TopGroups} for the
* resulting query, use the {@link BlockJoinCollector}.
* Note that this is not necessary, ie, if you simply want
* to collect the parent documents and don't need to see
* which child documents matched under that parent, then
* you can use any collector.
*
* <p><b>NOTE</b>: If the overall query contains parent-only
* matches, for example you OR a parent-only query with a
* joined child-only query, then the resulting collected documents
* will be correct, however the {@link TopGroups} you get
* from {@link BlockJoinCollector} will not contain every
* child for parents that had matched.
*
* <p>See {@link org.apache.lucene.search.join} for an
* overview. </p>
*
* @lucene.experimental
*/
public class BlockJoinQuery extends Query {
public static enum ScoreMode {None, Avg, Max, Total};
private final Filter parentsFilter;
private final Query childQuery;
// If we are rewritten, this is the original childQuery we
// were passed; we use this for .equals() and
// .hashCode(). This makes rewritten query equal the
// original, so that user does not have to .rewrite() their
// query before searching:
private final Query origChildQuery;
private final ScoreMode scoreMode;
public BlockJoinQuery(Query childQuery, Filter parentsFilter, ScoreMode scoreMode) {
super();
this.origChildQuery = childQuery;
this.childQuery = childQuery;
this.parentsFilter = parentsFilter;
this.scoreMode = scoreMode;
}
private BlockJoinQuery(Query origChildQuery, Query childQuery, Filter parentsFilter, ScoreMode scoreMode) {
super();
this.origChildQuery = origChildQuery;
this.childQuery = childQuery;
this.parentsFilter = parentsFilter;
this.scoreMode = scoreMode;
}
public Weight createWeight(IndexSearcher searcher) throws IOException {
return new BlockJoinWeight(this, childQuery.createWeight(searcher), parentsFilter, scoreMode);
}
private static class BlockJoinWeight extends Weight {
private final Query joinQuery;
private final Weight childWeight;
private final Filter parentsFilter;
private final ScoreMode scoreMode;
public BlockJoinWeight(Query joinQuery, Weight childWeight, Filter parentsFilter, ScoreMode scoreMode) {
super();
this.joinQuery = joinQuery;
this.childWeight = childWeight;
this.parentsFilter = parentsFilter;
this.scoreMode = scoreMode;
}
@Override
public Query getQuery() {
return joinQuery;
}
@Override
public float getValue() {
return childWeight.getValue();
}
@Override
public float sumOfSquaredWeights() throws IOException {
return childWeight.sumOfSquaredWeights();
}
@Override
public void normalize(float norm) {
childWeight.normalize(norm);
}
@Override
public Scorer scorer(AtomicReaderContext readerContext, ScorerContext context) throws IOException {
// Pass scoreDocsInOrder true, topScorer false to our sub:
final Scorer childScorer = childWeight.scorer(readerContext, ScorerContext.def().scoreDocsInOrder(true).topScorer(false));
if (childScorer == null) {
// No matches
return null;
}
final int firstChildDoc = childScorer.nextDoc();
if (firstChildDoc == DocIdSetIterator.NO_MORE_DOCS) {
// No matches
return null;
}
final DocIdSet parents = parentsFilter.getDocIdSet(readerContext);
// TODO: once we do random-access filters we can
// generalize this:
if (parents == null) {
// No matches
return null;
}
if (!(parents instanceof OpenBitSet)) {
throw new IllegalStateException("parentFilter must return OpenBitSet; got " + parents);
}
return new BlockJoinScorer(this, childScorer, (OpenBitSet) parents, firstChildDoc, scoreMode);
}
@Override
public Explanation explain(AtomicReaderContext reader, int doc) throws IOException {
// TODO
throw new UnsupportedOperationException(getClass().getName() +
" cannot explain match on parent document");
}
@Override
public boolean scoresDocsOutOfOrder() {
return false;
}
}
static class BlockJoinScorer extends Scorer {
private final Scorer childScorer;
private final OpenBitSet parentBits;
private final ScoreMode scoreMode;
private int parentDoc;
private float parentScore;
private int nextChildDoc;
private int[] pendingChildDocs = new int[5];
private float[] pendingChildScores;
private int childDocUpto;
public BlockJoinScorer(Weight weight, Scorer childScorer, OpenBitSet parentBits, int firstChildDoc, ScoreMode scoreMode) {
super(weight);
//System.out.println("Q.init firstChildDoc=" + firstChildDoc);
this.parentBits = parentBits;
this.childScorer = childScorer;
this.scoreMode = scoreMode;
if (scoreMode != ScoreMode.None) {
pendingChildScores = new float[5];
}
nextChildDoc = firstChildDoc;
}
@Override
public void visitSubScorers(Query parent, BooleanClause.Occur relationship,
ScorerVisitor<Query, Query, Scorer> visitor) {
super.visitSubScorers(parent, relationship, visitor);
//childScorer.visitSubScorers(weight.getQuery(), BooleanClause.Occur.MUST, visitor);
childScorer.visitScorers(visitor);
}
int getChildCount() {
return childDocUpto;
}
int[] swapChildDocs(int[] other) {
final int[] ret = pendingChildDocs;
if (other == null) {
pendingChildDocs = new int[5];
} else {
pendingChildDocs = other;
}
return ret;
}
float[] swapChildScores(float[] other) {
if (scoreMode == ScoreMode.None) {
throw new IllegalStateException("ScoreMode is None");
}
final float[] ret = pendingChildScores;
if (other == null) {
pendingChildScores = new float[5];
} else {
pendingChildScores = other;
}
return ret;
}
@Override
public int nextDoc() throws IOException {
//System.out.println("Q.nextDoc() nextChildDoc=" + nextChildDoc);
if (nextChildDoc == NO_MORE_DOCS) {
//System.out.println(" end");
return parentDoc = NO_MORE_DOCS;
}
// Gather all children sharing the same parent as nextChildDoc
parentDoc = parentBits.nextSetBit(nextChildDoc);
//System.out.println(" parentDoc=" + parentDoc);
assert parentDoc != -1;
float totalScore = 0;
float maxScore = Float.NEGATIVE_INFINITY;
childDocUpto = 0;
do {
//System.out.println(" c=" + nextChildDoc);
if (pendingChildDocs.length == childDocUpto) {
pendingChildDocs = ArrayUtil.grow(pendingChildDocs);
if (scoreMode != ScoreMode.None) {
pendingChildScores = ArrayUtil.grow(pendingChildScores);
}
}
pendingChildDocs[childDocUpto] = nextChildDoc;
if (scoreMode != ScoreMode.None) {
// TODO: specialize this into dedicated classes per-scoreMode
final float childScore = childScorer.score();
pendingChildScores[childDocUpto] = childScore;
maxScore = Math.max(childScore, maxScore);
totalScore += childScore;
}
childDocUpto++;
nextChildDoc = childScorer.nextDoc();
} while (nextChildDoc < parentDoc);
//System.out.println(" nextChildDoc=" + nextChildDoc);
// Parent & child docs are supposed to be orthogonal:
assert nextChildDoc != parentDoc;
switch(scoreMode) {
case Avg:
parentScore = totalScore / childDocUpto;
break;
case Max:
parentScore = maxScore;
break;
case Total:
parentScore = totalScore;
break;
case None:
break;
}
//System.out.println(" return parentDoc=" + parentDoc);
return parentDoc;
}
@Override
public int docID() {
return parentDoc;
}
@Override
public float score() throws IOException {
return parentScore;
}
@Override
public int advance(int parentTarget) throws IOException {
//System.out.println("Q.advance parentTarget=" + parentTarget);
if (parentTarget == NO_MORE_DOCS) {
return parentDoc = NO_MORE_DOCS;
}
final int prevParentDoc = parentBits.prevSetBit(parentTarget-1);
//System.out.println(" rolled back to prevParentDoc=" + prevParentDoc + " vs parentDoc=" + parentDoc);
assert prevParentDoc >= parentDoc;
if (prevParentDoc > nextChildDoc) {
nextChildDoc = childScorer.advance(prevParentDoc);
// System.out.println(" childScorer advanced to child docID=" + nextChildDoc);
//} else {
//System.out.println(" skip childScorer advance");
}
// Parent & child docs are supposed to be orthogonal:
assert nextChildDoc != prevParentDoc;
final int nd = nextDoc();
//System.out.println(" return nextParentDoc=" + nd);
return nd;
}
}
@Override
public void extractTerms(Set<Term> terms) {
childQuery.extractTerms(terms);
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
final Query childRewrite = childQuery.rewrite(reader);
if (childRewrite != childQuery) {
return new BlockJoinQuery(childQuery,
childRewrite,
parentsFilter,
scoreMode);
} else {
return this;
}
}
@Override
public String toString(String field) {
return "BlockJoinQuery ("+childQuery.toString()+")";
}
@Override
public void setBoost(float boost) {
throw new UnsupportedOperationException("this query cannot support boosting; please use childQuery.setBoost instead");
}
@Override
public float getBoost() {
throw new UnsupportedOperationException("this query cannot support boosting; please use childQuery.getBoost instead");
}
@Override
public boolean equals(Object _other) {
if (_other instanceof BlockJoinQuery) {
final BlockJoinQuery other = (BlockJoinQuery) _other;
return origChildQuery.equals(other.origChildQuery) &&
parentsFilter.equals(other.parentsFilter) &&
scoreMode == other.scoreMode;
} else {
return false;
}
}
@Override
public int hashCode() {
final int prime = 31;
int hash = 1;
hash = prime * hash + origChildQuery.hashCode();
hash = prime * hash + scoreMode.hashCode();
hash = prime * hash + parentsFilter.hashCode();
return hash;
}
@Override
public Object clone() {
return new BlockJoinQuery((Query) origChildQuery.clone(),
parentsFilter,
scoreMode);
}
}

View File

@ -0,0 +1,32 @@
<html>
<body>
<p>This module supports index-time joins while searching, where joined
documents are indexed as a single document block using
{@link org.apache.lucene.index.IndexWriter#addDocuments}. This is useful for any normalized content (XML documents or database tables). In database terms, all rows for all
joined tables matching a single row of the primary table must be
indexed as a single document block, with the parent document
being last in the group.</p>
<p>When you index in this way, the documents in your index are divided
into parent documents (the last document of each block) and child
documents (all others). You provide a {@link org.apache.lucene.search.Filter} that identifies the
parent documents, as Lucene does not currently record any information
about doc blocks.</p>
<p>At search time, use {@link org.apache.lucene.search.join.BlockJoinQuery} to remap
matches from any child {@link org.apache.lucene.search.Query} (ie, a query that matches only
child documents) up to the parent document space. The resulting
{@link org.apache.lucene.search.join.BlockJoinQuery} can then be used as a clause in any query that
matches parent documents.</p>
<p>If you only care about the parent documents matching the query, you
can use any collector to collect the parent hits, but if you'd also
like to see which child documents match for each parent document,
use the {@link org.apache.lucene.search.join.BlockJoinCollector} to collect the hits. Once the
search is done, you retrieve a {@link
org.apache.lucene.search.grouping.TopGroups} instance from the
{@link org.apache.lucene.search.join.BlockJoinCollector#getTopGroups} method.</p>
</body>
</html>

View File

@ -0,0 +1,5 @@
<html>
<body>
Lucene's join module
</body>
</html>

View File

@ -0,0 +1,476 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.grouping.GroupDocs;
import org.apache.lucene.search.grouping.TopGroups;
import org.apache.lucene.search.join.BlockJoinCollector;
import org.apache.lucene.search.join.BlockJoinQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
public class TestBlockJoin extends LuceneTestCase {
// One resume...
private Document makeResume(String name, String country) {
Document resume = new Document();
resume.add(newField("docType", "resume", Field.Index.NOT_ANALYZED));
resume.add(newField("name", name, Field.Store.YES, Field.Index.NOT_ANALYZED));
resume.add(newField("country", country, Field.Index.NOT_ANALYZED));
return resume;
}
// ... has multiple jobs
private Document makeJob(String skill, int year) {
Document job = new Document();
job.add(newField("skill", skill, Field.Store.YES, Field.Index.NOT_ANALYZED));
job.add(new NumericField("year").setIntValue(year));
return job;
}
public void testSimple() throws Exception {
final Directory dir = newDirectory();
final RandomIndexWriter w = new RandomIndexWriter(random, dir);
final List<Document> docs = new ArrayList<Document>();
docs.add(makeJob("java", 2007));
docs.add(makeJob("python", 2010));
docs.add(makeResume("Lisa", "United Kingdom"));
w.addDocuments(docs);
docs.clear();
docs.add(makeJob("ruby", 2005));
docs.add(makeJob("java", 2006));
docs.add(makeResume("Frank", "United States"));
w.addDocuments(docs);
IndexReader r = w.getReader();
w.close();
IndexSearcher s = new IndexSearcher(r);
// Create a filter that defines "parent" documents in the index - in this case resumes
Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
// Define child document criteria (finds an example of relevant work experience)
BooleanQuery childQuery = new BooleanQuery();
childQuery.add(new BooleanClause(new TermQuery(new Term("skill", "java")), Occur.MUST));
childQuery.add(new BooleanClause(NumericRangeQuery.newIntRange("year", 2006, 2011, true, true), Occur.MUST));
// Define parent document criteria (find a resident in the UK)
Query parentQuery = new TermQuery(new Term("country", "United Kingdom"));
// Wrap the child document query to 'join' any matches
// up to corresponding parent:
BlockJoinQuery childJoinQuery = new BlockJoinQuery(childQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg);
// Combine the parent and nested child queries into a single query for a candidate
BooleanQuery fullQuery = new BooleanQuery();
fullQuery.add(new BooleanClause(parentQuery, Occur.MUST));
fullQuery.add(new BooleanClause(childJoinQuery, Occur.MUST));
BlockJoinCollector c = new BlockJoinCollector(Sort.RELEVANCE, 1, true, false);
s.search(fullQuery, c);
TopGroups<Integer> results = c.getTopGroups(childJoinQuery, null, 0, 10, 0, true);
//assertEquals(1, results.totalHitCount);
assertEquals(1, results.totalGroupedHitCount);
assertEquals(1, results.groups.length);
final GroupDocs<Integer> group = results.groups[0];
assertEquals(1, group.totalHits);
Document childDoc = s.doc(group.scoreDocs[0].doc);
//System.out.println(" doc=" + group.scoreDocs[0].doc);
assertEquals("java", childDoc.get("skill"));
assertNotNull(group.groupValue);
Document parentDoc = s.doc(group.groupValue);
assertEquals("Lisa", parentDoc.get("name"));
r.close();
dir.close();
}
private String[][] getRandomFields(int maxUniqueValues) {
final String[][] fields = new String[_TestUtil.nextInt(random, 2, 4)][];
for(int fieldID=0;fieldID<fields.length;fieldID++) {
final int valueCount;
if (fieldID == 0) {
valueCount = 2;
} else {
valueCount = _TestUtil.nextInt(random, 1, maxUniqueValues);
}
final String[] values = fields[fieldID] = new String[valueCount];
for(int i=0;i<valueCount;i++) {
values[i] = _TestUtil.randomRealisticUnicodeString(random);
//values[i] = _TestUtil.randomSimpleString(random);
}
}
return fields;
}
private Term randomParentTerm(String[] values) {
return new Term("parent0", values[random.nextInt(values.length)]);
}
private Term randomChildTerm(String[] values) {
return new Term("child0", values[random.nextInt(values.length)]);
}
private Sort getRandomSort(String prefix, int numFields) {
final List<SortField> sortFields = new ArrayList<SortField>();
// TODO: sometimes sort by score; problem is scores are
// not comparable across the two indices
// sortFields.add(SortField.FIELD_SCORE);
if (random.nextBoolean()) {
sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.Type.STRING, random.nextBoolean()));
} else if (random.nextBoolean()) {
sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.Type.STRING, random.nextBoolean()));
sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.Type.STRING, random.nextBoolean()));
}
// Break ties:
sortFields.add(new SortField(prefix + "ID", SortField.Type.INT));
return new Sort(sortFields.toArray(new SortField[sortFields.size()]));
}
public void testRandom() throws Exception {
// We build two indices at once: one normalized (which
// BlockJoinQuery/Collector can query) and the other w/
// same docs just fully denormalized:
final Directory dir = newDirectory();
final Directory joinDir = newDirectory();
final int numParentDocs = _TestUtil.nextInt(random, 100*RANDOM_MULTIPLIER, 300*RANDOM_MULTIPLIER);
//final int numParentDocs = 30;
// Values for parent fields:
final String[][] parentFields = getRandomFields(numParentDocs/2);
// Values for child fields:
final String[][] childFields = getRandomFields(numParentDocs);
// TODO: test star join, nested join cases too!
final RandomIndexWriter w = new RandomIndexWriter(random, dir);
final RandomIndexWriter joinW = new RandomIndexWriter(random, joinDir);
for(int parentDocID=0;parentDocID<numParentDocs;parentDocID++) {
Document parentDoc = new Document();
Document parentJoinDoc = new Document();
Field id = newField("parentID", ""+parentDocID, Field.Store.YES, Field.Index.NOT_ANALYZED);
parentDoc.add(id);
parentJoinDoc.add(id);
parentJoinDoc.add(newField("isParent", "x", Field.Index.NOT_ANALYZED));
for(int field=0;field<parentFields.length;field++) {
if (random.nextDouble() < 0.9) {
Field f = newField("parent" + field,
parentFields[field][random.nextInt(parentFields[field].length)],
Field.Index.NOT_ANALYZED);
parentDoc.add(f);
parentJoinDoc.add(f);
}
}
final List<Document> joinDocs = new ArrayList<Document>();
if (VERBOSE) {
System.out.println(" " + parentDoc);
}
final int numChildDocs = _TestUtil.nextInt(random, 1, 20);
for(int childDocID=0;childDocID<numChildDocs;childDocID++) {
// Denormalize: copy all parent fields into child doc:
Document childDoc = _TestUtil.cloneDocument(parentDoc);
Document joinChildDoc = new Document();
joinDocs.add(joinChildDoc);
Field childID = newField("childID", ""+childDocID, Field.Store.YES, Field.Index.NOT_ANALYZED);
childDoc.add(childID);
joinChildDoc.add(childID);
for(int childFieldID=0;childFieldID<childFields.length;childFieldID++) {
if (random.nextDouble() < 0.9) {
Field f = newField("child" + childFieldID,
childFields[childFieldID][random.nextInt(childFields[childFieldID].length)],
Field.Index.NOT_ANALYZED);
childDoc.add(f);
joinChildDoc.add(f);
}
}
if (VERBOSE) {
System.out.println(" " + joinChildDoc);
}
w.addDocument(childDoc);
}
// Parent last:
joinDocs.add(parentJoinDoc);
joinW.addDocuments(joinDocs);
}
final IndexReader r = w.getReader();
w.close();
final IndexReader joinR = joinW.getReader();
joinW.close();
if (VERBOSE) {
System.out.println("TEST: reader=" + r);
System.out.println("TEST: joinReader=" + joinR);
for(int docIDX=0;docIDX<joinR.maxDoc();docIDX++) {
System.out.println(" docID=" + docIDX + " doc=" + joinR.document(docIDX));
}
}
final IndexSearcher s = new IndexSearcher(r);
s.setDefaultFieldSortScoring(true, true);
final IndexSearcher joinS = new IndexSearcher(joinR);
final Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("isParent", "x"))));
final int iters = 200*RANDOM_MULTIPLIER;
for(int iter=0;iter<iters;iter++) {
if (VERBOSE) {
System.out.println("TEST: iter=" + (1+iter) + " of " + iters);
}
final Query childQuery;
if (random.nextInt(3) == 2) {
final int childFieldID = random.nextInt(childFields.length);
childQuery = new TermQuery(new Term("child" + childFieldID,
childFields[childFieldID][random.nextInt(childFields[childFieldID].length)]));
} else if (random.nextInt(3) == 2) {
BooleanQuery bq = new BooleanQuery();
childQuery = bq;
final int numClauses = _TestUtil.nextInt(random, 2, 4);
boolean didMust = false;
for(int clauseIDX=0;clauseIDX<numClauses;clauseIDX++) {
Query clause;
BooleanClause.Occur occur;
if (!didMust && random.nextBoolean()) {
occur = random.nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.MUST_NOT;
clause = new TermQuery(randomChildTerm(childFields[0]));
didMust = true;
} else {
occur = BooleanClause.Occur.SHOULD;
final int childFieldID = _TestUtil.nextInt(random, 1, childFields.length-1);
clause = new TermQuery(new Term("child" + childFieldID,
childFields[childFieldID][random.nextInt(childFields[childFieldID].length)]));
}
bq.add(clause, occur);
}
} else {
BooleanQuery bq = new BooleanQuery();
childQuery = bq;
bq.add(new TermQuery(randomChildTerm(childFields[0])),
BooleanClause.Occur.MUST);
final int childFieldID = _TestUtil.nextInt(random, 1, childFields.length-1);
bq.add(new TermQuery(new Term("child" + childFieldID, childFields[childFieldID][random.nextInt(childFields[childFieldID].length)])),
random.nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.MUST_NOT);
}
final BlockJoinQuery childJoinQuery = new BlockJoinQuery(childQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg);
// To run against the block-join index:
final Query parentJoinQuery;
// Same query as parentJoinQuery, but to run against
// the fully denormalized index (so we can compare)
// results:
final Query parentQuery;
if (random.nextBoolean()) {
parentQuery = childQuery;
parentJoinQuery = childJoinQuery;
} else {
// AND parent field w/ child field
final BooleanQuery bq = new BooleanQuery();
parentJoinQuery = bq;
final Term parentTerm = randomParentTerm(parentFields[0]);
if (random.nextBoolean()) {
bq.add(childJoinQuery, BooleanClause.Occur.MUST);
bq.add(new TermQuery(parentTerm),
BooleanClause.Occur.MUST);
} else {
bq.add(new TermQuery(parentTerm),
BooleanClause.Occur.MUST);
bq.add(childJoinQuery, BooleanClause.Occur.MUST);
}
final BooleanQuery bq2 = new BooleanQuery();
parentQuery = bq2;
if (random.nextBoolean()) {
bq2.add(childQuery, BooleanClause.Occur.MUST);
bq2.add(new TermQuery(parentTerm),
BooleanClause.Occur.MUST);
} else {
bq2.add(new TermQuery(parentTerm),
BooleanClause.Occur.MUST);
bq2.add(childQuery, BooleanClause.Occur.MUST);
}
}
final Sort parentSort = getRandomSort("parent", parentFields.length);
final Sort childSort = getRandomSort("child", childFields.length);
if (VERBOSE) {
System.out.println("\nTEST: query=" + parentQuery + " joinQuery=" + parentJoinQuery + " parentSort=" + parentSort + " childSort=" + childSort);
}
// Merge both sorst:
final List<SortField> sortFields = new ArrayList<SortField>(Arrays.asList(parentSort.getSort()));
sortFields.addAll(Arrays.asList(childSort.getSort()));
final Sort parentAndChildSort = new Sort(sortFields.toArray(new SortField[sortFields.size()]));
final TopDocs results = s.search(parentQuery, null, r.numDocs(),
parentAndChildSort);
if (VERBOSE) {
System.out.println("\nTEST: normal index gets " + results.totalHits + " hits");
final ScoreDoc[] hits = results.scoreDocs;
for(int hitIDX=0;hitIDX<hits.length;hitIDX++) {
final Document doc = s.doc(hits[hitIDX].doc);
//System.out.println(" score=" + hits[hitIDX].score + " parentID=" + doc.get("parentID") + " childID=" + doc.get("childID") + " (docID=" + hits[hitIDX].doc + ")");
System.out.println(" parentID=" + doc.get("parentID") + " childID=" + doc.get("childID") + " (docID=" + hits[hitIDX].doc + ")");
FieldDoc fd = (FieldDoc) hits[hitIDX];
if (fd.fields != null) {
System.out.print(" ");
for(Object o : fd.fields) {
if (o instanceof BytesRef) {
System.out.print(((BytesRef) o).utf8ToString() + " ");
} else {
System.out.print(o + " ");
}
}
System.out.println();
}
}
}
final BlockJoinCollector c = new BlockJoinCollector(parentSort, 10, true, true);
joinS.search(parentJoinQuery, c);
final int hitsPerGroup = _TestUtil.nextInt(random, 1, 20);
//final int hitsPerGroup = 100;
final TopGroups<Integer> joinResults = c.getTopGroups(childJoinQuery, childSort, 0, hitsPerGroup, 0, true);
if (VERBOSE) {
System.out.println("\nTEST: block join index gets " + (joinResults == null ? 0 : joinResults.groups.length) + " groups; hitsPerGroup=" + hitsPerGroup);
if (joinResults != null) {
final GroupDocs<Integer>[] groups = joinResults.groups;
for(int groupIDX=0;groupIDX<groups.length;groupIDX++) {
final GroupDocs<Integer> group = groups[groupIDX];
if (group.groupSortValues != null) {
System.out.print(" ");
for(Object o : group.groupSortValues) {
if (o instanceof BytesRef) {
System.out.print(((BytesRef) o).utf8ToString() + " ");
} else {
System.out.print(o + " ");
}
}
System.out.println();
}
assertNotNull(group.groupValue);
final Document parentDoc = joinS.doc(group.groupValue);
System.out.println(" group parentID=" + parentDoc.get("parentID") + " (docID=" + group.groupValue + ")");
for(int hitIDX=0;hitIDX<group.scoreDocs.length;hitIDX++) {
final Document doc = joinS.doc(group.scoreDocs[hitIDX].doc);
//System.out.println(" score=" + group.scoreDocs[hitIDX].score + " childID=" + doc.get("childID") + " (docID=" + group.scoreDocs[hitIDX].doc + ")");
System.out.println(" childID=" + doc.get("childID") + " child0=" + doc.get("child0") + " (docID=" + group.scoreDocs[hitIDX].doc + ")");
}
}
}
}
if (results.totalHits == 0) {
assertNull(joinResults);
} else {
compareHits(r, joinR, results, joinResults);
}
}
r.close();
joinR.close();
dir.close();
joinDir.close();
}
private void compareHits(IndexReader r, IndexReader joinR, TopDocs results, TopGroups<Integer> joinResults) throws Exception {
// results is 'complete'; joinResults is a subset
int resultUpto = 0;
int joinGroupUpto = 0;
final ScoreDoc[] hits = results.scoreDocs;
final GroupDocs<Integer>[] groupDocs = joinResults.groups;
while(joinGroupUpto < groupDocs.length) {
final GroupDocs<Integer> group = groupDocs[joinGroupUpto++];
final ScoreDoc[] groupHits = group.scoreDocs;
assertNotNull(group.groupValue);
final Document parentDoc = joinR.document(group.groupValue);
final String parentID = parentDoc.get("parentID");
//System.out.println("GROUP groupDoc=" + group.groupDoc + " parent=" + parentDoc);
assertNotNull(parentID);
assertTrue(groupHits.length > 0);
for(int hitIDX=0;hitIDX<groupHits.length;hitIDX++) {
final Document nonJoinHit = r.document(hits[resultUpto++].doc);
final Document joinHit = joinR.document(groupHits[hitIDX].doc);
assertEquals(parentID,
nonJoinHit.get("parentID"));
assertEquals(joinHit.get("childID"),
nonJoinHit.get("childID"));
}
if (joinGroupUpto < groupDocs.length) {
// Advance non-join hit to the next parentID:
//System.out.println(" next joingroupUpto=" + joinGroupUpto + " gd.length=" + groupDocs.length + " parentID=" + parentID);
while(true) {
assertTrue(resultUpto < hits.length);
if (!parentID.equals(r.document(hits[resultUpto].doc).get("parentID"))) {
break;
}
resultUpto++;
}
}
}
}
}