mirror of https://github.com/apache/lucene.git
LUCENE-3171: add modules/join to enable joining parent + child documents when indexed as a doc block
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1140851 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2745612a4c
commit
4a3b510739
|
@ -66,6 +66,12 @@ New Features
|
|||
highlighting speed up. Use FastVectorHighlighter.setPhraseLimit() to set limit
|
||||
(e.g. 5000). (Mike Sokolov via Koji Sekiguchi)
|
||||
|
||||
* LUCENE-3171: Added BlockJoinQuery and BlockJoinCollector, under the
|
||||
new contrib/join module, to enable searches that require joining
|
||||
between parent and child documents. Joined (children + parent)
|
||||
documents must be indexed as a document block, using
|
||||
IndexWriter.add/UpdateDocuments ((Mark Harwood, Mike McCandless)
|
||||
|
||||
API Changes
|
||||
|
||||
Bug Fixes
|
||||
|
|
|
@ -793,8 +793,13 @@ public abstract class FieldComparator<T> {
|
|||
@Override
|
||||
public void setScorer(Scorer scorer) {
|
||||
// wrap with a ScoreCachingWrappingScorer so that successive calls to
|
||||
// score() will not incur score computation over and over again.
|
||||
this.scorer = new ScoreCachingWrappingScorer(scorer);
|
||||
// score() will not incur score computation over and
|
||||
// over again.
|
||||
if (!(scorer instanceof ScoreCachingWrappingScorer)) {
|
||||
this.scorer = new ScoreCachingWrappingScorer(scorer);
|
||||
} else {
|
||||
this.scorer = scorer;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -31,12 +31,12 @@ import org.apache.lucene.util.PriorityQueue;
|
|||
* @see IndexSearcher#search(Query,Filter,int,Sort)
|
||||
* @see FieldCache
|
||||
*/
|
||||
public abstract class FieldValueHitQueue extends PriorityQueue<FieldValueHitQueue.Entry> {
|
||||
public abstract class FieldValueHitQueue<T extends FieldValueHitQueue.Entry> extends PriorityQueue<T> {
|
||||
|
||||
final static class Entry extends ScoreDoc {
|
||||
int slot;
|
||||
public static class Entry extends ScoreDoc {
|
||||
public int slot;
|
||||
|
||||
Entry(int slot, int doc, float score) {
|
||||
public Entry(int slot, int doc, float score) {
|
||||
super(doc, score);
|
||||
this.slot = slot;
|
||||
}
|
||||
|
@ -51,7 +51,7 @@ public abstract class FieldValueHitQueue extends PriorityQueue<FieldValueHitQueu
|
|||
* An implementation of {@link FieldValueHitQueue} which is optimized in case
|
||||
* there is just one comparator.
|
||||
*/
|
||||
private static final class OneComparatorFieldValueHitQueue extends FieldValueHitQueue {
|
||||
private static final class OneComparatorFieldValueHitQueue<T extends FieldValueHitQueue.Entry> extends FieldValueHitQueue<T> {
|
||||
private final int oneReverseMul;
|
||||
|
||||
public OneComparatorFieldValueHitQueue(SortField[] fields, int size)
|
||||
|
@ -92,7 +92,7 @@ public abstract class FieldValueHitQueue extends PriorityQueue<FieldValueHitQueu
|
|||
* An implementation of {@link FieldValueHitQueue} which is optimized in case
|
||||
* there is more than one comparator.
|
||||
*/
|
||||
private static final class MultiComparatorsFieldValueHitQueue extends FieldValueHitQueue {
|
||||
private static final class MultiComparatorsFieldValueHitQueue<T extends FieldValueHitQueue.Entry> extends FieldValueHitQueue<T> {
|
||||
|
||||
public MultiComparatorsFieldValueHitQueue(SortField[] fields, int size)
|
||||
throws IOException {
|
||||
|
@ -156,24 +156,28 @@ public abstract class FieldValueHitQueue extends PriorityQueue<FieldValueHitQueu
|
|||
* The number of hits to retain. Must be greater than zero.
|
||||
* @throws IOException
|
||||
*/
|
||||
public static FieldValueHitQueue create(SortField[] fields, int size) throws IOException {
|
||||
public static <T extends FieldValueHitQueue.Entry> FieldValueHitQueue<T> create(SortField[] fields, int size) throws IOException {
|
||||
|
||||
if (fields.length == 0) {
|
||||
throw new IllegalArgumentException("Sort must contain at least one field");
|
||||
}
|
||||
|
||||
if (fields.length == 1) {
|
||||
return new OneComparatorFieldValueHitQueue(fields, size);
|
||||
return new OneComparatorFieldValueHitQueue<T>(fields, size);
|
||||
} else {
|
||||
return new MultiComparatorsFieldValueHitQueue(fields, size);
|
||||
return new MultiComparatorsFieldValueHitQueue<T>(fields, size);
|
||||
}
|
||||
}
|
||||
|
||||
FieldComparator[] getComparators() { return comparators; }
|
||||
public FieldComparator[] getComparators() {
|
||||
return comparators;
|
||||
}
|
||||
|
||||
int[] getReverseMul() { return reverseMul; }
|
||||
public int[] getReverseMul() {
|
||||
return reverseMul;
|
||||
}
|
||||
|
||||
protected void setComparator(int pos, FieldComparator comparator) {
|
||||
public void setComparator(int pos, FieldComparator comparator) {
|
||||
if (pos==0) firstComparator = comparator;
|
||||
comparators[pos] = comparator;
|
||||
}
|
||||
|
|
|
@ -48,9 +48,9 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
|
||||
FieldComparator comparator;
|
||||
final int reverseMul;
|
||||
final FieldValueHitQueue queue;
|
||||
final FieldValueHitQueue<Entry> queue;
|
||||
|
||||
public OneComparatorNonScoringCollector(FieldValueHitQueue queue,
|
||||
public OneComparatorNonScoringCollector(FieldValueHitQueue<Entry> queue,
|
||||
int numHits, boolean fillFields) throws IOException {
|
||||
super(queue, numHits, fillFields);
|
||||
this.queue = queue;
|
||||
|
@ -113,7 +113,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
private static class OutOfOrderOneComparatorNonScoringCollector extends
|
||||
OneComparatorNonScoringCollector {
|
||||
|
||||
public OutOfOrderOneComparatorNonScoringCollector(FieldValueHitQueue queue,
|
||||
public OutOfOrderOneComparatorNonScoringCollector(FieldValueHitQueue<Entry> queue,
|
||||
int numHits, boolean fillFields) throws IOException {
|
||||
super(queue, numHits, fillFields);
|
||||
}
|
||||
|
@ -160,7 +160,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
|
||||
Scorer scorer;
|
||||
|
||||
public OneComparatorScoringNoMaxScoreCollector(FieldValueHitQueue queue,
|
||||
public OneComparatorScoringNoMaxScoreCollector(FieldValueHitQueue<Entry> queue,
|
||||
int numHits, boolean fillFields) throws IOException {
|
||||
super(queue, numHits, fillFields);
|
||||
}
|
||||
|
@ -221,7 +221,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
OneComparatorScoringNoMaxScoreCollector {
|
||||
|
||||
public OutOfOrderOneComparatorScoringNoMaxScoreCollector(
|
||||
FieldValueHitQueue queue, int numHits, boolean fillFields)
|
||||
FieldValueHitQueue<Entry> queue, int numHits, boolean fillFields)
|
||||
throws IOException {
|
||||
super(queue, numHits, fillFields);
|
||||
}
|
||||
|
@ -274,7 +274,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
|
||||
Scorer scorer;
|
||||
|
||||
public OneComparatorScoringMaxScoreCollector(FieldValueHitQueue queue,
|
||||
public OneComparatorScoringMaxScoreCollector(FieldValueHitQueue<Entry> queue,
|
||||
int numHits, boolean fillFields) throws IOException {
|
||||
super(queue, numHits, fillFields);
|
||||
// Must set maxScore to NEG_INF, or otherwise Math.max always returns NaN.
|
||||
|
@ -334,7 +334,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
private static class OutOfOrderOneComparatorScoringMaxScoreCollector extends
|
||||
OneComparatorScoringMaxScoreCollector {
|
||||
|
||||
public OutOfOrderOneComparatorScoringMaxScoreCollector(FieldValueHitQueue queue,
|
||||
public OutOfOrderOneComparatorScoringMaxScoreCollector(FieldValueHitQueue<Entry> queue,
|
||||
int numHits, boolean fillFields) throws IOException {
|
||||
super(queue, numHits, fillFields);
|
||||
}
|
||||
|
@ -384,8 +384,8 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
|
||||
final FieldComparator[] comparators;
|
||||
final int[] reverseMul;
|
||||
final FieldValueHitQueue queue;
|
||||
public MultiComparatorNonScoringCollector(FieldValueHitQueue queue,
|
||||
final FieldValueHitQueue<Entry> queue;
|
||||
public MultiComparatorNonScoringCollector(FieldValueHitQueue<Entry> queue,
|
||||
int numHits, boolean fillFields) throws IOException {
|
||||
super(queue, numHits, fillFields);
|
||||
this.queue = queue;
|
||||
|
@ -471,7 +471,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
private static class OutOfOrderMultiComparatorNonScoringCollector extends
|
||||
MultiComparatorNonScoringCollector {
|
||||
|
||||
public OutOfOrderMultiComparatorNonScoringCollector(FieldValueHitQueue queue,
|
||||
public OutOfOrderMultiComparatorNonScoringCollector(FieldValueHitQueue<Entry> queue,
|
||||
int numHits, boolean fillFields) throws IOException {
|
||||
super(queue, numHits, fillFields);
|
||||
}
|
||||
|
@ -540,7 +540,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
|
||||
Scorer scorer;
|
||||
|
||||
public MultiComparatorScoringMaxScoreCollector(FieldValueHitQueue queue,
|
||||
public MultiComparatorScoringMaxScoreCollector(FieldValueHitQueue<Entry> queue,
|
||||
int numHits, boolean fillFields) throws IOException {
|
||||
super(queue, numHits, fillFields);
|
||||
// Must set maxScore to NEG_INF, or otherwise Math.max always returns NaN.
|
||||
|
@ -619,7 +619,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
private final static class OutOfOrderMultiComparatorScoringMaxScoreCollector
|
||||
extends MultiComparatorScoringMaxScoreCollector {
|
||||
|
||||
public OutOfOrderMultiComparatorScoringMaxScoreCollector(FieldValueHitQueue queue,
|
||||
public OutOfOrderMultiComparatorScoringMaxScoreCollector(FieldValueHitQueue<Entry> queue,
|
||||
int numHits, boolean fillFields) throws IOException {
|
||||
super(queue, numHits, fillFields);
|
||||
}
|
||||
|
@ -692,7 +692,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
|
||||
Scorer scorer;
|
||||
|
||||
public MultiComparatorScoringNoMaxScoreCollector(FieldValueHitQueue queue,
|
||||
public MultiComparatorScoringNoMaxScoreCollector(FieldValueHitQueue<Entry> queue,
|
||||
int numHits, boolean fillFields) throws IOException {
|
||||
super(queue, numHits, fillFields);
|
||||
}
|
||||
|
@ -771,7 +771,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
extends MultiComparatorScoringNoMaxScoreCollector {
|
||||
|
||||
public OutOfOrderMultiComparatorScoringNoMaxScoreCollector(
|
||||
FieldValueHitQueue queue, int numHits, boolean fillFields)
|
||||
FieldValueHitQueue<Entry> queue, int numHits, boolean fillFields)
|
||||
throws IOException {
|
||||
super(queue, numHits, fillFields);
|
||||
}
|
||||
|
@ -917,7 +917,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
throw new IllegalArgumentException("numHits must be > 0; please use TotalHitCountCollector if you just need the total hit count");
|
||||
}
|
||||
|
||||
FieldValueHitQueue queue = FieldValueHitQueue.create(sort.fields, numHits);
|
||||
FieldValueHitQueue<Entry> queue = FieldValueHitQueue.create(sort.fields, numHits);
|
||||
if (queue.getComparators().length == 1) {
|
||||
if (docsScoredInOrder) {
|
||||
if (trackMaxScore) {
|
||||
|
@ -972,7 +972,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
protected void populateResults(ScoreDoc[] results, int howMany) {
|
||||
if (fillFields) {
|
||||
// avoid casting if unnecessary.
|
||||
FieldValueHitQueue queue = (FieldValueHitQueue) pq;
|
||||
FieldValueHitQueue<Entry> queue = (FieldValueHitQueue<Entry>) pq;
|
||||
for (int i = howMany - 1; i >= 0; i--) {
|
||||
results[i] = queue.fillFields(queue.pop());
|
||||
}
|
||||
|
@ -993,12 +993,11 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
}
|
||||
|
||||
// If this is a maxScoring tracking collector and there were no results,
|
||||
return new TopFieldDocs(totalHits, results, ((FieldValueHitQueue) pq).getFields(), maxScore);
|
||||
return new TopFieldDocs(totalHits, results, ((FieldValueHitQueue<Entry>) pq).getFields(), maxScore);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean acceptsDocsOutOfOrder() {
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -393,6 +393,56 @@ public final class ArrayUtil {
|
|||
return array;
|
||||
}
|
||||
|
||||
public static int[][] grow(int[][] array, int minSize) {
|
||||
if (array.length < minSize) {
|
||||
int[][] newArray = new int[oversize(minSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF)][];
|
||||
System.arraycopy(array, 0, newArray, 0, array.length);
|
||||
return newArray;
|
||||
} else {
|
||||
return array;
|
||||
}
|
||||
}
|
||||
|
||||
public static int[][] grow(int[][] array) {
|
||||
return grow(array, 1 + array.length);
|
||||
}
|
||||
|
||||
public static int[][] shrink(int[][] array, int targetSize) {
|
||||
final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
|
||||
if (newSize != array.length) {
|
||||
int[][] newArray = new int[newSize][];
|
||||
System.arraycopy(array, 0, newArray, 0, newSize);
|
||||
return newArray;
|
||||
} else {
|
||||
return array;
|
||||
}
|
||||
}
|
||||
|
||||
public static float[][] grow(float[][] array, int minSize) {
|
||||
if (array.length < minSize) {
|
||||
float[][] newArray = new float[oversize(minSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF)][];
|
||||
System.arraycopy(array, 0, newArray, 0, array.length);
|
||||
return newArray;
|
||||
} else {
|
||||
return array;
|
||||
}
|
||||
}
|
||||
|
||||
public static float[][] grow(float[][] array) {
|
||||
return grow(array, 1 + array.length);
|
||||
}
|
||||
|
||||
public static float[][] shrink(float[][] array, int targetSize) {
|
||||
final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
|
||||
if (newSize != array.length) {
|
||||
float[][] newArray = new float[newSize][];
|
||||
System.arraycopy(array, 0, newArray, 0, newSize);
|
||||
return newArray;
|
||||
} else {
|
||||
return array;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns hash of chars in range start (inclusive) to
|
||||
* end (inclusive)
|
||||
|
@ -617,6 +667,7 @@ public final class ArrayUtil {
|
|||
*/
|
||||
public static <T> void mergeSort(T[] a, int fromIndex, int toIndex, Comparator<? super T> comp) {
|
||||
if (toIndex-fromIndex <= 1) return;
|
||||
//System.out.println("SORT: " + (toIndex-fromIndex));
|
||||
getSorter(a, comp).mergeSort(fromIndex, toIndex-1);
|
||||
}
|
||||
|
||||
|
|
|
@ -35,6 +35,7 @@ import java.util.zip.ZipEntry;
|
|||
import java.util.zip.ZipFile;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.index.CheckIndex;
|
||||
import org.apache.lucene.index.ConcurrentMergeScheduler;
|
||||
|
@ -491,4 +492,25 @@ public class _TestUtil {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: this is likely buggy, and cannot clone fields
|
||||
// with tokenStreamValues, etc. Use at your own risk!!
|
||||
|
||||
// TODO: is there a pre-existing way to do this!!!
|
||||
public static Document cloneDocument(Document doc1) {
|
||||
final Document doc2 = new Document();
|
||||
for(Fieldable f : doc1.getFields()) {
|
||||
Field field1 = (Field) f;
|
||||
|
||||
Field field2 = new Field(field1.name(),
|
||||
field1.stringValue(),
|
||||
field1.isStored() ? Field.Store.YES : Field.Store.NO,
|
||||
field1.isIndexed() ? (field1.isTokenized() ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED) : Field.Index.NO);
|
||||
field2.setOmitNorms(field1.getOmitNorms());
|
||||
field2.setOmitTermFreqAndPositions(field1.getOmitTermFreqAndPositions());
|
||||
doc2.add(field2);
|
||||
}
|
||||
|
||||
return doc2;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,7 +33,6 @@ import java.util.concurrent.atomic.AtomicInteger;
|
|||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.index.codecs.CodecProvider;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
|
@ -69,28 +68,6 @@ public class TestNRTThreads extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
// TODO: is there a pre-existing way to do this!!!
|
||||
private Document cloneDoc(Document doc1) {
|
||||
final Document doc2 = new Document();
|
||||
for(Fieldable f : doc1.getFields()) {
|
||||
Field field1 = (Field) f;
|
||||
|
||||
Field field2 = new Field(field1.name(),
|
||||
field1.stringValue(),
|
||||
field1.isStored() ? Field.Store.YES : Field.Store.NO,
|
||||
field1.isIndexed() ? (field1.isTokenized() ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED) : Field.Index.NO);
|
||||
if (field1.getOmitNorms()) {
|
||||
field2.setOmitNorms(true);
|
||||
}
|
||||
if (field1.getOmitTermFreqAndPositions()) {
|
||||
field2.setOmitTermFreqAndPositions(true);
|
||||
}
|
||||
doc2.add(field2);
|
||||
}
|
||||
|
||||
return doc2;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNRTThreads() throws Exception {
|
||||
|
||||
|
@ -218,7 +195,7 @@ public class TestNRTThreads extends LuceneTestCase {
|
|||
|
||||
allSubDocs.add(subDocs);
|
||||
doc.add(packIDField);
|
||||
docsList.add(cloneDoc(doc));
|
||||
docsList.add(_TestUtil.cloneDocument(doc));
|
||||
docIDs.add(doc.get("docid"));
|
||||
|
||||
final int maxDocCount = _TestUtil.nextInt(random, 1, 10);
|
||||
|
@ -227,7 +204,7 @@ public class TestNRTThreads extends LuceneTestCase {
|
|||
if (doc == null) {
|
||||
break;
|
||||
}
|
||||
docsList.add(cloneDoc(doc));
|
||||
docsList.add(_TestUtil.cloneDocument(doc));
|
||||
docIDs.add(doc.get("docid"));
|
||||
}
|
||||
addCount.addAndGet(docsList.size());
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
<fileset dir="common" includes="build.xml" />
|
||||
<fileset dir="grouping" includes="build.xml" />
|
||||
<fileset dir="queries" includes="build.xml" />
|
||||
<fileset dir="join" includes="build.xml" />
|
||||
<fileset dir="suggest" includes="build.xml" />
|
||||
</subant>
|
||||
</sequential>
|
||||
|
@ -39,6 +40,7 @@
|
|||
<fileset dir="common" includes="build.xml" />
|
||||
<fileset dir="grouping" includes="build.xml" />
|
||||
<fileset dir="queries" includes="build.xml" />
|
||||
<fileset dir="join" includes="build.xml" />
|
||||
<fileset dir="suggest" includes="build.xml" />
|
||||
</subant>
|
||||
</sequential>
|
||||
|
@ -52,6 +54,7 @@
|
|||
<fileset dir="common" includes="build.xml" />
|
||||
<fileset dir="grouping" includes="build.xml" />
|
||||
<fileset dir="queries" includes="build.xml" />
|
||||
<fileset dir="join" includes="build.xml" />
|
||||
<fileset dir="suggest" includes="build.xml" />
|
||||
</subant>
|
||||
</sequential>
|
||||
|
@ -65,6 +68,7 @@
|
|||
<fileset dir="common" includes="build.xml" />
|
||||
<fileset dir="grouping" includes="build.xml" />
|
||||
<fileset dir="queries" includes="build.xml" />
|
||||
<fileset dir="join" includes="build.xml" />
|
||||
<fileset dir="suggest" includes="build.xml" />
|
||||
</subant>
|
||||
</sequential>
|
||||
|
@ -79,6 +83,7 @@
|
|||
<fileset dir="common" includes="build.xml" />
|
||||
<fileset dir="grouping" includes="build.xml" />
|
||||
<fileset dir="queries" includes="build.xml" />
|
||||
<fileset dir="join" includes="build.xml" />
|
||||
<fileset dir="suggest" includes="build.xml" />
|
||||
</subant>
|
||||
</sequential>
|
||||
|
@ -91,6 +96,7 @@
|
|||
<fileset dir="common" includes="build.xml" />
|
||||
<fileset dir="grouping" includes="build.xml" />
|
||||
<fileset dir="queries" includes="build.xml" />
|
||||
<fileset dir="join" includes="build.xml" />
|
||||
<fileset dir="suggest" includes="build.xml" />
|
||||
</subant>
|
||||
</sequential>
|
||||
|
@ -105,6 +111,7 @@
|
|||
<fileset dir="common" includes="build.xml" />
|
||||
<fileset dir="grouping" includes="build.xml" />
|
||||
<fileset dir="queries" includes="build.xml" />
|
||||
<fileset dir="join" includes="build.xml" />
|
||||
<fileset dir="suggest" includes="build.xml" />
|
||||
</subant>
|
||||
</sequential>
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
<?xml version="1.0"?>
|
||||
<project name="join" default="default">
|
||||
<description>
|
||||
Queries and collectors for performing joins
|
||||
</description>
|
||||
|
||||
<property name="build.dir" location="build/" />
|
||||
<import file="../../lucene/contrib/contrib-build.xml"/>
|
||||
|
||||
<property name="build.dir" location="build/" />
|
||||
<property name="dist.dir" location="dist/" />
|
||||
|
||||
<module-uptodate name="grouping" jarfile="${common.dir}/../modules/grouping/build/lucene-grouping-${version}.jar"
|
||||
property="grouping.uptodate" classpath.property="grouping.jar"/>
|
||||
|
||||
<path id="classpath">
|
||||
<pathelement path="${grouping.jar}"/>
|
||||
<path refid="base.classpath"/>
|
||||
</path>
|
||||
|
||||
<path id="run.classpath">
|
||||
<path refid="classpath"/>
|
||||
<pathelement location="${build.dir}/classes/java"/>
|
||||
</path>
|
||||
|
||||
<property name="build.dir" location="build/" />
|
||||
<property name="dist.dir" location="dist/" />
|
||||
<property name="maven.dist.dir" location="../dist/maven" />
|
||||
|
||||
<target name="compile-grouping" unless="grouping.uptodate">
|
||||
<subant target="default">
|
||||
<fileset dir="${common.dir}/../modules/grouping" includes="build.xml"/>
|
||||
</subant>
|
||||
</target>
|
||||
|
||||
<target name="init" depends="contrib-build.init,compile-grouping"/>
|
||||
|
||||
<target name="dist-maven" depends="jar-core,javadocs,contrib-build.dist-maven" />
|
||||
</project>
|
|
@ -0,0 +1,472 @@
|
|||
package org.apache.lucene.search.join;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter; // javadocs
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.FieldComparator;
|
||||
import org.apache.lucene.search.FieldValueHitQueue;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreCachingWrappingScorer;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.TopDocsCollector;
|
||||
import org.apache.lucene.search.TopFieldCollector;
|
||||
import org.apache.lucene.search.TopScoreDocCollector;
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.search.grouping.GroupDocs;
|
||||
import org.apache.lucene.search.grouping.TopGroups;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
|
||||
/** Collects parent document hits for a Query containing one more more
|
||||
* BlockJoinQuery clauses, sorted by the
|
||||
* specified parent Sort. Note that this cannot perform
|
||||
* arbitrary joins; rather, it requires that all joined
|
||||
* documents are indexed as a doc block (using {@link
|
||||
* IndexWriter#addDocuments} or {@link
|
||||
* IndexWriter#updateDocuments}). Ie, the join is computed
|
||||
* at index time.
|
||||
*
|
||||
* <p>The parent Sort must only use
|
||||
* fields from the parent documents; sorting by field in
|
||||
* the child documents is not supported.</p>
|
||||
*
|
||||
* <p>You should only use this
|
||||
* collector if one or more of the clauses in the query is
|
||||
* a {@link BlockJoinQuery}. This collector will find those query
|
||||
* clauses and record the matching child documents for the
|
||||
* top scoring parent documents.</p>
|
||||
*
|
||||
* <p>Multiple joins (star join) and nested joins and a mix
|
||||
* of the two are allowed, as long as in all cases the
|
||||
* documents corresponding to a single row of each joined
|
||||
* parent table were indexed as a doc block.</p>
|
||||
*
|
||||
* <p>For the simple star join you can retrieve the
|
||||
* {@link TopGroups} instance containing each {@link BlockJoinQuery}'s
|
||||
* matching child documents for the top parent groups,
|
||||
* using {@link #getTopGroups}. Ie,
|
||||
* a single query, which will contain two or more
|
||||
* {@link BlockJoinQuery}'s as clauses representing the star join,
|
||||
* can then retrieve two or more {@link TopGroups} instances.</p>
|
||||
*
|
||||
* <p>For nested joins, the query will run correctly (ie,
|
||||
* match the right parent and child documents), however,
|
||||
* because TopGroups is currently unable to support nesting
|
||||
* (each group is not able to hold another TopGroups), you
|
||||
* are only able to retrieve the TopGroups of the first
|
||||
* join. The TopGroups of the nested joins will not be
|
||||
* correct.
|
||||
*
|
||||
* See {@link org.apache.lucene.search.join} for a code
|
||||
* sample.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class BlockJoinCollector extends Collector {
|
||||
|
||||
private final Sort sort;
|
||||
|
||||
// Maps each BlockJoinQuery instance to its "slot" in
|
||||
// joinScorers and in OneGroup's cached doc/scores/count:
|
||||
private final Map<Query,Integer> joinQueryID = new HashMap<Query,Integer>();
|
||||
private final int numParentHits;
|
||||
private final FieldValueHitQueue<OneGroup> queue;
|
||||
private final FieldComparator[] comparators;
|
||||
private final int[] reverseMul;
|
||||
private final int compEnd;
|
||||
private final boolean trackMaxScore;
|
||||
private final boolean trackScores;
|
||||
|
||||
private int docBase;
|
||||
private BlockJoinQuery.BlockJoinScorer[] joinScorers = new BlockJoinQuery.BlockJoinScorer[0];
|
||||
private IndexReader.AtomicReaderContext currentReaderContext;
|
||||
private Scorer scorer;
|
||||
private boolean queueFull;
|
||||
|
||||
private OneGroup bottom;
|
||||
private int totalHitCount;
|
||||
private float maxScore = Float.NaN;
|
||||
|
||||
/* Creates a BlockJoinCollector. The provided sort must
|
||||
* not be null. */
|
||||
public BlockJoinCollector(Sort sort, int numParentHits, boolean trackScores, boolean trackMaxScore) throws IOException {
|
||||
// TODO: allow null sort to be specialized to relevance
|
||||
// only collector
|
||||
this.sort = sort;
|
||||
this.trackMaxScore = trackMaxScore;
|
||||
this.trackScores = trackScores;
|
||||
this.numParentHits = numParentHits;
|
||||
queue = FieldValueHitQueue.create(sort.getSort(), numParentHits);
|
||||
comparators = queue.getComparators();
|
||||
reverseMul = queue.getReverseMul();
|
||||
compEnd = comparators.length - 1;
|
||||
}
|
||||
|
||||
private static final class OneGroup extends FieldValueHitQueue.Entry {
|
||||
public OneGroup(int comparatorSlot, int parentDoc, float parentScore, int numJoins, boolean doScores) {
|
||||
super(comparatorSlot, parentDoc, parentScore);
|
||||
docs = new int[numJoins][];
|
||||
for(int joinID=0;joinID<numJoins;joinID++) {
|
||||
docs[joinID] = new int[5];
|
||||
}
|
||||
if (doScores) {
|
||||
scores = new float[numJoins][];
|
||||
for(int joinID=0;joinID<numJoins;joinID++) {
|
||||
scores[joinID] = new float[5];
|
||||
}
|
||||
}
|
||||
counts = new int[numJoins];
|
||||
}
|
||||
AtomicReaderContext readerContext;
|
||||
int[][] docs;
|
||||
float[][] scores;
|
||||
int[] counts;
|
||||
};
|
||||
|
||||
@Override
|
||||
public void collect(int parentDoc) throws IOException {
|
||||
//System.out.println("C parentDoc=" + parentDoc);
|
||||
totalHitCount++;
|
||||
|
||||
float score = Float.NaN;
|
||||
|
||||
if (trackMaxScore) {
|
||||
score = scorer.score();
|
||||
if (score > maxScore) {
|
||||
maxScore = score;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: we could sweep all joinScorers here and
|
||||
// aggregate total child hit count, so we can fill this
|
||||
// in getTopGroups (we wire it to 0 now)
|
||||
|
||||
if (queueFull) {
|
||||
//System.out.println(" queueFull");
|
||||
// Fastmatch: return if this hit is not competitive
|
||||
for (int i = 0;; i++) {
|
||||
final int c = reverseMul[i] * comparators[i].compareBottom(parentDoc);
|
||||
if (c < 0) {
|
||||
// Definitely not competitive.
|
||||
//System.out.println(" skip");
|
||||
return;
|
||||
} else if (c > 0) {
|
||||
// Definitely competitive.
|
||||
break;
|
||||
} else if (i == compEnd) {
|
||||
// Here c=0. If we're at the last comparator, this doc is not
|
||||
// competitive, since docs are visited in doc Id order, which means
|
||||
// this doc cannot compete with any other document in the queue.
|
||||
//System.out.println(" skip");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
//System.out.println(" competes! doc=" + (docBase + parentDoc));
|
||||
|
||||
// This hit is competitive - replace bottom element in queue & adjustTop
|
||||
for (int i = 0; i < comparators.length; i++) {
|
||||
comparators[i].copy(bottom.slot, parentDoc);
|
||||
}
|
||||
if (!trackMaxScore && trackScores) {
|
||||
score = scorer.score();
|
||||
}
|
||||
bottom.doc = docBase + parentDoc;
|
||||
bottom.readerContext = currentReaderContext;
|
||||
bottom.score = score;
|
||||
copyGroups(bottom);
|
||||
bottom = queue.updateTop();
|
||||
|
||||
for (int i = 0; i < comparators.length; i++) {
|
||||
comparators[i].setBottom(bottom.slot);
|
||||
}
|
||||
} else {
|
||||
// Startup transient: queue is not yet full:
|
||||
final int comparatorSlot = totalHitCount - 1;
|
||||
|
||||
// Copy hit into queue
|
||||
for (int i = 0; i < comparators.length; i++) {
|
||||
comparators[i].copy(comparatorSlot, parentDoc);
|
||||
}
|
||||
//System.out.println(" startup: new OG doc=" + (docBase+parentDoc));
|
||||
final OneGroup og = new OneGroup(comparatorSlot, docBase+parentDoc, score, joinScorers.length, trackScores);
|
||||
og.readerContext = currentReaderContext;
|
||||
copyGroups(og);
|
||||
bottom = queue.add(og);
|
||||
queueFull = totalHitCount == numParentHits;
|
||||
if (queueFull) {
|
||||
// End of startup transient: queue just filled up:
|
||||
for (int i = 0; i < comparators.length; i++) {
|
||||
comparators[i].setBottom(bottom.slot);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pulls out child doc and scores for all join queries:
|
||||
private void copyGroups(OneGroup og) {
|
||||
// While rare, it's possible top arrays could be too
|
||||
// short if join query had null scorer on first
|
||||
// segment(s) but then became non-null on later segments
|
||||
final int numSubScorers = joinScorers.length;
|
||||
if (og.docs.length < numSubScorers) {
|
||||
// While rare, this could happen if join query had
|
||||
// null scorer on first segment(s) but then became
|
||||
// non-null on later segments
|
||||
og.docs = ArrayUtil.grow(og.docs);
|
||||
}
|
||||
if (og.counts.length < numSubScorers) {
|
||||
og.counts = ArrayUtil.grow(og.counts);
|
||||
}
|
||||
if (trackScores && og.scores.length < numSubScorers) {
|
||||
og.scores = ArrayUtil.grow(og.scores);
|
||||
}
|
||||
|
||||
//System.out.println("copyGroups parentDoc=" + og.doc);
|
||||
for(int scorerIDX = 0;scorerIDX < numSubScorers;scorerIDX++) {
|
||||
final BlockJoinQuery.BlockJoinScorer joinScorer = joinScorers[scorerIDX];
|
||||
//System.out.println(" scorer=" + joinScorer);
|
||||
if (joinScorer != null) {
|
||||
og.counts[scorerIDX] = joinScorer.getChildCount();
|
||||
//System.out.println(" count=" + og.counts[scorerIDX]);
|
||||
og.docs[scorerIDX] = joinScorer.swapChildDocs(og.docs[scorerIDX]);
|
||||
/*
|
||||
for(int idx=0;idx<og.counts[scorerIDX];idx++) {
|
||||
System.out.println(" docs[" + idx + "]=" + og.docs[scorerIDX][idx]);
|
||||
}
|
||||
*/
|
||||
if (trackScores) {
|
||||
og.scores[scorerIDX] = joinScorer.swapChildScores(og.scores[scorerIDX]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(AtomicReaderContext context) throws IOException {
|
||||
currentReaderContext = context;
|
||||
docBase = context.docBase;
|
||||
for (int compIDX = 0; compIDX < comparators.length; compIDX++) {
|
||||
queue.setComparator(compIDX, comparators[compIDX].setNextReader(context));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean acceptsDocsOutOfOrder() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setScorer(Scorer scorer) {
|
||||
//System.out.println("C.setScorer scorer=" + scorer);
|
||||
// Since we invoke .score(), and the comparators likely
|
||||
// do as well, cache it so it's only "really" computed
|
||||
// once:
|
||||
this.scorer = new ScoreCachingWrappingScorer(scorer);
|
||||
for (int compIDX = 0; compIDX < comparators.length; compIDX++) {
|
||||
comparators[compIDX].setScorer(this.scorer);
|
||||
}
|
||||
Arrays.fill(joinScorers, null);
|
||||
|
||||
// Find any BlockJoinScorers out there:
|
||||
scorer.visitScorers(new Scorer.ScorerVisitor<Query,Query,Scorer>() {
|
||||
private void enroll(BlockJoinQuery query, BlockJoinQuery.BlockJoinScorer scorer) {
|
||||
final Integer slot = joinQueryID.get(query);
|
||||
if (slot == null) {
|
||||
joinQueryID.put(query, joinScorers.length);
|
||||
//System.out.println("found JQ: " + query + " slot=" + joinScorers.length);
|
||||
final BlockJoinQuery.BlockJoinScorer[] newArray = new BlockJoinQuery.BlockJoinScorer[1+joinScorers.length];
|
||||
System.arraycopy(joinScorers, 0, newArray, 0, joinScorers.length);
|
||||
joinScorers = newArray;
|
||||
joinScorers[joinScorers.length-1] = scorer;
|
||||
} else {
|
||||
joinScorers[slot] = scorer;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visitOptional(Query parent, Query child, Scorer scorer) {
|
||||
//System.out.println("visitOpt");
|
||||
if (child instanceof BlockJoinQuery) {
|
||||
enroll((BlockJoinQuery) child,
|
||||
(BlockJoinQuery.BlockJoinScorer) scorer);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visitRequired(Query parent, Query child, Scorer scorer) {
|
||||
//System.out.println("visitReq parent=" + parent + " child=" + child + " scorer=" + scorer);
|
||||
if (child instanceof BlockJoinQuery) {
|
||||
enroll((BlockJoinQuery) child,
|
||||
(BlockJoinQuery.BlockJoinScorer) scorer);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visitProhibited(Query parent, Query child, Scorer scorer) {
|
||||
//System.out.println("visitProh");
|
||||
if (child instanceof BlockJoinQuery) {
|
||||
enroll((BlockJoinQuery) child,
|
||||
(BlockJoinQuery.BlockJoinScorer) scorer);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private final static class FakeScorer extends Scorer {
|
||||
|
||||
float score;
|
||||
int doc;
|
||||
|
||||
public FakeScorer() {
|
||||
super((Weight) null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score() {
|
||||
return score;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
private OneGroup[] sortedGroups;
|
||||
|
||||
private void sortQueue() {
|
||||
sortedGroups = new OneGroup[queue.size()];
|
||||
for(int downTo=queue.size()-1;downTo>=0;downTo--) {
|
||||
sortedGroups[downTo] = queue.pop();
|
||||
}
|
||||
}
|
||||
|
||||
/** Return the TopGroups for the specified
|
||||
* BlockJoinQuery. The groupValue of each GroupDocs will
|
||||
* be the parent docID for that group. Note that the
|
||||
* {@link GroupDocs#totalHits}, which would be the
|
||||
* total number of child documents matching that parent,
|
||||
* is not computed (will always be 0). Returns null if
|
||||
* no groups matched. */
|
||||
@SuppressWarnings("unchecked")
|
||||
public TopGroups<Integer> getTopGroups(BlockJoinQuery query, Sort withinGroupSort, int offset, int maxDocsPerGroup, int withinGroupOffset, boolean fillSortFields)
|
||||
|
||||
throws IOException {
|
||||
|
||||
final Integer _slot = joinQueryID.get(query);
|
||||
if (_slot == null) {
|
||||
if (totalHitCount == 0) {
|
||||
return null;
|
||||
} else {
|
||||
throw new IllegalArgumentException("the Query did not contain the provided BlockJoinQuery");
|
||||
}
|
||||
}
|
||||
|
||||
// unbox once
|
||||
final int slot = _slot;
|
||||
|
||||
if (offset >= queue.size()) {
|
||||
return null;
|
||||
}
|
||||
int totalGroupedHitCount = 0;
|
||||
|
||||
if (sortedGroups == null) {
|
||||
sortQueue();
|
||||
}
|
||||
|
||||
final FakeScorer fakeScorer = new FakeScorer();
|
||||
|
||||
final GroupDocs<Integer>[] groups = new GroupDocs[sortedGroups.length - offset];
|
||||
|
||||
for(int groupIDX=offset;groupIDX<sortedGroups.length;groupIDX++) {
|
||||
final OneGroup og = sortedGroups[groupIDX];
|
||||
|
||||
// At this point we hold all docs w/ in each group,
|
||||
// unsorted; we now sort them:
|
||||
final TopDocsCollector collector;
|
||||
if (withinGroupSort == null) {
|
||||
// Sort by score
|
||||
if (!trackScores) {
|
||||
throw new IllegalArgumentException("cannot sort by relevance within group: trackScores=false");
|
||||
}
|
||||
collector = TopScoreDocCollector.create(maxDocsPerGroup, true);
|
||||
} else {
|
||||
// Sort by fields
|
||||
collector = TopFieldCollector.create(withinGroupSort, maxDocsPerGroup, fillSortFields, trackScores, trackMaxScore, true);
|
||||
}
|
||||
|
||||
collector.setScorer(fakeScorer);
|
||||
collector.setNextReader(og.readerContext);
|
||||
final int numChildDocs = og.counts[slot];
|
||||
for(int docIDX=0;docIDX<numChildDocs;docIDX++) {
|
||||
final int doc = og.docs[slot][docIDX];
|
||||
fakeScorer.doc = doc;
|
||||
if (trackScores) {
|
||||
fakeScorer.score = og.scores[slot][docIDX];
|
||||
}
|
||||
collector.collect(doc);
|
||||
}
|
||||
totalGroupedHitCount += numChildDocs;
|
||||
|
||||
final Object[] groupSortValues;
|
||||
|
||||
if (fillSortFields) {
|
||||
groupSortValues = new Object[comparators.length];
|
||||
for(int sortFieldIDX=0;sortFieldIDX<comparators.length;sortFieldIDX++) {
|
||||
groupSortValues[sortFieldIDX] = comparators[sortFieldIDX].value(og.slot);
|
||||
}
|
||||
} else {
|
||||
groupSortValues = null;
|
||||
}
|
||||
|
||||
final TopDocs topDocs = collector.topDocs(withinGroupOffset, maxDocsPerGroup);
|
||||
|
||||
groups[groupIDX-offset] = new GroupDocs<Integer>(topDocs.getMaxScore(),
|
||||
og.counts[slot],
|
||||
topDocs.scoreDocs,
|
||||
og.doc,
|
||||
groupSortValues);
|
||||
}
|
||||
|
||||
return new TopGroups<Integer>(new TopGroups<Integer>(sort.getSort(),
|
||||
withinGroupSort == null ? null : withinGroupSort.getSort(),
|
||||
0, totalGroupedHitCount, groups),
|
||||
totalHitCount);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,410 @@
|
|||
package org.apache.lucene.search.join;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter; // javadocs
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.search.grouping.TopGroups;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
|
||||
/**
|
||||
* This query requires that you index
|
||||
* children and parent docs as a single block, using the
|
||||
* {@link IndexWriter#addDocuments} or {@link
|
||||
* IndexWriter#updateDocuments} API. In each block, the
|
||||
* child documents must appear first, ending with the parent
|
||||
* document. At search time you provide a Filter
|
||||
* identifying the parents, however this Filter must provide
|
||||
* an {@link OpenBitSet} per sub-reader.
|
||||
*
|
||||
* <p>Once the block index is built, use this query to wrap
|
||||
* any sub-query matching only child docs and join matches in that
|
||||
* child document space up to the parent document space.
|
||||
* You can then use this Query as a clause with
|
||||
* other queries in the parent document space.</p>
|
||||
*
|
||||
* <p>The child documents must be orthogonal to the parent
|
||||
* documents: the wrapped child query must never
|
||||
* return a parent document.</p>
|
||||
*
|
||||
* If you'd like to retrieve {@link TopGroups} for the
|
||||
* resulting query, use the {@link BlockJoinCollector}.
|
||||
* Note that this is not necessary, ie, if you simply want
|
||||
* to collect the parent documents and don't need to see
|
||||
* which child documents matched under that parent, then
|
||||
* you can use any collector.
|
||||
*
|
||||
* <p><b>NOTE</b>: If the overall query contains parent-only
|
||||
* matches, for example you OR a parent-only query with a
|
||||
* joined child-only query, then the resulting collected documents
|
||||
* will be correct, however the {@link TopGroups} you get
|
||||
* from {@link BlockJoinCollector} will not contain every
|
||||
* child for parents that had matched.
|
||||
*
|
||||
* <p>See {@link org.apache.lucene.search.join} for an
|
||||
* overview. </p>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
||||
public class BlockJoinQuery extends Query {
|
||||
|
||||
public static enum ScoreMode {None, Avg, Max, Total};
|
||||
|
||||
private final Filter parentsFilter;
|
||||
private final Query childQuery;
|
||||
|
||||
// If we are rewritten, this is the original childQuery we
|
||||
// were passed; we use this for .equals() and
|
||||
// .hashCode(). This makes rewritten query equal the
|
||||
// original, so that user does not have to .rewrite() their
|
||||
// query before searching:
|
||||
private final Query origChildQuery;
|
||||
private final ScoreMode scoreMode;
|
||||
|
||||
public BlockJoinQuery(Query childQuery, Filter parentsFilter, ScoreMode scoreMode) {
|
||||
super();
|
||||
this.origChildQuery = childQuery;
|
||||
this.childQuery = childQuery;
|
||||
this.parentsFilter = parentsFilter;
|
||||
this.scoreMode = scoreMode;
|
||||
}
|
||||
|
||||
private BlockJoinQuery(Query origChildQuery, Query childQuery, Filter parentsFilter, ScoreMode scoreMode) {
|
||||
super();
|
||||
this.origChildQuery = origChildQuery;
|
||||
this.childQuery = childQuery;
|
||||
this.parentsFilter = parentsFilter;
|
||||
this.scoreMode = scoreMode;
|
||||
}
|
||||
|
||||
public Weight createWeight(IndexSearcher searcher) throws IOException {
|
||||
return new BlockJoinWeight(this, childQuery.createWeight(searcher), parentsFilter, scoreMode);
|
||||
}
|
||||
|
||||
private static class BlockJoinWeight extends Weight {
|
||||
private final Query joinQuery;
|
||||
private final Weight childWeight;
|
||||
private final Filter parentsFilter;
|
||||
private final ScoreMode scoreMode;
|
||||
|
||||
public BlockJoinWeight(Query joinQuery, Weight childWeight, Filter parentsFilter, ScoreMode scoreMode) {
|
||||
super();
|
||||
this.joinQuery = joinQuery;
|
||||
this.childWeight = childWeight;
|
||||
this.parentsFilter = parentsFilter;
|
||||
this.scoreMode = scoreMode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query getQuery() {
|
||||
return joinQuery;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float getValue() {
|
||||
return childWeight.getValue();
|
||||
}
|
||||
|
||||
@Override
|
||||
public float sumOfSquaredWeights() throws IOException {
|
||||
return childWeight.sumOfSquaredWeights();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void normalize(float norm) {
|
||||
childWeight.normalize(norm);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Scorer scorer(AtomicReaderContext readerContext, ScorerContext context) throws IOException {
|
||||
// Pass scoreDocsInOrder true, topScorer false to our sub:
|
||||
final Scorer childScorer = childWeight.scorer(readerContext, ScorerContext.def().scoreDocsInOrder(true).topScorer(false));
|
||||
|
||||
if (childScorer == null) {
|
||||
// No matches
|
||||
return null;
|
||||
}
|
||||
|
||||
final int firstChildDoc = childScorer.nextDoc();
|
||||
if (firstChildDoc == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
// No matches
|
||||
return null;
|
||||
}
|
||||
|
||||
final DocIdSet parents = parentsFilter.getDocIdSet(readerContext);
|
||||
// TODO: once we do random-access filters we can
|
||||
// generalize this:
|
||||
if (parents == null) {
|
||||
// No matches
|
||||
return null;
|
||||
}
|
||||
if (!(parents instanceof OpenBitSet)) {
|
||||
throw new IllegalStateException("parentFilter must return OpenBitSet; got " + parents);
|
||||
}
|
||||
|
||||
return new BlockJoinScorer(this, childScorer, (OpenBitSet) parents, firstChildDoc, scoreMode);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(AtomicReaderContext reader, int doc) throws IOException {
|
||||
// TODO
|
||||
throw new UnsupportedOperationException(getClass().getName() +
|
||||
" cannot explain match on parent document");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean scoresDocsOutOfOrder() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static class BlockJoinScorer extends Scorer {
|
||||
private final Scorer childScorer;
|
||||
private final OpenBitSet parentBits;
|
||||
private final ScoreMode scoreMode;
|
||||
private int parentDoc;
|
||||
private float parentScore;
|
||||
private int nextChildDoc;
|
||||
|
||||
private int[] pendingChildDocs = new int[5];
|
||||
private float[] pendingChildScores;
|
||||
private int childDocUpto;
|
||||
|
||||
public BlockJoinScorer(Weight weight, Scorer childScorer, OpenBitSet parentBits, int firstChildDoc, ScoreMode scoreMode) {
|
||||
super(weight);
|
||||
//System.out.println("Q.init firstChildDoc=" + firstChildDoc);
|
||||
this.parentBits = parentBits;
|
||||
this.childScorer = childScorer;
|
||||
this.scoreMode = scoreMode;
|
||||
if (scoreMode != ScoreMode.None) {
|
||||
pendingChildScores = new float[5];
|
||||
}
|
||||
nextChildDoc = firstChildDoc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visitSubScorers(Query parent, BooleanClause.Occur relationship,
|
||||
ScorerVisitor<Query, Query, Scorer> visitor) {
|
||||
super.visitSubScorers(parent, relationship, visitor);
|
||||
//childScorer.visitSubScorers(weight.getQuery(), BooleanClause.Occur.MUST, visitor);
|
||||
childScorer.visitScorers(visitor);
|
||||
}
|
||||
|
||||
int getChildCount() {
|
||||
return childDocUpto;
|
||||
}
|
||||
|
||||
int[] swapChildDocs(int[] other) {
|
||||
final int[] ret = pendingChildDocs;
|
||||
if (other == null) {
|
||||
pendingChildDocs = new int[5];
|
||||
} else {
|
||||
pendingChildDocs = other;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
float[] swapChildScores(float[] other) {
|
||||
if (scoreMode == ScoreMode.None) {
|
||||
throw new IllegalStateException("ScoreMode is None");
|
||||
}
|
||||
final float[] ret = pendingChildScores;
|
||||
if (other == null) {
|
||||
pendingChildScores = new float[5];
|
||||
} else {
|
||||
pendingChildScores = other;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
//System.out.println("Q.nextDoc() nextChildDoc=" + nextChildDoc);
|
||||
|
||||
if (nextChildDoc == NO_MORE_DOCS) {
|
||||
//System.out.println(" end");
|
||||
return parentDoc = NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
// Gather all children sharing the same parent as nextChildDoc
|
||||
parentDoc = parentBits.nextSetBit(nextChildDoc);
|
||||
//System.out.println(" parentDoc=" + parentDoc);
|
||||
assert parentDoc != -1;
|
||||
|
||||
float totalScore = 0;
|
||||
float maxScore = Float.NEGATIVE_INFINITY;
|
||||
|
||||
childDocUpto = 0;
|
||||
do {
|
||||
//System.out.println(" c=" + nextChildDoc);
|
||||
if (pendingChildDocs.length == childDocUpto) {
|
||||
pendingChildDocs = ArrayUtil.grow(pendingChildDocs);
|
||||
if (scoreMode != ScoreMode.None) {
|
||||
pendingChildScores = ArrayUtil.grow(pendingChildScores);
|
||||
}
|
||||
}
|
||||
pendingChildDocs[childDocUpto] = nextChildDoc;
|
||||
if (scoreMode != ScoreMode.None) {
|
||||
// TODO: specialize this into dedicated classes per-scoreMode
|
||||
final float childScore = childScorer.score();
|
||||
pendingChildScores[childDocUpto] = childScore;
|
||||
maxScore = Math.max(childScore, maxScore);
|
||||
totalScore += childScore;
|
||||
}
|
||||
childDocUpto++;
|
||||
nextChildDoc = childScorer.nextDoc();
|
||||
} while (nextChildDoc < parentDoc);
|
||||
//System.out.println(" nextChildDoc=" + nextChildDoc);
|
||||
|
||||
// Parent & child docs are supposed to be orthogonal:
|
||||
assert nextChildDoc != parentDoc;
|
||||
|
||||
switch(scoreMode) {
|
||||
case Avg:
|
||||
parentScore = totalScore / childDocUpto;
|
||||
break;
|
||||
case Max:
|
||||
parentScore = maxScore;
|
||||
break;
|
||||
case Total:
|
||||
parentScore = totalScore;
|
||||
break;
|
||||
case None:
|
||||
break;
|
||||
}
|
||||
|
||||
//System.out.println(" return parentDoc=" + parentDoc);
|
||||
return parentDoc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return parentDoc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score() throws IOException {
|
||||
return parentScore;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int parentTarget) throws IOException {
|
||||
|
||||
//System.out.println("Q.advance parentTarget=" + parentTarget);
|
||||
if (parentTarget == NO_MORE_DOCS) {
|
||||
return parentDoc = NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
final int prevParentDoc = parentBits.prevSetBit(parentTarget-1);
|
||||
|
||||
//System.out.println(" rolled back to prevParentDoc=" + prevParentDoc + " vs parentDoc=" + parentDoc);
|
||||
assert prevParentDoc >= parentDoc;
|
||||
if (prevParentDoc > nextChildDoc) {
|
||||
nextChildDoc = childScorer.advance(prevParentDoc);
|
||||
// System.out.println(" childScorer advanced to child docID=" + nextChildDoc);
|
||||
//} else {
|
||||
//System.out.println(" skip childScorer advance");
|
||||
}
|
||||
|
||||
// Parent & child docs are supposed to be orthogonal:
|
||||
assert nextChildDoc != prevParentDoc;
|
||||
|
||||
final int nd = nextDoc();
|
||||
//System.out.println(" return nextParentDoc=" + nd);
|
||||
return nd;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void extractTerms(Set<Term> terms) {
|
||||
childQuery.extractTerms(terms);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
final Query childRewrite = childQuery.rewrite(reader);
|
||||
if (childRewrite != childQuery) {
|
||||
return new BlockJoinQuery(childQuery,
|
||||
childRewrite,
|
||||
parentsFilter,
|
||||
scoreMode);
|
||||
} else {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return "BlockJoinQuery ("+childQuery.toString()+")";
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setBoost(float boost) {
|
||||
throw new UnsupportedOperationException("this query cannot support boosting; please use childQuery.setBoost instead");
|
||||
}
|
||||
|
||||
@Override
|
||||
public float getBoost() {
|
||||
throw new UnsupportedOperationException("this query cannot support boosting; please use childQuery.getBoost instead");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object _other) {
|
||||
if (_other instanceof BlockJoinQuery) {
|
||||
final BlockJoinQuery other = (BlockJoinQuery) _other;
|
||||
return origChildQuery.equals(other.origChildQuery) &&
|
||||
parentsFilter.equals(other.parentsFilter) &&
|
||||
scoreMode == other.scoreMode;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int hash = 1;
|
||||
hash = prime * hash + origChildQuery.hashCode();
|
||||
hash = prime * hash + scoreMode.hashCode();
|
||||
hash = prime * hash + parentsFilter.hashCode();
|
||||
return hash;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object clone() {
|
||||
return new BlockJoinQuery((Query) origChildQuery.clone(),
|
||||
parentsFilter,
|
||||
scoreMode);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
<html>
|
||||
<body>
|
||||
|
||||
<p>This module supports index-time joins while searching, where joined
|
||||
documents are indexed as a single document block using
|
||||
{@link org.apache.lucene.index.IndexWriter#addDocuments}. This is useful for any normalized content (XML documents or database tables). In database terms, all rows for all
|
||||
joined tables matching a single row of the primary table must be
|
||||
indexed as a single document block, with the parent document
|
||||
being last in the group.</p>
|
||||
|
||||
<p>When you index in this way, the documents in your index are divided
|
||||
into parent documents (the last document of each block) and child
|
||||
documents (all others). You provide a {@link org.apache.lucene.search.Filter} that identifies the
|
||||
parent documents, as Lucene does not currently record any information
|
||||
about doc blocks.</p>
|
||||
|
||||
<p>At search time, use {@link org.apache.lucene.search.join.BlockJoinQuery} to remap
|
||||
matches from any child {@link org.apache.lucene.search.Query} (ie, a query that matches only
|
||||
child documents) up to the parent document space. The resulting
|
||||
{@link org.apache.lucene.search.join.BlockJoinQuery} can then be used as a clause in any query that
|
||||
matches parent documents.</p>
|
||||
|
||||
<p>If you only care about the parent documents matching the query, you
|
||||
can use any collector to collect the parent hits, but if you'd also
|
||||
like to see which child documents match for each parent document,
|
||||
use the {@link org.apache.lucene.search.join.BlockJoinCollector} to collect the hits. Once the
|
||||
search is done, you retrieve a {@link
|
||||
org.apache.lucene.search.grouping.TopGroups} instance from the
|
||||
{@link org.apache.lucene.search.join.BlockJoinCollector#getTopGroups} method.</p>
|
||||
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,5 @@
|
|||
<html>
|
||||
<body>
|
||||
Lucene's join module
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,476 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.NumericField;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.grouping.GroupDocs;
|
||||
import org.apache.lucene.search.grouping.TopGroups;
|
||||
import org.apache.lucene.search.join.BlockJoinCollector;
|
||||
import org.apache.lucene.search.join.BlockJoinQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
public class TestBlockJoin extends LuceneTestCase {
|
||||
|
||||
// One resume...
|
||||
private Document makeResume(String name, String country) {
|
||||
Document resume = new Document();
|
||||
resume.add(newField("docType", "resume", Field.Index.NOT_ANALYZED));
|
||||
resume.add(newField("name", name, Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
resume.add(newField("country", country, Field.Index.NOT_ANALYZED));
|
||||
return resume;
|
||||
}
|
||||
|
||||
// ... has multiple jobs
|
||||
private Document makeJob(String skill, int year) {
|
||||
Document job = new Document();
|
||||
job.add(newField("skill", skill, Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
job.add(new NumericField("year").setIntValue(year));
|
||||
return job;
|
||||
}
|
||||
|
||||
public void testSimple() throws Exception {
|
||||
|
||||
final Directory dir = newDirectory();
|
||||
final RandomIndexWriter w = new RandomIndexWriter(random, dir);
|
||||
|
||||
final List<Document> docs = new ArrayList<Document>();
|
||||
|
||||
docs.add(makeJob("java", 2007));
|
||||
docs.add(makeJob("python", 2010));
|
||||
docs.add(makeResume("Lisa", "United Kingdom"));
|
||||
w.addDocuments(docs);
|
||||
|
||||
docs.clear();
|
||||
docs.add(makeJob("ruby", 2005));
|
||||
docs.add(makeJob("java", 2006));
|
||||
docs.add(makeResume("Frank", "United States"));
|
||||
w.addDocuments(docs);
|
||||
|
||||
IndexReader r = w.getReader();
|
||||
w.close();
|
||||
IndexSearcher s = new IndexSearcher(r);
|
||||
|
||||
// Create a filter that defines "parent" documents in the index - in this case resumes
|
||||
Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
|
||||
|
||||
// Define child document criteria (finds an example of relevant work experience)
|
||||
BooleanQuery childQuery = new BooleanQuery();
|
||||
childQuery.add(new BooleanClause(new TermQuery(new Term("skill", "java")), Occur.MUST));
|
||||
childQuery.add(new BooleanClause(NumericRangeQuery.newIntRange("year", 2006, 2011, true, true), Occur.MUST));
|
||||
|
||||
// Define parent document criteria (find a resident in the UK)
|
||||
Query parentQuery = new TermQuery(new Term("country", "United Kingdom"));
|
||||
|
||||
// Wrap the child document query to 'join' any matches
|
||||
// up to corresponding parent:
|
||||
BlockJoinQuery childJoinQuery = new BlockJoinQuery(childQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg);
|
||||
|
||||
// Combine the parent and nested child queries into a single query for a candidate
|
||||
BooleanQuery fullQuery = new BooleanQuery();
|
||||
fullQuery.add(new BooleanClause(parentQuery, Occur.MUST));
|
||||
fullQuery.add(new BooleanClause(childJoinQuery, Occur.MUST));
|
||||
|
||||
BlockJoinCollector c = new BlockJoinCollector(Sort.RELEVANCE, 1, true, false);
|
||||
|
||||
s.search(fullQuery, c);
|
||||
|
||||
TopGroups<Integer> results = c.getTopGroups(childJoinQuery, null, 0, 10, 0, true);
|
||||
|
||||
//assertEquals(1, results.totalHitCount);
|
||||
assertEquals(1, results.totalGroupedHitCount);
|
||||
assertEquals(1, results.groups.length);
|
||||
|
||||
final GroupDocs<Integer> group = results.groups[0];
|
||||
assertEquals(1, group.totalHits);
|
||||
|
||||
Document childDoc = s.doc(group.scoreDocs[0].doc);
|
||||
//System.out.println(" doc=" + group.scoreDocs[0].doc);
|
||||
assertEquals("java", childDoc.get("skill"));
|
||||
assertNotNull(group.groupValue);
|
||||
Document parentDoc = s.doc(group.groupValue);
|
||||
assertEquals("Lisa", parentDoc.get("name"));
|
||||
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private String[][] getRandomFields(int maxUniqueValues) {
|
||||
|
||||
final String[][] fields = new String[_TestUtil.nextInt(random, 2, 4)][];
|
||||
for(int fieldID=0;fieldID<fields.length;fieldID++) {
|
||||
final int valueCount;
|
||||
if (fieldID == 0) {
|
||||
valueCount = 2;
|
||||
} else {
|
||||
valueCount = _TestUtil.nextInt(random, 1, maxUniqueValues);
|
||||
}
|
||||
|
||||
final String[] values = fields[fieldID] = new String[valueCount];
|
||||
for(int i=0;i<valueCount;i++) {
|
||||
values[i] = _TestUtil.randomRealisticUnicodeString(random);
|
||||
//values[i] = _TestUtil.randomSimpleString(random);
|
||||
}
|
||||
}
|
||||
|
||||
return fields;
|
||||
}
|
||||
|
||||
private Term randomParentTerm(String[] values) {
|
||||
return new Term("parent0", values[random.nextInt(values.length)]);
|
||||
}
|
||||
|
||||
private Term randomChildTerm(String[] values) {
|
||||
return new Term("child0", values[random.nextInt(values.length)]);
|
||||
}
|
||||
|
||||
private Sort getRandomSort(String prefix, int numFields) {
|
||||
final List<SortField> sortFields = new ArrayList<SortField>();
|
||||
// TODO: sometimes sort by score; problem is scores are
|
||||
// not comparable across the two indices
|
||||
// sortFields.add(SortField.FIELD_SCORE);
|
||||
if (random.nextBoolean()) {
|
||||
sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.Type.STRING, random.nextBoolean()));
|
||||
} else if (random.nextBoolean()) {
|
||||
sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.Type.STRING, random.nextBoolean()));
|
||||
sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.Type.STRING, random.nextBoolean()));
|
||||
}
|
||||
// Break ties:
|
||||
sortFields.add(new SortField(prefix + "ID", SortField.Type.INT));
|
||||
return new Sort(sortFields.toArray(new SortField[sortFields.size()]));
|
||||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
// We build two indices at once: one normalized (which
|
||||
// BlockJoinQuery/Collector can query) and the other w/
|
||||
// same docs just fully denormalized:
|
||||
final Directory dir = newDirectory();
|
||||
final Directory joinDir = newDirectory();
|
||||
|
||||
final int numParentDocs = _TestUtil.nextInt(random, 100*RANDOM_MULTIPLIER, 300*RANDOM_MULTIPLIER);
|
||||
//final int numParentDocs = 30;
|
||||
|
||||
// Values for parent fields:
|
||||
final String[][] parentFields = getRandomFields(numParentDocs/2);
|
||||
// Values for child fields:
|
||||
final String[][] childFields = getRandomFields(numParentDocs);
|
||||
|
||||
// TODO: test star join, nested join cases too!
|
||||
final RandomIndexWriter w = new RandomIndexWriter(random, dir);
|
||||
final RandomIndexWriter joinW = new RandomIndexWriter(random, joinDir);
|
||||
for(int parentDocID=0;parentDocID<numParentDocs;parentDocID++) {
|
||||
Document parentDoc = new Document();
|
||||
Document parentJoinDoc = new Document();
|
||||
Field id = newField("parentID", ""+parentDocID, Field.Store.YES, Field.Index.NOT_ANALYZED);
|
||||
parentDoc.add(id);
|
||||
parentJoinDoc.add(id);
|
||||
parentJoinDoc.add(newField("isParent", "x", Field.Index.NOT_ANALYZED));
|
||||
for(int field=0;field<parentFields.length;field++) {
|
||||
if (random.nextDouble() < 0.9) {
|
||||
Field f = newField("parent" + field,
|
||||
parentFields[field][random.nextInt(parentFields[field].length)],
|
||||
Field.Index.NOT_ANALYZED);
|
||||
parentDoc.add(f);
|
||||
parentJoinDoc.add(f);
|
||||
}
|
||||
}
|
||||
|
||||
final List<Document> joinDocs = new ArrayList<Document>();
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(" " + parentDoc);
|
||||
}
|
||||
|
||||
final int numChildDocs = _TestUtil.nextInt(random, 1, 20);
|
||||
for(int childDocID=0;childDocID<numChildDocs;childDocID++) {
|
||||
// Denormalize: copy all parent fields into child doc:
|
||||
Document childDoc = _TestUtil.cloneDocument(parentDoc);
|
||||
Document joinChildDoc = new Document();
|
||||
joinDocs.add(joinChildDoc);
|
||||
|
||||
Field childID = newField("childID", ""+childDocID, Field.Store.YES, Field.Index.NOT_ANALYZED);
|
||||
childDoc.add(childID);
|
||||
joinChildDoc.add(childID);
|
||||
|
||||
for(int childFieldID=0;childFieldID<childFields.length;childFieldID++) {
|
||||
if (random.nextDouble() < 0.9) {
|
||||
Field f = newField("child" + childFieldID,
|
||||
childFields[childFieldID][random.nextInt(childFields[childFieldID].length)],
|
||||
Field.Index.NOT_ANALYZED);
|
||||
childDoc.add(f);
|
||||
joinChildDoc.add(f);
|
||||
}
|
||||
}
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(" " + joinChildDoc);
|
||||
}
|
||||
|
||||
w.addDocument(childDoc);
|
||||
}
|
||||
|
||||
// Parent last:
|
||||
joinDocs.add(parentJoinDoc);
|
||||
joinW.addDocuments(joinDocs);
|
||||
}
|
||||
|
||||
final IndexReader r = w.getReader();
|
||||
w.close();
|
||||
final IndexReader joinR = joinW.getReader();
|
||||
joinW.close();
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: reader=" + r);
|
||||
System.out.println("TEST: joinReader=" + joinR);
|
||||
|
||||
for(int docIDX=0;docIDX<joinR.maxDoc();docIDX++) {
|
||||
System.out.println(" docID=" + docIDX + " doc=" + joinR.document(docIDX));
|
||||
}
|
||||
}
|
||||
|
||||
final IndexSearcher s = new IndexSearcher(r);
|
||||
s.setDefaultFieldSortScoring(true, true);
|
||||
|
||||
final IndexSearcher joinS = new IndexSearcher(joinR);
|
||||
|
||||
final Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("isParent", "x"))));
|
||||
|
||||
final int iters = 200*RANDOM_MULTIPLIER;
|
||||
|
||||
for(int iter=0;iter<iters;iter++) {
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: iter=" + (1+iter) + " of " + iters);
|
||||
}
|
||||
|
||||
final Query childQuery;
|
||||
if (random.nextInt(3) == 2) {
|
||||
final int childFieldID = random.nextInt(childFields.length);
|
||||
childQuery = new TermQuery(new Term("child" + childFieldID,
|
||||
childFields[childFieldID][random.nextInt(childFields[childFieldID].length)]));
|
||||
} else if (random.nextInt(3) == 2) {
|
||||
BooleanQuery bq = new BooleanQuery();
|
||||
childQuery = bq;
|
||||
final int numClauses = _TestUtil.nextInt(random, 2, 4);
|
||||
boolean didMust = false;
|
||||
for(int clauseIDX=0;clauseIDX<numClauses;clauseIDX++) {
|
||||
Query clause;
|
||||
BooleanClause.Occur occur;
|
||||
if (!didMust && random.nextBoolean()) {
|
||||
occur = random.nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.MUST_NOT;
|
||||
clause = new TermQuery(randomChildTerm(childFields[0]));
|
||||
didMust = true;
|
||||
} else {
|
||||
occur = BooleanClause.Occur.SHOULD;
|
||||
final int childFieldID = _TestUtil.nextInt(random, 1, childFields.length-1);
|
||||
clause = new TermQuery(new Term("child" + childFieldID,
|
||||
childFields[childFieldID][random.nextInt(childFields[childFieldID].length)]));
|
||||
}
|
||||
bq.add(clause, occur);
|
||||
}
|
||||
} else {
|
||||
BooleanQuery bq = new BooleanQuery();
|
||||
childQuery = bq;
|
||||
|
||||
bq.add(new TermQuery(randomChildTerm(childFields[0])),
|
||||
BooleanClause.Occur.MUST);
|
||||
final int childFieldID = _TestUtil.nextInt(random, 1, childFields.length-1);
|
||||
bq.add(new TermQuery(new Term("child" + childFieldID, childFields[childFieldID][random.nextInt(childFields[childFieldID].length)])),
|
||||
random.nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.MUST_NOT);
|
||||
}
|
||||
|
||||
final BlockJoinQuery childJoinQuery = new BlockJoinQuery(childQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg);
|
||||
|
||||
// To run against the block-join index:
|
||||
final Query parentJoinQuery;
|
||||
|
||||
// Same query as parentJoinQuery, but to run against
|
||||
// the fully denormalized index (so we can compare)
|
||||
// results:
|
||||
final Query parentQuery;
|
||||
|
||||
if (random.nextBoolean()) {
|
||||
parentQuery = childQuery;
|
||||
parentJoinQuery = childJoinQuery;
|
||||
} else {
|
||||
// AND parent field w/ child field
|
||||
final BooleanQuery bq = new BooleanQuery();
|
||||
parentJoinQuery = bq;
|
||||
final Term parentTerm = randomParentTerm(parentFields[0]);
|
||||
if (random.nextBoolean()) {
|
||||
bq.add(childJoinQuery, BooleanClause.Occur.MUST);
|
||||
bq.add(new TermQuery(parentTerm),
|
||||
BooleanClause.Occur.MUST);
|
||||
} else {
|
||||
bq.add(new TermQuery(parentTerm),
|
||||
BooleanClause.Occur.MUST);
|
||||
bq.add(childJoinQuery, BooleanClause.Occur.MUST);
|
||||
}
|
||||
|
||||
final BooleanQuery bq2 = new BooleanQuery();
|
||||
parentQuery = bq2;
|
||||
if (random.nextBoolean()) {
|
||||
bq2.add(childQuery, BooleanClause.Occur.MUST);
|
||||
bq2.add(new TermQuery(parentTerm),
|
||||
BooleanClause.Occur.MUST);
|
||||
} else {
|
||||
bq2.add(new TermQuery(parentTerm),
|
||||
BooleanClause.Occur.MUST);
|
||||
bq2.add(childQuery, BooleanClause.Occur.MUST);
|
||||
}
|
||||
}
|
||||
|
||||
final Sort parentSort = getRandomSort("parent", parentFields.length);
|
||||
final Sort childSort = getRandomSort("child", childFields.length);
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("\nTEST: query=" + parentQuery + " joinQuery=" + parentJoinQuery + " parentSort=" + parentSort + " childSort=" + childSort);
|
||||
}
|
||||
|
||||
// Merge both sorst:
|
||||
final List<SortField> sortFields = new ArrayList<SortField>(Arrays.asList(parentSort.getSort()));
|
||||
sortFields.addAll(Arrays.asList(childSort.getSort()));
|
||||
final Sort parentAndChildSort = new Sort(sortFields.toArray(new SortField[sortFields.size()]));
|
||||
|
||||
final TopDocs results = s.search(parentQuery, null, r.numDocs(),
|
||||
parentAndChildSort);
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("\nTEST: normal index gets " + results.totalHits + " hits");
|
||||
final ScoreDoc[] hits = results.scoreDocs;
|
||||
for(int hitIDX=0;hitIDX<hits.length;hitIDX++) {
|
||||
final Document doc = s.doc(hits[hitIDX].doc);
|
||||
//System.out.println(" score=" + hits[hitIDX].score + " parentID=" + doc.get("parentID") + " childID=" + doc.get("childID") + " (docID=" + hits[hitIDX].doc + ")");
|
||||
System.out.println(" parentID=" + doc.get("parentID") + " childID=" + doc.get("childID") + " (docID=" + hits[hitIDX].doc + ")");
|
||||
FieldDoc fd = (FieldDoc) hits[hitIDX];
|
||||
if (fd.fields != null) {
|
||||
System.out.print(" ");
|
||||
for(Object o : fd.fields) {
|
||||
if (o instanceof BytesRef) {
|
||||
System.out.print(((BytesRef) o).utf8ToString() + " ");
|
||||
} else {
|
||||
System.out.print(o + " ");
|
||||
}
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final BlockJoinCollector c = new BlockJoinCollector(parentSort, 10, true, true);
|
||||
|
||||
joinS.search(parentJoinQuery, c);
|
||||
|
||||
final int hitsPerGroup = _TestUtil.nextInt(random, 1, 20);
|
||||
//final int hitsPerGroup = 100;
|
||||
final TopGroups<Integer> joinResults = c.getTopGroups(childJoinQuery, childSort, 0, hitsPerGroup, 0, true);
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("\nTEST: block join index gets " + (joinResults == null ? 0 : joinResults.groups.length) + " groups; hitsPerGroup=" + hitsPerGroup);
|
||||
if (joinResults != null) {
|
||||
final GroupDocs<Integer>[] groups = joinResults.groups;
|
||||
for(int groupIDX=0;groupIDX<groups.length;groupIDX++) {
|
||||
final GroupDocs<Integer> group = groups[groupIDX];
|
||||
if (group.groupSortValues != null) {
|
||||
System.out.print(" ");
|
||||
for(Object o : group.groupSortValues) {
|
||||
if (o instanceof BytesRef) {
|
||||
System.out.print(((BytesRef) o).utf8ToString() + " ");
|
||||
} else {
|
||||
System.out.print(o + " ");
|
||||
}
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
assertNotNull(group.groupValue);
|
||||
final Document parentDoc = joinS.doc(group.groupValue);
|
||||
System.out.println(" group parentID=" + parentDoc.get("parentID") + " (docID=" + group.groupValue + ")");
|
||||
for(int hitIDX=0;hitIDX<group.scoreDocs.length;hitIDX++) {
|
||||
final Document doc = joinS.doc(group.scoreDocs[hitIDX].doc);
|
||||
//System.out.println(" score=" + group.scoreDocs[hitIDX].score + " childID=" + doc.get("childID") + " (docID=" + group.scoreDocs[hitIDX].doc + ")");
|
||||
System.out.println(" childID=" + doc.get("childID") + " child0=" + doc.get("child0") + " (docID=" + group.scoreDocs[hitIDX].doc + ")");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (results.totalHits == 0) {
|
||||
assertNull(joinResults);
|
||||
} else {
|
||||
compareHits(r, joinR, results, joinResults);
|
||||
}
|
||||
}
|
||||
|
||||
r.close();
|
||||
joinR.close();
|
||||
dir.close();
|
||||
joinDir.close();
|
||||
}
|
||||
|
||||
private void compareHits(IndexReader r, IndexReader joinR, TopDocs results, TopGroups<Integer> joinResults) throws Exception {
|
||||
// results is 'complete'; joinResults is a subset
|
||||
int resultUpto = 0;
|
||||
int joinGroupUpto = 0;
|
||||
|
||||
final ScoreDoc[] hits = results.scoreDocs;
|
||||
final GroupDocs<Integer>[] groupDocs = joinResults.groups;
|
||||
|
||||
while(joinGroupUpto < groupDocs.length) {
|
||||
final GroupDocs<Integer> group = groupDocs[joinGroupUpto++];
|
||||
final ScoreDoc[] groupHits = group.scoreDocs;
|
||||
assertNotNull(group.groupValue);
|
||||
final Document parentDoc = joinR.document(group.groupValue);
|
||||
final String parentID = parentDoc.get("parentID");
|
||||
//System.out.println("GROUP groupDoc=" + group.groupDoc + " parent=" + parentDoc);
|
||||
assertNotNull(parentID);
|
||||
assertTrue(groupHits.length > 0);
|
||||
for(int hitIDX=0;hitIDX<groupHits.length;hitIDX++) {
|
||||
final Document nonJoinHit = r.document(hits[resultUpto++].doc);
|
||||
final Document joinHit = joinR.document(groupHits[hitIDX].doc);
|
||||
assertEquals(parentID,
|
||||
nonJoinHit.get("parentID"));
|
||||
assertEquals(joinHit.get("childID"),
|
||||
nonJoinHit.get("childID"));
|
||||
}
|
||||
|
||||
if (joinGroupUpto < groupDocs.length) {
|
||||
// Advance non-join hit to the next parentID:
|
||||
//System.out.println(" next joingroupUpto=" + joinGroupUpto + " gd.length=" + groupDocs.length + " parentID=" + parentID);
|
||||
while(true) {
|
||||
assertTrue(resultUpto < hits.length);
|
||||
if (!parentID.equals(r.document(hits[resultUpto].doc).get("parentID"))) {
|
||||
break;
|
||||
}
|
||||
resultUpto++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue