LUCENE-3685: add ToChildBlockJoinQuery, to join from parent to child

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1231512 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-01-14 15:17:04 +00:00
parent b2de49f606
commit d584f6361d
7 changed files with 618 additions and 61 deletions

View File

@ -111,6 +111,11 @@ New Features
* LUCENE-3634: IndexReader's static main method was moved to a new
tool, CompoundFileExtractor, in contrib/misc. (Mike McCandless)
* LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous
BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do
joins in both parent to child and child to parent directions.
(Mike McCandless)
API Changes
* LUCENE-3596: DirectoryTaxonomyWriter.openIndexWriter() now takes an

View File

@ -33,6 +33,8 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.junit.Ignore;
// nocommit
@Ignore
public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
//this is some text here is a link and another link . This is an entity: & plus a <. Here is an &

View File

@ -0,0 +1,316 @@
package org.apache.lucene.search.join;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Set;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter; // javadocs
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Scorer.ChildScorer;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
/**
* Just like {@link ToParentBlockJoinQuery}, except this
* query joins in reverse: you provide a Query matching
* parent documents and it joins down to child
* documents.
*
* @lucene.experimental
*/
public class ToChildBlockJoinQuery extends Query {
private final Filter parentsFilter;
private final Query parentQuery;
// If we are rewritten, this is the original parentQuery we
// were passed; we use this for .equals() and
// .hashCode(). This makes rewritten query equal the
// original, so that user does not have to .rewrite() their
// query before searching:
private final Query origParentQuery;
private final boolean doScores;
public ToChildBlockJoinQuery(Query parentQuery, Filter parentsFilter, boolean doScores) {
super();
this.origParentQuery = parentQuery;
this.parentQuery = parentQuery;
this.parentsFilter = parentsFilter;
this.doScores = doScores;
}
private ToChildBlockJoinQuery(Query origParentQuery, Query parentQuery, Filter parentsFilter, boolean doScores) {
super();
this.origParentQuery = origParentQuery;
this.parentQuery = parentQuery;
this.parentsFilter = parentsFilter;
this.doScores = doScores;
}
@Override
public Weight createWeight(IndexSearcher searcher) throws IOException {
return new ToChildBlockJoinWeight(this, parentQuery.createWeight(searcher), parentsFilter, doScores);
}
private static class ToChildBlockJoinWeight extends Weight {
private final Query joinQuery;
private final Weight parentWeight;
private final Filter parentsFilter;
private final boolean doScores;
public ToChildBlockJoinWeight(Query joinQuery, Weight parentWeight, Filter parentsFilter, boolean doScores) {
super();
this.joinQuery = joinQuery;
this.parentWeight = parentWeight;
this.parentsFilter = parentsFilter;
this.doScores = doScores;
}
@Override
public Query getQuery() {
return joinQuery;
}
@Override
public float getValueForNormalization() throws IOException {
return parentWeight.getValueForNormalization() * joinQuery.getBoost() * joinQuery.getBoost();
}
@Override
public void normalize(float norm, float topLevelBoost) {
parentWeight.normalize(norm, topLevelBoost * joinQuery.getBoost());
}
@Override
public Scorer scorer(AtomicReaderContext readerContext, boolean scoreDocsInOrder,
boolean topScorer, Bits acceptDocs) throws IOException {
// Pass scoreDocsInOrder true, topScorer false to our sub:
final Scorer parentScorer = parentWeight.scorer(readerContext, true, false, acceptDocs);
if (parentScorer == null) {
// No matches
return null;
}
final DocIdSet parents = parentsFilter.getDocIdSet(readerContext, readerContext.reader.getLiveDocs());
// TODO: once we do random-access filters we can
// generalize this:
if (parents == null) {
// No matches
return null;
}
if (!(parents instanceof FixedBitSet)) {
throw new IllegalStateException("parentFilter must return FixedBitSet; got " + parents);
}
return new ToChildBlockJoinScorer(this, parentScorer, (FixedBitSet) parents, doScores);
}
@Override
public Explanation explain(AtomicReaderContext reader, int doc) throws IOException {
// TODO
throw new UnsupportedOperationException(getClass().getName() +
" cannot explain match on parent document");
}
@Override
public boolean scoresDocsOutOfOrder() {
return false;
}
}
static class ToChildBlockJoinScorer extends Scorer {
private final Scorer parentScorer;
private final FixedBitSet parentBits;
private final boolean doScores;
private float parentScore;
private int childDoc = -1;
private int parentDoc;
public ToChildBlockJoinScorer(Weight weight, Scorer parentScorer, FixedBitSet parentBits, boolean doScores) {
super(weight);
this.doScores = doScores;
this.parentBits = parentBits;
this.parentScorer = parentScorer;
}
@Override
public Collection<ChildScorer> getChildren() {
return Collections.singletonList(new ChildScorer(parentScorer, "BLOCK_JOIN"));
}
@Override
public int nextDoc() throws IOException {
//System.out.println("Q.nextDoc() parentDoc=" + parentDoc + " childDoc=" + childDoc);
if (childDoc+1 == parentDoc) {
// OK, we are done iterating through all children
// matching this one parent doc, so we now nextDoc()
// the parent. Use a while loop because we may have
// to skip over some number of parents w/ no
// children:
while (true) {
parentDoc = parentScorer.nextDoc();
if (parentDoc == 0) {
// Degenerate but allowed: parent has no children
// TODO: would be nice to pull initial parent
// into ctor so we can skip this if... but it's
// tricky because scorer must return -1 for
// .doc() on init...
parentDoc = parentScorer.nextDoc();
}
if (parentDoc == NO_MORE_DOCS) {
childDoc = NO_MORE_DOCS;
//System.out.println(" END");
return childDoc;
}
childDoc = 1 + parentBits.prevSetBit(parentDoc-1);
if (childDoc < parentDoc) {
if (doScores) {
parentScore = parentScorer.score();
}
//System.out.println(" " + childDoc);
return childDoc;
} else {
// Degenerate but allowed: parent has no children
}
}
} else {
assert childDoc < parentDoc: "childDoc=" + childDoc + " parentDoc=" + parentDoc;
childDoc++;
//System.out.println(" " + childDoc);
return childDoc;
}
}
@Override
public int docID() {
return childDoc;
}
@Override
public float score() throws IOException {
return parentScore;
}
@Override
public int advance(int childTarget) throws IOException {
//System.out.println("Q.advance childTarget=" + childTarget);
if (childTarget == NO_MORE_DOCS) {
//System.out.println(" END");
return childDoc = parentDoc = NO_MORE_DOCS;
}
assert childTarget != parentDoc;
if (childTarget > parentDoc) {
// Advance to new parent:
parentDoc = parentScorer.advance(childTarget);
//System.out.println(" advance to parentDoc=" + parentDoc);
assert parentDoc > childTarget;
if (parentDoc == NO_MORE_DOCS) {
//System.out.println(" END");
return childDoc = NO_MORE_DOCS;
}
if (doScores) {
parentScore = parentScorer.score();
}
final int firstChild = parentBits.prevSetBit(parentDoc-1);
//System.out.println(" firstChild=" + firstChild);
childTarget = Math.max(childTarget, firstChild);
}
assert childTarget < parentDoc;
// Advance within children of current parent:
childDoc = childTarget;
//System.out.println(" " + childDoc);
return childDoc;
}
}
@Override
public void extractTerms(Set<Term> terms) {
parentQuery.extractTerms(terms);
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
final Query parentRewrite = parentQuery.rewrite(reader);
if (parentRewrite != parentQuery) {
Query rewritten = new ToChildBlockJoinQuery(parentQuery,
parentRewrite,
parentsFilter,
doScores);
rewritten.setBoost(getBoost());
return rewritten;
} else {
return this;
}
}
@Override
public String toString(String field) {
return "ToChildBlockJoinQuery ("+parentQuery.toString()+")";
}
@Override
public boolean equals(Object _other) {
if (_other instanceof ToChildBlockJoinQuery) {
final ToChildBlockJoinQuery other = (ToChildBlockJoinQuery) _other;
return origParentQuery.equals(other.origParentQuery) &&
parentsFilter.equals(other.parentsFilter) &&
doScores == other.doScores;
} else {
return false;
}
}
@Override
public int hashCode() {
final int prime = 31;
int hash = 1;
hash = prime * hash + origParentQuery.hashCode();
hash = prime * hash + new Boolean(doScores).hashCode();
hash = prime * hash + parentsFilter.hashCode();
return hash;
}
@Override
public Object clone() {
return new ToChildBlockJoinQuery((Query) origParentQuery.clone(),
parentsFilter,
doScores);
}
}

View File

@ -60,7 +60,7 @@ import org.apache.lucene.util.ArrayUtil;
*
* <p>You should only use this
* collector if one or more of the clauses in the query is
* a {@link BlockJoinQuery}. This collector will find those query
* a {@link ToParentBlockJoinQuery}. This collector will find those query
* clauses and record the matching child documents for the
* top scoring parent documents.</p>
*
@ -70,11 +70,11 @@ import org.apache.lucene.util.ArrayUtil;
* parent table were indexed as a doc block.</p>
*
* <p>For the simple star join you can retrieve the
* {@link TopGroups} instance containing each {@link BlockJoinQuery}'s
* {@link TopGroups} instance containing each {@link ToParentBlockJoinQuery}'s
* matching child documents for the top parent groups,
* using {@link #getTopGroups}. Ie,
* a single query, which will contain two or more
* {@link BlockJoinQuery}'s as clauses representing the star join,
* {@link ToParentBlockJoinQuery}'s as clauses representing the star join,
* can then retrieve two or more {@link TopGroups} instances.</p>
*
* <p>For nested joins, the query will run correctly (ie,
@ -90,7 +90,7 @@ import org.apache.lucene.util.ArrayUtil;
*
* @lucene.experimental
*/
public class BlockJoinCollector extends Collector {
public class ToParentBlockJoinCollector extends Collector {
private final Sort sort;
@ -106,7 +106,7 @@ public class BlockJoinCollector extends Collector {
private final boolean trackScores;
private int docBase;
private BlockJoinQuery.BlockJoinScorer[] joinScorers = new BlockJoinQuery.BlockJoinScorer[0];
private ToParentBlockJoinQuery.BlockJoinScorer[] joinScorers = new ToParentBlockJoinQuery.BlockJoinScorer[0];
private IndexReader.AtomicReaderContext currentReaderContext;
private Scorer scorer;
private boolean queueFull;
@ -115,9 +115,9 @@ public class BlockJoinCollector extends Collector {
private int totalHitCount;
private float maxScore = Float.NaN;
/* Creates a BlockJoinCollector. The provided sort must
/* Creates a ToParentBlockJoinCollector. The provided sort must
* not be null. */
public BlockJoinCollector(Sort sort, int numParentHits, boolean trackScores, boolean trackMaxScore) throws IOException {
public ToParentBlockJoinCollector(Sort sort, int numParentHits, boolean trackScores, boolean trackMaxScore) throws IOException {
// TODO: allow null sort to be specialized to relevance
// only collector
this.sort = sort;
@ -252,7 +252,7 @@ public class BlockJoinCollector extends Collector {
//System.out.println("copyGroups parentDoc=" + og.doc);
for(int scorerIDX = 0;scorerIDX < numSubScorers;scorerIDX++) {
final BlockJoinQuery.BlockJoinScorer joinScorer = joinScorers[scorerIDX];
final ToParentBlockJoinQuery.BlockJoinScorer joinScorer = joinScorers[scorerIDX];
//System.out.println(" scorer=" + joinScorer);
if (joinScorer != null) {
og.counts[scorerIDX] = joinScorer.getChildCount();
@ -284,12 +284,12 @@ public class BlockJoinCollector extends Collector {
return false;
}
private void enroll(BlockJoinQuery query, BlockJoinQuery.BlockJoinScorer scorer) {
private void enroll(ToParentBlockJoinQuery query, ToParentBlockJoinQuery.BlockJoinScorer scorer) {
final Integer slot = joinQueryID.get(query);
if (slot == null) {
joinQueryID.put(query, joinScorers.length);
//System.out.println("found JQ: " + query + " slot=" + joinScorers.length);
final BlockJoinQuery.BlockJoinScorer[] newArray = new BlockJoinQuery.BlockJoinScorer[1+joinScorers.length];
final ToParentBlockJoinQuery.BlockJoinScorer[] newArray = new ToParentBlockJoinQuery.BlockJoinScorer[1+joinScorers.length];
System.arraycopy(joinScorers, 0, newArray, 0, joinScorers.length);
joinScorers = newArray;
joinScorers[joinScorers.length-1] = scorer;
@ -313,8 +313,8 @@ public class BlockJoinCollector extends Collector {
Queue<Scorer> queue = new LinkedList<Scorer>();
queue.add(scorer);
while ((scorer = queue.poll()) != null) {
if (scorer instanceof BlockJoinQuery.BlockJoinScorer) {
enroll((BlockJoinQuery) scorer.getWeight().getQuery(), (BlockJoinQuery.BlockJoinScorer)scorer);
if (scorer instanceof ToParentBlockJoinQuery.BlockJoinScorer) {
enroll((ToParentBlockJoinQuery) scorer.getWeight().getQuery(), (ToParentBlockJoinQuery.BlockJoinScorer) scorer);
}
for (ChildScorer sub : scorer.getChildren()) {
@ -370,7 +370,7 @@ public class BlockJoinCollector extends Collector {
* is not computed (will always be 0). Returns null if
* no groups matched. */
@SuppressWarnings("unchecked")
public TopGroups<Integer> getTopGroups(BlockJoinQuery query, Sort withinGroupSort, int offset, int maxDocsPerGroup, int withinGroupOffset, boolean fillSortFields)
public TopGroups<Integer> getTopGroups(ToParentBlockJoinQuery query, Sort withinGroupSort, int offset, int maxDocsPerGroup, int withinGroupOffset, boolean fillSortFields)
throws IOException {

View File

@ -56,12 +56,15 @@ import org.apache.lucene.util.FixedBitSet;
* You can then use this Query as a clause with
* other queries in the parent document space.</p>
*
* <p>See {@link ToChildBlockJoinQuery} if you need to join
* in the reverse order.
*
* <p>The child documents must be orthogonal to the parent
* documents: the wrapped child query must never
* return a parent document.</p>
*
* If you'd like to retrieve {@link TopGroups} for the
* resulting query, use the {@link BlockJoinCollector}.
* resulting query, use the {@link ToParentBlockJoinCollector}.
* Note that this is not necessary, ie, if you simply want
* to collect the parent documents and don't need to see
* which child documents matched under that parent, then
@ -71,7 +74,7 @@ import org.apache.lucene.util.FixedBitSet;
* matches, for example you OR a parent-only query with a
* joined child-only query, then the resulting collected documents
* will be correct, however the {@link TopGroups} you get
* from {@link BlockJoinCollector} will not contain every
* from {@link ToParentBlockJoinCollector} will not contain every
* child for parents that had matched.
*
* <p>See {@link org.apache.lucene.search.join} for an
@ -80,9 +83,22 @@ import org.apache.lucene.util.FixedBitSet;
* @lucene.experimental
*/
public class BlockJoinQuery extends Query {
public class ToParentBlockJoinQuery extends Query {
public static enum ScoreMode {None, Avg, Max, Total};
/** How to aggregate multiple child hit scores into a
* single parent score. */
public static enum ScoreMode {
/** Do no scoring. */
None,
/** Parent hit's score is the average of all child
scores. */
Avg,
/** Parent hit's score is the max of all child
scores. */
Max,
/** Parent hit's score is the sum of all child
scores. */
Total};
private final Filter parentsFilter;
private final Query childQuery;
@ -95,7 +111,15 @@ public class BlockJoinQuery extends Query {
private final Query origChildQuery;
private final ScoreMode scoreMode;
public BlockJoinQuery(Query childQuery, Filter parentsFilter, ScoreMode scoreMode) {
/** Create a ToParentBlockJoinQuery.
*
* @param childQuery Query matching child documents.
* @param parentsFilter Filter (must produce FixedBitSet
* per-seegment) identifying the parent documents.
* @param scoreMode How to aggregate multiple child scores
* into a single parent score.
**/
public ToParentBlockJoinQuery(Query childQuery, Filter parentsFilter, ScoreMode scoreMode) {
super();
this.origChildQuery = childQuery;
this.childQuery = childQuery;
@ -103,7 +127,7 @@ public class BlockJoinQuery extends Query {
this.scoreMode = scoreMode;
}
private BlockJoinQuery(Query origChildQuery, Query childQuery, Filter parentsFilter, ScoreMode scoreMode) {
private ToParentBlockJoinQuery(Query origChildQuery, Query childQuery, Filter parentsFilter, ScoreMode scoreMode) {
super();
this.origChildQuery = origChildQuery;
this.childQuery = childQuery;
@ -267,9 +291,9 @@ public class BlockJoinQuery extends Query {
//System.out.println(" c=" + nextChildDoc);
if (pendingChildDocs.length == childDocUpto) {
pendingChildDocs = ArrayUtil.grow(pendingChildDocs);
if (scoreMode != ScoreMode.None) {
pendingChildScores = ArrayUtil.grow(pendingChildScores);
}
if (scoreMode != ScoreMode.None && pendingChildScores.length == childDocUpto) {
pendingChildScores = ArrayUtil.grow(pendingChildScores);
}
pendingChildDocs[childDocUpto] = nextChildDoc;
if (scoreMode != ScoreMode.None) {
@ -362,7 +386,7 @@ public class BlockJoinQuery extends Query {
public Query rewrite(IndexReader reader) throws IOException {
final Query childRewrite = childQuery.rewrite(reader);
if (childRewrite != childQuery) {
Query rewritten = new BlockJoinQuery(childQuery,
Query rewritten = new ToParentBlockJoinQuery(childQuery,
childRewrite,
parentsFilter,
scoreMode);
@ -375,13 +399,13 @@ public class BlockJoinQuery extends Query {
@Override
public String toString(String field) {
return "BlockJoinQuery ("+childQuery.toString()+")";
return "ToParentBlockJoinQuery ("+childQuery.toString()+")";
}
@Override
public boolean equals(Object _other) {
if (_other instanceof BlockJoinQuery) {
final BlockJoinQuery other = (BlockJoinQuery) _other;
if (_other instanceof ToParentBlockJoinQuery) {
final ToParentBlockJoinQuery other = (ToParentBlockJoinQuery) _other;
return origChildQuery.equals(other.origChildQuery) &&
parentsFilter.equals(other.parentsFilter) &&
scoreMode == other.scoreMode;
@ -402,7 +426,7 @@ public class BlockJoinQuery extends Query {
@Override
public Object clone() {
return new BlockJoinQuery((Query) origChildQuery.clone(),
return new ToParentBlockJoinQuery((Query) origChildQuery.clone(),
parentsFilter,
scoreMode);
}

View File

@ -14,19 +14,25 @@
parent documents, as Lucene does not currently record any information
about doc blocks.</p>
<p>At search time, use {@link org.apache.lucene.search.join.BlockJoinQuery} to remap
matches from any child {@link org.apache.lucene.search.Query} (ie, a query that matches only
child documents) up to the parent document space. The resulting
{@link org.apache.lucene.search.join.BlockJoinQuery} can then be used as a clause in any query that
matches parent documents.</p>
<p>At search time, use {@link
org.apache.lucene.search.join.ToParentBlockJoinQuery} to remap/join
matches from any child {@link org.apache.lucene.search.Query} (ie, a
query that matches only child documents) up to the parent document
space. The
resulting query can then be used as a clause in any query that
matches parent.</p>
<p>If you only care about the parent documents matching the query, you
can use any collector to collect the parent hits, but if you'd also
like to see which child documents match for each parent document,
use the {@link org.apache.lucene.search.join.BlockJoinCollector} to collect the hits. Once the
use the {@link org.apache.lucene.search.join.ToParentBlockJoinCollector} to collect the hits. Once the
search is done, you retrieve a {@link
org.apache.lucene.search.grouping.TopGroups} instance from the
{@link org.apache.lucene.search.join.BlockJoinCollector#getTopGroups} method.</p>
{@link org.apache.lucene.search.join.ToParentBlockJoinCollector#getTopGroups} method.</p>
<p>To map/join in the opposite direction, use {@link
org.apache.lucene.search.join.ToChildBlockJoinQuery}. This wraps
any query matching parent documents, creating the joined query
matching only child documents.
</body>
</html>

View File

@ -17,6 +17,7 @@ package org.apache.lucene.search.join;
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@ -26,6 +27,7 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LogDocMergePolicy;
import org.apache.lucene.index.RandomIndexWriter;
@ -34,10 +36,9 @@ import org.apache.lucene.search.*;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.grouping.GroupDocs;
import org.apache.lucene.search.grouping.TopGroups;
import org.apache.lucene.search.join.BlockJoinCollector;
import org.apache.lucene.search.join.BlockJoinQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util._TestUtil;
@ -57,7 +58,7 @@ public class TestBlockJoin extends LuceneTestCase {
private Document makeJob(String skill, int year) {
Document job = new Document();
job.add(newField("skill", skill, StringField.TYPE_STORED));
job.add(new NumericField("year").setIntValue(year));
job.add(new NumericField("year", NumericField.TYPE_STORED).setIntValue(year));
return job;
}
@ -104,14 +105,14 @@ public class TestBlockJoin extends LuceneTestCase {
// Wrap the child document query to 'join' any matches
// up to corresponding parent:
BlockJoinQuery childJoinQuery = new BlockJoinQuery(childQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg);
ToParentBlockJoinQuery childJoinQuery = new ToParentBlockJoinQuery(childQuery, parentsFilter, ToParentBlockJoinQuery.ScoreMode.Avg);
// Combine the parent and nested child queries into a single query for a candidate
BooleanQuery fullQuery = new BooleanQuery();
fullQuery.add(new BooleanClause(parentQuery, Occur.MUST));
fullQuery.add(new BooleanClause(childJoinQuery, Occur.MUST));
BlockJoinCollector c = new BlockJoinCollector(Sort.RELEVANCE, 1, true, false);
ToParentBlockJoinCollector c = new ToParentBlockJoinCollector(Sort.RELEVANCE, 1, true, false);
s.search(fullQuery, c);
@ -131,10 +132,35 @@ public class TestBlockJoin extends LuceneTestCase {
Document parentDoc = s.doc(group.groupValue);
assertEquals("Lisa", parentDoc.get("name"));
//System.out.println("TEST: now test up");
// Now join "up" (map parent hits to child docs) instead...:
ToChildBlockJoinQuery parentJoinQuery = new ToChildBlockJoinQuery(parentQuery, parentsFilter, random.nextBoolean());
BooleanQuery fullChildQuery = new BooleanQuery();
fullChildQuery.add(new BooleanClause(parentJoinQuery, Occur.MUST));
fullChildQuery.add(new BooleanClause(childQuery, Occur.MUST));
//System.out.println("FULL: " + fullChildQuery);
TopDocs hits = s.search(fullChildQuery, 10);
assertEquals(1, hits.totalHits);
childDoc = s.doc(hits.scoreDocs[0].doc);
//System.out.println("CHILD = " + childDoc + " docID=" + hits.scoreDocs[0].doc);
assertEquals("java", childDoc.get("skill"));
assertEquals(2007, ((NumericField) childDoc.getField("year")).numericValue());
assertEquals("Lisa", getParentDoc(r, parentsFilter, hits.scoreDocs[0].doc).get("name"));
r.close();
dir.close();
}
private Document getParentDoc(IndexReader reader, Filter parents, int childDocID) throws IOException {
final AtomicReaderContext[] leaves = ReaderUtil.leaves(reader.getTopReaderContext());
final int subIndex = ReaderUtil.subIndex(childDocID, leaves);
final AtomicReaderContext leaf = leaves[subIndex];
final FixedBitSet bits = (FixedBitSet) parents.getDocIdSet(leaf, null);
return leaf.reader.document(bits.nextSetBit(childDocID - leaf.docBase));
}
public void testBoostBug() throws Exception {
final Directory dir = newDirectory();
final RandomIndexWriter w = new RandomIndexWriter(random, dir);
@ -142,7 +168,7 @@ public class TestBlockJoin extends LuceneTestCase {
w.close();
IndexSearcher s = newSearcher(r);
BlockJoinQuery q = new BlockJoinQuery(new MatchAllDocsQuery(), new QueryWrapperFilter(new MatchAllDocsQuery()), BlockJoinQuery.ScoreMode.Avg);
ToParentBlockJoinQuery q = new ToParentBlockJoinQuery(new MatchAllDocsQuery(), new QueryWrapperFilter(new MatchAllDocsQuery()), ToParentBlockJoinQuery.ScoreMode.Avg);
s.search(q, 10);
BooleanQuery bq = new BooleanQuery();
bq.setBoost(2f); // we boost the BQ
@ -199,8 +225,9 @@ public class TestBlockJoin extends LuceneTestCase {
public void testRandom() throws Exception {
// We build two indices at once: one normalized (which
// BlockJoinQuery/Collector can query) and the other w/
// same docs just fully denormalized:
// ToParentBlockJoinQuery/Collector,
// ToChildBlockJoinQuery can query) and the other w/
// the same docs, just fully denormalized:
final Directory dir = newDirectory();
final Directory joinDir = newDirectory();
@ -212,7 +239,7 @@ public class TestBlockJoin extends LuceneTestCase {
// Values for child fields:
final String[][] childFields = getRandomFields(numParentDocs);
// TODO: test star join, nested join cases too!
// TODO: parallel star join, nested join cases too!
final RandomIndexWriter w = new RandomIndexWriter(random, dir);
final RandomIndexWriter joinW = new RandomIndexWriter(random, joinDir);
for(int parentDocID=0;parentDocID<numParentDocs;parentDocID++) {
@ -235,7 +262,15 @@ public class TestBlockJoin extends LuceneTestCase {
final List<Document> joinDocs = new ArrayList<Document>();
if (VERBOSE) {
System.out.println(" " + parentDoc);
StringBuilder sb = new StringBuilder();
sb.append("parentID=" + parentDoc.get("parentID"));
for(int fieldID=0;fieldID<parentFields.length;fieldID++) {
String s = parentDoc.get("parent" + fieldID);
if (s != null) {
sb.append(" parent" + fieldID + "=" + s);
}
}
System.out.println(" " + sb.toString());
}
final int numChildDocs = _TestUtil.nextInt(random, 1, 20);
@ -260,7 +295,15 @@ public class TestBlockJoin extends LuceneTestCase {
}
if (VERBOSE) {
System.out.println(" " + joinChildDoc);
StringBuilder sb = new StringBuilder();
sb.append("childID=" + joinChildDoc.get("childID"));
for(int fieldID=0;fieldID<childFields.length;fieldID++) {
String s = joinChildDoc.get("child" + fieldID);
if (s != null) {
sb.append(" child" + fieldID + "=" + s);
}
}
System.out.println(" " + sb.toString());
}
w.addDocument(childDoc);
@ -335,14 +378,26 @@ public class TestBlockJoin extends LuceneTestCase {
random.nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.MUST_NOT);
}
final BlockJoinQuery childJoinQuery = new BlockJoinQuery(childQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg);
final int x = random.nextInt(4);
final ToParentBlockJoinQuery.ScoreMode agg;
if (x == 0) {
agg = ToParentBlockJoinQuery.ScoreMode.None;
} else if (x == 1) {
agg = ToParentBlockJoinQuery.ScoreMode.Max;
} else if (x == 2) {
agg = ToParentBlockJoinQuery.ScoreMode.Total;
} else {
agg = ToParentBlockJoinQuery.ScoreMode.Avg;
}
final ToParentBlockJoinQuery childJoinQuery = new ToParentBlockJoinQuery(childQuery, parentsFilter, agg);
// To run against the block-join index:
final Query parentJoinQuery;
// Same query as parentJoinQuery, but to run against
// the fully denormalized index (so we can compare)
// results:
// the fully denormalized index (so we can compare
// results):
final Query parentQuery;
if (random.nextBoolean()) {
@ -383,7 +438,7 @@ public class TestBlockJoin extends LuceneTestCase {
System.out.println("\nTEST: query=" + parentQuery + " joinQuery=" + parentJoinQuery + " parentSort=" + parentSort + " childSort=" + childSort);
}
// Merge both sorst:
// Merge both sorts:
final List<SortField> sortFields = new ArrayList<SortField>(Arrays.asList(parentSort.getSort()));
sortFields.addAll(Arrays.asList(childSort.getSort()));
final Sort parentAndChildSort = new Sort(sortFields.toArray(new SortField[sortFields.size()]));
@ -413,7 +468,16 @@ public class TestBlockJoin extends LuceneTestCase {
}
}
final BlockJoinCollector c = new BlockJoinCollector(parentSort, 10, true, true);
final boolean trackScores;
final boolean trackMaxScore;
if (agg == ToParentBlockJoinQuery.ScoreMode.None) {
trackScores = false;
trackMaxScore = false;
} else {
trackScores = random.nextBoolean();
trackMaxScore = random.nextBoolean();
}
final ToParentBlockJoinCollector c = new ToParentBlockJoinCollector(parentSort, 10, trackScores, trackMaxScore);
joinS.search(parentJoinQuery, c);
@ -456,6 +520,124 @@ public class TestBlockJoin extends LuceneTestCase {
} else {
compareHits(r, joinR, results, joinResults);
}
// Test joining in the opposite direction (parent to
// child):
// Get random query against parent documents:
final Query parentQuery2;
if (random.nextInt(3) == 2) {
final int fieldID = random.nextInt(parentFields.length);
parentQuery2 = new TermQuery(new Term("parent" + fieldID,
parentFields[fieldID][random.nextInt(parentFields[fieldID].length)]));
} else if (random.nextInt(3) == 2) {
BooleanQuery bq = new BooleanQuery();
parentQuery2 = bq;
final int numClauses = _TestUtil.nextInt(random, 2, 4);
boolean didMust = false;
for(int clauseIDX=0;clauseIDX<numClauses;clauseIDX++) {
Query clause;
BooleanClause.Occur occur;
if (!didMust && random.nextBoolean()) {
occur = random.nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.MUST_NOT;
clause = new TermQuery(randomParentTerm(parentFields[0]));
didMust = true;
} else {
occur = BooleanClause.Occur.SHOULD;
final int fieldID = _TestUtil.nextInt(random, 1, parentFields.length-1);
clause = new TermQuery(new Term("parent" + fieldID,
parentFields[fieldID][random.nextInt(parentFields[fieldID].length)]));
}
bq.add(clause, occur);
}
} else {
BooleanQuery bq = new BooleanQuery();
parentQuery2 = bq;
bq.add(new TermQuery(randomParentTerm(parentFields[0])),
BooleanClause.Occur.MUST);
final int fieldID = _TestUtil.nextInt(random, 1, parentFields.length-1);
bq.add(new TermQuery(new Term("parent" + fieldID, parentFields[fieldID][random.nextInt(parentFields[fieldID].length)])),
random.nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.MUST_NOT);
}
if (VERBOSE) {
System.out.println("\nTEST: top down: parentQuery2=" + parentQuery2);
}
// Maps parent query to child docs:
final ToChildBlockJoinQuery parentJoinQuery2 = new ToChildBlockJoinQuery(parentQuery2, parentsFilter, random.nextBoolean());
// To run against the block-join index:
final Query childJoinQuery2;
// Same query as parentJoinQuery, but to run against
// the fully denormalized index (so we can compare
// results):
final Query childQuery2;
if (random.nextBoolean()) {
childQuery2 = parentQuery2;
childJoinQuery2 = parentJoinQuery2;
} else {
// AND child field w/ parent query:
final BooleanQuery bq = new BooleanQuery();
childJoinQuery2 = bq;
final Term childTerm = randomChildTerm(childFields[0]);
if (random.nextBoolean()) {
bq.add(parentJoinQuery2, BooleanClause.Occur.MUST);
bq.add(new TermQuery(childTerm),
BooleanClause.Occur.MUST);
} else {
bq.add(new TermQuery(childTerm),
BooleanClause.Occur.MUST);
bq.add(parentJoinQuery2, BooleanClause.Occur.MUST);
}
final BooleanQuery bq2 = new BooleanQuery();
childQuery2 = bq2;
if (random.nextBoolean()) {
bq2.add(parentQuery2, BooleanClause.Occur.MUST);
bq2.add(new TermQuery(childTerm),
BooleanClause.Occur.MUST);
} else {
bq2.add(new TermQuery(childTerm),
BooleanClause.Occur.MUST);
bq2.add(parentQuery2, BooleanClause.Occur.MUST);
}
}
final Sort childSort2 = getRandomSort("child", childFields.length);
// Search denormalized index:
if (VERBOSE) {
System.out.println("TEST: run top down query=" + childQuery2 + " sort=" + childSort2);
}
final TopDocs results2 = s.search(childQuery2, null, r.numDocs(),
childSort2);
if (VERBOSE) {
System.out.println(" " + results2.totalHits + " totalHits:");
for(ScoreDoc sd : results2.scoreDocs) {
final Document doc = s.doc(sd.doc);
System.out.println(" childID=" + doc.get("childID") + " parentID=" + doc.get("parentID") + " docID=" + sd.doc);
}
}
// Search join index:
if (VERBOSE) {
System.out.println("TEST: run top down join query=" + childJoinQuery2 + " sort=" + childSort2);
}
TopDocs joinResults2 = joinS.search(childJoinQuery2, null, joinR.numDocs(), childSort2);
if (VERBOSE) {
System.out.println(" " + joinResults2.totalHits + " totalHits:");
for(ScoreDoc sd : joinResults2.scoreDocs) {
final Document doc = joinS.doc(sd.doc);
final Document parentDoc = getParentDoc(joinR, parentsFilter, sd.doc);
System.out.println(" childID=" + doc.get("childID") + " parentID=" + parentDoc.get("parentID") + " docID=" + sd.doc);
}
}
compareChildHits(r, joinR, results2, joinResults2);
}
r.close();
@ -464,6 +646,28 @@ public class TestBlockJoin extends LuceneTestCase {
joinDir.close();
}
private void compareChildHits(IndexReader r, IndexReader joinR, TopDocs results, TopDocs joinResults) throws Exception {
assertEquals(results.totalHits, joinResults.totalHits);
assertEquals(results.scoreDocs.length, joinResults.scoreDocs.length);
for(int hitCount=0;hitCount<results.scoreDocs.length;hitCount++) {
ScoreDoc hit = results.scoreDocs[hitCount];
ScoreDoc joinHit = joinResults.scoreDocs[hitCount];
Document doc1 = r.document(hit.doc);
Document doc2 = joinR.document(joinHit.doc);
assertEquals("hit " + hitCount + " differs",
doc1.get("childID"), doc2.get("childID"));
// don't compare scores -- they are expected to differ
assertTrue(hit instanceof FieldDoc);
assertTrue(joinHit instanceof FieldDoc);
FieldDoc hit0 = (FieldDoc) hit;
FieldDoc joinHit0 = (FieldDoc) joinHit;
assertEquals(hit0.fields, joinHit0.fields);
}
}
private void compareHits(IndexReader r, IndexReader joinR, TopDocs results, TopGroups<Integer> joinResults) throws Exception {
// results is 'complete'; joinResults is a subset
int resultUpto = 0;
@ -539,8 +743,8 @@ public class TestBlockJoin extends LuceneTestCase {
// Wrap the child document query to 'join' any matches
// up to corresponding parent:
BlockJoinQuery childJobJoinQuery = new BlockJoinQuery(childJobQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg);
BlockJoinQuery childQualificationJoinQuery = new BlockJoinQuery(childQualificationQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg);
ToParentBlockJoinQuery childJobJoinQuery = new ToParentBlockJoinQuery(childJobQuery, parentsFilter, ToParentBlockJoinQuery.ScoreMode.Avg);
ToParentBlockJoinQuery childQualificationJoinQuery = new ToParentBlockJoinQuery(childQualificationQuery, parentsFilter, ToParentBlockJoinQuery.ScoreMode.Avg);
// Combine the parent and nested child queries into a single query for a candidate
BooleanQuery fullQuery = new BooleanQuery();
@ -548,12 +752,13 @@ public class TestBlockJoin extends LuceneTestCase {
fullQuery.add(new BooleanClause(childJobJoinQuery, Occur.MUST));
fullQuery.add(new BooleanClause(childQualificationJoinQuery, Occur.MUST));
//????? How do I control volume of jobs vs qualifications per parent?
BlockJoinCollector c = new BlockJoinCollector(Sort.RELEVANCE, 10, true, false);
// Collects all job and qualification child docs for
// each resume hit in the top N (sorted by score):
ToParentBlockJoinCollector c = new ToParentBlockJoinCollector(Sort.RELEVANCE, 10, true, false);
s.search(fullQuery, c);
//Examine "Job" children
// Examine "Job" children
boolean showNullPointerIssue=true;
if (showNullPointerIssue) {
TopGroups<Integer> jobResults = c.getTopGroups(childJobJoinQuery, null, 0, 10, 0, true);
@ -573,10 +778,9 @@ public class TestBlockJoin extends LuceneTestCase {
assertEquals("Lisa", parentDoc.get("name"));
}
//Now Examine qualification children
// Now Examine qualification children
TopGroups<Integer> qualificationResults = c.getTopGroups(childQualificationJoinQuery, null, 0, 10, 0, true);
//!!!!! This next line can null pointer - but only if prior "jobs" section called first
assertEquals(1, qualificationResults.totalGroupedHitCount);
assertEquals(1, qualificationResults.groups.length);
@ -610,7 +814,7 @@ public class TestBlockJoin extends LuceneTestCase {
new QueryWrapperFilter(
new TermQuery(new Term("parent", "1"))));
BlockJoinQuery q = new BlockJoinQuery(tq, parentFilter, BlockJoinQuery.ScoreMode.Avg);
ToParentBlockJoinQuery q = new ToParentBlockJoinQuery(tq, parentFilter, ToParentBlockJoinQuery.ScoreMode.Avg);
Weight weight = s.createNormalizedWeight(q);
DocIdSetIterator disi = weight.scorer(ReaderUtil.leaves(s.getIndexReader().getTopReaderContext())[0], true, true, null);
assertEquals(1, disi.advance(1));
@ -644,7 +848,7 @@ public class TestBlockJoin extends LuceneTestCase {
new QueryWrapperFilter(
new TermQuery(new Term("isparent", "yes"))));
BlockJoinQuery q = new BlockJoinQuery(tq, parentFilter, BlockJoinQuery.ScoreMode.Avg);
ToParentBlockJoinQuery q = new ToParentBlockJoinQuery(tq, parentFilter, ToParentBlockJoinQuery.ScoreMode.Avg);
Weight weight = s.createNormalizedWeight(q);
DocIdSetIterator disi = weight.scorer(ReaderUtil.leaves(s.getIndexReader().getTopReaderContext())[0], true, true, null);
assertEquals(2, disi.advance(0));