LUCENE-4968: fix several block join issues

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1477570 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-04-30 11:35:36 +00:00
parent 20f2f39170
commit e5b6c85074
4 changed files with 251 additions and 25 deletions

View File

@ -68,6 +68,11 @@ Bug Fixes
* LUCENE-4959: Fix incorrect return value in
SimpleNaiveBayesClassifier.assignClass. (Alexey Kutin via Adrien Grand)
* LUCENE-4968: Fixed ToParentBlockJoinQuery/Collector: correctly handle parent
hits that had no child matches, don't throw IllegalArgumentEx when
the child query has no hits, more aggressively catch cases where childQuery
incorrectly matches parent documents (Mike McCandless)
Optimizations
* LUCENE-4938: Don't use an unnecessarily large priority queue in IndexSearcher

View File

@ -111,6 +111,7 @@ public class ToParentBlockJoinCollector extends Collector {
if (trackMaxScore) {
maxScore = Float.MIN_VALUE;
}
//System.out.println("numParentHits=" + numParentHits);
this.trackScores = trackScores;
this.numParentHits = numParentHits;
queue = FieldValueHitQueue.create(sort.getSort(), numParentHits);
@ -122,6 +123,7 @@ public class ToParentBlockJoinCollector extends Collector {
private static final class OneGroup extends FieldValueHitQueue.Entry {
public OneGroup(int comparatorSlot, int parentDoc, float parentScore, int numJoins, boolean doScores) {
super(comparatorSlot, parentDoc, parentScore);
//System.out.println("make OneGroup parentDoc=" + parentDoc);
docs = new int[numJoins][];
for(int joinID=0;joinID<numJoins;joinID++) {
docs[joinID] = new int[5];
@ -142,7 +144,7 @@ public class ToParentBlockJoinCollector extends Collector {
@Override
public void collect(int parentDoc) throws IOException {
//System.out.println("C parentDoc=" + parentDoc);
//System.out.println("\nC parentDoc=" + parentDoc);
totalHitCount++;
float score = Float.NaN;
@ -203,8 +205,7 @@ public class ToParentBlockJoinCollector extends Collector {
for (int i = 0; i < comparators.length; i++) {
comparators[i].copy(comparatorSlot, parentDoc);
}
//System.out.println(" startup: new OG doc=" +
//(docBase+parentDoc));
//System.out.println(" startup: new OG doc=" + (docBase+parentDoc));
if (!trackMaxScore && trackScores) {
score = scorer.score();
}
@ -241,22 +242,28 @@ public class ToParentBlockJoinCollector extends Collector {
og.scores = ArrayUtil.grow(og.scores);
}
//System.out.println("copyGroups parentDoc=" + og.doc);
//System.out.println("\ncopyGroups parentDoc=" + og.doc);
for(int scorerIDX = 0;scorerIDX < numSubScorers;scorerIDX++) {
final ToParentBlockJoinQuery.BlockJoinScorer joinScorer = joinScorers[scorerIDX];
//System.out.println(" scorer=" + joinScorer);
if (joinScorer != null) {
if (joinScorer != null && docBase + joinScorer.getParentDoc() == og.doc) {
og.counts[scorerIDX] = joinScorer.getChildCount();
//System.out.println(" count=" + og.counts[scorerIDX]);
og.docs[scorerIDX] = joinScorer.swapChildDocs(og.docs[scorerIDX]);
assert og.docs[scorerIDX].length >= og.counts[scorerIDX]: "length=" + og.docs[scorerIDX].length + " vs count=" + og.counts[scorerIDX];
//System.out.println(" len=" + og.docs[scorerIDX].length);
/*
for(int idx=0;idx<og.counts[scorerIDX];idx++) {
for(int idx=0;idx<og.counts[scorerIDX];idx++) {
System.out.println(" docs[" + idx + "]=" + og.docs[scorerIDX][idx]);
}
}
*/
if (trackScores) {
//System.out.println(" copy scores");
og.scores[scorerIDX] = joinScorer.swapChildScores(og.scores[scorerIDX]);
assert og.scores[scorerIDX].length >= og.counts[scorerIDX]: "length=" + og.scores[scorerIDX].length + " vs count=" + og.counts[scorerIDX];
}
} else {
og.counts[scorerIDX] = 0;
}
}
}
@ -302,13 +309,16 @@ public class ToParentBlockJoinCollector extends Collector {
Arrays.fill(joinScorers, null);
Queue<Scorer> queue = new LinkedList<Scorer>();
//System.out.println("\nqueue: add top scorer=" + scorer);
queue.add(scorer);
while ((scorer = queue.poll()) != null) {
//System.out.println(" poll: " + scorer + "; " + scorer.getWeight().getQuery());
if (scorer instanceof ToParentBlockJoinQuery.BlockJoinScorer) {
enroll((ToParentBlockJoinQuery) scorer.getWeight().getQuery(), (ToParentBlockJoinQuery.BlockJoinScorer) scorer);
}
for (ChildScorer sub : scorer.getChildren()) {
//System.out.println(" add sub: " + sub.child + "; " + sub.child.getWeight().getQuery());
queue.add(sub.child);
}
}
@ -384,12 +394,8 @@ public class ToParentBlockJoinCollector extends Collector {
throws IOException {
final Integer _slot = joinQueryID.get(query);
if (_slot == null) {
if (totalHitCount == 0) {
return null;
} else {
throw new IllegalArgumentException("the Query did not contain the provided BlockJoinQuery");
}
if (_slot == null && totalHitCount == 0) {
return null;
}
if (sortedGroups == null) {
@ -401,7 +407,7 @@ public class ToParentBlockJoinCollector extends Collector {
return null;
}
return accumulateGroups(_slot, offset, maxDocsPerGroup, withinGroupOffset, withinGroupSort, fillSortFields);
return accumulateGroups(_slot == null ? -1 : _slot.intValue(), offset, maxDocsPerGroup, withinGroupOffset, withinGroupSort, fillSortFields);
}
/**
@ -423,18 +429,26 @@ public class ToParentBlockJoinCollector extends Collector {
final FakeScorer fakeScorer = new FakeScorer();
int totalGroupedHitCount = 0;
//System.out.println("slot=" + slot);
for(int groupIDX=offset;groupIDX<sortedGroups.length;groupIDX++) {
final OneGroup og = sortedGroups[groupIDX];
final int numChildDocs = og.counts[slot];
final int numChildDocs;
if (slot == -1 || slot >= og.counts.length) {
numChildDocs = 0;
} else {
numChildDocs = og.counts[slot];
}
// Number of documents in group should be bounded to prevent redundant memory allocation
final int numDocsInGroup = Math.min(numChildDocs, maxDocsPerGroup);
final int numDocsInGroup = Math.max(1, Math.min(numChildDocs, maxDocsPerGroup));
//System.out.println("parent doc=" + og.doc + " numChildDocs=" + numChildDocs + " maxDocsPG=" + maxDocsPerGroup);
// At this point we hold all docs w/ in each group,
// unsorted; we now sort them:
final TopDocsCollector<?> collector;
if (withinGroupSort == null) {
//System.out.println("sort by score");
// Sort by score
if (!trackScores) {
throw new IllegalArgumentException("cannot sort by relevance within group: trackScores=false");
@ -448,6 +462,7 @@ public class ToParentBlockJoinCollector extends Collector {
collector.setScorer(fakeScorer);
collector.setNextReader(og.readerContext);
for(int docIDX=0;docIDX<numChildDocs;docIDX++) {
//System.out.println("docIDX=" + docIDX + " vs " + og.docs[slot].length);
final int doc = og.docs[slot][docIDX];
fakeScorer.doc = doc;
if (trackScores) {

View File

@ -195,10 +195,8 @@ public class ToParentBlockJoinQuery extends Query {
@Override
public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
BlockJoinScorer scorer = (BlockJoinScorer) scorer(context, true, false, context.reader().getLiveDocs());
if (scorer != null) {
if (scorer.advance(doc) == doc) {
return scorer.explain(context.docBase);
}
if (scorer != null && scorer.advance(doc) == doc) {
return scorer.explain(context.docBase);
}
return new ComplexExplanation(false, 0.0f, "Not a match");
}
@ -246,6 +244,10 @@ public class ToParentBlockJoinQuery extends Query {
return childDocUpto;
}
int getParentDoc() {
return parentDoc;
}
int[] swapChildDocs(int[] other) {
final int[] ret = pendingChildDocs;
if (other == null) {
@ -272,7 +274,6 @@ public class ToParentBlockJoinQuery extends Query {
@Override
public int nextDoc() throws IOException {
//System.out.println("Q.nextDoc() nextChildDoc=" + nextChildDoc);
// Loop until we hit a parentDoc that's accepted
while (true) {
if (nextChildDoc == NO_MORE_DOCS) {
@ -285,6 +286,12 @@ public class ToParentBlockJoinQuery extends Query {
parentDoc = parentBits.nextSetBit(nextChildDoc);
// Parent & child docs are supposed to be
// orthogonal:
if (nextChildDoc == parentDoc) {
throw new IllegalStateException("child query must only match non-parent docs, but parent docID=" + nextChildDoc + " matched childScorer=" + childScorer.getClass());
}
//System.out.println(" parentDoc=" + parentDoc);
assert parentDoc != -1;
@ -295,6 +302,13 @@ public class ToParentBlockJoinQuery extends Query {
do {
nextChildDoc = childScorer.nextDoc();
} while (nextChildDoc < parentDoc);
// Parent & child docs are supposed to be
// orthogonal:
if (nextChildDoc == parentDoc) {
throw new IllegalStateException("child query must only match non-parent docs, but parent docID=" + nextChildDoc + " matched childScorer=" + childScorer.getClass());
}
continue;
}
@ -326,8 +340,11 @@ public class ToParentBlockJoinQuery extends Query {
nextChildDoc = childScorer.nextDoc();
} while (nextChildDoc < parentDoc);
// Parent & child docs are supposed to be orthogonal:
assert nextChildDoc != parentDoc;
// Parent & child docs are supposed to be
// orthogonal:
if (nextChildDoc == parentDoc) {
throw new IllegalStateException("child query must only match non-parent docs, but parent docID=" + nextChildDoc + " matched childScorer=" + childScorer.getClass());
}
switch(scoreMode) {
case Avg:
@ -343,7 +360,7 @@ public class ToParentBlockJoinQuery extends Query {
break;
}
//System.out.println(" return parentDoc=" + parentDoc);
//System.out.println(" return parentDoc=" + parentDoc + " childDocUpto=" + childDocUpto);
return parentDoc;
}
}
@ -393,7 +410,9 @@ public class ToParentBlockJoinQuery extends Query {
}
// Parent & child docs are supposed to be orthogonal:
assert nextChildDoc != prevParentDoc;
if (nextChildDoc == prevParentDoc) {
throw new IllegalStateException("child query must only match non-parent docs, but parent docID=" + nextChildDoc + " matched childScorer=" + childScorer.getClass());
}
final int nd = nextDoc();
//System.out.println(" return nextParentDoc=" + nd);

View File

@ -1206,4 +1206,191 @@ public class TestBlockJoin extends LuceneTestCase {
r.close();
dir.close();
}
// LUCENE-4968
public void testSometimesParentOnlyMatches() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
Document parent = new Document();
parent.add(new StoredField("parentID", "0"));
parent.add(newTextField("parentText", "text", Field.Store.NO));
parent.add(newStringField("isParent", "yes", Field.Store.NO));
List<Document> docs = new ArrayList<Document>();
Document child = new Document();
docs.add(child);
child.add(new StoredField("childID", "0"));
child.add(newTextField("childText", "text", Field.Store.NO));
// parent last:
docs.add(parent);
w.addDocuments(docs);
docs.clear();
parent = new Document();
parent.add(newTextField("parentText", "text", Field.Store.NO));
parent.add(newStringField("isParent", "yes", Field.Store.NO));
parent.add(new StoredField("parentID", "1"));
// parent last:
docs.add(parent);
w.addDocuments(docs);
IndexReader r = w.getReader();
w.close();
Query childQuery = new TermQuery(new Term("childText", "text"));
Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("isParent", "yes"))));
ToParentBlockJoinQuery childJoinQuery = new ToParentBlockJoinQuery(childQuery, parentsFilter, ScoreMode.Avg);
BooleanQuery parentQuery = new BooleanQuery();
parentQuery.add(childJoinQuery, Occur.SHOULD);
parentQuery.add(new TermQuery(new Term("parentText", "text")), Occur.SHOULD);
ToParentBlockJoinCollector c = new ToParentBlockJoinCollector(new Sort(new SortField("parentID", SortField.Type.STRING)),
10, true, true);
newSearcher(r).search(parentQuery, c);
TopGroups<Integer> groups = c.getTopGroups(childJoinQuery, null, 0, 10, 0, false);
// Two parents:
assertEquals(2, groups.totalGroupCount.intValue());
// One child docs:
assertEquals(1, groups.totalGroupedHitCount);
GroupDocs<Integer> group = groups.groups[0];
StoredDocument doc = r.document(group.groupValue.intValue());
assertEquals("0", doc.get("parentID"));
System.out.println("group: " + group);
group = groups.groups[1];
doc = r.document(group.groupValue.intValue());
assertEquals("1", doc.get("parentID"));
r.close();
d.close();
}
// LUCENE-4968
public void testChildQueryNeverMatches() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
Document parent = new Document();
parent.add(new StoredField("parentID", "0"));
parent.add(newTextField("parentText", "text", Field.Store.NO));
parent.add(newStringField("isParent", "yes", Field.Store.NO));
List<Document> docs = new ArrayList<Document>();
Document child = new Document();
docs.add(child);
child.add(new StoredField("childID", "0"));
child.add(newTextField("childText", "text", Field.Store.NO));
// parent last:
docs.add(parent);
w.addDocuments(docs);
docs.clear();
parent = new Document();
parent.add(newTextField("parentText", "text", Field.Store.NO));
parent.add(newStringField("isParent", "yes", Field.Store.NO));
parent.add(new StoredField("parentID", "1"));
// parent last:
docs.add(parent);
w.addDocuments(docs);
IndexReader r = w.getReader();
w.close();
// never matches:
Query childQuery = new TermQuery(new Term("childText", "bogus"));
Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("isParent", "yes"))));
ToParentBlockJoinQuery childJoinQuery = new ToParentBlockJoinQuery(childQuery, parentsFilter, ScoreMode.Avg);
BooleanQuery parentQuery = new BooleanQuery();
parentQuery.add(childJoinQuery, Occur.SHOULD);
parentQuery.add(new TermQuery(new Term("parentText", "text")), Occur.SHOULD);
ToParentBlockJoinCollector c = new ToParentBlockJoinCollector(new Sort(new SortField("parentID", SortField.Type.STRING)),
10, true, true);
newSearcher(r).search(parentQuery, c);
TopGroups<Integer> groups = c.getTopGroups(childJoinQuery, null, 0, 10, 0, false);
// Two parents:
assertEquals(2, groups.totalGroupCount.intValue());
// One child docs:
assertEquals(0, groups.totalGroupedHitCount);
GroupDocs<Integer> group = groups.groups[0];
StoredDocument doc = r.document(group.groupValue.intValue());
assertEquals("0", doc.get("parentID"));
System.out.println("group: " + group);
group = groups.groups[1];
doc = r.document(group.groupValue.intValue());
assertEquals("1", doc.get("parentID"));
r.close();
d.close();
}
// LUCENE-4968
public void testChildQueryMatchesParent() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
Document parent = new Document();
parent.add(new StoredField("parentID", "0"));
parent.add(newTextField("parentText", "text", Field.Store.NO));
parent.add(newStringField("isParent", "yes", Field.Store.NO));
List<Document> docs = new ArrayList<Document>();
Document child = new Document();
docs.add(child);
child.add(new StoredField("childID", "0"));
child.add(newTextField("childText", "text", Field.Store.NO));
// parent last:
docs.add(parent);
w.addDocuments(docs);
docs.clear();
parent = new Document();
parent.add(newTextField("parentText", "text", Field.Store.NO));
parent.add(newStringField("isParent", "yes", Field.Store.NO));
parent.add(new StoredField("parentID", "1"));
// parent last:
docs.add(parent);
w.addDocuments(docs);
IndexReader r = w.getReader();
w.close();
// illegally matches parent:
Query childQuery = new TermQuery(new Term("parentText", "text"));
Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("isParent", "yes"))));
ToParentBlockJoinQuery childJoinQuery = new ToParentBlockJoinQuery(childQuery, parentsFilter, ScoreMode.Avg);
BooleanQuery parentQuery = new BooleanQuery();
parentQuery.add(childJoinQuery, Occur.SHOULD);
parentQuery.add(new TermQuery(new Term("parentText", "text")), Occur.SHOULD);
ToParentBlockJoinCollector c = new ToParentBlockJoinCollector(new Sort(new SortField("parentID", SortField.Type.STRING)),
10, true, true);
try {
newSearcher(r).search(parentQuery, c);
fail("should have hit exception");
} catch (IllegalStateException ise) {
// expected
}
r.close();
d.close();
}
}