LUCENE4832: add ToParentBlockJoinCollector.getTopGroupsWithAllChildDocs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1457880 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-03-18 17:25:18 +00:00
parent 05509413d4
commit d75b728bef
3 changed files with 173 additions and 37 deletions

View File

@ -99,6 +99,9 @@ New Features
* LUCENE-4843: Add LimitTokenPositionFilter: don't emit tokens with * LUCENE-4843: Add LimitTokenPositionFilter: don't emit tokens with
positions that exceed the configured limit. (Steve Rowe) positions that exceed the configured limit. (Steve Rowe)
* LUCENE-4832: Add ToParentBlockJoinCollector.getTopGroupsWithAllChildDocs, to retrieve
all children in each group. (Aleksey Aleev via Mike McCandless)
API Changes API Changes
* LUCENE-4844: removed TaxonomyReader.getParent(), you should use * LUCENE-4844: removed TaxonomyReader.getParent(), you should use

View File

@ -363,16 +363,24 @@ public class ToParentBlockJoinCollector extends Collector {
} }
} }
/** Return the TopGroups for the specified /** Returns the TopGroups for the specified
* BlockJoinQuery. The groupValue of each GroupDocs will * BlockJoinQuery. The groupValue of each GroupDocs will
* be the parent docID for that group. Note that the * be the parent docID for that group.
* {@link GroupDocs#totalHits}, which would be the * The number of documents within each group is calculated as minimum of <code>maxDocsPerGroup</code>
* total number of child documents matching that parent, * and number of matched child documents for that group.
* is not computed (will always be 0). Returns null if * Returns null if no groups matched.
* no groups matched. */ *
@SuppressWarnings("unchecked") * @param query Search query
public TopGroups<Integer> getTopGroups(ToParentBlockJoinQuery query, Sort withinGroupSort, int offset, int maxDocsPerGroup, int withinGroupOffset, boolean fillSortFields) * @param withinGroupSort Sort criteria within groups
* @param offset Parent docs offset
* @param maxDocsPerGroup Upper bound of documents per group number
* @param withinGroupOffset Offset within each group of child docs
* @param fillSortFields Specifies whether to add sort fields or not
* @return TopGroups for specified query
* @throws IOException if there is a low-level I/O error
*/
public TopGroups<Integer> getTopGroups(ToParentBlockJoinQuery query, Sort withinGroupSort, int offset,
int maxDocsPerGroup, int withinGroupOffset, boolean fillSortFields)
throws IOException { throws IOException {
final Integer _slot = joinQueryID.get(query); final Integer _slot = joinQueryID.get(query);
@ -384,9 +392,6 @@ public class ToParentBlockJoinCollector extends Collector {
} }
} }
// unbox once
final int slot = _slot;
if (sortedGroups == null) { if (sortedGroups == null) {
if (offset >= queue.size()) { if (offset >= queue.size()) {
return null; return null;
@ -396,15 +401,35 @@ public class ToParentBlockJoinCollector extends Collector {
return null; return null;
} }
int totalGroupedHitCount = 0; return accumulateGroups(_slot, offset, maxDocsPerGroup, withinGroupOffset, withinGroupSort, fillSortFields);
}
/**
* Accumulates groups for the BlockJoinQuery specified by its slot.
*
* @param slot Search query's slot
* @param offset Parent docs offset
* @param maxDocsPerGroup Upper bound of documents per group number
* @param withinGroupOffset Offset within each group of child docs
* @param withinGroupSort Sort criteria within groups
* @param fillSortFields Specifies whether to add sort fields or not
* @return TopGroups for the query specified by slot
* @throws IOException if there is a low-level I/O error
*/
@SuppressWarnings({"unchecked","rawtypes"})
private TopGroups<Integer> accumulateGroups(int slot, int offset, int maxDocsPerGroup,
int withinGroupOffset, Sort withinGroupSort, boolean fillSortFields) throws IOException {
final GroupDocs<Integer>[] groups = new GroupDocs[sortedGroups.length - offset];
final FakeScorer fakeScorer = new FakeScorer(); final FakeScorer fakeScorer = new FakeScorer();
@SuppressWarnings({"unchecked","rawtypes"}) int totalGroupedHitCount = 0;
final GroupDocs<Integer>[] groups = new GroupDocs[sortedGroups.length - offset];
for(int groupIDX=offset;groupIDX<sortedGroups.length;groupIDX++) { for(int groupIDX=offset;groupIDX<sortedGroups.length;groupIDX++) {
final OneGroup og = sortedGroups[groupIDX]; final OneGroup og = sortedGroups[groupIDX];
final int numChildDocs = og.counts[slot];
// Number of documents in group should be bounded to prevent redundant memory allocation
final int numDocsInGroup = Math.min(numChildDocs, maxDocsPerGroup);
// At this point we hold all docs w/ in each group, // At this point we hold all docs w/ in each group,
// unsorted; we now sort them: // unsorted; we now sort them:
@ -414,15 +439,14 @@ public class ToParentBlockJoinCollector extends Collector {
if (!trackScores) { if (!trackScores) {
throw new IllegalArgumentException("cannot sort by relevance within group: trackScores=false"); throw new IllegalArgumentException("cannot sort by relevance within group: trackScores=false");
} }
collector = TopScoreDocCollector.create(maxDocsPerGroup, true); collector = TopScoreDocCollector.create(numDocsInGroup, true);
} else { } else {
// Sort by fields // Sort by fields
collector = TopFieldCollector.create(withinGroupSort, maxDocsPerGroup, fillSortFields, trackScores, trackMaxScore, true); collector = TopFieldCollector.create(withinGroupSort, numDocsInGroup, fillSortFields, trackScores, trackMaxScore, true);
} }
collector.setScorer(fakeScorer); collector.setScorer(fakeScorer);
collector.setNextReader(og.readerContext); collector.setNextReader(og.readerContext);
final int numChildDocs = og.counts[slot];
for(int docIDX=0;docIDX<numChildDocs;docIDX++) { for(int docIDX=0;docIDX<numChildDocs;docIDX++) {
final int doc = og.docs[slot][docIDX]; final int doc = og.docs[slot][docIDX];
fakeScorer.doc = doc; fakeScorer.doc = doc;
@ -444,11 +468,11 @@ public class ToParentBlockJoinCollector extends Collector {
groupSortValues = null; groupSortValues = null;
} }
final TopDocs topDocs = collector.topDocs(withinGroupOffset, maxDocsPerGroup); final TopDocs topDocs = collector.topDocs(withinGroupOffset, numDocsInGroup);
groups[groupIDX-offset] = new GroupDocs<Integer>(og.score, groups[groupIDX-offset] = new GroupDocs<Integer>(og.score,
topDocs.getMaxScore(), topDocs.getMaxScore(),
og.counts[slot], numChildDocs,
topDocs.scoreDocs, topDocs.scoreDocs,
og.doc, og.doc,
groupSortValues); groupSortValues);
@ -460,6 +484,27 @@ public class ToParentBlockJoinCollector extends Collector {
totalHitCount); totalHitCount);
} }
/** Returns the TopGroups for the specified BlockJoinQuery.
* The groupValue of each GroupDocs will be the parent docID for that group.
* The number of documents within each group
* equals to the total number of matched child documents for that group.
* Returns null if no groups matched.
*
* @param query Search query
* @param withinGroupSort Sort criteria within groups
* @param offset Parent docs offset
* @param withinGroupOffset Offset within each group of child docs
* @param fillSortFields Specifies whether to add sort fields or not
* @return TopGroups for specified query
* @throws IOException if there is a low-level I/O error
*/
public TopGroups<Integer> getTopGroupsWithAllChildDocs(ToParentBlockJoinQuery query, Sort withinGroupSort, int offset,
int withinGroupOffset, boolean fillSortFields)
throws IOException {
return getTopGroups(query, withinGroupSort, offset, Integer.MAX_VALUE, withinGroupOffset, fillSortFields);
}
/** /**
* Returns the highest score across all collected parent hits, as long as * Returns the highest score across all collected parent hits, as long as
* <code>trackMaxScores=true</code> was passed * <code>trackMaxScores=true</code> was passed

View File

@ -961,8 +961,6 @@ public class TestBlockJoin extends LuceneTestCase {
s.search(fullQuery, c); s.search(fullQuery, c);
// Examine "Job" children // Examine "Job" children
boolean showNullPointerIssue=true;
if (showNullPointerIssue) {
TopGroups<Integer> jobResults = c.getTopGroups(childJobJoinQuery, null, 0, 10, 0, true); TopGroups<Integer> jobResults = c.getTopGroups(childJobJoinQuery, null, 0, 10, 0, true);
//assertEquals(1, results.totalHitCount); //assertEquals(1, results.totalHitCount);
@ -978,7 +976,6 @@ public class TestBlockJoin extends LuceneTestCase {
assertNotNull(group.groupValue); assertNotNull(group.groupValue);
StoredDocument parentDoc = s.doc(group.groupValue); StoredDocument parentDoc = s.doc(group.groupValue);
assertEquals("Lisa", parentDoc.get("name")); assertEquals("Lisa", parentDoc.get("name"));
}
// Now Examine qualification children // Now Examine qualification children
TopGroups<Integer> qualificationResults = c.getTopGroups(childQualificationJoinQuery, null, 0, 10, 0, true); TopGroups<Integer> qualificationResults = c.getTopGroups(childQualificationJoinQuery, null, 0, 10, 0, true);
@ -992,7 +989,7 @@ public class TestBlockJoin extends LuceneTestCase {
StoredDocument childQualificationDoc = s.doc(qGroup.scoreDocs[0].doc); StoredDocument childQualificationDoc = s.doc(qGroup.scoreDocs[0].doc);
assertEquals("maths", childQualificationDoc.get("qualification")); assertEquals("maths", childQualificationDoc.get("qualification"));
assertNotNull(qGroup.groupValue); assertNotNull(qGroup.groupValue);
StoredDocument parentDoc = s.doc(qGroup.groupValue); parentDoc = s.doc(qGroup.groupValue);
assertEquals("Lisa", parentDoc.get("name")); assertEquals("Lisa", parentDoc.get("name"));
@ -1057,4 +1054,95 @@ public class TestBlockJoin extends LuceneTestCase {
r.close(); r.close();
dir.close(); dir.close();
} }
public void testGetTopGroups() throws Exception {
final Directory dir = newDirectory();
final RandomIndexWriter w = new RandomIndexWriter(random(), dir);
final List<Document> docs = new ArrayList<Document>();
docs.add(makeJob("ruby", 2005));
docs.add(makeJob("java", 2006));
docs.add(makeJob("java", 2010));
docs.add(makeJob("java", 2012));
Collections.shuffle(docs, random());
docs.add(makeResume("Frank", "United States"));
addSkillless(w);
w.addDocuments(docs);
addSkillless(w);
IndexReader r = w.getReader();
w.close();
IndexSearcher s = newSearcher(r);
// Create a filter that defines "parent" documents in the index - in this case resumes
Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
// Define child document criteria (finds an example of relevant work experience)
BooleanQuery childQuery = new BooleanQuery();
childQuery.add(new BooleanClause(new TermQuery(new Term("skill", "java")), Occur.MUST));
childQuery.add(new BooleanClause(NumericRangeQuery.newIntRange("year", 2006, 2011, true, true), Occur.MUST));
// Wrap the child document query to 'join' any matches
// up to corresponding parent:
ToParentBlockJoinQuery childJoinQuery = new ToParentBlockJoinQuery(childQuery, parentsFilter, ScoreMode.Avg);
ToParentBlockJoinCollector c = new ToParentBlockJoinCollector(Sort.RELEVANCE, 2, true, true);
s.search(childJoinQuery, c);
//Get all child documents within groups
@SuppressWarnings({"unchecked","rawtypes"})
TopGroups<Integer>[] getTopGroupsResults = new TopGroups[2];
getTopGroupsResults[0] = c.getTopGroups(childJoinQuery, null, 0, 10, 0, true);
getTopGroupsResults[1] = c.getTopGroupsWithAllChildDocs(childJoinQuery, null, 0, 0, true);
for (TopGroups<Integer> results : getTopGroupsResults) {
assertFalse(Float.isNaN(results.maxScore));
assertEquals(2, results.totalGroupedHitCount);
assertEquals(1, results.groups.length);
final GroupDocs<Integer> group = results.groups[0];
assertEquals(2, group.totalHits);
assertFalse(Float.isNaN(group.score));
assertNotNull(group.groupValue);
StoredDocument parentDoc = s.doc(group.groupValue);
assertEquals("Frank", parentDoc.get("name"));
assertEquals(2, group.scoreDocs.length); //all matched child documents collected
for (ScoreDoc scoreDoc : group.scoreDocs) {
StoredDocument childDoc = s.doc(scoreDoc.doc);
assertEquals("java", childDoc.get("skill"));
int year = Integer.parseInt(childDoc.get("year"));
assertTrue(year >= 2006 && year <= 2011);
}
}
//Get part of child documents
TopGroups<Integer> boundedResults = c.getTopGroups(childJoinQuery, null, 0, 1, 0, true);
assertFalse(Float.isNaN(boundedResults.maxScore));
assertEquals(2, boundedResults.totalGroupedHitCount);
assertEquals(1, boundedResults.groups.length);
final GroupDocs<Integer> group = boundedResults.groups[0];
assertEquals(2, group.totalHits);
assertFalse(Float.isNaN(group.score));
assertNotNull(group.groupValue);
StoredDocument parentDoc = s.doc(group.groupValue);
assertEquals("Frank", parentDoc.get("name"));
assertEquals(1, group.scoreDocs.length); //not all matched child documents collected
for (ScoreDoc scoreDoc : group.scoreDocs) {
StoredDocument childDoc = s.doc(scoreDoc.doc);
assertEquals("java", childDoc.get("skill"));
int year = Integer.parseInt(childDoc.get("year"));
assertTrue(year >= 2006 && year <= 2011);
}
r.close();
dir.close();
}
} }