mirror of https://github.com/apache/lucene.git
LUCENE4832: add ToParentBlockJoinCollector.getTopGroupsWithAllChildDocs
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1457880 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
05509413d4
commit
d75b728bef
|
@ -99,6 +99,9 @@ New Features
|
|||
* LUCENE-4843: Add LimitTokenPositionFilter: don't emit tokens with
|
||||
positions that exceed the configured limit. (Steve Rowe)
|
||||
|
||||
* LUCENE-4832: Add ToParentBlockJoinCollector.getTopGroupsWithAllChildDocs, to retrieve
|
||||
all children in each group. (Aleksey Aleev via Mike McCandless)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-4844: removed TaxonomyReader.getParent(), you should use
|
||||
|
|
|
@ -363,16 +363,24 @@ public class ToParentBlockJoinCollector extends Collector {
|
|||
}
|
||||
}
|
||||
|
||||
/** Return the TopGroups for the specified
|
||||
* BlockJoinQuery. The groupValue of each GroupDocs will
|
||||
* be the parent docID for that group. Note that the
|
||||
* {@link GroupDocs#totalHits}, which would be the
|
||||
* total number of child documents matching that parent,
|
||||
* is not computed (will always be 0). Returns null if
|
||||
* no groups matched. */
|
||||
@SuppressWarnings("unchecked")
|
||||
public TopGroups<Integer> getTopGroups(ToParentBlockJoinQuery query, Sort withinGroupSort, int offset, int maxDocsPerGroup, int withinGroupOffset, boolean fillSortFields)
|
||||
|
||||
/** Returns the TopGroups for the specified
|
||||
* BlockJoinQuery. The groupValue of each GroupDocs will
|
||||
* be the parent docID for that group.
|
||||
* The number of documents within each group is calculated as minimum of <code>maxDocsPerGroup</code>
|
||||
* and number of matched child documents for that group.
|
||||
* Returns null if no groups matched.
|
||||
*
|
||||
* @param query Search query
|
||||
* @param withinGroupSort Sort criteria within groups
|
||||
* @param offset Parent docs offset
|
||||
* @param maxDocsPerGroup Upper bound of documents per group number
|
||||
* @param withinGroupOffset Offset within each group of child docs
|
||||
* @param fillSortFields Specifies whether to add sort fields or not
|
||||
* @return TopGroups for specified query
|
||||
* @throws IOException if there is a low-level I/O error
|
||||
*/
|
||||
public TopGroups<Integer> getTopGroups(ToParentBlockJoinQuery query, Sort withinGroupSort, int offset,
|
||||
int maxDocsPerGroup, int withinGroupOffset, boolean fillSortFields)
|
||||
throws IOException {
|
||||
|
||||
final Integer _slot = joinQueryID.get(query);
|
||||
|
@ -384,9 +392,6 @@ public class ToParentBlockJoinCollector extends Collector {
|
|||
}
|
||||
}
|
||||
|
||||
// unbox once
|
||||
final int slot = _slot;
|
||||
|
||||
if (sortedGroups == null) {
|
||||
if (offset >= queue.size()) {
|
||||
return null;
|
||||
|
@ -396,15 +401,35 @@ public class ToParentBlockJoinCollector extends Collector {
|
|||
return null;
|
||||
}
|
||||
|
||||
int totalGroupedHitCount = 0;
|
||||
return accumulateGroups(_slot, offset, maxDocsPerGroup, withinGroupOffset, withinGroupSort, fillSortFields);
|
||||
}
|
||||
|
||||
/**
|
||||
* Accumulates groups for the BlockJoinQuery specified by its slot.
|
||||
*
|
||||
* @param slot Search query's slot
|
||||
* @param offset Parent docs offset
|
||||
* @param maxDocsPerGroup Upper bound of documents per group number
|
||||
* @param withinGroupOffset Offset within each group of child docs
|
||||
* @param withinGroupSort Sort criteria within groups
|
||||
* @param fillSortFields Specifies whether to add sort fields or not
|
||||
* @return TopGroups for the query specified by slot
|
||||
* @throws IOException if there is a low-level I/O error
|
||||
*/
|
||||
@SuppressWarnings({"unchecked","rawtypes"})
|
||||
private TopGroups<Integer> accumulateGroups(int slot, int offset, int maxDocsPerGroup,
|
||||
int withinGroupOffset, Sort withinGroupSort, boolean fillSortFields) throws IOException {
|
||||
final GroupDocs<Integer>[] groups = new GroupDocs[sortedGroups.length - offset];
|
||||
final FakeScorer fakeScorer = new FakeScorer();
|
||||
|
||||
@SuppressWarnings({"unchecked","rawtypes"})
|
||||
final GroupDocs<Integer>[] groups = new GroupDocs[sortedGroups.length - offset];
|
||||
int totalGroupedHitCount = 0;
|
||||
|
||||
for(int groupIDX=offset;groupIDX<sortedGroups.length;groupIDX++) {
|
||||
final OneGroup og = sortedGroups[groupIDX];
|
||||
final int numChildDocs = og.counts[slot];
|
||||
|
||||
// Number of documents in group should be bounded to prevent redundant memory allocation
|
||||
final int numDocsInGroup = Math.min(numChildDocs, maxDocsPerGroup);
|
||||
|
||||
// At this point we hold all docs w/ in each group,
|
||||
// unsorted; we now sort them:
|
||||
|
@ -414,15 +439,14 @@ public class ToParentBlockJoinCollector extends Collector {
|
|||
if (!trackScores) {
|
||||
throw new IllegalArgumentException("cannot sort by relevance within group: trackScores=false");
|
||||
}
|
||||
collector = TopScoreDocCollector.create(maxDocsPerGroup, true);
|
||||
collector = TopScoreDocCollector.create(numDocsInGroup, true);
|
||||
} else {
|
||||
// Sort by fields
|
||||
collector = TopFieldCollector.create(withinGroupSort, maxDocsPerGroup, fillSortFields, trackScores, trackMaxScore, true);
|
||||
collector = TopFieldCollector.create(withinGroupSort, numDocsInGroup, fillSortFields, trackScores, trackMaxScore, true);
|
||||
}
|
||||
|
||||
collector.setScorer(fakeScorer);
|
||||
collector.setNextReader(og.readerContext);
|
||||
final int numChildDocs = og.counts[slot];
|
||||
for(int docIDX=0;docIDX<numChildDocs;docIDX++) {
|
||||
final int doc = og.docs[slot][docIDX];
|
||||
fakeScorer.doc = doc;
|
||||
|
@ -444,11 +468,11 @@ public class ToParentBlockJoinCollector extends Collector {
|
|||
groupSortValues = null;
|
||||
}
|
||||
|
||||
final TopDocs topDocs = collector.topDocs(withinGroupOffset, maxDocsPerGroup);
|
||||
final TopDocs topDocs = collector.topDocs(withinGroupOffset, numDocsInGroup);
|
||||
|
||||
groups[groupIDX-offset] = new GroupDocs<Integer>(og.score,
|
||||
topDocs.getMaxScore(),
|
||||
og.counts[slot],
|
||||
numChildDocs,
|
||||
topDocs.scoreDocs,
|
||||
og.doc,
|
||||
groupSortValues);
|
||||
|
@ -459,6 +483,27 @@ public class ToParentBlockJoinCollector extends Collector {
|
|||
0, totalGroupedHitCount, groups, maxScore),
|
||||
totalHitCount);
|
||||
}
|
||||
|
||||
/** Returns the TopGroups for the specified BlockJoinQuery.
|
||||
* The groupValue of each GroupDocs will be the parent docID for that group.
|
||||
* The number of documents within each group
|
||||
* equals to the total number of matched child documents for that group.
|
||||
* Returns null if no groups matched.
|
||||
*
|
||||
* @param query Search query
|
||||
* @param withinGroupSort Sort criteria within groups
|
||||
* @param offset Parent docs offset
|
||||
* @param withinGroupOffset Offset within each group of child docs
|
||||
* @param fillSortFields Specifies whether to add sort fields or not
|
||||
* @return TopGroups for specified query
|
||||
* @throws IOException if there is a low-level I/O error
|
||||
*/
|
||||
public TopGroups<Integer> getTopGroupsWithAllChildDocs(ToParentBlockJoinQuery query, Sort withinGroupSort, int offset,
|
||||
int withinGroupOffset, boolean fillSortFields)
|
||||
throws IOException {
|
||||
|
||||
return getTopGroups(query, withinGroupSort, offset, Integer.MAX_VALUE, withinGroupOffset, fillSortFields);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the highest score across all collected parent hits, as long as
|
||||
|
|
|
@ -961,24 +961,21 @@ public class TestBlockJoin extends LuceneTestCase {
|
|||
s.search(fullQuery, c);
|
||||
|
||||
// Examine "Job" children
|
||||
boolean showNullPointerIssue=true;
|
||||
if (showNullPointerIssue) {
|
||||
TopGroups<Integer> jobResults = c.getTopGroups(childJobJoinQuery, null, 0, 10, 0, true);
|
||||
TopGroups<Integer> jobResults = c.getTopGroups(childJobJoinQuery, null, 0, 10, 0, true);
|
||||
|
||||
//assertEquals(1, results.totalHitCount);
|
||||
assertEquals(1, jobResults.totalGroupedHitCount);
|
||||
assertEquals(1, jobResults.groups.length);
|
||||
//assertEquals(1, results.totalHitCount);
|
||||
assertEquals(1, jobResults.totalGroupedHitCount);
|
||||
assertEquals(1, jobResults.groups.length);
|
||||
|
||||
final GroupDocs<Integer> group = jobResults.groups[0];
|
||||
assertEquals(1, group.totalHits);
|
||||
final GroupDocs<Integer> group = jobResults.groups[0];
|
||||
assertEquals(1, group.totalHits);
|
||||
|
||||
StoredDocument childJobDoc = s.doc(group.scoreDocs[0].doc);
|
||||
//System.out.println(" doc=" + group.scoreDocs[0].doc);
|
||||
assertEquals("java", childJobDoc.get("skill"));
|
||||
assertNotNull(group.groupValue);
|
||||
StoredDocument parentDoc = s.doc(group.groupValue);
|
||||
assertEquals("Lisa", parentDoc.get("name"));
|
||||
}
|
||||
StoredDocument childJobDoc = s.doc(group.scoreDocs[0].doc);
|
||||
//System.out.println(" doc=" + group.scoreDocs[0].doc);
|
||||
assertEquals("java", childJobDoc.get("skill"));
|
||||
assertNotNull(group.groupValue);
|
||||
StoredDocument parentDoc = s.doc(group.groupValue);
|
||||
assertEquals("Lisa", parentDoc.get("name"));
|
||||
|
||||
// Now Examine qualification children
|
||||
TopGroups<Integer> qualificationResults = c.getTopGroups(childQualificationJoinQuery, null, 0, 10, 0, true);
|
||||
|
@ -992,7 +989,7 @@ public class TestBlockJoin extends LuceneTestCase {
|
|||
StoredDocument childQualificationDoc = s.doc(qGroup.scoreDocs[0].doc);
|
||||
assertEquals("maths", childQualificationDoc.get("qualification"));
|
||||
assertNotNull(qGroup.groupValue);
|
||||
StoredDocument parentDoc = s.doc(qGroup.groupValue);
|
||||
parentDoc = s.doc(qGroup.groupValue);
|
||||
assertEquals("Lisa", parentDoc.get("name"));
|
||||
|
||||
|
||||
|
@ -1057,4 +1054,95 @@ public class TestBlockJoin extends LuceneTestCase {
|
|||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testGetTopGroups() throws Exception {
|
||||
|
||||
final Directory dir = newDirectory();
|
||||
final RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
|
||||
final List<Document> docs = new ArrayList<Document>();
|
||||
docs.add(makeJob("ruby", 2005));
|
||||
docs.add(makeJob("java", 2006));
|
||||
docs.add(makeJob("java", 2010));
|
||||
docs.add(makeJob("java", 2012));
|
||||
Collections.shuffle(docs, random());
|
||||
docs.add(makeResume("Frank", "United States"));
|
||||
|
||||
addSkillless(w);
|
||||
w.addDocuments(docs);
|
||||
addSkillless(w);
|
||||
|
||||
IndexReader r = w.getReader();
|
||||
w.close();
|
||||
IndexSearcher s = newSearcher(r);
|
||||
|
||||
// Create a filter that defines "parent" documents in the index - in this case resumes
|
||||
Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
|
||||
|
||||
// Define child document criteria (finds an example of relevant work experience)
|
||||
BooleanQuery childQuery = new BooleanQuery();
|
||||
childQuery.add(new BooleanClause(new TermQuery(new Term("skill", "java")), Occur.MUST));
|
||||
childQuery.add(new BooleanClause(NumericRangeQuery.newIntRange("year", 2006, 2011, true, true), Occur.MUST));
|
||||
|
||||
// Wrap the child document query to 'join' any matches
|
||||
// up to corresponding parent:
|
||||
ToParentBlockJoinQuery childJoinQuery = new ToParentBlockJoinQuery(childQuery, parentsFilter, ScoreMode.Avg);
|
||||
|
||||
ToParentBlockJoinCollector c = new ToParentBlockJoinCollector(Sort.RELEVANCE, 2, true, true);
|
||||
|
||||
s.search(childJoinQuery, c);
|
||||
|
||||
//Get all child documents within groups
|
||||
@SuppressWarnings({"unchecked","rawtypes"})
|
||||
TopGroups<Integer>[] getTopGroupsResults = new TopGroups[2];
|
||||
getTopGroupsResults[0] = c.getTopGroups(childJoinQuery, null, 0, 10, 0, true);
|
||||
getTopGroupsResults[1] = c.getTopGroupsWithAllChildDocs(childJoinQuery, null, 0, 0, true);
|
||||
|
||||
for (TopGroups<Integer> results : getTopGroupsResults) {
|
||||
assertFalse(Float.isNaN(results.maxScore));
|
||||
assertEquals(2, results.totalGroupedHitCount);
|
||||
assertEquals(1, results.groups.length);
|
||||
|
||||
final GroupDocs<Integer> group = results.groups[0];
|
||||
assertEquals(2, group.totalHits);
|
||||
assertFalse(Float.isNaN(group.score));
|
||||
assertNotNull(group.groupValue);
|
||||
StoredDocument parentDoc = s.doc(group.groupValue);
|
||||
assertEquals("Frank", parentDoc.get("name"));
|
||||
|
||||
assertEquals(2, group.scoreDocs.length); //all matched child documents collected
|
||||
|
||||
for (ScoreDoc scoreDoc : group.scoreDocs) {
|
||||
StoredDocument childDoc = s.doc(scoreDoc.doc);
|
||||
assertEquals("java", childDoc.get("skill"));
|
||||
int year = Integer.parseInt(childDoc.get("year"));
|
||||
assertTrue(year >= 2006 && year <= 2011);
|
||||
}
|
||||
}
|
||||
|
||||
//Get part of child documents
|
||||
TopGroups<Integer> boundedResults = c.getTopGroups(childJoinQuery, null, 0, 1, 0, true);
|
||||
assertFalse(Float.isNaN(boundedResults.maxScore));
|
||||
assertEquals(2, boundedResults.totalGroupedHitCount);
|
||||
assertEquals(1, boundedResults.groups.length);
|
||||
|
||||
final GroupDocs<Integer> group = boundedResults.groups[0];
|
||||
assertEquals(2, group.totalHits);
|
||||
assertFalse(Float.isNaN(group.score));
|
||||
assertNotNull(group.groupValue);
|
||||
StoredDocument parentDoc = s.doc(group.groupValue);
|
||||
assertEquals("Frank", parentDoc.get("name"));
|
||||
|
||||
assertEquals(1, group.scoreDocs.length); //not all matched child documents collected
|
||||
|
||||
for (ScoreDoc scoreDoc : group.scoreDocs) {
|
||||
StoredDocument childDoc = s.doc(scoreDoc.doc);
|
||||
assertEquals("java", childDoc.get("skill"));
|
||||
int year = Integer.parseInt(childDoc.get("year"));
|
||||
assertTrue(year >= 2006 && year <= 2011);
|
||||
}
|
||||
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue