mirror of https://github.com/apache/lucene.git
LUCENE4832: add ToParentBlockJoinCollector.getTopGroupsWithAllChildDocs
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1457880 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
05509413d4
commit
d75b728bef
|
@ -99,6 +99,9 @@ New Features
|
||||||
* LUCENE-4843: Add LimitTokenPositionFilter: don't emit tokens with
|
* LUCENE-4843: Add LimitTokenPositionFilter: don't emit tokens with
|
||||||
positions that exceed the configured limit. (Steve Rowe)
|
positions that exceed the configured limit. (Steve Rowe)
|
||||||
|
|
||||||
|
* LUCENE-4832: Add ToParentBlockJoinCollector.getTopGroupsWithAllChildDocs, to retrieve
|
||||||
|
all children in each group. (Aleksey Aleev via Mike McCandless)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-4844: removed TaxonomyReader.getParent(), you should use
|
* LUCENE-4844: removed TaxonomyReader.getParent(), you should use
|
||||||
|
|
|
@ -363,16 +363,24 @@ public class ToParentBlockJoinCollector extends Collector {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Return the TopGroups for the specified
|
/** Returns the TopGroups for the specified
|
||||||
* BlockJoinQuery. The groupValue of each GroupDocs will
|
* BlockJoinQuery. The groupValue of each GroupDocs will
|
||||||
* be the parent docID for that group. Note that the
|
* be the parent docID for that group.
|
||||||
* {@link GroupDocs#totalHits}, which would be the
|
* The number of documents within each group is calculated as minimum of <code>maxDocsPerGroup</code>
|
||||||
* total number of child documents matching that parent,
|
* and number of matched child documents for that group.
|
||||||
* is not computed (will always be 0). Returns null if
|
* Returns null if no groups matched.
|
||||||
* no groups matched. */
|
*
|
||||||
@SuppressWarnings("unchecked")
|
* @param query Search query
|
||||||
public TopGroups<Integer> getTopGroups(ToParentBlockJoinQuery query, Sort withinGroupSort, int offset, int maxDocsPerGroup, int withinGroupOffset, boolean fillSortFields)
|
* @param withinGroupSort Sort criteria within groups
|
||||||
|
* @param offset Parent docs offset
|
||||||
|
* @param maxDocsPerGroup Upper bound of documents per group number
|
||||||
|
* @param withinGroupOffset Offset within each group of child docs
|
||||||
|
* @param fillSortFields Specifies whether to add sort fields or not
|
||||||
|
* @return TopGroups for specified query
|
||||||
|
* @throws IOException if there is a low-level I/O error
|
||||||
|
*/
|
||||||
|
public TopGroups<Integer> getTopGroups(ToParentBlockJoinQuery query, Sort withinGroupSort, int offset,
|
||||||
|
int maxDocsPerGroup, int withinGroupOffset, boolean fillSortFields)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
final Integer _slot = joinQueryID.get(query);
|
final Integer _slot = joinQueryID.get(query);
|
||||||
|
@ -384,9 +392,6 @@ public class ToParentBlockJoinCollector extends Collector {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// unbox once
|
|
||||||
final int slot = _slot;
|
|
||||||
|
|
||||||
if (sortedGroups == null) {
|
if (sortedGroups == null) {
|
||||||
if (offset >= queue.size()) {
|
if (offset >= queue.size()) {
|
||||||
return null;
|
return null;
|
||||||
|
@ -396,15 +401,35 @@ public class ToParentBlockJoinCollector extends Collector {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
int totalGroupedHitCount = 0;
|
return accumulateGroups(_slot, offset, maxDocsPerGroup, withinGroupOffset, withinGroupSort, fillSortFields);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Accumulates groups for the BlockJoinQuery specified by its slot.
|
||||||
|
*
|
||||||
|
* @param slot Search query's slot
|
||||||
|
* @param offset Parent docs offset
|
||||||
|
* @param maxDocsPerGroup Upper bound of documents per group number
|
||||||
|
* @param withinGroupOffset Offset within each group of child docs
|
||||||
|
* @param withinGroupSort Sort criteria within groups
|
||||||
|
* @param fillSortFields Specifies whether to add sort fields or not
|
||||||
|
* @return TopGroups for the query specified by slot
|
||||||
|
* @throws IOException if there is a low-level I/O error
|
||||||
|
*/
|
||||||
|
@SuppressWarnings({"unchecked","rawtypes"})
|
||||||
|
private TopGroups<Integer> accumulateGroups(int slot, int offset, int maxDocsPerGroup,
|
||||||
|
int withinGroupOffset, Sort withinGroupSort, boolean fillSortFields) throws IOException {
|
||||||
|
final GroupDocs<Integer>[] groups = new GroupDocs[sortedGroups.length - offset];
|
||||||
final FakeScorer fakeScorer = new FakeScorer();
|
final FakeScorer fakeScorer = new FakeScorer();
|
||||||
|
|
||||||
@SuppressWarnings({"unchecked","rawtypes"})
|
int totalGroupedHitCount = 0;
|
||||||
final GroupDocs<Integer>[] groups = new GroupDocs[sortedGroups.length - offset];
|
|
||||||
|
|
||||||
for(int groupIDX=offset;groupIDX<sortedGroups.length;groupIDX++) {
|
for(int groupIDX=offset;groupIDX<sortedGroups.length;groupIDX++) {
|
||||||
final OneGroup og = sortedGroups[groupIDX];
|
final OneGroup og = sortedGroups[groupIDX];
|
||||||
|
final int numChildDocs = og.counts[slot];
|
||||||
|
|
||||||
|
// Number of documents in group should be bounded to prevent redundant memory allocation
|
||||||
|
final int numDocsInGroup = Math.min(numChildDocs, maxDocsPerGroup);
|
||||||
|
|
||||||
// At this point we hold all docs w/ in each group,
|
// At this point we hold all docs w/ in each group,
|
||||||
// unsorted; we now sort them:
|
// unsorted; we now sort them:
|
||||||
|
@ -414,15 +439,14 @@ public class ToParentBlockJoinCollector extends Collector {
|
||||||
if (!trackScores) {
|
if (!trackScores) {
|
||||||
throw new IllegalArgumentException("cannot sort by relevance within group: trackScores=false");
|
throw new IllegalArgumentException("cannot sort by relevance within group: trackScores=false");
|
||||||
}
|
}
|
||||||
collector = TopScoreDocCollector.create(maxDocsPerGroup, true);
|
collector = TopScoreDocCollector.create(numDocsInGroup, true);
|
||||||
} else {
|
} else {
|
||||||
// Sort by fields
|
// Sort by fields
|
||||||
collector = TopFieldCollector.create(withinGroupSort, maxDocsPerGroup, fillSortFields, trackScores, trackMaxScore, true);
|
collector = TopFieldCollector.create(withinGroupSort, numDocsInGroup, fillSortFields, trackScores, trackMaxScore, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
collector.setScorer(fakeScorer);
|
collector.setScorer(fakeScorer);
|
||||||
collector.setNextReader(og.readerContext);
|
collector.setNextReader(og.readerContext);
|
||||||
final int numChildDocs = og.counts[slot];
|
|
||||||
for(int docIDX=0;docIDX<numChildDocs;docIDX++) {
|
for(int docIDX=0;docIDX<numChildDocs;docIDX++) {
|
||||||
final int doc = og.docs[slot][docIDX];
|
final int doc = og.docs[slot][docIDX];
|
||||||
fakeScorer.doc = doc;
|
fakeScorer.doc = doc;
|
||||||
|
@ -444,11 +468,11 @@ public class ToParentBlockJoinCollector extends Collector {
|
||||||
groupSortValues = null;
|
groupSortValues = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
final TopDocs topDocs = collector.topDocs(withinGroupOffset, maxDocsPerGroup);
|
final TopDocs topDocs = collector.topDocs(withinGroupOffset, numDocsInGroup);
|
||||||
|
|
||||||
groups[groupIDX-offset] = new GroupDocs<Integer>(og.score,
|
groups[groupIDX-offset] = new GroupDocs<Integer>(og.score,
|
||||||
topDocs.getMaxScore(),
|
topDocs.getMaxScore(),
|
||||||
og.counts[slot],
|
numChildDocs,
|
||||||
topDocs.scoreDocs,
|
topDocs.scoreDocs,
|
||||||
og.doc,
|
og.doc,
|
||||||
groupSortValues);
|
groupSortValues);
|
||||||
|
@ -460,6 +484,27 @@ public class ToParentBlockJoinCollector extends Collector {
|
||||||
totalHitCount);
|
totalHitCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns the TopGroups for the specified BlockJoinQuery.
|
||||||
|
* The groupValue of each GroupDocs will be the parent docID for that group.
|
||||||
|
* The number of documents within each group
|
||||||
|
* equals to the total number of matched child documents for that group.
|
||||||
|
* Returns null if no groups matched.
|
||||||
|
*
|
||||||
|
* @param query Search query
|
||||||
|
* @param withinGroupSort Sort criteria within groups
|
||||||
|
* @param offset Parent docs offset
|
||||||
|
* @param withinGroupOffset Offset within each group of child docs
|
||||||
|
* @param fillSortFields Specifies whether to add sort fields or not
|
||||||
|
* @return TopGroups for specified query
|
||||||
|
* @throws IOException if there is a low-level I/O error
|
||||||
|
*/
|
||||||
|
public TopGroups<Integer> getTopGroupsWithAllChildDocs(ToParentBlockJoinQuery query, Sort withinGroupSort, int offset,
|
||||||
|
int withinGroupOffset, boolean fillSortFields)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
return getTopGroups(query, withinGroupSort, offset, Integer.MAX_VALUE, withinGroupOffset, fillSortFields);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the highest score across all collected parent hits, as long as
|
* Returns the highest score across all collected parent hits, as long as
|
||||||
* <code>trackMaxScores=true</code> was passed
|
* <code>trackMaxScores=true</code> was passed
|
||||||
|
|
|
@ -961,8 +961,6 @@ public class TestBlockJoin extends LuceneTestCase {
|
||||||
s.search(fullQuery, c);
|
s.search(fullQuery, c);
|
||||||
|
|
||||||
// Examine "Job" children
|
// Examine "Job" children
|
||||||
boolean showNullPointerIssue=true;
|
|
||||||
if (showNullPointerIssue) {
|
|
||||||
TopGroups<Integer> jobResults = c.getTopGroups(childJobJoinQuery, null, 0, 10, 0, true);
|
TopGroups<Integer> jobResults = c.getTopGroups(childJobJoinQuery, null, 0, 10, 0, true);
|
||||||
|
|
||||||
//assertEquals(1, results.totalHitCount);
|
//assertEquals(1, results.totalHitCount);
|
||||||
|
@ -978,7 +976,6 @@ public class TestBlockJoin extends LuceneTestCase {
|
||||||
assertNotNull(group.groupValue);
|
assertNotNull(group.groupValue);
|
||||||
StoredDocument parentDoc = s.doc(group.groupValue);
|
StoredDocument parentDoc = s.doc(group.groupValue);
|
||||||
assertEquals("Lisa", parentDoc.get("name"));
|
assertEquals("Lisa", parentDoc.get("name"));
|
||||||
}
|
|
||||||
|
|
||||||
// Now Examine qualification children
|
// Now Examine qualification children
|
||||||
TopGroups<Integer> qualificationResults = c.getTopGroups(childQualificationJoinQuery, null, 0, 10, 0, true);
|
TopGroups<Integer> qualificationResults = c.getTopGroups(childQualificationJoinQuery, null, 0, 10, 0, true);
|
||||||
|
@ -992,7 +989,7 @@ public class TestBlockJoin extends LuceneTestCase {
|
||||||
StoredDocument childQualificationDoc = s.doc(qGroup.scoreDocs[0].doc);
|
StoredDocument childQualificationDoc = s.doc(qGroup.scoreDocs[0].doc);
|
||||||
assertEquals("maths", childQualificationDoc.get("qualification"));
|
assertEquals("maths", childQualificationDoc.get("qualification"));
|
||||||
assertNotNull(qGroup.groupValue);
|
assertNotNull(qGroup.groupValue);
|
||||||
StoredDocument parentDoc = s.doc(qGroup.groupValue);
|
parentDoc = s.doc(qGroup.groupValue);
|
||||||
assertEquals("Lisa", parentDoc.get("name"));
|
assertEquals("Lisa", parentDoc.get("name"));
|
||||||
|
|
||||||
|
|
||||||
|
@ -1057,4 +1054,95 @@ public class TestBlockJoin extends LuceneTestCase {
|
||||||
r.close();
|
r.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testGetTopGroups() throws Exception {
|
||||||
|
|
||||||
|
final Directory dir = newDirectory();
|
||||||
|
final RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||||
|
|
||||||
|
final List<Document> docs = new ArrayList<Document>();
|
||||||
|
docs.add(makeJob("ruby", 2005));
|
||||||
|
docs.add(makeJob("java", 2006));
|
||||||
|
docs.add(makeJob("java", 2010));
|
||||||
|
docs.add(makeJob("java", 2012));
|
||||||
|
Collections.shuffle(docs, random());
|
||||||
|
docs.add(makeResume("Frank", "United States"));
|
||||||
|
|
||||||
|
addSkillless(w);
|
||||||
|
w.addDocuments(docs);
|
||||||
|
addSkillless(w);
|
||||||
|
|
||||||
|
IndexReader r = w.getReader();
|
||||||
|
w.close();
|
||||||
|
IndexSearcher s = newSearcher(r);
|
||||||
|
|
||||||
|
// Create a filter that defines "parent" documents in the index - in this case resumes
|
||||||
|
Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
|
||||||
|
|
||||||
|
// Define child document criteria (finds an example of relevant work experience)
|
||||||
|
BooleanQuery childQuery = new BooleanQuery();
|
||||||
|
childQuery.add(new BooleanClause(new TermQuery(new Term("skill", "java")), Occur.MUST));
|
||||||
|
childQuery.add(new BooleanClause(NumericRangeQuery.newIntRange("year", 2006, 2011, true, true), Occur.MUST));
|
||||||
|
|
||||||
|
// Wrap the child document query to 'join' any matches
|
||||||
|
// up to corresponding parent:
|
||||||
|
ToParentBlockJoinQuery childJoinQuery = new ToParentBlockJoinQuery(childQuery, parentsFilter, ScoreMode.Avg);
|
||||||
|
|
||||||
|
ToParentBlockJoinCollector c = new ToParentBlockJoinCollector(Sort.RELEVANCE, 2, true, true);
|
||||||
|
|
||||||
|
s.search(childJoinQuery, c);
|
||||||
|
|
||||||
|
//Get all child documents within groups
|
||||||
|
@SuppressWarnings({"unchecked","rawtypes"})
|
||||||
|
TopGroups<Integer>[] getTopGroupsResults = new TopGroups[2];
|
||||||
|
getTopGroupsResults[0] = c.getTopGroups(childJoinQuery, null, 0, 10, 0, true);
|
||||||
|
getTopGroupsResults[1] = c.getTopGroupsWithAllChildDocs(childJoinQuery, null, 0, 0, true);
|
||||||
|
|
||||||
|
for (TopGroups<Integer> results : getTopGroupsResults) {
|
||||||
|
assertFalse(Float.isNaN(results.maxScore));
|
||||||
|
assertEquals(2, results.totalGroupedHitCount);
|
||||||
|
assertEquals(1, results.groups.length);
|
||||||
|
|
||||||
|
final GroupDocs<Integer> group = results.groups[0];
|
||||||
|
assertEquals(2, group.totalHits);
|
||||||
|
assertFalse(Float.isNaN(group.score));
|
||||||
|
assertNotNull(group.groupValue);
|
||||||
|
StoredDocument parentDoc = s.doc(group.groupValue);
|
||||||
|
assertEquals("Frank", parentDoc.get("name"));
|
||||||
|
|
||||||
|
assertEquals(2, group.scoreDocs.length); //all matched child documents collected
|
||||||
|
|
||||||
|
for (ScoreDoc scoreDoc : group.scoreDocs) {
|
||||||
|
StoredDocument childDoc = s.doc(scoreDoc.doc);
|
||||||
|
assertEquals("java", childDoc.get("skill"));
|
||||||
|
int year = Integer.parseInt(childDoc.get("year"));
|
||||||
|
assertTrue(year >= 2006 && year <= 2011);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//Get part of child documents
|
||||||
|
TopGroups<Integer> boundedResults = c.getTopGroups(childJoinQuery, null, 0, 1, 0, true);
|
||||||
|
assertFalse(Float.isNaN(boundedResults.maxScore));
|
||||||
|
assertEquals(2, boundedResults.totalGroupedHitCount);
|
||||||
|
assertEquals(1, boundedResults.groups.length);
|
||||||
|
|
||||||
|
final GroupDocs<Integer> group = boundedResults.groups[0];
|
||||||
|
assertEquals(2, group.totalHits);
|
||||||
|
assertFalse(Float.isNaN(group.score));
|
||||||
|
assertNotNull(group.groupValue);
|
||||||
|
StoredDocument parentDoc = s.doc(group.groupValue);
|
||||||
|
assertEquals("Frank", parentDoc.get("name"));
|
||||||
|
|
||||||
|
assertEquals(1, group.scoreDocs.length); //not all matched child documents collected
|
||||||
|
|
||||||
|
for (ScoreDoc scoreDoc : group.scoreDocs) {
|
||||||
|
StoredDocument childDoc = s.doc(scoreDoc.doc);
|
||||||
|
assertEquals("java", childDoc.get("skill"));
|
||||||
|
int year = Integer.parseInt(childDoc.get("year"));
|
||||||
|
assertTrue(year >= 2006 && year <= 2011);
|
||||||
|
}
|
||||||
|
|
||||||
|
r.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue