mirror of https://github.com/apache/lucene.git
LUCENE-3129: add single pass grouping collector, BlockGroupingCollector
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1130648 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5c68532ecc
commit
375c1abdbf
|
@ -130,6 +130,12 @@ New Features
|
|||
case where the indexing rate is lowish but the reopen rate is
|
||||
highish, to take load off the IO system. (Mike McCandless)
|
||||
|
||||
* LUCENE-3129: Added BlockGroupingCollector, a single pass
|
||||
grouping collector which is faster than the two-pass approach, and
|
||||
also computes the total group count, but requires that every
|
||||
document sharing the same group was indexed as a doc block
|
||||
(IndexWriter.add/updateDocuments). (Mike McCandless)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-3040: Switch all analysis consumers (highlighter, morelikethis, memory, ...)
|
||||
|
|
|
@ -0,0 +1,516 @@
|
|||
package org.apache.lucene.search.grouping;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexWriter; // javadocs
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.FieldComparator;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.TopDocsCollector;
|
||||
import org.apache.lucene.search.TopFieldCollector;
|
||||
import org.apache.lucene.search.TopScoreDocCollector;
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
/** BlockGroupingCollector performs grouping with a
|
||||
* single pass collector, as long as you are grouping by a
|
||||
* doc block field, ie all documents sharing a given group
|
||||
* value were indexed as a doc block using the atomic
|
||||
* {@link IndexWriter#addDocuments} or {@link
|
||||
* IndexWriter#updateDocuments} API.
|
||||
*
|
||||
* <p>This results in faster performance (~25% faster QPS)
|
||||
* than the two-pass grouping collectors, with the tradeoff
|
||||
* being that the documents in each group must always be
|
||||
* indexed as a block. This collector also fills in
|
||||
* TopGroups.totalGroupCount without requiring the separate
|
||||
* {@link AllGroupsCollector}. However, this collector does
|
||||
* not fill in the groupValue of each group; this field
|
||||
* will always be null.
|
||||
*
|
||||
* <p><b>NOTE</b>: this collector makes no effort to verify
|
||||
* the docs were in fact indexed as a block, so it's up to
|
||||
* you to ensure this was the case.
|
||||
*
|
||||
* <p>See {@link org.apache.lucene.search.grouping} for more
|
||||
* details including a full code example.</p>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
||||
public class BlockGroupingCollector extends Collector {
|
||||
|
||||
private int[] pendingSubDocs;
|
||||
private float[] pendingSubScores;
|
||||
private int subDocUpto;
|
||||
|
||||
private final Sort groupSort;
|
||||
private final int topNGroups;
|
||||
private final Filter lastDocPerGroup;
|
||||
|
||||
// TODO: specialize into 2 classes, static "create" method:
|
||||
private final boolean needsScores;
|
||||
|
||||
private final FieldComparator[] comparators;
|
||||
private final int[] reversed;
|
||||
private final int compIDXEnd;
|
||||
private int bottomSlot;
|
||||
private boolean queueFull;
|
||||
private AtomicReaderContext currentReaderContext;
|
||||
|
||||
private int topGroupDoc;
|
||||
private int totalHitCount;
|
||||
private int totalGroupCount;
|
||||
private int docBase;
|
||||
private int groupEndDocID;
|
||||
//private OpenBitSet lastDocPerGroupBits;
|
||||
private DocIdSetIterator lastDocPerGroupBits;
|
||||
private Scorer scorer;
|
||||
private final GroupQueue groupQueue;
|
||||
private boolean groupCompetes;
|
||||
|
||||
private final static class FakeScorer extends Scorer {
|
||||
|
||||
float score;
|
||||
int doc;
|
||||
|
||||
public FakeScorer() {
|
||||
super((Weight) null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score() {
|
||||
return score;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
private static final class OneGroup {
|
||||
AtomicReaderContext readerContext;
|
||||
//int groupOrd;
|
||||
int topGroupDoc;
|
||||
int[] docs;
|
||||
float[] scores;
|
||||
int count;
|
||||
int comparatorSlot;
|
||||
}
|
||||
|
||||
// Sorts by groupSort. Not static -- uses comparators, reversed
|
||||
private final class GroupQueue extends PriorityQueue<OneGroup> {
|
||||
|
||||
public GroupQueue(int size) {
|
||||
super(size);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean lessThan(final OneGroup group1, final OneGroup group2) {
|
||||
|
||||
//System.out.println(" ltcheck");
|
||||
assert group1 != group2;
|
||||
assert group1.comparatorSlot != group2.comparatorSlot;
|
||||
|
||||
final int numComparators = comparators.length;
|
||||
for (int compIDX=0;compIDX < numComparators; compIDX++) {
|
||||
final int c = reversed[compIDX] * comparators[compIDX].compare(group1.comparatorSlot, group2.comparatorSlot);
|
||||
if (c != 0) {
|
||||
// Short circuit
|
||||
return c > 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Break ties by docID; lower docID is always sorted first
|
||||
return group1.topGroupDoc > group2.topGroupDoc;
|
||||
}
|
||||
}
|
||||
|
||||
// Called when we transition to another group; if the
|
||||
// group is competitive we insert into the group queue
|
||||
private void processGroup() {
|
||||
totalGroupCount++;
|
||||
//System.out.println(" processGroup ord=" + lastGroupOrd + " competes=" + groupCompetes + " count=" + subDocUpto + " groupDoc=" + topGroupDoc);
|
||||
if (groupCompetes) {
|
||||
if (!queueFull) {
|
||||
// Startup transient: always add a new OneGroup
|
||||
final OneGroup og = new OneGroup();
|
||||
og.count = subDocUpto;
|
||||
og.topGroupDoc = docBase + topGroupDoc;
|
||||
og.docs = pendingSubDocs;
|
||||
pendingSubDocs = new int[10];
|
||||
if (needsScores) {
|
||||
og.scores = pendingSubScores;
|
||||
pendingSubScores = new float[10];
|
||||
}
|
||||
og.readerContext = currentReaderContext;
|
||||
//og.groupOrd = lastGroupOrd;
|
||||
og.comparatorSlot = bottomSlot;
|
||||
final OneGroup bottomGroup = groupQueue.add(og);
|
||||
//System.out.println(" ADD group=" + getGroupString(lastGroupOrd) + " newBottom=" + getGroupString(bottomGroup.groupOrd));
|
||||
queueFull = groupQueue.size() == topNGroups;
|
||||
if (queueFull) {
|
||||
// Queue just became full; now set the real bottom
|
||||
// in the comparators:
|
||||
bottomSlot = bottomGroup.comparatorSlot;
|
||||
//System.out.println(" set bottom=" + bottomSlot);
|
||||
for (int i = 0; i < comparators.length; i++) {
|
||||
comparators[i].setBottom(bottomSlot);
|
||||
}
|
||||
//System.out.println(" QUEUE FULL");
|
||||
} else {
|
||||
// Queue not full yet -- just advance bottomSlot:
|
||||
bottomSlot = groupQueue.size();
|
||||
}
|
||||
} else {
|
||||
// Replace bottom element in PQ and then updateTop
|
||||
final OneGroup og = groupQueue.top();
|
||||
assert og != null;
|
||||
og.count = subDocUpto;
|
||||
og.topGroupDoc = docBase + topGroupDoc;
|
||||
// Swap pending docs
|
||||
final int[] savDocs = og.docs;
|
||||
og.docs = pendingSubDocs;
|
||||
pendingSubDocs = savDocs;
|
||||
if (needsScores) {
|
||||
// Swap pending scores
|
||||
final float[] savScores = og.scores;
|
||||
og.scores = pendingSubScores;
|
||||
pendingSubScores = og.scores;
|
||||
}
|
||||
og.readerContext = currentReaderContext;
|
||||
//og.groupOrd = lastGroupOrd;
|
||||
bottomSlot = groupQueue.updateTop().comparatorSlot;
|
||||
|
||||
//System.out.println(" set bottom=" + bottomSlot);
|
||||
for (int i = 0; i < comparators.length; i++) {
|
||||
comparators[i].setBottom(bottomSlot);
|
||||
}
|
||||
}
|
||||
}
|
||||
subDocUpto = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create the single pass collector.
|
||||
*
|
||||
* @param groupSort The {@link Sort} used to sort the
|
||||
* groups. The top sorted document within each group
|
||||
* according to groupSort, determines how that group
|
||||
* sorts against other groups. This must be non-null,
|
||||
* ie, if you want to groupSort by relevance use
|
||||
* Sort.RELEVANCE.
|
||||
* @param topNGroups How many top groups to keep.
|
||||
* @param needsScores true if the collected documents
|
||||
* require scores, either because relevance is included
|
||||
* in the withinGroupSort or because you plan to pass true
|
||||
* for either getSscores or getMaxScores to {@link
|
||||
* #getTopGroups}
|
||||
* @param lastDocPerGroup a {@link Filter} that marks the
|
||||
* last document in each group.
|
||||
*/
|
||||
public BlockGroupingCollector(Sort groupSort, int topNGroups, boolean needsScores, Filter lastDocPerGroup) throws IOException {
|
||||
|
||||
if (topNGroups < 1) {
|
||||
throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")");
|
||||
}
|
||||
|
||||
groupQueue = new GroupQueue(topNGroups);
|
||||
pendingSubDocs = new int[10];
|
||||
if (needsScores) {
|
||||
pendingSubScores = new float[10];
|
||||
}
|
||||
|
||||
this.needsScores = needsScores;
|
||||
this.lastDocPerGroup = lastDocPerGroup;
|
||||
// TODO: allow null groupSort to mean "by relevance",
|
||||
// and specialize it?
|
||||
this.groupSort = groupSort;
|
||||
|
||||
this.topNGroups = topNGroups;
|
||||
|
||||
final SortField[] sortFields = groupSort.getSort();
|
||||
comparators = new FieldComparator[sortFields.length];
|
||||
compIDXEnd = comparators.length - 1;
|
||||
reversed = new int[sortFields.length];
|
||||
for (int i = 0; i < sortFields.length; i++) {
|
||||
final SortField sortField = sortFields[i];
|
||||
comparators[i] = sortField.getComparator(topNGroups, i);
|
||||
reversed[i] = sortField.getReverse() ? -1 : 1;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: maybe allow no sort on retrieving groups? app
|
||||
// may want to simply process docs in the group itself?
|
||||
// typically they will be presented as a "single" result
|
||||
// in the UI?
|
||||
|
||||
/** Returns the grouped results. Returns null if the
|
||||
* number of groups collected is <= groupOffset.
|
||||
*
|
||||
* <p><b>NOTE</b>: This collector is unable to compute
|
||||
* the groupValue per group so it will always be null.
|
||||
* This is normally not a problem, as you can obtain the
|
||||
* value just like you obtain other values for each
|
||||
* matching document (eg, via stored fields, via
|
||||
* FieldCache, etc.)
|
||||
*
|
||||
* @param withinGroupSort The {@link Sort} used to sort
|
||||
* documents within each group. Passing null is
|
||||
* allowed, to sort by relevance.
|
||||
* @param groupOffset Which group to start from
|
||||
* @param withinGroupOffset Which document to start from
|
||||
* within each group
|
||||
* @param maxDocsPerGroup How many top documents to keep
|
||||
* within each group.
|
||||
* @param fillSortFields If true then the Comparable
|
||||
* values for the sort fields will be set
|
||||
*/
|
||||
public TopGroups getTopGroups(Sort withinGroupSort, int groupOffset, int withinGroupOffset, int maxDocsPerGroup, boolean fillSortFields) throws IOException {
|
||||
|
||||
//if (queueFull) {
|
||||
//System.out.println("getTopGroups groupOffset=" + groupOffset + " topNGroups=" + topNGroups);
|
||||
//}
|
||||
if (subDocUpto != 0) {
|
||||
processGroup();
|
||||
}
|
||||
if (groupOffset >= groupQueue.size()) {
|
||||
return null;
|
||||
}
|
||||
int totalGroupedHitCount = 0;
|
||||
|
||||
final FakeScorer fakeScorer = new FakeScorer();
|
||||
|
||||
final GroupDocs[] groups = new GroupDocs[groupQueue.size() - groupOffset];
|
||||
for(int downTo=groupQueue.size()-groupOffset-1;downTo>=0;downTo--) {
|
||||
final OneGroup og = groupQueue.pop();
|
||||
|
||||
// At this point we hold all docs w/ in each group,
|
||||
// unsorted; we now sort them:
|
||||
final TopDocsCollector collector;
|
||||
if (withinGroupSort == null) {
|
||||
// Sort by score
|
||||
if (!needsScores) {
|
||||
throw new IllegalArgumentException("cannot sort by relevance within group: needsScores=false");
|
||||
}
|
||||
collector = TopScoreDocCollector.create(maxDocsPerGroup, true);
|
||||
} else {
|
||||
// Sort by fields
|
||||
collector = TopFieldCollector.create(withinGroupSort, maxDocsPerGroup, fillSortFields, needsScores, needsScores, true);
|
||||
}
|
||||
|
||||
collector.setScorer(fakeScorer);
|
||||
collector.setNextReader(og.readerContext);
|
||||
for(int docIDX=0;docIDX<og.count;docIDX++) {
|
||||
final int doc = og.docs[docIDX];
|
||||
fakeScorer.doc = doc;
|
||||
if (needsScores) {
|
||||
fakeScorer.score = og.scores[docIDX];
|
||||
}
|
||||
collector.collect(doc);
|
||||
}
|
||||
totalGroupedHitCount += og.count;
|
||||
|
||||
final Comparable[] groupSortValues;
|
||||
|
||||
if (fillSortFields) {
|
||||
groupSortValues = new Comparable[comparators.length];
|
||||
for(int sortFieldIDX=0;sortFieldIDX<comparators.length;sortFieldIDX++) {
|
||||
groupSortValues[sortFieldIDX] = comparators[sortFieldIDX].value(og.comparatorSlot);
|
||||
}
|
||||
} else {
|
||||
groupSortValues = null;
|
||||
}
|
||||
|
||||
final TopDocs topDocs = collector.topDocs(withinGroupOffset, maxDocsPerGroup);
|
||||
|
||||
groups[downTo] = new GroupDocs(topDocs.getMaxScore(),
|
||||
og.count,
|
||||
topDocs.scoreDocs,
|
||||
null,
|
||||
groupSortValues);
|
||||
}
|
||||
|
||||
/*
|
||||
while (groupQueue.size() != 0) {
|
||||
final OneGroup og = groupQueue.pop();
|
||||
//System.out.println(" leftover: og ord=" + og.groupOrd + " count=" + og.count);
|
||||
totalGroupedHitCount += og.count;
|
||||
}
|
||||
*/
|
||||
|
||||
return new TopGroups(new TopGroups(groupSort.getSort(),
|
||||
withinGroupSort == null ? null : withinGroupSort.getSort(),
|
||||
totalHitCount, totalGroupedHitCount, groups),
|
||||
totalGroupCount);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setScorer(Scorer scorer) throws IOException {
|
||||
this.scorer = scorer;
|
||||
for (FieldComparator comparator : comparators) {
|
||||
comparator.setScorer(scorer);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc) throws IOException {
|
||||
|
||||
// System.out.println("C " + doc);
|
||||
|
||||
if (doc > groupEndDocID) {
|
||||
// Group changed
|
||||
if (subDocUpto != 0) {
|
||||
processGroup();
|
||||
}
|
||||
groupEndDocID = lastDocPerGroupBits.advance(doc);
|
||||
//System.out.println(" adv " + groupEndDocID + " " + lastDocPerGroupBits);
|
||||
subDocUpto = 0;
|
||||
groupCompetes = !queueFull;
|
||||
}
|
||||
|
||||
totalHitCount++;
|
||||
|
||||
// Always cache doc/score within this group:
|
||||
if (subDocUpto == pendingSubDocs.length) {
|
||||
pendingSubDocs = ArrayUtil.grow(pendingSubDocs);
|
||||
}
|
||||
pendingSubDocs[subDocUpto] = doc;
|
||||
if (needsScores) {
|
||||
if (subDocUpto == pendingSubScores.length) {
|
||||
pendingSubScores = ArrayUtil.grow(pendingSubScores);
|
||||
}
|
||||
pendingSubScores[subDocUpto] = scorer.score();
|
||||
}
|
||||
subDocUpto++;
|
||||
|
||||
if (groupCompetes) {
|
||||
if (subDocUpto == 1) {
|
||||
assert !queueFull;
|
||||
|
||||
//System.out.println(" init copy to bottomSlot=" + bottomSlot);
|
||||
for (FieldComparator fc : comparators) {
|
||||
fc.copy(bottomSlot, doc);
|
||||
fc.setBottom(bottomSlot);
|
||||
}
|
||||
topGroupDoc = doc;
|
||||
} else {
|
||||
// Compare to bottomSlot
|
||||
for (int compIDX = 0;; compIDX++) {
|
||||
final int c = reversed[compIDX] * comparators[compIDX].compareBottom(doc);
|
||||
if (c < 0) {
|
||||
// Definitely not competitive -- done
|
||||
return;
|
||||
} else if (c > 0) {
|
||||
// Definitely competitive.
|
||||
break;
|
||||
} else if (compIDX == compIDXEnd) {
|
||||
// Ties with bottom, except we know this docID is
|
||||
// > docID in the queue (docs are visited in
|
||||
// order), so not competitive:
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
//System.out.println(" best w/in group!");
|
||||
|
||||
for (FieldComparator fc : comparators) {
|
||||
fc.copy(bottomSlot, doc);
|
||||
// Necessary because some comparators cache
|
||||
// details of bottom slot; this forces them to
|
||||
// re-cache:
|
||||
fc.setBottom(bottomSlot);
|
||||
}
|
||||
topGroupDoc = doc;
|
||||
}
|
||||
} else {
|
||||
// We're not sure this group will make it into the
|
||||
// queue yet
|
||||
for (int compIDX = 0;; compIDX++) {
|
||||
final int c = reversed[compIDX] * comparators[compIDX].compareBottom(doc);
|
||||
if (c < 0) {
|
||||
// Definitely not competitive -- done
|
||||
//System.out.println(" doc doesn't compete w/ top groups");
|
||||
return;
|
||||
} else if (c > 0) {
|
||||
// Definitely competitive.
|
||||
break;
|
||||
} else if (compIDX == compIDXEnd) {
|
||||
// Ties with bottom, except we know this docID is
|
||||
// > docID in the queue (docs are visited in
|
||||
// order), so not competitive:
|
||||
//System.out.println(" doc doesn't compete w/ top groups");
|
||||
return;
|
||||
}
|
||||
}
|
||||
groupCompetes = true;
|
||||
for (FieldComparator fc : comparators) {
|
||||
fc.copy(bottomSlot, doc);
|
||||
// Necessary because some comparators cache
|
||||
// details of bottom slot; this forces them to
|
||||
// re-cache:
|
||||
fc.setBottom(bottomSlot);
|
||||
}
|
||||
topGroupDoc = doc;
|
||||
//System.out.println(" doc competes w/ top groups");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean acceptsDocsOutOfOrder() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(AtomicReaderContext readerContext) throws IOException {
|
||||
if (subDocUpto != 0) {
|
||||
processGroup();
|
||||
}
|
||||
subDocUpto = 0;
|
||||
docBase = readerContext.docBase;
|
||||
//System.out.println("setNextReader base=" + docBase + " r=" + readerContext.reader);
|
||||
lastDocPerGroupBits = lastDocPerGroup.getDocIdSet(readerContext).iterator();
|
||||
groupEndDocID = -1;
|
||||
|
||||
currentReaderContext = readerContext;
|
||||
for (int i=0; i<comparators.length; i++) {
|
||||
comparators[i] = comparators[i].setNextReader(readerContext);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -3,62 +3,92 @@
|
|||
|
||||
<p>This module enables search result grouping with Lucene, where hits
|
||||
with the same value in the specified single-valued group field are
|
||||
grouped together. For example, if you group by the <tt>author</tt>
|
||||
field, then all documents with the same value in the <tt>author</tt>
|
||||
grouped together. For example, if you group by the <code>author</code>
|
||||
field, then all documents with the same value in the <code>author</code>
|
||||
field fall into a single group.</p>
|
||||
|
||||
<p>Grouping requires a number of inputs:</p>
|
||||
|
||||
<ul>
|
||||
<li> <tt>groupField</tt>: this is the field used for grouping.
|
||||
For example, if you use the <tt>author</tt> field then each
|
||||
<li> <code>groupField</code>: this is the field used for grouping.
|
||||
For example, if you use the <code>author</code> field then each
|
||||
group has all books by the same author. Documents that don't
|
||||
have this field are grouped under a single group with
|
||||
a <tt>null</tt> group value.
|
||||
a <code>null</code> group value.
|
||||
|
||||
<li> <tt>groupSort</tt>: how the groups are sorted. For sorting
|
||||
<li> <code>groupSort</code>: how the groups are sorted. For sorting
|
||||
purposes, each group is "represented" by the highest-sorted
|
||||
document according to the <tt>groupSort</tt> within it. For
|
||||
document according to the <code>groupSort</code> within it. For
|
||||
example, if you specify "price" (ascending) then the first group
|
||||
is the one with the lowest price book within it. Or if you
|
||||
specify relevance group sort, then the first group is the one
|
||||
containing the highest scoring book.
|
||||
|
||||
<li> <tt>topNGroups</tt>: how many top groups to keep. For
|
||||
<li> <code>topNGroups</code>: how many top groups to keep. For
|
||||
example, 10 means the top 10 groups are computed.
|
||||
|
||||
<li> <tt>groupOffset</tt>: which "slice" of top groups you want to
|
||||
<li> <code>groupOffset</code>: which "slice" of top groups you want to
|
||||
retrieve. For example, 3 means you'll get 7 groups back
|
||||
(assuming <tt>topNGroups</tt> is 10). This is useful for
|
||||
(assuming <code>topNGroups</code> is 10). This is useful for
|
||||
paging, where you might show 5 groups per page.
|
||||
|
||||
<li> <tt>withinGroupSort</tt>: how the documents within each group
|
||||
<li> <code>withinGroupSort</code>: how the documents within each group
|
||||
are sorted. This can be different from the group sort.
|
||||
|
||||
<li> <tt>maxDocsPerGroup</tt>: how many top documents within each
|
||||
<li> <code>maxDocsPerGroup</code>: how many top documents within each
|
||||
group to keep.
|
||||
|
||||
<li> <tt>withinGroupOffset</tt>: which "slice" of top
|
||||
<li> <code>withinGroupOffset</code>: which "slice" of top
|
||||
documents you want to retrieve from each group.
|
||||
|
||||
</ul>
|
||||
|
||||
<p>The implementation is two-pass: the first pass ({@link
|
||||
<p>
|
||||
There are two grouping implementations here:
|
||||
<ul>
|
||||
<li>
|
||||
Arbitrary grouping that can group by any single-valued indexed
|
||||
field, implemented as a two-pass collector: the first pass ({@link
|
||||
org.apache.lucene.search.grouping.FirstPassGroupingCollector})
|
||||
gathers the top groups, and the second pass ({@link
|
||||
org.apache.lucene.search.grouping.SecondPassGroupingCollector})
|
||||
gathers documents within those groups. If the search is costly to
|
||||
run you may want to use the {@link
|
||||
org.apache.lucene.search.CachingCollector} class, which
|
||||
caches hits and can (quickly) replay them for the second pass. This
|
||||
way you only run the query once, but you pay a RAM cost to (briefly)
|
||||
org.apache.lucene.search.CachingCollector} class, which caches
|
||||
hits and can (quickly) replay them for the second pass. This way
|
||||
you only run the query once, but you pay a RAM cost to (briefly)
|
||||
hold all hits. Results are returned as a {@link
|
||||
org.apache.lucene.search.grouping.TopGroups} instance.</p>
|
||||
</li>
|
||||
<li>
|
||||
Indexed groups, using a single pass collector (<code>BlockGroupingCollectorDoc</code>) that
|
||||
is able to group according to the doc blocks created during
|
||||
indexing using <code>IndexWriter</code>'s <code>add/updateDocuments</code> API.
|
||||
This is faster (~25% faster QPS) than the generic two-pass
|
||||
collector, but it only works for doc blocks so you must statically
|
||||
commit (during indexing) to which grouping you'll need at search
|
||||
time.
|
||||
|
||||
<p>This implementation does not rely on a single valued grouping
|
||||
field; rather, the blocks in the index define the groups, so your
|
||||
application is free to determine what the grouping criteria is.
|
||||
At search time, you must provide a <code>Filter</code> that marks
|
||||
the last document in each group. This is a substantial memory
|
||||
savings because this collector does not load
|
||||
a <code>DocTermsIndex</code> from the
|
||||
<code>FieldCache</code>.
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<p>The benefit of the arbitrary grouping implementation is you don't have
|
||||
to commit at indexing time to a static grouping of your documents.
|
||||
But the downside is it's somewhat slower to run, and requires more RAM
|
||||
(a <code>FieldCache.DocTermsIndex</code> entry is created).
|
||||
|
||||
<p>Known limitations:</p>
|
||||
<ul>
|
||||
<li> The group field must be a single-valued indexed field.
|
||||
{@link org.apache.lucene.search.FieldCache} is used to load the {@link org.apache.lucene.search.FieldCache.DocTermsIndex} for this field.
|
||||
<li> For the two-pass grouping collector, the group field must be a
|
||||
single-valued indexed field.
|
||||
<li> Unlike Solr's implementation, this module cannot group by
|
||||
function query values nor by arbitrary queries.
|
||||
<li> Sharding is not directly supported, though is not too
|
||||
|
@ -66,7 +96,8 @@ field fall into a single group.</p>
|
|||
group yourself.
|
||||
</ul>
|
||||
|
||||
<p>Typical usage looks like this (using the {@link org.apache.lucene.search.CachingCollector}):</p>
|
||||
<p>Typical usage for the generic two-pass collector looks like this
|
||||
(using the {@link org.apache.lucene.search.CachingCollector}):</p>
|
||||
|
||||
<pre class="prettyprint">
|
||||
FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("author", groupSort, groupOffset+topNGroups);
|
||||
|
@ -111,5 +142,50 @@ field fall into a single group.</p>
|
|||
// Render groupsResult...
|
||||
</pre>
|
||||
|
||||
<p>To use the single-pass <code>BlockGroupingCollector</code>,
|
||||
first, at indexing time, you must ensure all docs in each group
|
||||
are added as a block, and you have some way to find the last
|
||||
document of each group. One simple way to do this is to add a
|
||||
marker binary field:</p>
|
||||
|
||||
<pre class="prettyprint">
|
||||
// Create Documents from your source:
|
||||
List<Document> oneGroup = ...;
|
||||
|
||||
Field groupEndField = new Field("groupEnd", "x", Field.Store.NO, Field.Index.NOT_ANALYZED);
|
||||
groupEndField.setOmitTermFreqAndPositions(true);
|
||||
groupEndField.setOmitNorms(true);
|
||||
oneGroup.get(oneGroup.size()-1).add(groupEndField);
|
||||
|
||||
// You can also use writer.updateDocuments(); just be sure you
|
||||
// replace an entire previous doc block with this new one. For
|
||||
// example, each group could have a "groupID" field, with the same
|
||||
// value for all docs in this group:
|
||||
writer.addDocuments(oneGroup);
|
||||
</pre>
|
||||
|
||||
Then, at search time, do this up front:
|
||||
|
||||
<pre class="prettyprint">
|
||||
// Set this once in your app & save away for reusing across all queries:
|
||||
Filter groupEndDocs = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("end", "x"))));
|
||||
</pre>
|
||||
|
||||
Finally, do this per search:
|
||||
|
||||
<pre class="prettyprint">
|
||||
// Per search:
|
||||
BlockGroupingCollector c = new BlockGroupingCollector(groupSort, groupOffset+topNGroups, needsScores, groupEndDocs);
|
||||
s.search(new TermQuery(new Term("content", searchTerm)), c);
|
||||
TopGroups groupsResult = c.getTopGroups(withinGroupSort, groupOffset, docOffset, docOffset+docsPerGroup, fillFields);
|
||||
|
||||
// Render groupsResult...
|
||||
</pre>
|
||||
|
||||
Note that the <code>groupValue</code> of each <code>GroupDocs</code>
|
||||
will be <code>null</code>, so if you need to present this value you'll
|
||||
have to separately retrieve it (for example using stored
|
||||
fields, <code>FieldCache</code>, etc.).
|
||||
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
package org.apache.lucene.search.grouping;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -183,6 +184,7 @@ public class TestGrouping extends LuceneTestCase {
|
|||
private Comparator<GroupDoc> getComparator(Sort sort) {
|
||||
final SortField[] sortFields = sort.getSort();
|
||||
return new Comparator<GroupDoc>() {
|
||||
// @Override -- Not until Java 1.6
|
||||
public int compare(GroupDoc d1, GroupDoc d2) {
|
||||
for(SortField sf : sortFields) {
|
||||
final int cmp;
|
||||
|
@ -224,6 +226,16 @@ public class TestGrouping extends LuceneTestCase {
|
|||
return fields;
|
||||
}
|
||||
|
||||
/*
|
||||
private String groupToString(BytesRef b) {
|
||||
if (b == null) {
|
||||
return "null";
|
||||
} else {
|
||||
return b.utf8ToString();
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
private TopGroups slowGrouping(GroupDoc[] groupDocs,
|
||||
String searchTerm,
|
||||
boolean fillFields,
|
||||
|
@ -247,21 +259,25 @@ public class TestGrouping extends LuceneTestCase {
|
|||
int totalHitCount = 0;
|
||||
Set<BytesRef> knownGroups = new HashSet<BytesRef>();
|
||||
|
||||
//System.out.println("TEST: slowGrouping");
|
||||
for(GroupDoc d : groupDocs) {
|
||||
// TODO: would be better to filter by searchTerm before sorting!
|
||||
if (!d.content.equals(searchTerm)) {
|
||||
continue;
|
||||
}
|
||||
totalHitCount++;
|
||||
//System.out.println(" match id=" + d.id);
|
||||
|
||||
if (doAllGroups) {
|
||||
if (!knownGroups.contains(d.group)) {
|
||||
knownGroups.add(d.group);
|
||||
//System.out.println(" add group=" + groupToString(d.group));
|
||||
}
|
||||
}
|
||||
|
||||
List<GroupDoc> l = groups.get(d.group);
|
||||
if (l == null) {
|
||||
//System.out.println(" add sortedGroup=" + groupToString(d.group));
|
||||
sortedGroups.add(d.group);
|
||||
if (fillFields) {
|
||||
sortedGroupFields.add(fillFields(d, groupSort));
|
||||
|
@ -322,6 +338,67 @@ public class TestGrouping extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
private IndexReader getDocBlockReader(Directory dir, GroupDoc[] groupDocs) throws IOException {
|
||||
// Coalesce by group, but in random order:
|
||||
Collections.shuffle(Arrays.asList(groupDocs), random);
|
||||
final Map<BytesRef,List<GroupDoc>> groupMap = new HashMap<BytesRef,List<GroupDoc>>();
|
||||
final List<BytesRef> groupValues = new ArrayList<BytesRef>();
|
||||
|
||||
for(GroupDoc groupDoc : groupDocs) {
|
||||
if (!groupMap.containsKey(groupDoc.group)) {
|
||||
groupValues.add(groupDoc.group);
|
||||
groupMap.put(groupDoc.group, new ArrayList<GroupDoc>());
|
||||
}
|
||||
groupMap.get(groupDoc.group).add(groupDoc);
|
||||
}
|
||||
|
||||
RandomIndexWriter w = new RandomIndexWriter(
|
||||
random,
|
||||
dir,
|
||||
newIndexWriterConfig(TEST_VERSION_CURRENT,
|
||||
new MockAnalyzer(random)));
|
||||
|
||||
final List<List<Document>> updateDocs = new ArrayList<List<Document>>();
|
||||
//System.out.println("TEST: index groups");
|
||||
for(BytesRef group : groupValues) {
|
||||
final List<Document> docs = new ArrayList<Document>();
|
||||
//System.out.println("TEST: group=" + (group == null ? "null" : group.utf8ToString()));
|
||||
for(GroupDoc groupValue : groupMap.get(group)) {
|
||||
Document doc = new Document();
|
||||
docs.add(doc);
|
||||
if (groupValue.group != null) {
|
||||
doc.add(newField("group", groupValue.group.utf8ToString(), Field.Index.NOT_ANALYZED));
|
||||
}
|
||||
doc.add(newField("sort1", groupValue.sort1.utf8ToString(), Field.Index.NOT_ANALYZED));
|
||||
doc.add(newField("sort2", groupValue.sort2.utf8ToString(), Field.Index.NOT_ANALYZED));
|
||||
doc.add(new NumericField("id").setIntValue(groupValue.id));
|
||||
doc.add(newField("content", groupValue.content, Field.Index.NOT_ANALYZED));
|
||||
//System.out.println("TEST: doc content=" + groupValue.content + " group=" + (groupValue.group == null ? "null" : groupValue.group.utf8ToString()) + " sort1=" + groupValue.sort1.utf8ToString() + " id=" + groupValue.id);
|
||||
}
|
||||
// So we can pull filter marking last doc in block:
|
||||
final Field groupEnd = newField("groupend", "x", Field.Index.NOT_ANALYZED);
|
||||
groupEnd.setOmitTermFreqAndPositions(true);
|
||||
groupEnd.setOmitNorms(true);
|
||||
docs.get(docs.size()-1).add(groupEnd);
|
||||
// Add as a doc block:
|
||||
w.addDocuments(docs);
|
||||
if (group != null && random.nextInt(7) == 4) {
|
||||
updateDocs.add(docs);
|
||||
}
|
||||
}
|
||||
|
||||
for(List<Document> docs : updateDocs) {
|
||||
// Just replaces docs w/ same docs:
|
||||
w.updateDocuments(new Term("group", docs.get(0).get("group")),
|
||||
docs);
|
||||
}
|
||||
|
||||
final IndexReader r = w.getReader();
|
||||
w.close();
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
for(int iter=0;iter<3;iter++) {
|
||||
|
||||
|
@ -350,7 +427,7 @@ public class TestGrouping extends LuceneTestCase {
|
|||
random,
|
||||
dir,
|
||||
newIndexWriterConfig(TEST_VERSION_CURRENT,
|
||||
new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
|
||||
new MockAnalyzer(random)));
|
||||
|
||||
Document doc = new Document();
|
||||
Document docNoGroup = new Document();
|
||||
|
@ -405,8 +482,19 @@ public class TestGrouping extends LuceneTestCase {
|
|||
final IndexReader r = w.getReader();
|
||||
w.close();
|
||||
|
||||
final IndexSearcher s = new IndexSearcher(r);
|
||||
// Build 2nd index, where docs are added in blocks by
|
||||
// group, so we can use single pass collector
|
||||
final Directory dir2 = newDirectory();
|
||||
final IndexReader r2 = getDocBlockReader(dir2, groupDocs);
|
||||
final Filter lastDocInBlock = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("groupend", "x"))));
|
||||
|
||||
final IndexSearcher s = new IndexSearcher(r);
|
||||
final IndexSearcher s2 = new IndexSearcher(r2);
|
||||
|
||||
final int[] docIDToID = FieldCache.DEFAULT.getInts(r, "id");
|
||||
final int[] docIDToID2 = FieldCache.DEFAULT.getInts(r2, "id");
|
||||
|
||||
try {
|
||||
for(int searchIter=0;searchIter<100;searchIter++) {
|
||||
|
||||
if (VERBOSE) {
|
||||
|
@ -418,10 +506,12 @@ public class TestGrouping extends LuceneTestCase {
|
|||
final boolean getScores = random.nextBoolean();
|
||||
final boolean getMaxScores = random.nextBoolean();
|
||||
final Sort groupSort = getRandomSort();
|
||||
//final Sort groupSort = new Sort(new SortField[] {new SortField("sort1", SortField.STRING), new SortField("id", SortField.INT)});
|
||||
// TODO: also test null (= sort by relevance)
|
||||
final Sort docSort = getRandomSort();
|
||||
|
||||
final int topNGroups = _TestUtil.nextInt(random, 1, 30);
|
||||
//final int topNGroups = 4;
|
||||
final int docsPerGroup = _TestUtil.nextInt(random, 1, 50);
|
||||
final int groupOffset = _TestUtil.nextInt(random, 0, (topNGroups-1)/2);
|
||||
//final int groupOffset = 0;
|
||||
|
@ -537,20 +627,58 @@ public class TestGrouping extends LuceneTestCase {
|
|||
|
||||
final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, doAllGroups, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
|
||||
|
||||
try {
|
||||
if (VERBOSE) {
|
||||
if (expectedGroups == null) {
|
||||
System.out.println("TEST: no expected groups");
|
||||
} else {
|
||||
System.out.println("TEST: expected groups");
|
||||
for(GroupDocs gd : expectedGroups.groups) {
|
||||
System.out.println(" group=" + (gd.groupValue == null ? "null" : gd.groupValue.utf8ToString()));
|
||||
for(ScoreDoc sd : gd.scoreDocs) {
|
||||
System.out.println(" id=" + sd.doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// NOTE: intentional but temporary field cache insanity!
|
||||
assertEquals(FieldCache.DEFAULT.getInts(r, "id"), expectedGroups, groupsResult);
|
||||
assertEquals(docIDToID, expectedGroups, groupsResult, true);
|
||||
|
||||
final boolean needsScores = getScores || getMaxScores || docSort == null;
|
||||
final BlockGroupingCollector c3 = new BlockGroupingCollector(groupSort, groupOffset+topNGroups, needsScores, lastDocInBlock);
|
||||
final AllGroupsCollector allGroupsCollector2;
|
||||
final Collector c4;
|
||||
if (doAllGroups) {
|
||||
allGroupsCollector2 = new AllGroupsCollector("group");
|
||||
c4 = MultiCollector.wrap(c3, allGroupsCollector2);
|
||||
} else {
|
||||
allGroupsCollector2 = null;
|
||||
c4 = c3;
|
||||
}
|
||||
s2.search(new TermQuery(new Term("content", searchTerm)), c4);
|
||||
final TopGroups tempTopGroups2 = c3.getTopGroups(docSort, groupOffset, docOffset, docOffset+docsPerGroup, fillFields);
|
||||
final TopGroups groupsResult2;
|
||||
if (doAllGroups && tempTopGroups2 != null) {
|
||||
assertEquals((int) tempTopGroups2.totalGroupCount, allGroupsCollector2.getGroupCount());
|
||||
groupsResult2 = new TopGroups(tempTopGroups2, allGroupsCollector2.getGroupCount());
|
||||
} else {
|
||||
groupsResult2 = tempTopGroups2;
|
||||
}
|
||||
assertEquals(docIDToID2, expectedGroups, groupsResult2, false);
|
||||
}
|
||||
} finally {
|
||||
FieldCache.DEFAULT.purge(r);
|
||||
}
|
||||
FieldCache.DEFAULT.purge(r2);
|
||||
}
|
||||
|
||||
r.close();
|
||||
dir.close();
|
||||
|
||||
r2.close();
|
||||
dir2.close();
|
||||
}
|
||||
}
|
||||
|
||||
private void assertEquals(int[] docIDtoID, TopGroups expected, TopGroups actual) {
|
||||
private void assertEquals(int[] docIDtoID, TopGroups expected, TopGroups actual, boolean verifyGroupValues) {
|
||||
if (expected == null) {
|
||||
assertNull(actual);
|
||||
return;
|
||||
|
@ -570,7 +698,9 @@ public class TestGrouping extends LuceneTestCase {
|
|||
}
|
||||
final GroupDocs expectedGroup = expected.groups[groupIDX];
|
||||
final GroupDocs actualGroup = actual.groups[groupIDX];
|
||||
if (verifyGroupValues) {
|
||||
assertEquals(expectedGroup.groupValue, actualGroup.groupValue);
|
||||
}
|
||||
assertArrayEquals(expectedGroup.groupSortValues, actualGroup.groupSortValues);
|
||||
|
||||
// TODO
|
||||
|
|
Loading…
Reference in New Issue