mirror of https://github.com/apache/lucene.git
LUCENE-3129: add single pass grouping collector, BlockGroupingCollector
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1130648 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5c68532ecc
commit
375c1abdbf
|
@ -130,6 +130,12 @@ New Features
|
||||||
case where the indexing rate is lowish but the reopen rate is
|
case where the indexing rate is lowish but the reopen rate is
|
||||||
highish, to take load off the IO system. (Mike McCandless)
|
highish, to take load off the IO system. (Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-3129: Added BlockGroupingCollector, a single pass
|
||||||
|
grouping collector which is faster than the two-pass approach, and
|
||||||
|
also computes the total group count, but requires that every
|
||||||
|
document sharing the same group was indexed as a doc block
|
||||||
|
(IndexWriter.add/updateDocuments). (Mike McCandless)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-3040: Switch all analysis consumers (highlighter, morelikethis, memory, ...)
|
* LUCENE-3040: Switch all analysis consumers (highlighter, morelikethis, memory, ...)
|
||||||
|
|
|
@ -0,0 +1,516 @@
|
||||||
|
package org.apache.lucene.search.grouping;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||||
|
import org.apache.lucene.index.IndexWriter; // javadocs
|
||||||
|
import org.apache.lucene.search.Collector;
|
||||||
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
|
import org.apache.lucene.search.FieldComparator;
|
||||||
|
import org.apache.lucene.search.Filter;
|
||||||
|
import org.apache.lucene.search.Scorer;
|
||||||
|
import org.apache.lucene.search.Sort;
|
||||||
|
import org.apache.lucene.search.SortField;
|
||||||
|
import org.apache.lucene.search.TopDocs;
|
||||||
|
import org.apache.lucene.search.TopDocsCollector;
|
||||||
|
import org.apache.lucene.search.TopFieldCollector;
|
||||||
|
import org.apache.lucene.search.TopScoreDocCollector;
|
||||||
|
import org.apache.lucene.search.Weight;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.PriorityQueue;
|
||||||
|
|
||||||
|
/** BlockGroupingCollector performs grouping with a
|
||||||
|
* single pass collector, as long as you are grouping by a
|
||||||
|
* doc block field, ie all documents sharing a given group
|
||||||
|
* value were indexed as a doc block using the atomic
|
||||||
|
* {@link IndexWriter#addDocuments} or {@link
|
||||||
|
* IndexWriter#updateDocuments} API.
|
||||||
|
*
|
||||||
|
* <p>This results in faster performance (~25% faster QPS)
|
||||||
|
* than the two-pass grouping collectors, with the tradeoff
|
||||||
|
* being that the documents in each group must always be
|
||||||
|
* indexed as a block. This collector also fills in
|
||||||
|
* TopGroups.totalGroupCount without requiring the separate
|
||||||
|
* {@link AllGroupsCollector}. However, this collector does
|
||||||
|
* not fill in the groupValue of each group; this field
|
||||||
|
* will always be null.
|
||||||
|
*
|
||||||
|
* <p><b>NOTE</b>: this collector makes no effort to verify
|
||||||
|
* the docs were in fact indexed as a block, so it's up to
|
||||||
|
* you to ensure this was the case.
|
||||||
|
*
|
||||||
|
* <p>See {@link org.apache.lucene.search.grouping} for more
|
||||||
|
* details including a full code example.</p>
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class BlockGroupingCollector extends Collector {
|
||||||
|
|
||||||
|
private int[] pendingSubDocs;
|
||||||
|
private float[] pendingSubScores;
|
||||||
|
private int subDocUpto;
|
||||||
|
|
||||||
|
private final Sort groupSort;
|
||||||
|
private final int topNGroups;
|
||||||
|
private final Filter lastDocPerGroup;
|
||||||
|
|
||||||
|
// TODO: specialize into 2 classes, static "create" method:
|
||||||
|
private final boolean needsScores;
|
||||||
|
|
||||||
|
private final FieldComparator[] comparators;
|
||||||
|
private final int[] reversed;
|
||||||
|
private final int compIDXEnd;
|
||||||
|
private int bottomSlot;
|
||||||
|
private boolean queueFull;
|
||||||
|
private AtomicReaderContext currentReaderContext;
|
||||||
|
|
||||||
|
private int topGroupDoc;
|
||||||
|
private int totalHitCount;
|
||||||
|
private int totalGroupCount;
|
||||||
|
private int docBase;
|
||||||
|
private int groupEndDocID;
|
||||||
|
//private OpenBitSet lastDocPerGroupBits;
|
||||||
|
private DocIdSetIterator lastDocPerGroupBits;
|
||||||
|
private Scorer scorer;
|
||||||
|
private final GroupQueue groupQueue;
|
||||||
|
private boolean groupCompetes;
|
||||||
|
|
||||||
|
private final static class FakeScorer extends Scorer {
|
||||||
|
|
||||||
|
float score;
|
||||||
|
int doc;
|
||||||
|
|
||||||
|
public FakeScorer() {
|
||||||
|
super((Weight) null);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float score() {
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int docID() {
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int advance(int target) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextDoc() {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final class OneGroup {
|
||||||
|
AtomicReaderContext readerContext;
|
||||||
|
//int groupOrd;
|
||||||
|
int topGroupDoc;
|
||||||
|
int[] docs;
|
||||||
|
float[] scores;
|
||||||
|
int count;
|
||||||
|
int comparatorSlot;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sorts by groupSort. Not static -- uses comparators, reversed
|
||||||
|
private final class GroupQueue extends PriorityQueue<OneGroup> {
|
||||||
|
|
||||||
|
public GroupQueue(int size) {
|
||||||
|
super(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected boolean lessThan(final OneGroup group1, final OneGroup group2) {
|
||||||
|
|
||||||
|
//System.out.println(" ltcheck");
|
||||||
|
assert group1 != group2;
|
||||||
|
assert group1.comparatorSlot != group2.comparatorSlot;
|
||||||
|
|
||||||
|
final int numComparators = comparators.length;
|
||||||
|
for (int compIDX=0;compIDX < numComparators; compIDX++) {
|
||||||
|
final int c = reversed[compIDX] * comparators[compIDX].compare(group1.comparatorSlot, group2.comparatorSlot);
|
||||||
|
if (c != 0) {
|
||||||
|
// Short circuit
|
||||||
|
return c > 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Break ties by docID; lower docID is always sorted first
|
||||||
|
return group1.topGroupDoc > group2.topGroupDoc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Called when we transition to another group; if the
|
||||||
|
// group is competitive we insert into the group queue
|
||||||
|
private void processGroup() {
|
||||||
|
totalGroupCount++;
|
||||||
|
//System.out.println(" processGroup ord=" + lastGroupOrd + " competes=" + groupCompetes + " count=" + subDocUpto + " groupDoc=" + topGroupDoc);
|
||||||
|
if (groupCompetes) {
|
||||||
|
if (!queueFull) {
|
||||||
|
// Startup transient: always add a new OneGroup
|
||||||
|
final OneGroup og = new OneGroup();
|
||||||
|
og.count = subDocUpto;
|
||||||
|
og.topGroupDoc = docBase + topGroupDoc;
|
||||||
|
og.docs = pendingSubDocs;
|
||||||
|
pendingSubDocs = new int[10];
|
||||||
|
if (needsScores) {
|
||||||
|
og.scores = pendingSubScores;
|
||||||
|
pendingSubScores = new float[10];
|
||||||
|
}
|
||||||
|
og.readerContext = currentReaderContext;
|
||||||
|
//og.groupOrd = lastGroupOrd;
|
||||||
|
og.comparatorSlot = bottomSlot;
|
||||||
|
final OneGroup bottomGroup = groupQueue.add(og);
|
||||||
|
//System.out.println(" ADD group=" + getGroupString(lastGroupOrd) + " newBottom=" + getGroupString(bottomGroup.groupOrd));
|
||||||
|
queueFull = groupQueue.size() == topNGroups;
|
||||||
|
if (queueFull) {
|
||||||
|
// Queue just became full; now set the real bottom
|
||||||
|
// in the comparators:
|
||||||
|
bottomSlot = bottomGroup.comparatorSlot;
|
||||||
|
//System.out.println(" set bottom=" + bottomSlot);
|
||||||
|
for (int i = 0; i < comparators.length; i++) {
|
||||||
|
comparators[i].setBottom(bottomSlot);
|
||||||
|
}
|
||||||
|
//System.out.println(" QUEUE FULL");
|
||||||
|
} else {
|
||||||
|
// Queue not full yet -- just advance bottomSlot:
|
||||||
|
bottomSlot = groupQueue.size();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Replace bottom element in PQ and then updateTop
|
||||||
|
final OneGroup og = groupQueue.top();
|
||||||
|
assert og != null;
|
||||||
|
og.count = subDocUpto;
|
||||||
|
og.topGroupDoc = docBase + topGroupDoc;
|
||||||
|
// Swap pending docs
|
||||||
|
final int[] savDocs = og.docs;
|
||||||
|
og.docs = pendingSubDocs;
|
||||||
|
pendingSubDocs = savDocs;
|
||||||
|
if (needsScores) {
|
||||||
|
// Swap pending scores
|
||||||
|
final float[] savScores = og.scores;
|
||||||
|
og.scores = pendingSubScores;
|
||||||
|
pendingSubScores = og.scores;
|
||||||
|
}
|
||||||
|
og.readerContext = currentReaderContext;
|
||||||
|
//og.groupOrd = lastGroupOrd;
|
||||||
|
bottomSlot = groupQueue.updateTop().comparatorSlot;
|
||||||
|
|
||||||
|
//System.out.println(" set bottom=" + bottomSlot);
|
||||||
|
for (int i = 0; i < comparators.length; i++) {
|
||||||
|
comparators[i].setBottom(bottomSlot);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
subDocUpto = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create the single pass collector.
|
||||||
|
*
|
||||||
|
* @param groupSort The {@link Sort} used to sort the
|
||||||
|
* groups. The top sorted document within each group
|
||||||
|
* according to groupSort, determines how that group
|
||||||
|
* sorts against other groups. This must be non-null,
|
||||||
|
* ie, if you want to groupSort by relevance use
|
||||||
|
* Sort.RELEVANCE.
|
||||||
|
* @param topNGroups How many top groups to keep.
|
||||||
|
* @param needsScores true if the collected documents
|
||||||
|
* require scores, either because relevance is included
|
||||||
|
* in the withinGroupSort or because you plan to pass true
|
||||||
|
* for either getSscores or getMaxScores to {@link
|
||||||
|
* #getTopGroups}
|
||||||
|
* @param lastDocPerGroup a {@link Filter} that marks the
|
||||||
|
* last document in each group.
|
||||||
|
*/
|
||||||
|
public BlockGroupingCollector(Sort groupSort, int topNGroups, boolean needsScores, Filter lastDocPerGroup) throws IOException {
|
||||||
|
|
||||||
|
if (topNGroups < 1) {
|
||||||
|
throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")");
|
||||||
|
}
|
||||||
|
|
||||||
|
groupQueue = new GroupQueue(topNGroups);
|
||||||
|
pendingSubDocs = new int[10];
|
||||||
|
if (needsScores) {
|
||||||
|
pendingSubScores = new float[10];
|
||||||
|
}
|
||||||
|
|
||||||
|
this.needsScores = needsScores;
|
||||||
|
this.lastDocPerGroup = lastDocPerGroup;
|
||||||
|
// TODO: allow null groupSort to mean "by relevance",
|
||||||
|
// and specialize it?
|
||||||
|
this.groupSort = groupSort;
|
||||||
|
|
||||||
|
this.topNGroups = topNGroups;
|
||||||
|
|
||||||
|
final SortField[] sortFields = groupSort.getSort();
|
||||||
|
comparators = new FieldComparator[sortFields.length];
|
||||||
|
compIDXEnd = comparators.length - 1;
|
||||||
|
reversed = new int[sortFields.length];
|
||||||
|
for (int i = 0; i < sortFields.length; i++) {
|
||||||
|
final SortField sortField = sortFields[i];
|
||||||
|
comparators[i] = sortField.getComparator(topNGroups, i);
|
||||||
|
reversed[i] = sortField.getReverse() ? -1 : 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: maybe allow no sort on retrieving groups? app
|
||||||
|
// may want to simply process docs in the group itself?
|
||||||
|
// typically they will be presented as a "single" result
|
||||||
|
// in the UI?
|
||||||
|
|
||||||
|
/** Returns the grouped results. Returns null if the
|
||||||
|
* number of groups collected is <= groupOffset.
|
||||||
|
*
|
||||||
|
* <p><b>NOTE</b>: This collector is unable to compute
|
||||||
|
* the groupValue per group so it will always be null.
|
||||||
|
* This is normally not a problem, as you can obtain the
|
||||||
|
* value just like you obtain other values for each
|
||||||
|
* matching document (eg, via stored fields, via
|
||||||
|
* FieldCache, etc.)
|
||||||
|
*
|
||||||
|
* @param withinGroupSort The {@link Sort} used to sort
|
||||||
|
* documents within each group. Passing null is
|
||||||
|
* allowed, to sort by relevance.
|
||||||
|
* @param groupOffset Which group to start from
|
||||||
|
* @param withinGroupOffset Which document to start from
|
||||||
|
* within each group
|
||||||
|
* @param maxDocsPerGroup How many top documents to keep
|
||||||
|
* within each group.
|
||||||
|
* @param fillSortFields If true then the Comparable
|
||||||
|
* values for the sort fields will be set
|
||||||
|
*/
|
||||||
|
public TopGroups getTopGroups(Sort withinGroupSort, int groupOffset, int withinGroupOffset, int maxDocsPerGroup, boolean fillSortFields) throws IOException {
|
||||||
|
|
||||||
|
//if (queueFull) {
|
||||||
|
//System.out.println("getTopGroups groupOffset=" + groupOffset + " topNGroups=" + topNGroups);
|
||||||
|
//}
|
||||||
|
if (subDocUpto != 0) {
|
||||||
|
processGroup();
|
||||||
|
}
|
||||||
|
if (groupOffset >= groupQueue.size()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
int totalGroupedHitCount = 0;
|
||||||
|
|
||||||
|
final FakeScorer fakeScorer = new FakeScorer();
|
||||||
|
|
||||||
|
final GroupDocs[] groups = new GroupDocs[groupQueue.size() - groupOffset];
|
||||||
|
for(int downTo=groupQueue.size()-groupOffset-1;downTo>=0;downTo--) {
|
||||||
|
final OneGroup og = groupQueue.pop();
|
||||||
|
|
||||||
|
// At this point we hold all docs w/ in each group,
|
||||||
|
// unsorted; we now sort them:
|
||||||
|
final TopDocsCollector collector;
|
||||||
|
if (withinGroupSort == null) {
|
||||||
|
// Sort by score
|
||||||
|
if (!needsScores) {
|
||||||
|
throw new IllegalArgumentException("cannot sort by relevance within group: needsScores=false");
|
||||||
|
}
|
||||||
|
collector = TopScoreDocCollector.create(maxDocsPerGroup, true);
|
||||||
|
} else {
|
||||||
|
// Sort by fields
|
||||||
|
collector = TopFieldCollector.create(withinGroupSort, maxDocsPerGroup, fillSortFields, needsScores, needsScores, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
collector.setScorer(fakeScorer);
|
||||||
|
collector.setNextReader(og.readerContext);
|
||||||
|
for(int docIDX=0;docIDX<og.count;docIDX++) {
|
||||||
|
final int doc = og.docs[docIDX];
|
||||||
|
fakeScorer.doc = doc;
|
||||||
|
if (needsScores) {
|
||||||
|
fakeScorer.score = og.scores[docIDX];
|
||||||
|
}
|
||||||
|
collector.collect(doc);
|
||||||
|
}
|
||||||
|
totalGroupedHitCount += og.count;
|
||||||
|
|
||||||
|
final Comparable[] groupSortValues;
|
||||||
|
|
||||||
|
if (fillSortFields) {
|
||||||
|
groupSortValues = new Comparable[comparators.length];
|
||||||
|
for(int sortFieldIDX=0;sortFieldIDX<comparators.length;sortFieldIDX++) {
|
||||||
|
groupSortValues[sortFieldIDX] = comparators[sortFieldIDX].value(og.comparatorSlot);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
groupSortValues = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
final TopDocs topDocs = collector.topDocs(withinGroupOffset, maxDocsPerGroup);
|
||||||
|
|
||||||
|
groups[downTo] = new GroupDocs(topDocs.getMaxScore(),
|
||||||
|
og.count,
|
||||||
|
topDocs.scoreDocs,
|
||||||
|
null,
|
||||||
|
groupSortValues);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
while (groupQueue.size() != 0) {
|
||||||
|
final OneGroup og = groupQueue.pop();
|
||||||
|
//System.out.println(" leftover: og ord=" + og.groupOrd + " count=" + og.count);
|
||||||
|
totalGroupedHitCount += og.count;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
return new TopGroups(new TopGroups(groupSort.getSort(),
|
||||||
|
withinGroupSort == null ? null : withinGroupSort.getSort(),
|
||||||
|
totalHitCount, totalGroupedHitCount, groups),
|
||||||
|
totalGroupCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setScorer(Scorer scorer) throws IOException {
|
||||||
|
this.scorer = scorer;
|
||||||
|
for (FieldComparator comparator : comparators) {
|
||||||
|
comparator.setScorer(scorer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void collect(int doc) throws IOException {
|
||||||
|
|
||||||
|
// System.out.println("C " + doc);
|
||||||
|
|
||||||
|
if (doc > groupEndDocID) {
|
||||||
|
// Group changed
|
||||||
|
if (subDocUpto != 0) {
|
||||||
|
processGroup();
|
||||||
|
}
|
||||||
|
groupEndDocID = lastDocPerGroupBits.advance(doc);
|
||||||
|
//System.out.println(" adv " + groupEndDocID + " " + lastDocPerGroupBits);
|
||||||
|
subDocUpto = 0;
|
||||||
|
groupCompetes = !queueFull;
|
||||||
|
}
|
||||||
|
|
||||||
|
totalHitCount++;
|
||||||
|
|
||||||
|
// Always cache doc/score within this group:
|
||||||
|
if (subDocUpto == pendingSubDocs.length) {
|
||||||
|
pendingSubDocs = ArrayUtil.grow(pendingSubDocs);
|
||||||
|
}
|
||||||
|
pendingSubDocs[subDocUpto] = doc;
|
||||||
|
if (needsScores) {
|
||||||
|
if (subDocUpto == pendingSubScores.length) {
|
||||||
|
pendingSubScores = ArrayUtil.grow(pendingSubScores);
|
||||||
|
}
|
||||||
|
pendingSubScores[subDocUpto] = scorer.score();
|
||||||
|
}
|
||||||
|
subDocUpto++;
|
||||||
|
|
||||||
|
if (groupCompetes) {
|
||||||
|
if (subDocUpto == 1) {
|
||||||
|
assert !queueFull;
|
||||||
|
|
||||||
|
//System.out.println(" init copy to bottomSlot=" + bottomSlot);
|
||||||
|
for (FieldComparator fc : comparators) {
|
||||||
|
fc.copy(bottomSlot, doc);
|
||||||
|
fc.setBottom(bottomSlot);
|
||||||
|
}
|
||||||
|
topGroupDoc = doc;
|
||||||
|
} else {
|
||||||
|
// Compare to bottomSlot
|
||||||
|
for (int compIDX = 0;; compIDX++) {
|
||||||
|
final int c = reversed[compIDX] * comparators[compIDX].compareBottom(doc);
|
||||||
|
if (c < 0) {
|
||||||
|
// Definitely not competitive -- done
|
||||||
|
return;
|
||||||
|
} else if (c > 0) {
|
||||||
|
// Definitely competitive.
|
||||||
|
break;
|
||||||
|
} else if (compIDX == compIDXEnd) {
|
||||||
|
// Ties with bottom, except we know this docID is
|
||||||
|
// > docID in the queue (docs are visited in
|
||||||
|
// order), so not competitive:
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//System.out.println(" best w/in group!");
|
||||||
|
|
||||||
|
for (FieldComparator fc : comparators) {
|
||||||
|
fc.copy(bottomSlot, doc);
|
||||||
|
// Necessary because some comparators cache
|
||||||
|
// details of bottom slot; this forces them to
|
||||||
|
// re-cache:
|
||||||
|
fc.setBottom(bottomSlot);
|
||||||
|
}
|
||||||
|
topGroupDoc = doc;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// We're not sure this group will make it into the
|
||||||
|
// queue yet
|
||||||
|
for (int compIDX = 0;; compIDX++) {
|
||||||
|
final int c = reversed[compIDX] * comparators[compIDX].compareBottom(doc);
|
||||||
|
if (c < 0) {
|
||||||
|
// Definitely not competitive -- done
|
||||||
|
//System.out.println(" doc doesn't compete w/ top groups");
|
||||||
|
return;
|
||||||
|
} else if (c > 0) {
|
||||||
|
// Definitely competitive.
|
||||||
|
break;
|
||||||
|
} else if (compIDX == compIDXEnd) {
|
||||||
|
// Ties with bottom, except we know this docID is
|
||||||
|
// > docID in the queue (docs are visited in
|
||||||
|
// order), so not competitive:
|
||||||
|
//System.out.println(" doc doesn't compete w/ top groups");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
groupCompetes = true;
|
||||||
|
for (FieldComparator fc : comparators) {
|
||||||
|
fc.copy(bottomSlot, doc);
|
||||||
|
// Necessary because some comparators cache
|
||||||
|
// details of bottom slot; this forces them to
|
||||||
|
// re-cache:
|
||||||
|
fc.setBottom(bottomSlot);
|
||||||
|
}
|
||||||
|
topGroupDoc = doc;
|
||||||
|
//System.out.println(" doc competes w/ top groups");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean acceptsDocsOutOfOrder() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setNextReader(AtomicReaderContext readerContext) throws IOException {
|
||||||
|
if (subDocUpto != 0) {
|
||||||
|
processGroup();
|
||||||
|
}
|
||||||
|
subDocUpto = 0;
|
||||||
|
docBase = readerContext.docBase;
|
||||||
|
//System.out.println("setNextReader base=" + docBase + " r=" + readerContext.reader);
|
||||||
|
lastDocPerGroupBits = lastDocPerGroup.getDocIdSet(readerContext).iterator();
|
||||||
|
groupEndDocID = -1;
|
||||||
|
|
||||||
|
currentReaderContext = readerContext;
|
||||||
|
for (int i=0; i<comparators.length; i++) {
|
||||||
|
comparators[i] = comparators[i].setNextReader(readerContext);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -3,62 +3,92 @@
|
||||||
|
|
||||||
<p>This module enables search result grouping with Lucene, where hits
|
<p>This module enables search result grouping with Lucene, where hits
|
||||||
with the same value in the specified single-valued group field are
|
with the same value in the specified single-valued group field are
|
||||||
grouped together. For example, if you group by the <tt>author</tt>
|
grouped together. For example, if you group by the <code>author</code>
|
||||||
field, then all documents with the same value in the <tt>author</tt>
|
field, then all documents with the same value in the <code>author</code>
|
||||||
field fall into a single group.</p>
|
field fall into a single group.</p>
|
||||||
|
|
||||||
<p>Grouping requires a number of inputs:</p>
|
<p>Grouping requires a number of inputs:</p>
|
||||||
|
|
||||||
<ul>
|
<ul>
|
||||||
<li> <tt>groupField</tt>: this is the field used for grouping.
|
<li> <code>groupField</code>: this is the field used for grouping.
|
||||||
For example, if you use the <tt>author</tt> field then each
|
For example, if you use the <code>author</code> field then each
|
||||||
group has all books by the same author. Documents that don't
|
group has all books by the same author. Documents that don't
|
||||||
have this field are grouped under a single group with
|
have this field are grouped under a single group with
|
||||||
a <tt>null</tt> group value.
|
a <code>null</code> group value.
|
||||||
|
|
||||||
<li> <tt>groupSort</tt>: how the groups are sorted. For sorting
|
<li> <code>groupSort</code>: how the groups are sorted. For sorting
|
||||||
purposes, each group is "represented" by the highest-sorted
|
purposes, each group is "represented" by the highest-sorted
|
||||||
document according to the <tt>groupSort</tt> within it. For
|
document according to the <code>groupSort</code> within it. For
|
||||||
example, if you specify "price" (ascending) then the first group
|
example, if you specify "price" (ascending) then the first group
|
||||||
is the one with the lowest price book within it. Or if you
|
is the one with the lowest price book within it. Or if you
|
||||||
specify relevance group sort, then the first group is the one
|
specify relevance group sort, then the first group is the one
|
||||||
containing the highest scoring book.
|
containing the highest scoring book.
|
||||||
|
|
||||||
<li> <tt>topNGroups</tt>: how many top groups to keep. For
|
<li> <code>topNGroups</code>: how many top groups to keep. For
|
||||||
example, 10 means the top 10 groups are computed.
|
example, 10 means the top 10 groups are computed.
|
||||||
|
|
||||||
<li> <tt>groupOffset</tt>: which "slice" of top groups you want to
|
<li> <code>groupOffset</code>: which "slice" of top groups you want to
|
||||||
retrieve. For example, 3 means you'll get 7 groups back
|
retrieve. For example, 3 means you'll get 7 groups back
|
||||||
(assuming <tt>topNGroups</tt> is 10). This is useful for
|
(assuming <code>topNGroups</code> is 10). This is useful for
|
||||||
paging, where you might show 5 groups per page.
|
paging, where you might show 5 groups per page.
|
||||||
|
|
||||||
<li> <tt>withinGroupSort</tt>: how the documents within each group
|
<li> <code>withinGroupSort</code>: how the documents within each group
|
||||||
are sorted. This can be different from the group sort.
|
are sorted. This can be different from the group sort.
|
||||||
|
|
||||||
<li> <tt>maxDocsPerGroup</tt>: how many top documents within each
|
<li> <code>maxDocsPerGroup</code>: how many top documents within each
|
||||||
group to keep.
|
group to keep.
|
||||||
|
|
||||||
<li> <tt>withinGroupOffset</tt>: which "slice" of top
|
<li> <code>withinGroupOffset</code>: which "slice" of top
|
||||||
documents you want to retrieve from each group.
|
documents you want to retrieve from each group.
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
<p>The implementation is two-pass: the first pass ({@link
|
<p>
|
||||||
org.apache.lucene.search.grouping.FirstPassGroupingCollector})
|
There are two grouping implementations here:
|
||||||
gathers the top groups, and the second pass ({@link
|
<ul>
|
||||||
org.apache.lucene.search.grouping.SecondPassGroupingCollector})
|
<li>
|
||||||
gathers documents within those groups. If the search is costly to
|
Arbitrary grouping that can group by any single-valued indexed
|
||||||
run you may want to use the {@link
|
field, implemented as a two-pass collector: the first pass ({@link
|
||||||
org.apache.lucene.search.CachingCollector} class, which
|
org.apache.lucene.search.grouping.FirstPassGroupingCollector})
|
||||||
caches hits and can (quickly) replay them for the second pass. This
|
gathers the top groups, and the second pass ({@link
|
||||||
way you only run the query once, but you pay a RAM cost to (briefly)
|
org.apache.lucene.search.grouping.SecondPassGroupingCollector})
|
||||||
hold all hits. Results are returned as a {@link
|
gathers documents within those groups. If the search is costly to
|
||||||
org.apache.lucene.search.grouping.TopGroups} instance.</p>
|
run you may want to use the {@link
|
||||||
|
org.apache.lucene.search.CachingCollector} class, which caches
|
||||||
|
hits and can (quickly) replay them for the second pass. This way
|
||||||
|
you only run the query once, but you pay a RAM cost to (briefly)
|
||||||
|
hold all hits. Results are returned as a {@link
|
||||||
|
org.apache.lucene.search.grouping.TopGroups} instance.</p>
|
||||||
|
</li>
|
||||||
|
<li>
|
||||||
|
Indexed groups, using a single pass collector (<code>BlockGroupingCollectorDoc</code>) that
|
||||||
|
is able to group according to the doc blocks created during
|
||||||
|
indexing using <code>IndexWriter</code>'s <code>add/updateDocuments</code> API.
|
||||||
|
This is faster (~25% faster QPS) than the generic two-pass
|
||||||
|
collector, but it only works for doc blocks so you must statically
|
||||||
|
commit (during indexing) to which grouping you'll need at search
|
||||||
|
time.
|
||||||
|
|
||||||
|
<p>This implementation does not rely on a single valued grouping
|
||||||
|
field; rather, the blocks in the index define the groups, so your
|
||||||
|
application is free to determine what the grouping criteria is.
|
||||||
|
At search time, you must provide a <code>Filter</code> that marks
|
||||||
|
the last document in each group. This is a substantial memory
|
||||||
|
savings because this collector does not load
|
||||||
|
a <code>DocTermsIndex</code> from the
|
||||||
|
<code>FieldCache</code>.
|
||||||
|
</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<p>The benefit of the arbitrary grouping implementation is you don't have
|
||||||
|
to commit at indexing time to a static grouping of your documents.
|
||||||
|
But the downside is it's somewhat slower to run, and requires more RAM
|
||||||
|
(a <code>FieldCache.DocTermsIndex</code> entry is created).
|
||||||
|
|
||||||
<p>Known limitations:</p>
|
<p>Known limitations:</p>
|
||||||
<ul>
|
<ul>
|
||||||
<li> The group field must be a single-valued indexed field.
|
<li> For the two-pass grouping collector, the group field must be a
|
||||||
{@link org.apache.lucene.search.FieldCache} is used to load the {@link org.apache.lucene.search.FieldCache.DocTermsIndex} for this field.
|
single-valued indexed field.
|
||||||
<li> Unlike Solr's implementation, this module cannot group by
|
<li> Unlike Solr's implementation, this module cannot group by
|
||||||
function query values nor by arbitrary queries.
|
function query values nor by arbitrary queries.
|
||||||
<li> Sharding is not directly supported, though is not too
|
<li> Sharding is not directly supported, though is not too
|
||||||
|
@ -66,7 +96,8 @@ field fall into a single group.</p>
|
||||||
group yourself.
|
group yourself.
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
<p>Typical usage looks like this (using the {@link org.apache.lucene.search.CachingCollector}):</p>
|
<p>Typical usage for the generic two-pass collector looks like this
|
||||||
|
(using the {@link org.apache.lucene.search.CachingCollector}):</p>
|
||||||
|
|
||||||
<pre class="prettyprint">
|
<pre class="prettyprint">
|
||||||
FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("author", groupSort, groupOffset+topNGroups);
|
FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("author", groupSort, groupOffset+topNGroups);
|
||||||
|
@ -111,5 +142,50 @@ field fall into a single group.</p>
|
||||||
// Render groupsResult...
|
// Render groupsResult...
|
||||||
</pre>
|
</pre>
|
||||||
|
|
||||||
|
<p>To use the single-pass <code>BlockGroupingCollector</code>,
|
||||||
|
first, at indexing time, you must ensure all docs in each group
|
||||||
|
are added as a block, and you have some way to find the last
|
||||||
|
document of each group. One simple way to do this is to add a
|
||||||
|
marker binary field:</p>
|
||||||
|
|
||||||
|
<pre class="prettyprint">
|
||||||
|
// Create Documents from your source:
|
||||||
|
List<Document> oneGroup = ...;
|
||||||
|
|
||||||
|
Field groupEndField = new Field("groupEnd", "x", Field.Store.NO, Field.Index.NOT_ANALYZED);
|
||||||
|
groupEndField.setOmitTermFreqAndPositions(true);
|
||||||
|
groupEndField.setOmitNorms(true);
|
||||||
|
oneGroup.get(oneGroup.size()-1).add(groupEndField);
|
||||||
|
|
||||||
|
// You can also use writer.updateDocuments(); just be sure you
|
||||||
|
// replace an entire previous doc block with this new one. For
|
||||||
|
// example, each group could have a "groupID" field, with the same
|
||||||
|
// value for all docs in this group:
|
||||||
|
writer.addDocuments(oneGroup);
|
||||||
|
</pre>
|
||||||
|
|
||||||
|
Then, at search time, do this up front:
|
||||||
|
|
||||||
|
<pre class="prettyprint">
|
||||||
|
// Set this once in your app & save away for reusing across all queries:
|
||||||
|
Filter groupEndDocs = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("end", "x"))));
|
||||||
|
</pre>
|
||||||
|
|
||||||
|
Finally, do this per search:
|
||||||
|
|
||||||
|
<pre class="prettyprint">
|
||||||
|
// Per search:
|
||||||
|
BlockGroupingCollector c = new BlockGroupingCollector(groupSort, groupOffset+topNGroups, needsScores, groupEndDocs);
|
||||||
|
s.search(new TermQuery(new Term("content", searchTerm)), c);
|
||||||
|
TopGroups groupsResult = c.getTopGroups(withinGroupSort, groupOffset, docOffset, docOffset+docsPerGroup, fillFields);
|
||||||
|
|
||||||
|
// Render groupsResult...
|
||||||
|
</pre>
|
||||||
|
|
||||||
|
Note that the <code>groupValue</code> of each <code>GroupDocs</code>
|
||||||
|
will be <code>null</code>, so if you need to present this value you'll
|
||||||
|
have to separately retrieve it (for example using stored
|
||||||
|
fields, <code>FieldCache</code>, etc.).
|
||||||
|
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
package org.apache.lucene.search.grouping;
|
package org.apache.lucene.search.grouping;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
|
@ -183,6 +184,7 @@ public class TestGrouping extends LuceneTestCase {
|
||||||
private Comparator<GroupDoc> getComparator(Sort sort) {
|
private Comparator<GroupDoc> getComparator(Sort sort) {
|
||||||
final SortField[] sortFields = sort.getSort();
|
final SortField[] sortFields = sort.getSort();
|
||||||
return new Comparator<GroupDoc>() {
|
return new Comparator<GroupDoc>() {
|
||||||
|
// @Override -- Not until Java 1.6
|
||||||
public int compare(GroupDoc d1, GroupDoc d2) {
|
public int compare(GroupDoc d1, GroupDoc d2) {
|
||||||
for(SortField sf : sortFields) {
|
for(SortField sf : sortFields) {
|
||||||
final int cmp;
|
final int cmp;
|
||||||
|
@ -224,6 +226,16 @@ public class TestGrouping extends LuceneTestCase {
|
||||||
return fields;
|
return fields;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
private String groupToString(BytesRef b) {
|
||||||
|
if (b == null) {
|
||||||
|
return "null";
|
||||||
|
} else {
|
||||||
|
return b.utf8ToString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
private TopGroups slowGrouping(GroupDoc[] groupDocs,
|
private TopGroups slowGrouping(GroupDoc[] groupDocs,
|
||||||
String searchTerm,
|
String searchTerm,
|
||||||
boolean fillFields,
|
boolean fillFields,
|
||||||
|
@ -247,21 +259,25 @@ public class TestGrouping extends LuceneTestCase {
|
||||||
int totalHitCount = 0;
|
int totalHitCount = 0;
|
||||||
Set<BytesRef> knownGroups = new HashSet<BytesRef>();
|
Set<BytesRef> knownGroups = new HashSet<BytesRef>();
|
||||||
|
|
||||||
|
//System.out.println("TEST: slowGrouping");
|
||||||
for(GroupDoc d : groupDocs) {
|
for(GroupDoc d : groupDocs) {
|
||||||
// TODO: would be better to filter by searchTerm before sorting!
|
// TODO: would be better to filter by searchTerm before sorting!
|
||||||
if (!d.content.equals(searchTerm)) {
|
if (!d.content.equals(searchTerm)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
totalHitCount++;
|
totalHitCount++;
|
||||||
|
//System.out.println(" match id=" + d.id);
|
||||||
|
|
||||||
if (doAllGroups) {
|
if (doAllGroups) {
|
||||||
if (!knownGroups.contains(d.group)) {
|
if (!knownGroups.contains(d.group)) {
|
||||||
knownGroups.add(d.group);
|
knownGroups.add(d.group);
|
||||||
|
//System.out.println(" add group=" + groupToString(d.group));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
List<GroupDoc> l = groups.get(d.group);
|
List<GroupDoc> l = groups.get(d.group);
|
||||||
if (l == null) {
|
if (l == null) {
|
||||||
|
//System.out.println(" add sortedGroup=" + groupToString(d.group));
|
||||||
sortedGroups.add(d.group);
|
sortedGroups.add(d.group);
|
||||||
if (fillFields) {
|
if (fillFields) {
|
||||||
sortedGroupFields.add(fillFields(d, groupSort));
|
sortedGroupFields.add(fillFields(d, groupSort));
|
||||||
|
@ -322,6 +338,67 @@ public class TestGrouping extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private IndexReader getDocBlockReader(Directory dir, GroupDoc[] groupDocs) throws IOException {
|
||||||
|
// Coalesce by group, but in random order:
|
||||||
|
Collections.shuffle(Arrays.asList(groupDocs), random);
|
||||||
|
final Map<BytesRef,List<GroupDoc>> groupMap = new HashMap<BytesRef,List<GroupDoc>>();
|
||||||
|
final List<BytesRef> groupValues = new ArrayList<BytesRef>();
|
||||||
|
|
||||||
|
for(GroupDoc groupDoc : groupDocs) {
|
||||||
|
if (!groupMap.containsKey(groupDoc.group)) {
|
||||||
|
groupValues.add(groupDoc.group);
|
||||||
|
groupMap.put(groupDoc.group, new ArrayList<GroupDoc>());
|
||||||
|
}
|
||||||
|
groupMap.get(groupDoc.group).add(groupDoc);
|
||||||
|
}
|
||||||
|
|
||||||
|
RandomIndexWriter w = new RandomIndexWriter(
|
||||||
|
random,
|
||||||
|
dir,
|
||||||
|
newIndexWriterConfig(TEST_VERSION_CURRENT,
|
||||||
|
new MockAnalyzer(random)));
|
||||||
|
|
||||||
|
final List<List<Document>> updateDocs = new ArrayList<List<Document>>();
|
||||||
|
//System.out.println("TEST: index groups");
|
||||||
|
for(BytesRef group : groupValues) {
|
||||||
|
final List<Document> docs = new ArrayList<Document>();
|
||||||
|
//System.out.println("TEST: group=" + (group == null ? "null" : group.utf8ToString()));
|
||||||
|
for(GroupDoc groupValue : groupMap.get(group)) {
|
||||||
|
Document doc = new Document();
|
||||||
|
docs.add(doc);
|
||||||
|
if (groupValue.group != null) {
|
||||||
|
doc.add(newField("group", groupValue.group.utf8ToString(), Field.Index.NOT_ANALYZED));
|
||||||
|
}
|
||||||
|
doc.add(newField("sort1", groupValue.sort1.utf8ToString(), Field.Index.NOT_ANALYZED));
|
||||||
|
doc.add(newField("sort2", groupValue.sort2.utf8ToString(), Field.Index.NOT_ANALYZED));
|
||||||
|
doc.add(new NumericField("id").setIntValue(groupValue.id));
|
||||||
|
doc.add(newField("content", groupValue.content, Field.Index.NOT_ANALYZED));
|
||||||
|
//System.out.println("TEST: doc content=" + groupValue.content + " group=" + (groupValue.group == null ? "null" : groupValue.group.utf8ToString()) + " sort1=" + groupValue.sort1.utf8ToString() + " id=" + groupValue.id);
|
||||||
|
}
|
||||||
|
// So we can pull filter marking last doc in block:
|
||||||
|
final Field groupEnd = newField("groupend", "x", Field.Index.NOT_ANALYZED);
|
||||||
|
groupEnd.setOmitTermFreqAndPositions(true);
|
||||||
|
groupEnd.setOmitNorms(true);
|
||||||
|
docs.get(docs.size()-1).add(groupEnd);
|
||||||
|
// Add as a doc block:
|
||||||
|
w.addDocuments(docs);
|
||||||
|
if (group != null && random.nextInt(7) == 4) {
|
||||||
|
updateDocs.add(docs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for(List<Document> docs : updateDocs) {
|
||||||
|
// Just replaces docs w/ same docs:
|
||||||
|
w.updateDocuments(new Term("group", docs.get(0).get("group")),
|
||||||
|
docs);
|
||||||
|
}
|
||||||
|
|
||||||
|
final IndexReader r = w.getReader();
|
||||||
|
w.close();
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
public void testRandom() throws Exception {
|
public void testRandom() throws Exception {
|
||||||
for(int iter=0;iter<3;iter++) {
|
for(int iter=0;iter<3;iter++) {
|
||||||
|
|
||||||
|
@ -350,7 +427,7 @@ public class TestGrouping extends LuceneTestCase {
|
||||||
random,
|
random,
|
||||||
dir,
|
dir,
|
||||||
newIndexWriterConfig(TEST_VERSION_CURRENT,
|
newIndexWriterConfig(TEST_VERSION_CURRENT,
|
||||||
new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
|
new MockAnalyzer(random)));
|
||||||
|
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
Document docNoGroup = new Document();
|
Document docNoGroup = new Document();
|
||||||
|
@ -405,152 +482,203 @@ public class TestGrouping extends LuceneTestCase {
|
||||||
final IndexReader r = w.getReader();
|
final IndexReader r = w.getReader();
|
||||||
w.close();
|
w.close();
|
||||||
|
|
||||||
|
// Build 2nd index, where docs are added in blocks by
|
||||||
|
// group, so we can use single pass collector
|
||||||
|
final Directory dir2 = newDirectory();
|
||||||
|
final IndexReader r2 = getDocBlockReader(dir2, groupDocs);
|
||||||
|
final Filter lastDocInBlock = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("groupend", "x"))));
|
||||||
|
|
||||||
final IndexSearcher s = new IndexSearcher(r);
|
final IndexSearcher s = new IndexSearcher(r);
|
||||||
|
final IndexSearcher s2 = new IndexSearcher(r2);
|
||||||
|
|
||||||
for(int searchIter=0;searchIter<100;searchIter++) {
|
final int[] docIDToID = FieldCache.DEFAULT.getInts(r, "id");
|
||||||
|
final int[] docIDToID2 = FieldCache.DEFAULT.getInts(r2, "id");
|
||||||
|
|
||||||
if (VERBOSE) {
|
try {
|
||||||
System.out.println("TEST: searchIter=" + searchIter);
|
for(int searchIter=0;searchIter<100;searchIter++) {
|
||||||
}
|
|
||||||
|
|
||||||
final String searchTerm = contentStrings[random.nextInt(contentStrings.length)];
|
|
||||||
final boolean fillFields = random.nextBoolean();
|
|
||||||
final boolean getScores = random.nextBoolean();
|
|
||||||
final boolean getMaxScores = random.nextBoolean();
|
|
||||||
final Sort groupSort = getRandomSort();
|
|
||||||
// TODO: also test null (= sort by relevance)
|
|
||||||
final Sort docSort = getRandomSort();
|
|
||||||
|
|
||||||
final int topNGroups = _TestUtil.nextInt(random, 1, 30);
|
|
||||||
final int docsPerGroup = _TestUtil.nextInt(random, 1, 50);
|
|
||||||
final int groupOffset = _TestUtil.nextInt(random, 0, (topNGroups-1)/2);
|
|
||||||
//final int groupOffset = 0;
|
|
||||||
|
|
||||||
final int docOffset = _TestUtil.nextInt(random, 0, docsPerGroup-1);
|
|
||||||
//final int docOffset = 0;
|
|
||||||
|
|
||||||
final boolean doCache = random.nextBoolean();
|
|
||||||
final boolean doAllGroups = random.nextBoolean();
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doAllGroups=" + doAllGroups);
|
|
||||||
}
|
|
||||||
|
|
||||||
final AllGroupsCollector allGroupsCollector;
|
|
||||||
if (doAllGroups) {
|
|
||||||
allGroupsCollector = new AllGroupsCollector("group");
|
|
||||||
} else {
|
|
||||||
allGroupsCollector = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
final FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("group", groupSort, groupOffset+topNGroups);
|
|
||||||
final CachingCollector cCache;
|
|
||||||
final Collector c;
|
|
||||||
|
|
||||||
final boolean useWrappingCollector = random.nextBoolean();
|
|
||||||
|
|
||||||
if (doCache) {
|
|
||||||
final double maxCacheMB = random.nextDouble();
|
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println("TEST: maxCacheMB=" + maxCacheMB);
|
System.out.println("TEST: searchIter=" + searchIter);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (useWrappingCollector) {
|
final String searchTerm = contentStrings[random.nextInt(contentStrings.length)];
|
||||||
if (doAllGroups) {
|
final boolean fillFields = random.nextBoolean();
|
||||||
cCache = CachingCollector.create(c1, true, maxCacheMB);
|
final boolean getScores = random.nextBoolean();
|
||||||
c = MultiCollector.wrap(cCache, allGroupsCollector);
|
final boolean getMaxScores = random.nextBoolean();
|
||||||
} else {
|
final Sort groupSort = getRandomSort();
|
||||||
c = cCache = CachingCollector.create(c1, true, maxCacheMB);
|
//final Sort groupSort = new Sort(new SortField[] {new SortField("sort1", SortField.STRING), new SortField("id", SortField.INT)});
|
||||||
}
|
// TODO: also test null (= sort by relevance)
|
||||||
} else {
|
final Sort docSort = getRandomSort();
|
||||||
// Collect only into cache, then replay multiple times:
|
|
||||||
c = cCache = CachingCollector.create(false, true, maxCacheMB);
|
final int topNGroups = _TestUtil.nextInt(random, 1, 30);
|
||||||
|
//final int topNGroups = 4;
|
||||||
|
final int docsPerGroup = _TestUtil.nextInt(random, 1, 50);
|
||||||
|
final int groupOffset = _TestUtil.nextInt(random, 0, (topNGroups-1)/2);
|
||||||
|
//final int groupOffset = 0;
|
||||||
|
|
||||||
|
final int docOffset = _TestUtil.nextInt(random, 0, docsPerGroup-1);
|
||||||
|
//final int docOffset = 0;
|
||||||
|
|
||||||
|
final boolean doCache = random.nextBoolean();
|
||||||
|
final boolean doAllGroups = random.nextBoolean();
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doAllGroups=" + doAllGroups);
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
cCache = null;
|
final AllGroupsCollector allGroupsCollector;
|
||||||
if (doAllGroups) {
|
if (doAllGroups) {
|
||||||
c = MultiCollector.wrap(c1, allGroupsCollector);
|
allGroupsCollector = new AllGroupsCollector("group");
|
||||||
} else {
|
} else {
|
||||||
c = c1;
|
allGroupsCollector = null;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
final FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("group", groupSort, groupOffset+topNGroups);
|
||||||
|
final CachingCollector cCache;
|
||||||
|
final Collector c;
|
||||||
|
|
||||||
s.search(new TermQuery(new Term("content", searchTerm)), c);
|
final boolean useWrappingCollector = random.nextBoolean();
|
||||||
|
|
||||||
|
if (doCache) {
|
||||||
|
final double maxCacheMB = random.nextDouble();
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: maxCacheMB=" + maxCacheMB);
|
||||||
|
}
|
||||||
|
|
||||||
if (doCache && !useWrappingCollector) {
|
if (useWrappingCollector) {
|
||||||
if (cCache.isCached()) {
|
if (doAllGroups) {
|
||||||
// Replay for first-pass grouping
|
cCache = CachingCollector.create(c1, true, maxCacheMB);
|
||||||
cCache.replay(c1);
|
c = MultiCollector.wrap(cCache, allGroupsCollector);
|
||||||
if (doAllGroups) {
|
} else {
|
||||||
// Replay for all groups:
|
c = cCache = CachingCollector.create(c1, true, maxCacheMB);
|
||||||
cCache.replay(allGroupsCollector);
|
}
|
||||||
|
} else {
|
||||||
|
// Collect only into cache, then replay multiple times:
|
||||||
|
c = cCache = CachingCollector.create(false, true, maxCacheMB);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Replay by re-running search:
|
cCache = null;
|
||||||
s.search(new TermQuery(new Term("content", searchTerm)), c1);
|
|
||||||
if (doAllGroups) {
|
if (doAllGroups) {
|
||||||
s.search(new TermQuery(new Term("content", searchTerm)), allGroupsCollector);
|
c = MultiCollector.wrap(c1, allGroupsCollector);
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
final Collection<SearchGroup> topGroups = c1.getTopGroups(groupOffset, fillFields);
|
|
||||||
final TopGroups groupsResult;
|
|
||||||
|
|
||||||
if (topGroups != null) {
|
|
||||||
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: topGroups");
|
|
||||||
for (SearchGroup searchGroup : topGroups) {
|
|
||||||
System.out.println(" " + (searchGroup.groupValue == null ? "null" : searchGroup.groupValue.utf8ToString()) + ": " + Arrays.deepToString(searchGroup.sortValues));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
final SecondPassGroupingCollector c2 = new SecondPassGroupingCollector("group", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields);
|
|
||||||
if (doCache) {
|
|
||||||
if (cCache.isCached()) {
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println("TEST: cache is intact");
|
|
||||||
}
|
|
||||||
cCache.replay(c2);
|
|
||||||
} else {
|
} else {
|
||||||
if (VERBOSE) {
|
c = c1;
|
||||||
System.out.println("TEST: cache was too large");
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
s.search(new TermQuery(new Term("content", searchTerm)), c);
|
||||||
|
|
||||||
|
if (doCache && !useWrappingCollector) {
|
||||||
|
if (cCache.isCached()) {
|
||||||
|
// Replay for first-pass grouping
|
||||||
|
cCache.replay(c1);
|
||||||
|
if (doAllGroups) {
|
||||||
|
// Replay for all groups:
|
||||||
|
cCache.replay(allGroupsCollector);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// Replay by re-running search:
|
||||||
|
s.search(new TermQuery(new Term("content", searchTerm)), c1);
|
||||||
|
if (doAllGroups) {
|
||||||
|
s.search(new TermQuery(new Term("content", searchTerm)), allGroupsCollector);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final Collection<SearchGroup> topGroups = c1.getTopGroups(groupOffset, fillFields);
|
||||||
|
final TopGroups groupsResult;
|
||||||
|
|
||||||
|
if (topGroups != null) {
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: topGroups");
|
||||||
|
for (SearchGroup searchGroup : topGroups) {
|
||||||
|
System.out.println(" " + (searchGroup.groupValue == null ? "null" : searchGroup.groupValue.utf8ToString()) + ": " + Arrays.deepToString(searchGroup.sortValues));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final SecondPassGroupingCollector c2 = new SecondPassGroupingCollector("group", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields);
|
||||||
|
if (doCache) {
|
||||||
|
if (cCache.isCached()) {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: cache is intact");
|
||||||
|
}
|
||||||
|
cCache.replay(c2);
|
||||||
|
} else {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: cache was too large");
|
||||||
|
}
|
||||||
|
s.search(new TermQuery(new Term("content", searchTerm)), c2);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
s.search(new TermQuery(new Term("content", searchTerm)), c2);
|
s.search(new TermQuery(new Term("content", searchTerm)), c2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (doAllGroups) {
|
||||||
|
TopGroups tempTopGroups = c2.getTopGroups(docOffset);
|
||||||
|
groupsResult = new TopGroups(tempTopGroups, allGroupsCollector.getGroupCount());
|
||||||
|
} else {
|
||||||
|
groupsResult = c2.getTopGroups(docOffset);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
s.search(new TermQuery(new Term("content", searchTerm)), c2);
|
groupsResult = null;
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: no results");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (doAllGroups) {
|
final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, doAllGroups, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
|
||||||
TopGroups tempTopGroups = c2.getTopGroups(docOffset);
|
|
||||||
groupsResult = new TopGroups(tempTopGroups, allGroupsCollector.getGroupCount());
|
|
||||||
} else {
|
|
||||||
groupsResult = c2.getTopGroups(docOffset);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
groupsResult = null;
|
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println("TEST: no results");
|
if (expectedGroups == null) {
|
||||||
|
System.out.println("TEST: no expected groups");
|
||||||
|
} else {
|
||||||
|
System.out.println("TEST: expected groups");
|
||||||
|
for(GroupDocs gd : expectedGroups.groups) {
|
||||||
|
System.out.println(" group=" + (gd.groupValue == null ? "null" : gd.groupValue.utf8ToString()));
|
||||||
|
for(ScoreDoc sd : gd.scoreDocs) {
|
||||||
|
System.out.println(" id=" + sd.doc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, doAllGroups, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
|
|
||||||
|
|
||||||
try {
|
|
||||||
// NOTE: intentional but temporary field cache insanity!
|
// NOTE: intentional but temporary field cache insanity!
|
||||||
assertEquals(FieldCache.DEFAULT.getInts(r, "id"), expectedGroups, groupsResult);
|
assertEquals(docIDToID, expectedGroups, groupsResult, true);
|
||||||
} finally {
|
|
||||||
FieldCache.DEFAULT.purge(r);
|
final boolean needsScores = getScores || getMaxScores || docSort == null;
|
||||||
|
final BlockGroupingCollector c3 = new BlockGroupingCollector(groupSort, groupOffset+topNGroups, needsScores, lastDocInBlock);
|
||||||
|
final AllGroupsCollector allGroupsCollector2;
|
||||||
|
final Collector c4;
|
||||||
|
if (doAllGroups) {
|
||||||
|
allGroupsCollector2 = new AllGroupsCollector("group");
|
||||||
|
c4 = MultiCollector.wrap(c3, allGroupsCollector2);
|
||||||
|
} else {
|
||||||
|
allGroupsCollector2 = null;
|
||||||
|
c4 = c3;
|
||||||
|
}
|
||||||
|
s2.search(new TermQuery(new Term("content", searchTerm)), c4);
|
||||||
|
final TopGroups tempTopGroups2 = c3.getTopGroups(docSort, groupOffset, docOffset, docOffset+docsPerGroup, fillFields);
|
||||||
|
final TopGroups groupsResult2;
|
||||||
|
if (doAllGroups && tempTopGroups2 != null) {
|
||||||
|
assertEquals((int) tempTopGroups2.totalGroupCount, allGroupsCollector2.getGroupCount());
|
||||||
|
groupsResult2 = new TopGroups(tempTopGroups2, allGroupsCollector2.getGroupCount());
|
||||||
|
} else {
|
||||||
|
groupsResult2 = tempTopGroups2;
|
||||||
|
}
|
||||||
|
assertEquals(docIDToID2, expectedGroups, groupsResult2, false);
|
||||||
}
|
}
|
||||||
|
} finally {
|
||||||
|
FieldCache.DEFAULT.purge(r);
|
||||||
|
FieldCache.DEFAULT.purge(r2);
|
||||||
}
|
}
|
||||||
|
|
||||||
r.close();
|
r.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
|
|
||||||
|
r2.close();
|
||||||
|
dir2.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertEquals(int[] docIDtoID, TopGroups expected, TopGroups actual) {
|
private void assertEquals(int[] docIDtoID, TopGroups expected, TopGroups actual, boolean verifyGroupValues) {
|
||||||
if (expected == null) {
|
if (expected == null) {
|
||||||
assertNull(actual);
|
assertNull(actual);
|
||||||
return;
|
return;
|
||||||
|
@ -570,7 +698,9 @@ public class TestGrouping extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
final GroupDocs expectedGroup = expected.groups[groupIDX];
|
final GroupDocs expectedGroup = expected.groups[groupIDX];
|
||||||
final GroupDocs actualGroup = actual.groups[groupIDX];
|
final GroupDocs actualGroup = actual.groups[groupIDX];
|
||||||
assertEquals(expectedGroup.groupValue, actualGroup.groupValue);
|
if (verifyGroupValues) {
|
||||||
|
assertEquals(expectedGroup.groupValue, actualGroup.groupValue);
|
||||||
|
}
|
||||||
assertArrayEquals(expectedGroup.groupSortValues, actualGroup.groupSortValues);
|
assertArrayEquals(expectedGroup.groupSortValues, actualGroup.groupSortValues);
|
||||||
|
|
||||||
// TODO
|
// TODO
|
||||||
|
|
Loading…
Reference in New Issue