mirror of https://github.com/apache/lucene.git
LUCENE-3099: allow subclasses to determine the group value
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1130858 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
710e630e91
commit
d1548ca30a
|
@ -75,6 +75,10 @@ API Changes
|
|||
* LUCENE-3141: add getter method to access fragInfos in FieldFragList.
|
||||
(Sujit Pal via Koji Sekiguchi)
|
||||
|
||||
* LUCENE-3099: Allow subclasses to determine the group value for
|
||||
First/SecondPassGroupingCollector. (Martijn van Groningen, Mike
|
||||
McCandless)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-3149: Upgrade contrib/icu's ICU jar file to ICU 4.8.
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
package org.apache.lucene.search.grouping;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
|
||||
/**
|
||||
* A collector that collects all groups that match the
|
||||
* query. Only the group value is collected, and the order
|
||||
* is undefined. This collector does not determine
|
||||
* the most relevant document of a group.
|
||||
*
|
||||
* <p/>
|
||||
* This is an abstract version. Concrete implementations define
|
||||
* what a group actually is and how it is internally collected.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class AbstractAllGroupsCollector<GROUP_VALUE_TYPE> extends Collector {
|
||||
|
||||
/**
|
||||
* Returns the total number of groups for the executed search.
|
||||
* This is a convenience method. The following code snippet has the same effect: <pre>getGroups().size()</pre>
|
||||
*
|
||||
* @return The total number of groups for the executed search
|
||||
*/
|
||||
public int getGroupCount() {
|
||||
return getGroups().size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the group values
|
||||
* <p/>
|
||||
* This is an unordered collections of group values. For each group that matched the query there is a {@link BytesRef}
|
||||
* representing a group value.
|
||||
*
|
||||
* @return the group values
|
||||
*/
|
||||
public abstract Collection<GROUP_VALUE_TYPE> getGroups();
|
||||
|
||||
// Empty not necessary
|
||||
public void setScorer(Scorer scorer) throws IOException {}
|
||||
|
||||
public boolean acceptsDocsOutOfOrder() {
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -17,56 +17,39 @@ package org.apache.lucene.search.grouping;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
import org.apache.lucene.search.FieldComparator;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.search.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/** FirstPassGroupingCollector is the first of two passes necessary
|
||||
* to collect grouped hits. This pass gathers the top N sorted
|
||||
* groups.
|
||||
* groups. Concrete subclasses define what a group is and how it
|
||||
* is internally collected.
|
||||
*
|
||||
* <p>See {@link org.apache.lucene.search.grouping} for more
|
||||
* details including a full code example.</p>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
abstract public class AbstractFirstPassGroupingCollector<GROUP_VALUE_TYPE> extends Collector {
|
||||
|
||||
public class FirstPassGroupingCollector extends Collector {
|
||||
|
||||
private final String groupField;
|
||||
private final Sort groupSort;
|
||||
private final FieldComparator[] comparators;
|
||||
private final int[] reversed;
|
||||
private final int topNGroups;
|
||||
private final HashMap<BytesRef, CollectedSearchGroup> groupMap;
|
||||
private final BytesRef scratchBytesRef = new BytesRef();
|
||||
private final HashMap<GROUP_VALUE_TYPE, CollectedSearchGroup<GROUP_VALUE_TYPE>> groupMap;
|
||||
private final int compIDXEnd;
|
||||
|
||||
// Set once we reach topNGroups unique groups:
|
||||
private TreeSet<CollectedSearchGroup> orderedGroups;
|
||||
private TreeSet<CollectedSearchGroup<GROUP_VALUE_TYPE>> orderedGroups;
|
||||
private int docBase;
|
||||
private int spareSlot;
|
||||
private FieldCache.DocTermsIndex index;
|
||||
|
||||
/**
|
||||
* Create the first pass collector.
|
||||
*
|
||||
* @param groupField The field used to group
|
||||
* documents. This field must be single-valued and
|
||||
* indexed (FieldCache is used to access its value
|
||||
* per-document).
|
||||
* @param groupSort The {@link Sort} used to sort the
|
||||
* groups. The top sorted document within each group
|
||||
* according to groupSort, determines how that group
|
||||
|
@ -74,13 +57,13 @@ public class FirstPassGroupingCollector extends Collector {
|
|||
* ie, if you want to groupSort by relevance use
|
||||
* Sort.RELEVANCE.
|
||||
* @param topNGroups How many top groups to keep.
|
||||
* @throws IOException If I/O related errors occur
|
||||
*/
|
||||
public FirstPassGroupingCollector(String groupField, Sort groupSort, int topNGroups) throws IOException {
|
||||
public AbstractFirstPassGroupingCollector(Sort groupSort, int topNGroups) throws IOException {
|
||||
if (topNGroups < 1) {
|
||||
throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")");
|
||||
}
|
||||
|
||||
this.groupField = groupField;
|
||||
// TODO: allow null groupSort to mean "by relevance",
|
||||
// and specialize it?
|
||||
this.groupSort = groupSort;
|
||||
|
@ -100,13 +83,19 @@ public class FirstPassGroupingCollector extends Collector {
|
|||
}
|
||||
|
||||
spareSlot = topNGroups;
|
||||
groupMap = new HashMap<BytesRef, CollectedSearchGroup>(topNGroups);
|
||||
groupMap = new HashMap<GROUP_VALUE_TYPE, CollectedSearchGroup<GROUP_VALUE_TYPE>>(topNGroups);
|
||||
}
|
||||
|
||||
/** Returns top groups, starting from offset. This may
|
||||
* return null, if no groups were collected, or if the
|
||||
* number of unique groups collected is <= offset. */
|
||||
public Collection<SearchGroup> getTopGroups(int groupOffset, boolean fillFields) {
|
||||
/**
|
||||
* Returns top groups, starting from offset. This may
|
||||
* return null, if no groups were collected, or if the
|
||||
* number of unique groups collected is <= offset.
|
||||
*
|
||||
* @param groupOffset The offset in the collected groups
|
||||
* @param fillFields Whether to fill to {@link SearchGroup#sortValues}
|
||||
* @return top groups, starting from offset
|
||||
*/
|
||||
public Collection<SearchGroup<GROUP_VALUE_TYPE>> getTopGroups(int groupOffset, boolean fillFields) {
|
||||
|
||||
//System.out.println("FP.getTopGroups groupOffset=" + groupOffset + " fillFields=" + fillFields + " groupMap.size()=" + groupMap.size());
|
||||
|
||||
|
@ -122,15 +111,15 @@ public class FirstPassGroupingCollector extends Collector {
|
|||
buildSortedSet();
|
||||
}
|
||||
|
||||
final Collection<SearchGroup> result = new ArrayList<SearchGroup>();
|
||||
final Collection<SearchGroup<GROUP_VALUE_TYPE>> result = new ArrayList<SearchGroup<GROUP_VALUE_TYPE>>();
|
||||
int upto = 0;
|
||||
final int sortFieldCount = groupSort.getSort().length;
|
||||
for(CollectedSearchGroup group : orderedGroups) {
|
||||
for(CollectedSearchGroup<GROUP_VALUE_TYPE> group : orderedGroups) {
|
||||
if (upto++ < groupOffset) {
|
||||
continue;
|
||||
}
|
||||
//System.out.println(" group=" + (group.groupValue == null ? "null" : group.groupValue.utf8ToString()));
|
||||
SearchGroup searchGroup = new SearchGroup();
|
||||
SearchGroup<GROUP_VALUE_TYPE> searchGroup = new SearchGroup<GROUP_VALUE_TYPE>();
|
||||
searchGroup.groupValue = group.groupValue;
|
||||
if (fillFields) {
|
||||
searchGroup.sortValues = new Comparable[sortFieldCount];
|
||||
|
@ -144,10 +133,6 @@ public class FirstPassGroupingCollector extends Collector {
|
|||
return result;
|
||||
}
|
||||
|
||||
public String getGroupField() {
|
||||
return groupField;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setScorer(Scorer scorer) throws IOException {
|
||||
for (FieldComparator comparator : comparators) {
|
||||
|
@ -189,13 +174,9 @@ public class FirstPassGroupingCollector extends Collector {
|
|||
// TODO: should we add option to mean "ignore docs that
|
||||
// don't have the group field" (instead of stuffing them
|
||||
// under null group)?
|
||||
final int ord = index.getOrd(doc);
|
||||
//System.out.println(" ord=" + ord);
|
||||
final GROUP_VALUE_TYPE groupValue = getDocGroupValue(doc);
|
||||
|
||||
final BytesRef br = ord == 0 ? null : index.lookup(ord, scratchBytesRef);
|
||||
//System.out.println(" group=" + (br == null ? "null" : br.utf8ToString()));
|
||||
|
||||
final CollectedSearchGroup group = groupMap.get(br);
|
||||
final CollectedSearchGroup<GROUP_VALUE_TYPE> group = groupMap.get(groupValue);
|
||||
|
||||
if (group == null) {
|
||||
|
||||
|
@ -210,8 +191,8 @@ public class FirstPassGroupingCollector extends Collector {
|
|||
// just keep collecting them
|
||||
|
||||
// Add a new CollectedSearchGroup:
|
||||
CollectedSearchGroup sg = new CollectedSearchGroup();
|
||||
sg.groupValue = ord == 0 ? null : new BytesRef(scratchBytesRef);
|
||||
CollectedSearchGroup<GROUP_VALUE_TYPE> sg = new CollectedSearchGroup<GROUP_VALUE_TYPE>();
|
||||
sg.groupValue = copyDocGroupValue(groupValue, null);
|
||||
sg.comparatorSlot = groupMap.size();
|
||||
sg.topDoc = docBase + doc;
|
||||
for (FieldComparator fc : comparators) {
|
||||
|
@ -233,20 +214,14 @@ public class FirstPassGroupingCollector extends Collector {
|
|||
// the bottom group with this new group.
|
||||
|
||||
// java 6-only: final CollectedSearchGroup bottomGroup = orderedGroups.pollLast();
|
||||
final CollectedSearchGroup bottomGroup = orderedGroups.last();
|
||||
final CollectedSearchGroup<GROUP_VALUE_TYPE> bottomGroup = orderedGroups.last();
|
||||
orderedGroups.remove(bottomGroup);
|
||||
assert orderedGroups.size() == topNGroups -1;
|
||||
|
||||
groupMap.remove(bottomGroup.groupValue);
|
||||
|
||||
// reuse the removed CollectedSearchGroup
|
||||
if (br == null) {
|
||||
bottomGroup.groupValue = null;
|
||||
} else if (bottomGroup.groupValue != null) {
|
||||
bottomGroup.groupValue.copy(br);
|
||||
} else {
|
||||
bottomGroup.groupValue = new BytesRef(br);
|
||||
}
|
||||
bottomGroup.groupValue = copyDocGroupValue(groupValue, bottomGroup.groupValue);
|
||||
bottomGroup.topDoc = docBase + doc;
|
||||
|
||||
for (FieldComparator fc : comparators) {
|
||||
|
@ -291,7 +266,7 @@ public class FirstPassGroupingCollector extends Collector {
|
|||
// Remove before updating the group since lookup is done via comparators
|
||||
// TODO: optimize this
|
||||
|
||||
final CollectedSearchGroup prevLast;
|
||||
final CollectedSearchGroup<GROUP_VALUE_TYPE> prevLast;
|
||||
if (orderedGroups != null) {
|
||||
prevLast = orderedGroups.last();
|
||||
orderedGroups.remove(group);
|
||||
|
@ -336,7 +311,7 @@ public class FirstPassGroupingCollector extends Collector {
|
|||
}
|
||||
};
|
||||
|
||||
orderedGroups = new TreeSet<CollectedSearchGroup>(comparator);
|
||||
orderedGroups = new TreeSet<CollectedSearchGroup<GROUP_VALUE_TYPE>>(comparator);
|
||||
orderedGroups.addAll(groupMap.values());
|
||||
assert orderedGroups.size() > 0;
|
||||
|
||||
|
@ -353,15 +328,31 @@ public class FirstPassGroupingCollector extends Collector {
|
|||
@Override
|
||||
public void setNextReader(AtomicReaderContext readerContext) throws IOException {
|
||||
docBase = readerContext.docBase;
|
||||
index = FieldCache.DEFAULT.getTermsIndex(readerContext.reader, groupField);
|
||||
|
||||
for (int i=0; i<comparators.length; i++) {
|
||||
comparators[i] = comparators[i].setNextReader(readerContext);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the group value for the specified doc.
|
||||
*
|
||||
* @param doc The specified doc
|
||||
* @return the group value for the specified doc
|
||||
*/
|
||||
protected abstract GROUP_VALUE_TYPE getDocGroupValue(int doc);
|
||||
|
||||
/**
|
||||
* Returns a copy of the specified group value by creating a new instance and copying the value from the specified
|
||||
* groupValue in the new instance. Or optionally the reuse argument can be used to copy the group value in.
|
||||
*
|
||||
* @param groupValue The group value to copy
|
||||
* @param reuse Optionally a reuse instance to prevent a new instance creation
|
||||
* @return a copy of the specified group value
|
||||
*/
|
||||
protected abstract GROUP_VALUE_TYPE copyDocGroupValue(GROUP_VALUE_TYPE groupValue, GROUP_VALUE_TYPE reuse);
|
||||
}
|
||||
|
||||
class CollectedSearchGroup extends SearchGroup {
|
||||
class CollectedSearchGroup<T> extends SearchGroup<T> {
|
||||
int topDoc;
|
||||
int comparatorSlot;
|
||||
}
|
|
@ -0,0 +1,156 @@
|
|||
package org.apache.lucene.search.grouping;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.search.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* SecondPassGroupingCollector is the second of two passes
|
||||
* necessary to collect grouped docs. This pass gathers the
|
||||
* top N documents per top group computed from the
|
||||
* first pass. Concrete subclasses define what a group is and how it
|
||||
* is internally collected.
|
||||
*
|
||||
* <p>See {@link org.apache.lucene.search.grouping} for more
|
||||
* details including a full code example.</p>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class AbstractSecondPassGroupingCollector<GROUP_VALUE_TYPE> extends Collector {
|
||||
|
||||
protected final Map<GROUP_VALUE_TYPE, SearchGroupDocs<GROUP_VALUE_TYPE>> groupMap;
|
||||
private final int maxDocsPerGroup;
|
||||
protected SearchGroupDocs<GROUP_VALUE_TYPE>[] groupDocs;
|
||||
private final Collection<SearchGroup<GROUP_VALUE_TYPE>> groups;
|
||||
private final Sort withinGroupSort;
|
||||
private final Sort groupSort;
|
||||
|
||||
private int totalHitCount;
|
||||
private int totalGroupedHitCount;
|
||||
|
||||
public AbstractSecondPassGroupingCollector(Collection<SearchGroup<GROUP_VALUE_TYPE>> groups, Sort groupSort, Sort withinGroupSort,
|
||||
int maxDocsPerGroup, boolean getScores, boolean getMaxScores, boolean fillSortFields)
|
||||
throws IOException {
|
||||
|
||||
//System.out.println("SP init");
|
||||
if (groups.size() == 0) {
|
||||
throw new IllegalArgumentException("no groups to collect (groups.size() is 0)");
|
||||
}
|
||||
|
||||
this.groupSort = groupSort;
|
||||
this.withinGroupSort = withinGroupSort;
|
||||
this.groups = groups;
|
||||
this.maxDocsPerGroup = maxDocsPerGroup;
|
||||
groupMap = new HashMap<GROUP_VALUE_TYPE, SearchGroupDocs<GROUP_VALUE_TYPE>>(groups.size());
|
||||
|
||||
for (SearchGroup<GROUP_VALUE_TYPE> group : groups) {
|
||||
//System.out.println(" prep group=" + (group.groupValue == null ? "null" : group.groupValue.utf8ToString()));
|
||||
final TopDocsCollector collector;
|
||||
if (withinGroupSort == null) {
|
||||
// Sort by score
|
||||
collector = TopScoreDocCollector.create(maxDocsPerGroup, true);
|
||||
} else {
|
||||
// Sort by fields
|
||||
collector = TopFieldCollector.create(withinGroupSort, maxDocsPerGroup, fillSortFields, getScores, getMaxScores, true);
|
||||
}
|
||||
groupMap.put(group.groupValue,
|
||||
new SearchGroupDocs<GROUP_VALUE_TYPE>(group.groupValue,
|
||||
collector));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setScorer(Scorer scorer) throws IOException {
|
||||
for (SearchGroupDocs<GROUP_VALUE_TYPE> group : groupMap.values()) {
|
||||
group.collector.setScorer(scorer);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc) throws IOException {
|
||||
totalHitCount++;
|
||||
SearchGroupDocs<GROUP_VALUE_TYPE> group = retrieveGroup(doc);
|
||||
if (group != null) {
|
||||
totalGroupedHitCount++;
|
||||
group.collector.collect(doc);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the group the specified doc belongs to or <code>null</code> if no group could be retrieved.
|
||||
*
|
||||
* @param doc The specified doc
|
||||
* @return the group the specified doc belongs to or <code>null</code> if no group could be retrieved
|
||||
* @throws IOException If an I/O related error occurred
|
||||
*/
|
||||
protected abstract SearchGroupDocs<GROUP_VALUE_TYPE> retrieveGroup(int doc) throws IOException;
|
||||
|
||||
@Override
|
||||
public void setNextReader(AtomicReaderContext readerContext) throws IOException {
|
||||
//System.out.println("SP.setNextReader");
|
||||
for (SearchGroupDocs<GROUP_VALUE_TYPE> group : groupMap.values()) {
|
||||
group.collector.setNextReader(readerContext);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean acceptsDocsOutOfOrder() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public TopGroups<GROUP_VALUE_TYPE> getTopGroups(int withinGroupOffset) {
|
||||
@SuppressWarnings("unchecked")
|
||||
final GroupDocs<GROUP_VALUE_TYPE>[] groupDocsResult = (GroupDocs<GROUP_VALUE_TYPE>[]) new GroupDocs[groups.size()];
|
||||
|
||||
int groupIDX = 0;
|
||||
for(SearchGroup group : groups) {
|
||||
final SearchGroupDocs<GROUP_VALUE_TYPE> groupDocs = groupMap.get(group.groupValue);
|
||||
final TopDocs topDocs = groupDocs.collector.topDocs(withinGroupOffset, maxDocsPerGroup);
|
||||
groupDocsResult[groupIDX++] = new GroupDocs<GROUP_VALUE_TYPE>(topDocs.getMaxScore(),
|
||||
topDocs.totalHits,
|
||||
topDocs.scoreDocs,
|
||||
groupDocs.groupValue,
|
||||
group.sortValues);
|
||||
}
|
||||
|
||||
return new TopGroups<GROUP_VALUE_TYPE>(groupSort.getSort(),
|
||||
withinGroupSort == null ? null : withinGroupSort.getSort(),
|
||||
totalHitCount, totalGroupedHitCount, groupDocsResult);
|
||||
}
|
||||
|
||||
|
||||
// TODO: merge with SearchGroup or not?
|
||||
// ad: don't need to build a new hashmap
|
||||
// disad: blows up the size of SearchGroup if we need many of them, and couples implementations
|
||||
public class SearchGroupDocs<GROUP_VALUE_TYPE> {
|
||||
|
||||
public final GROUP_VALUE_TYPE groupValue;
|
||||
public final TopDocsCollector collector;
|
||||
|
||||
public SearchGroupDocs(GROUP_VALUE_TYPE groupValue, TopDocsCollector collector) {
|
||||
this.groupValue = groupValue;
|
||||
this.collector = collector;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -49,7 +49,7 @@ import org.apache.lucene.util.PriorityQueue;
|
|||
* being that the documents in each group must always be
|
||||
* indexed as a block. This collector also fills in
|
||||
* TopGroups.totalGroupCount without requiring the separate
|
||||
* {@link AllGroupsCollector}. However, this collector does
|
||||
* {@link TermAllGroupsCollector}. However, this collector does
|
||||
* not fill in the groupValue of each group; this field
|
||||
* will always be null.
|
||||
*
|
||||
|
@ -317,7 +317,8 @@ public class BlockGroupingCollector extends Collector {
|
|||
|
||||
final FakeScorer fakeScorer = new FakeScorer();
|
||||
|
||||
final GroupDocs[] groups = new GroupDocs[groupQueue.size() - groupOffset];
|
||||
@SuppressWarnings("unchecked")
|
||||
final GroupDocs<Object>[] groups = new GroupDocs[groupQueue.size() - groupOffset];
|
||||
for(int downTo=groupQueue.size()-groupOffset-1;downTo>=0;downTo--) {
|
||||
final OneGroup og = groupQueue.pop();
|
||||
|
||||
|
@ -360,7 +361,7 @@ public class BlockGroupingCollector extends Collector {
|
|||
|
||||
final TopDocs topDocs = collector.topDocs(withinGroupOffset, maxDocsPerGroup);
|
||||
|
||||
groups[downTo] = new GroupDocs(topDocs.getMaxScore(),
|
||||
groups[downTo] = new GroupDocs<Object>(topDocs.getMaxScore(),
|
||||
og.count,
|
||||
topDocs.scoreDocs,
|
||||
null,
|
||||
|
@ -375,7 +376,7 @@ public class BlockGroupingCollector extends Collector {
|
|||
}
|
||||
*/
|
||||
|
||||
return new TopGroups(new TopGroups(groupSort.getSort(),
|
||||
return new TopGroups<Object>(new TopGroups<Object>(groupSort.getSort(),
|
||||
withinGroupSort == null ? null : withinGroupSort.getSort(),
|
||||
totalHitCount, totalGroupedHitCount, groups),
|
||||
totalGroupCount);
|
||||
|
|
|
@ -18,15 +18,14 @@ package org.apache.lucene.search.grouping;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/** Represents one group in the results.
|
||||
*
|
||||
* @lucene.experimental */
|
||||
public class GroupDocs {
|
||||
public class GroupDocs<GROUP_VALUE_TYPE> {
|
||||
/** The groupField value for all docs in this group; this
|
||||
* may be null if hits did not have the groupField. */
|
||||
public final BytesRef groupValue;
|
||||
public final GROUP_VALUE_TYPE groupValue;
|
||||
|
||||
/** Max score in this group */
|
||||
public final float maxScore;
|
||||
|
@ -40,13 +39,13 @@ public class GroupDocs {
|
|||
public final int totalHits;
|
||||
|
||||
/** Matches the groupSort passed to {@link
|
||||
* FirstPassGroupingCollector}. */
|
||||
* AbstractFirstPassGroupingCollector}. */
|
||||
public final Comparable[] groupSortValues;
|
||||
|
||||
public GroupDocs(float maxScore,
|
||||
int totalHits,
|
||||
ScoreDoc[] scoreDocs,
|
||||
BytesRef groupValue,
|
||||
GROUP_VALUE_TYPE groupValue,
|
||||
Comparable[] groupSortValues) {
|
||||
this.maxScore = maxScore;
|
||||
this.totalHits = totalHits;
|
||||
|
|
|
@ -17,10 +17,16 @@ package org.apache.lucene.search.grouping;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
/**
|
||||
* Represents a group that is found during the first pass search.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class SearchGroup<GROUP_VALUE_TYPE> {
|
||||
|
||||
/** @lucene.experimental */
|
||||
public class SearchGroup {
|
||||
public BytesRef groupValue;
|
||||
/** The value that defines this group */
|
||||
public GROUP_VALUE_TYPE groupValue;
|
||||
|
||||
/** The sort values used during sorting. Can be <code>null</code>. */
|
||||
public Comparable[] sortValues;
|
||||
}
|
||||
|
|
|
@ -1,172 +0,0 @@
|
|||
package org.apache.lucene.search.grouping;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.TopDocsCollector;
|
||||
import org.apache.lucene.search.TopFieldCollector;
|
||||
import org.apache.lucene.search.TopScoreDocCollector;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* SecondPassGroupingCollector is the second of two passes
|
||||
* necessary to collect grouped docs. This pass gathers the
|
||||
* top N documents per top group computed from the
|
||||
* first pass.
|
||||
*
|
||||
* <p>See {@link org.apache.lucene.search.grouping} for more
|
||||
* details including a full code example.</p>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class SecondPassGroupingCollector extends Collector {
|
||||
private final HashMap<BytesRef, SearchGroupDocs> groupMap;
|
||||
|
||||
private FieldCache.DocTermsIndex index;
|
||||
private final String groupField;
|
||||
private final int maxDocsPerGroup;
|
||||
private final SentinelIntSet ordSet;
|
||||
private final SearchGroupDocs[] groupDocs;
|
||||
private final BytesRef spareBytesRef = new BytesRef();
|
||||
private final Collection<SearchGroup> groups;
|
||||
private final Sort withinGroupSort;
|
||||
private final Sort groupSort;
|
||||
|
||||
private int totalHitCount;
|
||||
private int totalGroupedHitCount;
|
||||
|
||||
public SecondPassGroupingCollector(String groupField, Collection<SearchGroup> groups, Sort groupSort, Sort withinGroupSort,
|
||||
int maxDocsPerGroup, boolean getScores, boolean getMaxScores, boolean fillSortFields)
|
||||
throws IOException {
|
||||
|
||||
//System.out.println("SP init");
|
||||
if (groups.size() == 0) {
|
||||
throw new IllegalArgumentException("no groups to collect (groups.size() is 0)");
|
||||
}
|
||||
|
||||
this.groupSort = groupSort;
|
||||
this.withinGroupSort = withinGroupSort;
|
||||
this.groups = groups;
|
||||
this.groupField = groupField;
|
||||
this.maxDocsPerGroup = maxDocsPerGroup;
|
||||
|
||||
groupMap = new HashMap<BytesRef, SearchGroupDocs>(groups.size());
|
||||
|
||||
for (SearchGroup group : groups) {
|
||||
//System.out.println(" prep group=" + (group.groupValue == null ? "null" : group.groupValue.utf8ToString()));
|
||||
final TopDocsCollector collector;
|
||||
if (withinGroupSort == null) {
|
||||
// Sort by score
|
||||
collector = TopScoreDocCollector.create(maxDocsPerGroup, true);
|
||||
} else {
|
||||
// Sort by fields
|
||||
collector = TopFieldCollector.create(withinGroupSort, maxDocsPerGroup, fillSortFields, getScores, getMaxScores, true);
|
||||
}
|
||||
groupMap.put(group.groupValue,
|
||||
new SearchGroupDocs(group.groupValue,
|
||||
collector));
|
||||
}
|
||||
|
||||
ordSet = new SentinelIntSet(groupMap.size(), -1);
|
||||
groupDocs = new SearchGroupDocs[ordSet.keys.length];
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setScorer(Scorer scorer) throws IOException {
|
||||
for (SearchGroupDocs group : groupMap.values()) {
|
||||
group.collector.setScorer(scorer);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc) throws IOException {
|
||||
final int slot = ordSet.find(index.getOrd(doc));
|
||||
//System.out.println("SP.collect doc=" + doc + " slot=" + slot);
|
||||
totalHitCount++;
|
||||
if (slot >= 0) {
|
||||
totalGroupedHitCount++;
|
||||
groupDocs[slot].collector.collect(doc);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(AtomicReaderContext readerContext) throws IOException {
|
||||
//System.out.println("SP.setNextReader");
|
||||
for (SearchGroupDocs group : groupMap.values()) {
|
||||
group.collector.setNextReader(readerContext);
|
||||
}
|
||||
index = FieldCache.DEFAULT.getTermsIndex(readerContext.reader, groupField);
|
||||
|
||||
// Rebuild ordSet
|
||||
ordSet.clear();
|
||||
for (SearchGroupDocs group : groupMap.values()) {
|
||||
//System.out.println(" group=" + (group.groupValue == null ? "null" : group.groupValue.utf8ToString()));
|
||||
int ord = group.groupValue == null ? 0 : index.binarySearchLookup(group.groupValue, spareBytesRef);
|
||||
if (ord >= 0) {
|
||||
groupDocs[ordSet.put(ord)] = group;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean acceptsDocsOutOfOrder() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public TopGroups getTopGroups(int withinGroupOffset) {
|
||||
final GroupDocs[] groupDocsResult = new GroupDocs[groups.size()];
|
||||
|
||||
int groupIDX = 0;
|
||||
for(SearchGroup group : groups) {
|
||||
final SearchGroupDocs groupDocs = groupMap.get(group.groupValue);
|
||||
final TopDocs topDocs = groupDocs.collector.topDocs(withinGroupOffset, maxDocsPerGroup);
|
||||
groupDocsResult[groupIDX++] = new GroupDocs(topDocs.getMaxScore(),
|
||||
topDocs.totalHits,
|
||||
topDocs.scoreDocs,
|
||||
groupDocs.groupValue,
|
||||
group.sortValues);
|
||||
}
|
||||
|
||||
return new TopGroups(groupSort.getSort(),
|
||||
withinGroupSort == null ? null : withinGroupSort.getSort(),
|
||||
totalHitCount, totalGroupedHitCount, groupDocsResult);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// TODO: merge with SearchGroup or not?
|
||||
// ad: don't need to build a new hashmap
|
||||
// disad: blows up the size of SearchGroup if we need many of them, and couples implementations
|
||||
class SearchGroupDocs {
|
||||
public final BytesRef groupValue;
|
||||
public final TopDocsCollector collector;
|
||||
|
||||
public SearchGroupDocs(BytesRef groupValue, TopDocsCollector collector) {
|
||||
this.groupValue = groupValue;
|
||||
this.collector = collector;
|
||||
}
|
||||
}
|
|
@ -18,9 +18,7 @@ package org.apache.lucene.search.grouping;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -43,47 +41,44 @@ import java.util.List;
|
|||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class AllGroupsCollector extends Collector {
|
||||
public class TermAllGroupsCollector extends AbstractAllGroupsCollector<BytesRef> {
|
||||
|
||||
private static final int DEFAULT_INITIAL_SIZE = 128;
|
||||
|
||||
private final String groupField;
|
||||
private final SentinelIntSet ordSet;
|
||||
private final List<BytesRef> groups;
|
||||
private final BytesRef spareBytesRef = new BytesRef();
|
||||
|
||||
private FieldCache.DocTermsIndex index;
|
||||
private final BytesRef spareBytesRef = new BytesRef();
|
||||
|
||||
/**
|
||||
* Expert: Constructs a {@link AllGroupsCollector}
|
||||
* Expert: Constructs a {@link AbstractAllGroupsCollector}
|
||||
*
|
||||
* @param groupField The field to group by
|
||||
* @param initialSize The initial allocation size of the
|
||||
* internal int set and group list
|
||||
* which should roughly match the total
|
||||
* number of expected unique groups. Be aware that the
|
||||
* heap usage is 4 bytes * initialSize.
|
||||
* internal int set and group list
|
||||
* which should roughly match the total
|
||||
* number of expected unique groups. Be aware that the
|
||||
* heap usage is 4 bytes * initialSize.
|
||||
*/
|
||||
public AllGroupsCollector(String groupField, int initialSize) {
|
||||
this.groupField = groupField;
|
||||
public TermAllGroupsCollector(String groupField, int initialSize) {
|
||||
ordSet = new SentinelIntSet(initialSize, -1);
|
||||
groups = new ArrayList<BytesRef>(initialSize);
|
||||
this.groupField = groupField;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a {@link AllGroupsCollector}. This sets the
|
||||
* Constructs a {@link AbstractAllGroupsCollector}. This sets the
|
||||
* initial allocation size for the internal int set and group
|
||||
* list to 128.
|
||||
*
|
||||
* @param groupField The field to group by
|
||||
*/
|
||||
public AllGroupsCollector(String groupField) {
|
||||
public TermAllGroupsCollector(String groupField) {
|
||||
this(groupField, DEFAULT_INITIAL_SIZE);
|
||||
}
|
||||
|
||||
public void setScorer(Scorer scorer) throws IOException {
|
||||
}
|
||||
|
||||
public void collect(int doc) throws IOException {
|
||||
int key = index.getOrd(doc);
|
||||
if (!ordSet.exists(key)) {
|
||||
|
@ -94,22 +89,7 @@ public class AllGroupsCollector extends Collector {
|
|||
}
|
||||
|
||||
/**
|
||||
* Returns the total number of groups for the executed search.
|
||||
* This is a convenience method. The following code snippet has the same effect: <pre>getGroups().size()</pre>
|
||||
*
|
||||
* @return The total number of groups for the executed search
|
||||
*/
|
||||
public int getGroupCount() {
|
||||
return groups.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the group values
|
||||
* <p/>
|
||||
* This is an unordered collections of group values. For each group that matched the query there is a {@link BytesRef}
|
||||
* representing a group value.
|
||||
*
|
||||
* @return the group values
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
public Collection<BytesRef> getGroups() {
|
||||
return groups;
|
||||
|
@ -128,7 +108,4 @@ public class AllGroupsCollector extends Collector {
|
|||
}
|
||||
}
|
||||
|
||||
public boolean acceptsDocsOutOfOrder() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,85 @@
|
|||
package org.apache.lucene.search.grouping;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Concrete implementation of {@link AbstractFirstPassGroupingCollector} that groups based on
|
||||
* field values and more specifically uses {@link org.apache.lucene.search.FieldCache.DocTerms}
|
||||
* to collect groups.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class TermFirstPassGroupingCollector extends AbstractFirstPassGroupingCollector<BytesRef> {
|
||||
|
||||
private final BytesRef scratchBytesRef = new BytesRef();
|
||||
private FieldCache.DocTermsIndex index;
|
||||
|
||||
private String groupField;
|
||||
|
||||
/**
|
||||
* Create the first pass collector.
|
||||
*
|
||||
* @param groupField The field used to group
|
||||
* documents. This field must be single-valued and
|
||||
* indexed (FieldCache is used to access its value
|
||||
* per-document).
|
||||
* @param groupSort The {@link Sort} used to sort the
|
||||
* groups. The top sorted document within each group
|
||||
* according to groupSort, determines how that group
|
||||
* sorts against other groups. This must be non-null,
|
||||
* ie, if you want to groupSort by relevance use
|
||||
* Sort.RELEVANCE.
|
||||
* @param topNGroups How many top groups to keep.
|
||||
* @throws IOException When I/O related errors occur
|
||||
*/
|
||||
public TermFirstPassGroupingCollector(String groupField, Sort groupSort, int topNGroups) throws IOException {
|
||||
super(groupSort, topNGroups);
|
||||
this.groupField = groupField;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BytesRef getDocGroupValue(int doc) {
|
||||
final int ord = index.getOrd(doc);
|
||||
return ord == 0 ? null : index.lookup(ord, scratchBytesRef);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BytesRef copyDocGroupValue(BytesRef groupValue, BytesRef reuse) {
|
||||
if (groupValue == null) {
|
||||
return null;
|
||||
} else if (reuse != null) {
|
||||
reuse.copy(groupValue);
|
||||
return reuse;
|
||||
} else {
|
||||
return new BytesRef(groupValue);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(AtomicReaderContext readerContext) throws IOException {
|
||||
super.setNextReader(readerContext);
|
||||
index = FieldCache.DEFAULT.getTermsIndex(readerContext.reader, groupField);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
package org.apache.lucene.search.grouping;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
|
||||
/**
|
||||
* Concrete implementation of {@link AbstractSecondPassGroupingCollector} that groups based on
|
||||
* field values and more specifically uses {@link org.apache.lucene.search.FieldCache.DocTerms}
|
||||
* to collect grouped docs.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class TermSecondPassGroupingCollector extends AbstractSecondPassGroupingCollector<BytesRef> {
|
||||
|
||||
private final SentinelIntSet ordSet;
|
||||
private FieldCache.DocTermsIndex index;
|
||||
private final BytesRef spareBytesRef = new BytesRef();
|
||||
private final String groupField;
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public TermSecondPassGroupingCollector(String groupField, Collection<SearchGroup<BytesRef>> groups, Sort groupSort, Sort withinGroupSort,
|
||||
int maxDocsPerGroup, boolean getScores, boolean getMaxScores, boolean fillSortFields)
|
||||
throws IOException {
|
||||
super(groups, groupSort, withinGroupSort, maxDocsPerGroup, getScores, getMaxScores, fillSortFields);
|
||||
ordSet = new SentinelIntSet(groupMap.size(), -1);
|
||||
this.groupField = groupField;
|
||||
groupDocs = (SearchGroupDocs<BytesRef>[]) new SearchGroupDocs[ordSet.keys.length];
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(AtomicReaderContext readerContext) throws IOException {
|
||||
super.setNextReader(readerContext);
|
||||
index = FieldCache.DEFAULT.getTermsIndex(readerContext.reader, groupField);
|
||||
|
||||
// Rebuild ordSet
|
||||
ordSet.clear();
|
||||
for (SearchGroupDocs<BytesRef> group : groupMap.values()) {
|
||||
// System.out.println(" group=" + (group.groupValue == null ? "null" : group.groupValue.utf8ToString()));
|
||||
int ord = group.groupValue == null ? 0 : index.binarySearchLookup(group.groupValue, spareBytesRef);
|
||||
if (ord >= 0) {
|
||||
groupDocs[ordSet.put(ord)] = group;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected SearchGroupDocs<BytesRef> retrieveGroup(int doc) throws IOException {
|
||||
int slot = ordSet.find(index.getOrd(doc));
|
||||
if (slot >= 0) {
|
||||
return groupDocs[slot];
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -22,7 +22,7 @@ import org.apache.lucene.search.SortField;
|
|||
/** Represents result returned by a grouping search.
|
||||
*
|
||||
* @lucene.experimental */
|
||||
public class TopGroups {
|
||||
public class TopGroups<GROUP_VALUE_TYPE> {
|
||||
/** Number of documents matching the search */
|
||||
public final int totalHitCount;
|
||||
|
||||
|
@ -33,7 +33,7 @@ public class TopGroups {
|
|||
public final Integer totalGroupCount;
|
||||
|
||||
/** Group results in groupSort order */
|
||||
public final GroupDocs[] groups;
|
||||
public final GroupDocs<GROUP_VALUE_TYPE>[] groups;
|
||||
|
||||
/** How groups are sorted against each other */
|
||||
public final SortField[] groupSort;
|
||||
|
@ -41,7 +41,7 @@ public class TopGroups {
|
|||
/** How docs are sorted within each group */
|
||||
public final SortField[] withinGroupSort;
|
||||
|
||||
public TopGroups(SortField[] groupSort, SortField[] withinGroupSort, int totalHitCount, int totalGroupedHitCount, GroupDocs[] groups) {
|
||||
public TopGroups(SortField[] groupSort, SortField[] withinGroupSort, int totalHitCount, int totalGroupedHitCount, GroupDocs<GROUP_VALUE_TYPE>[] groups) {
|
||||
this.groupSort = groupSort;
|
||||
this.withinGroupSort = withinGroupSort;
|
||||
this.totalHitCount = totalHitCount;
|
||||
|
@ -50,7 +50,7 @@ public class TopGroups {
|
|||
this.totalGroupCount = null;
|
||||
}
|
||||
|
||||
public TopGroups(TopGroups oldTopGroups, Integer totalGroupCount) {
|
||||
public TopGroups(TopGroups<GROUP_VALUE_TYPE> oldTopGroups, Integer totalGroupCount) {
|
||||
this.groupSort = oldTopGroups.groupSort;
|
||||
this.withinGroupSort = oldTopGroups.withinGroupSort;
|
||||
this.totalHitCount = oldTopGroups.totalHitCount;
|
||||
|
|
|
@ -43,55 +43,37 @@ field fall into a single group.</p>
|
|||
|
||||
</ul>
|
||||
|
||||
<p>The implementation is two-pass: the first pass ({@link
|
||||
org.apache.lucene.search.grouping.TermFirstPassGroupingCollector})
|
||||
gathers the top groups, and the second pass ({@link
|
||||
org.apache.lucene.search.grouping.TermSecondPassGroupingCollector})
|
||||
gathers documents within those groups. If the search is costly to
|
||||
run you may want to use the {@link
|
||||
org.apache.lucene.search.CachingCollector} class, which
|
||||
caches hits and can (quickly) replay them for the second pass. This
|
||||
way you only run the query once, but you pay a RAM cost to (briefly)
|
||||
hold all hits. Results are returned as a {@link
|
||||
org.apache.lucene.search.grouping.TopGroups} instance.</p>
|
||||
|
||||
<p>
|
||||
There are two grouping implementations here:
|
||||
<ul>
|
||||
<li>
|
||||
Arbitrary grouping that can group by any single-valued indexed
|
||||
field, implemented as a two-pass collector: the first pass ({@link
|
||||
org.apache.lucene.search.grouping.FirstPassGroupingCollector})
|
||||
gathers the top groups, and the second pass ({@link
|
||||
org.apache.lucene.search.grouping.SecondPassGroupingCollector})
|
||||
gathers documents within those groups. If the search is costly to
|
||||
run you may want to use the {@link
|
||||
org.apache.lucene.search.CachingCollector} class, which caches
|
||||
hits and can (quickly) replay them for the second pass. This way
|
||||
you only run the query once, but you pay a RAM cost to (briefly)
|
||||
hold all hits. Results are returned as a {@link
|
||||
org.apache.lucene.search.grouping.TopGroups} instance.</p>
|
||||
</li>
|
||||
<li>
|
||||
Indexed groups, using a single pass collector (<code>BlockGroupingCollectorDoc</code>) that
|
||||
is able to group according to the doc blocks created during
|
||||
indexing using <code>IndexWriter</code>'s <code>add/updateDocuments</code> API.
|
||||
This is faster (~25% faster QPS) than the generic two-pass
|
||||
collector, but it only works for doc blocks so you must statically
|
||||
commit (during indexing) to which grouping you'll need at search
|
||||
time.
|
||||
This module abstracts away what defines group and how it is collected. All grouping collectors
|
||||
are abstract and have currently term based implementations. One can implement
|
||||
collectors that for example group on multiple fields.
|
||||
</p>
|
||||
|
||||
<p>This implementation does not rely on a single valued grouping
|
||||
field; rather, the blocks in the index define the groups, so your
|
||||
application is free to determine what the grouping criteria is.
|
||||
At search time, you must provide a <code>Filter</code> that marks
|
||||
the last document in each group. This is a substantial memory
|
||||
savings because this collector does not load
|
||||
a <code>DocTermsIndex</code> from the
|
||||
<code>FieldCache</code>.
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<p>The benefit of the arbitrary grouping implementation is you don't have
|
||||
to commit at indexing time to a static grouping of your documents.
|
||||
But the downside is it's somewhat slower to run, and requires more RAM
|
||||
(a <code>FieldCache.DocTermsIndex</code> entry is created).
|
||||
<p>
|
||||
This module abstracts away what defines group and how it is collected. All grouping collectors
|
||||
are abstract and have currently term based implementations. One can implement
|
||||
collectors that for example group on multiple fields.
|
||||
</p>
|
||||
|
||||
<p>Known limitations:</p>
|
||||
<ul>
|
||||
<li> For the two-pass grouping collector, the group field must be a
|
||||
single-valued indexed field.
|
||||
{@link org.apache.lucene.search.FieldCache} is used to load the {@link org.apache.lucene.search.FieldCache.DocTermsIndex} for this field.
|
||||
<li> Unlike Solr's implementation, this module cannot group by
|
||||
function query values nor by arbitrary queries.
|
||||
<li> Although Solr support grouping by function and this module has abstraction of what a group is, there are currently only
|
||||
implementations for grouping based on terms.
|
||||
<li> Sharding is not directly supported, though is not too
|
||||
difficult, if you can merge the top groups and top documents per
|
||||
group yourself.
|
||||
|
@ -101,14 +83,14 @@ But the downside is it's somewhat slower to run, and requires more RAM
|
|||
(using the {@link org.apache.lucene.search.CachingCollector}):</p>
|
||||
|
||||
<pre class="prettyprint">
|
||||
FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("author", groupSort, groupOffset+topNGroups);
|
||||
TermFirstPassGroupingCollector c1 = new TermFirstPassGroupingCollector("author", groupSort, groupOffset+topNGroups);
|
||||
|
||||
boolean cacheScores = true;
|
||||
double maxCacheRAMMB = 4.0;
|
||||
CachingCollector cachedCollector = CachingCollector.create(c1, cacheScores, maxCacheRAMMB);
|
||||
s.search(new TermQuery(new Term("content", searchTerm)), cachedCollector);
|
||||
|
||||
Collection<SearchGroup> topGroups = c1.getTopGroups(groupOffset, fillFields);
|
||||
Collection<SearchGroup<BytesRef>> topGroups = c1.getTopGroups(groupOffset, fillFields);
|
||||
|
||||
if (topGroups == null) {
|
||||
// No groups matched
|
||||
|
@ -118,12 +100,12 @@ But the downside is it's somewhat slower to run, and requires more RAM
|
|||
boolean getScores = true;
|
||||
boolean getMaxScores = true;
|
||||
boolean fillFields = true;
|
||||
SecondPassGroupingCollector c2 = new SecondPassGroupingCollector("author", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields);
|
||||
TermSecondPassGroupingCollector c2 = new TermSecondPassGroupingCollector("author", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields);
|
||||
|
||||
//Optionally compute total group count
|
||||
AllGroupsCollector allGroupsCollector = null;
|
||||
TermAllGroupsCollector allGroupsCollector = null;
|
||||
if (requiredTotalGroupCount) {
|
||||
allGroupsCollector = new AllGroupsCollector("author");
|
||||
allGroupsCollector = new TermAllGroupsCollector("author");
|
||||
c2 = MultiCollector.wrap(c2, allGroupsCollector);
|
||||
}
|
||||
|
||||
|
@ -135,9 +117,9 @@ But the downside is it's somewhat slower to run, and requires more RAM
|
|||
s.search(new TermQuery(new Term("content", searchTerm)), c2);
|
||||
}
|
||||
|
||||
TopGroups groupsResult = c2.getTopGroups(docOffset);
|
||||
TopGroups<BytesRef> groupsResult = c2.getTopGroups(docOffset);
|
||||
if (requiredTotalGroupCount) {
|
||||
groupResult = new TopGroups(groupsResult, allGroupsCollector.getGroupCount());
|
||||
groupResult = new TopGroups<BytesRef>(groupsResult, allGroupsCollector.getGroupCount());
|
||||
}
|
||||
|
||||
// Render groupsResult...
|
||||
|
|
|
@ -17,9 +17,6 @@
|
|||
|
||||
package org.apache.lucene.search.grouping;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
@ -33,6 +30,9 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
// TODO
|
||||
// - should test relevance sort too
|
||||
// - test null
|
||||
|
@ -103,10 +103,10 @@ public class TestGrouping extends LuceneTestCase {
|
|||
w.close();
|
||||
|
||||
final Sort groupSort = Sort.RELEVANCE;
|
||||
final FirstPassGroupingCollector c1 = new FirstPassGroupingCollector(groupField, groupSort, 10);
|
||||
final TermFirstPassGroupingCollector c1 = new TermFirstPassGroupingCollector(groupField, groupSort, 10);
|
||||
indexSearcher.search(new TermQuery(new Term("content", "random")), c1);
|
||||
|
||||
final SecondPassGroupingCollector c2 = new SecondPassGroupingCollector(groupField, c1.getTopGroups(0, true), groupSort, null, 5, true, false, true);
|
||||
final TermSecondPassGroupingCollector c2 = new TermSecondPassGroupingCollector(groupField, c1.getTopGroups(0, true), groupSort, null, 5, true, false, true);
|
||||
indexSearcher.search(new TermQuery(new Term("content", "random")), c2);
|
||||
|
||||
final TopGroups groups = c2.getTopGroups(0);
|
||||
|
@ -236,7 +236,7 @@ public class TestGrouping extends LuceneTestCase {
|
|||
}
|
||||
*/
|
||||
|
||||
private TopGroups slowGrouping(GroupDoc[] groupDocs,
|
||||
private TopGroups<BytesRef> slowGrouping(GroupDoc[] groupDocs,
|
||||
String searchTerm,
|
||||
boolean fillFields,
|
||||
boolean getScores,
|
||||
|
@ -296,7 +296,8 @@ public class TestGrouping extends LuceneTestCase {
|
|||
final int limit = Math.min(groupOffset + topNGroups, groups.size());
|
||||
|
||||
final Comparator<GroupDoc> docSortComp = getComparator(docSort);
|
||||
final GroupDocs[] result = new GroupDocs[limit-groupOffset];
|
||||
@SuppressWarnings("unchecked")
|
||||
final GroupDocs<BytesRef>[] result = new GroupDocs[limit-groupOffset];
|
||||
int totalGroupedHitCount = 0;
|
||||
for(int idx=groupOffset;idx < limit;idx++) {
|
||||
final BytesRef group = sortedGroups.get(idx);
|
||||
|
@ -321,7 +322,7 @@ public class TestGrouping extends LuceneTestCase {
|
|||
hits = new ScoreDoc[0];
|
||||
}
|
||||
|
||||
result[idx-groupOffset] = new GroupDocs(0.0f,
|
||||
result[idx-groupOffset] = new GroupDocs<BytesRef>(0.0f,
|
||||
docs.size(),
|
||||
hits,
|
||||
group,
|
||||
|
@ -329,12 +330,12 @@ public class TestGrouping extends LuceneTestCase {
|
|||
}
|
||||
|
||||
if (doAllGroups) {
|
||||
return new TopGroups(
|
||||
new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result),
|
||||
return new TopGroups<BytesRef>(
|
||||
new TopGroups<BytesRef>(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result),
|
||||
knownGroups.size()
|
||||
);
|
||||
} else {
|
||||
return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result);
|
||||
return new TopGroups<BytesRef>(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -525,14 +526,14 @@ public class TestGrouping extends LuceneTestCase {
|
|||
System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doAllGroups=" + doAllGroups);
|
||||
}
|
||||
|
||||
final AllGroupsCollector allGroupsCollector;
|
||||
final TermAllGroupsCollector allGroupsCollector;
|
||||
if (doAllGroups) {
|
||||
allGroupsCollector = new AllGroupsCollector("group");
|
||||
allGroupsCollector = new TermAllGroupsCollector("group");
|
||||
} else {
|
||||
allGroupsCollector = null;
|
||||
}
|
||||
|
||||
final FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("group", groupSort, groupOffset+topNGroups);
|
||||
final TermFirstPassGroupingCollector c1 = new TermFirstPassGroupingCollector("group", groupSort, groupOffset+topNGroups);
|
||||
final CachingCollector cCache;
|
||||
final Collector c;
|
||||
|
||||
|
@ -583,19 +584,19 @@ public class TestGrouping extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
final Collection<SearchGroup> topGroups = c1.getTopGroups(groupOffset, fillFields);
|
||||
final Collection<SearchGroup<BytesRef>> topGroups = c1.getTopGroups(groupOffset, fillFields);
|
||||
final TopGroups groupsResult;
|
||||
|
||||
if (topGroups != null) {
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: topGroups");
|
||||
for (SearchGroup searchGroup : topGroups) {
|
||||
for (SearchGroup<BytesRef> searchGroup : topGroups) {
|
||||
System.out.println(" " + (searchGroup.groupValue == null ? "null" : searchGroup.groupValue.utf8ToString()) + ": " + Arrays.deepToString(searchGroup.sortValues));
|
||||
}
|
||||
}
|
||||
|
||||
final SecondPassGroupingCollector c2 = new SecondPassGroupingCollector("group", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields);
|
||||
final TermSecondPassGroupingCollector c2 = new TermSecondPassGroupingCollector("group", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields);
|
||||
if (doCache) {
|
||||
if (cCache.isCached()) {
|
||||
if (VERBOSE) {
|
||||
|
@ -613,8 +614,8 @@ public class TestGrouping extends LuceneTestCase {
|
|||
}
|
||||
|
||||
if (doAllGroups) {
|
||||
TopGroups tempTopGroups = c2.getTopGroups(docOffset);
|
||||
groupsResult = new TopGroups(tempTopGroups, allGroupsCollector.getGroupCount());
|
||||
TopGroups<BytesRef> tempTopGroups = c2.getTopGroups(docOffset);
|
||||
groupsResult = new TopGroups<BytesRef>(tempTopGroups, allGroupsCollector.getGroupCount());
|
||||
} else {
|
||||
groupsResult = c2.getTopGroups(docOffset);
|
||||
}
|
||||
|
@ -625,14 +626,14 @@ public class TestGrouping extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, doAllGroups, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
|
||||
final TopGroups<BytesRef> expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, doAllGroups, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
|
||||
|
||||
if (VERBOSE) {
|
||||
if (expectedGroups == null) {
|
||||
System.out.println("TEST: no expected groups");
|
||||
} else {
|
||||
System.out.println("TEST: expected groups");
|
||||
for(GroupDocs gd : expectedGroups.groups) {
|
||||
for(GroupDocs<BytesRef> gd : expectedGroups.groups) {
|
||||
System.out.println(" group=" + (gd.groupValue == null ? "null" : gd.groupValue.utf8ToString()));
|
||||
for(ScoreDoc sd : gd.scoreDocs) {
|
||||
System.out.println(" id=" + sd.doc);
|
||||
|
@ -645,21 +646,22 @@ public class TestGrouping extends LuceneTestCase {
|
|||
|
||||
final boolean needsScores = getScores || getMaxScores || docSort == null;
|
||||
final BlockGroupingCollector c3 = new BlockGroupingCollector(groupSort, groupOffset+topNGroups, needsScores, lastDocInBlock);
|
||||
final AllGroupsCollector allGroupsCollector2;
|
||||
final TermAllGroupsCollector allGroupsCollector2;
|
||||
final Collector c4;
|
||||
if (doAllGroups) {
|
||||
allGroupsCollector2 = new AllGroupsCollector("group");
|
||||
allGroupsCollector2 = new TermAllGroupsCollector("group");
|
||||
c4 = MultiCollector.wrap(c3, allGroupsCollector2);
|
||||
} else {
|
||||
allGroupsCollector2 = null;
|
||||
c4 = c3;
|
||||
}
|
||||
s2.search(new TermQuery(new Term("content", searchTerm)), c4);
|
||||
final TopGroups tempTopGroups2 = c3.getTopGroups(docSort, groupOffset, docOffset, docOffset+docsPerGroup, fillFields);
|
||||
@SuppressWarnings("unchecked")
|
||||
final TopGroups<BytesRef> tempTopGroups2 = c3.getTopGroups(docSort, groupOffset, docOffset, docOffset+docsPerGroup, fillFields);
|
||||
final TopGroups groupsResult2;
|
||||
if (doAllGroups && tempTopGroups2 != null) {
|
||||
assertEquals((int) tempTopGroups2.totalGroupCount, allGroupsCollector2.getGroupCount());
|
||||
groupsResult2 = new TopGroups(tempTopGroups2, allGroupsCollector2.getGroupCount());
|
||||
groupsResult2 = new TopGroups<BytesRef>(tempTopGroups2, allGroupsCollector2.getGroupCount());
|
||||
} else {
|
||||
groupsResult2 = tempTopGroups2;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue