mirror of https://github.com/apache/lucene.git
LUCENE-3778: Added a grouping utility class that makes it easier to use result grouping for pure Lucene apps.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303871 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
55515820f9
commit
89b55566d8
|
@ -77,6 +77,9 @@ New Features
|
|||
* LUCENE-3444: Added a second pass grouping collector that keeps track of distinct
|
||||
values for a specified field for the top N group. (Martijn van Groningen)
|
||||
|
||||
* LUCENE-3778: Added a grouping utility class that makes it easier to use result
|
||||
grouping for pure Lucene apps. (Martijn van Groningen)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-2606: Changed RegexCapabilities interface to fix thread
|
||||
|
|
|
@ -0,0 +1,469 @@
|
|||
package org.apache.lucene.search.grouping;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.document.DocValuesField;
|
||||
import org.apache.lucene.index.DocValues;
|
||||
import org.apache.lucene.queries.function.ValueSource;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.search.grouping.dv.DVAllGroupHeadsCollector;
|
||||
import org.apache.lucene.search.grouping.dv.DVAllGroupsCollector;
|
||||
import org.apache.lucene.search.grouping.dv.DVFirstPassGroupingCollector;
|
||||
import org.apache.lucene.search.grouping.dv.DVSecondPassGroupingCollector;
|
||||
import org.apache.lucene.search.grouping.function.FunctionAllGroupHeadsCollector;
|
||||
import org.apache.lucene.search.grouping.function.FunctionAllGroupsCollector;
|
||||
import org.apache.lucene.search.grouping.function.FunctionFirstPassGroupingCollector;
|
||||
import org.apache.lucene.search.grouping.function.FunctionSecondPassGroupingCollector;
|
||||
import org.apache.lucene.search.grouping.term.TermAllGroupHeadsCollector;
|
||||
import org.apache.lucene.search.grouping.term.TermAllGroupsCollector;
|
||||
import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector;
|
||||
import org.apache.lucene.search.grouping.term.TermSecondPassGroupingCollector;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.mutable.MutableValue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Convenience class to perform grouping in a non distributed environment.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class GroupingSearch {
|
||||
|
||||
private final String groupField;
|
||||
private final ValueSource groupFunction;
|
||||
private final Map<?, ?> valueSourceContext;
|
||||
private final Filter groupEndDocs;
|
||||
private final DocValues.Type docValuesType;
|
||||
private final boolean diskResidentDocValues;
|
||||
|
||||
private Sort groupSort = Sort.RELEVANCE;
|
||||
private Sort sortWithinGroup;
|
||||
|
||||
private int groupDocsOffset;
|
||||
private int groupDocsLimit = 1;
|
||||
private boolean fillSortFields;
|
||||
private boolean includeScores = true;
|
||||
private boolean includeMaxScore = true;
|
||||
|
||||
private Double maxCacheRAMMB;
|
||||
private Integer maxDocsToCache;
|
||||
private boolean cacheScores;
|
||||
private boolean allGroups;
|
||||
private boolean allGroupHeads;
|
||||
private int initialSize = 128;
|
||||
|
||||
private Collection<?> matchingGroups;
|
||||
private Bits matchingGroupHeads;
|
||||
|
||||
/**
|
||||
* Constructs a <code>GroupingSearch</code> instance that groups documents by index terms using the {@link FieldCache}.
|
||||
* The group field can only have one token per document. This means that the field must not be analysed.
|
||||
*
|
||||
* @param groupField The name of the field to group by.
|
||||
*/
|
||||
public GroupingSearch(String groupField) {
|
||||
this(groupField, null, null, null, null, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a <code>GroupingSearch</code> instance that groups documents by doc values.
|
||||
* This constructor can only be used when the groupField is a {@link DocValuesField}.
|
||||
*
|
||||
* @param groupField The name of the field to group by that contains doc values
|
||||
* @param docValuesType The doc values type of the specified groupField
|
||||
* @param diskResidentDocValues Whether the values to group by should be disk resident
|
||||
*/
|
||||
public GroupingSearch(String groupField, DocValues.Type docValuesType, boolean diskResidentDocValues) {
|
||||
this(groupField, null, null, null, docValuesType, diskResidentDocValues);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a <code>GroupingSearch</code> instance that groups documents by function using a {@link ValueSource}
|
||||
* instance.
|
||||
*
|
||||
* @param groupFunction The function to group by specified as {@link ValueSource}
|
||||
* @param valueSourceContext The context of the specified groupFunction
|
||||
*/
|
||||
public GroupingSearch(ValueSource groupFunction, Map<?, ?> valueSourceContext) {
|
||||
this(null, groupFunction, valueSourceContext, null, null, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor for grouping documents by doc block.
|
||||
* This constructor can only be used when documents belonging in a group are indexed in one block.
|
||||
*
|
||||
* @param groupEndDocs The filter that marks the last document in all doc blocks
|
||||
*/
|
||||
public GroupingSearch(Filter groupEndDocs) {
|
||||
this(null, null, null, groupEndDocs, null, false);
|
||||
}
|
||||
|
||||
private GroupingSearch(String groupField, ValueSource groupFunction, Map<?, ?> valueSourceContext, Filter groupEndDocs, DocValues.Type docValuesType, boolean diskResidentDocValues) {
|
||||
this.groupField = groupField;
|
||||
this.groupFunction = groupFunction;
|
||||
this.valueSourceContext = valueSourceContext;
|
||||
this.groupEndDocs = groupEndDocs;
|
||||
this.docValuesType = docValuesType;
|
||||
this.diskResidentDocValues = diskResidentDocValues;
|
||||
}
|
||||
|
||||
/**
|
||||
* Executes a grouped search. Both the first pass and second pass are executed on the specified searcher.
|
||||
*
|
||||
* @param searcher The {@link org.apache.lucene.search.IndexSearcher} instance to execute the grouped search on.
|
||||
* @param query The query to execute with the grouping
|
||||
* @param groupOffset The group offset
|
||||
* @param groupLimit The number of groups to return from the specified group offset
|
||||
* @return the grouped result as a {@link TopGroups} instance
|
||||
* @throws IOException If any I/O related errors occur
|
||||
*/
|
||||
public <T> TopGroups<T> search(IndexSearcher searcher, Query query, int groupOffset, int groupLimit) throws IOException {
|
||||
return search(searcher, null, query, groupOffset, groupLimit);
|
||||
}
|
||||
|
||||
/**
|
||||
* Executes a grouped search. Both the first pass and second pass are executed on the specified searcher.
|
||||
*
|
||||
* @param searcher The {@link org.apache.lucene.search.IndexSearcher} instance to execute the grouped search on.
|
||||
* @param filter The filter to execute with the grouping
|
||||
* @param query The query to execute with the grouping
|
||||
* @param groupOffset The group offset
|
||||
* @param groupLimit The number of groups to return from the specified group offset
|
||||
* @return the grouped result as a {@link TopGroups} instance
|
||||
* @throws IOException If any I/O related errors occur
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
public <T> TopGroups<T> search(IndexSearcher searcher, Filter filter, Query query, int groupOffset, int groupLimit) throws IOException {
|
||||
if (groupField != null || groupFunction != null) {
|
||||
return groupByFieldOrFunction(searcher, filter, query, groupOffset, groupLimit);
|
||||
} else if (groupEndDocs != null) {
|
||||
return (TopGroups<T>) groupByDocBlock(searcher, filter, query, groupOffset, groupLimit);
|
||||
} else {
|
||||
throw new IllegalStateException("Either groupField, groupFunction or groupEndDocs must be set."); // This can't happen...
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings({"unchecked", "rawtypes"})
|
||||
protected TopGroups groupByFieldOrFunction(IndexSearcher searcher, Filter filter, Query query, int groupOffset, int groupLimit) throws IOException {
|
||||
int topN = groupOffset + groupLimit;
|
||||
final AbstractFirstPassGroupingCollector firstPassCollector;
|
||||
final AbstractAllGroupsCollector allGroupsCollector;
|
||||
final AbstractAllGroupHeadsCollector allGroupHeadsCollector;
|
||||
if (groupFunction != null) {
|
||||
firstPassCollector = new FunctionFirstPassGroupingCollector(groupFunction, valueSourceContext, groupSort, topN);
|
||||
if (allGroups) {
|
||||
allGroupsCollector = new FunctionAllGroupsCollector(groupFunction, valueSourceContext);
|
||||
} else {
|
||||
allGroupsCollector = null;
|
||||
}
|
||||
if (allGroupHeads) {
|
||||
allGroupHeadsCollector = new FunctionAllGroupHeadsCollector(groupFunction, valueSourceContext, sortWithinGroup);
|
||||
} else {
|
||||
allGroupHeadsCollector = null;
|
||||
}
|
||||
} else if (docValuesType != null) {
|
||||
firstPassCollector = DVFirstPassGroupingCollector.create(groupSort, topN, groupField, docValuesType, diskResidentDocValues);
|
||||
if (allGroups) {
|
||||
allGroupsCollector = DVAllGroupsCollector.create(groupField, docValuesType, diskResidentDocValues, initialSize);
|
||||
} else {
|
||||
allGroupsCollector = null;
|
||||
}
|
||||
if (allGroupHeads) {
|
||||
allGroupHeadsCollector = DVAllGroupHeadsCollector.create(groupField, sortWithinGroup, docValuesType, diskResidentDocValues);
|
||||
} else {
|
||||
allGroupHeadsCollector = null;
|
||||
}
|
||||
} else {
|
||||
firstPassCollector = new TermFirstPassGroupingCollector(groupField, groupSort, topN);
|
||||
if (allGroups) {
|
||||
allGroupsCollector = new TermAllGroupsCollector(groupField, initialSize);
|
||||
} else {
|
||||
allGroupsCollector = null;
|
||||
}
|
||||
if (allGroupHeads) {
|
||||
allGroupHeadsCollector = TermAllGroupHeadsCollector.create(groupField, sortWithinGroup, initialSize);
|
||||
} else {
|
||||
allGroupHeadsCollector = null;
|
||||
}
|
||||
}
|
||||
|
||||
final Collector firstRound;
|
||||
if (allGroupHeads || allGroups) {
|
||||
List<Collector> collectors = new ArrayList<Collector>();
|
||||
collectors.add(firstPassCollector);
|
||||
if (allGroupHeads) {
|
||||
collectors.add(allGroupsCollector);
|
||||
}
|
||||
if (allGroupHeads) {
|
||||
collectors.add(allGroupHeadsCollector);
|
||||
}
|
||||
firstRound = MultiCollector.wrap(collectors.toArray(new Collector[collectors.size()]));
|
||||
} else {
|
||||
firstRound = firstPassCollector;
|
||||
}
|
||||
|
||||
CachingCollector cachedCollector = null;
|
||||
if (maxCacheRAMMB != null || maxDocsToCache != null) {
|
||||
if (maxCacheRAMMB != null) {
|
||||
cachedCollector = CachingCollector.create(firstRound, cacheScores, maxCacheRAMMB);
|
||||
} else {
|
||||
cachedCollector = CachingCollector.create(firstRound, cacheScores, maxDocsToCache);
|
||||
}
|
||||
searcher.search(query, filter, cachedCollector);
|
||||
} else {
|
||||
searcher.search(query, filter, firstRound);
|
||||
}
|
||||
|
||||
if (allGroups) {
|
||||
matchingGroups = allGroupsCollector.getGroups();
|
||||
} else {
|
||||
matchingGroups = Collections.emptyList();
|
||||
}
|
||||
if (allGroupHeads) {
|
||||
matchingGroupHeads = allGroupHeadsCollector.retrieveGroupHeads(searcher.getIndexReader().maxDoc());
|
||||
} else {
|
||||
matchingGroupHeads = new Bits.MatchNoBits(searcher.getIndexReader().maxDoc());
|
||||
}
|
||||
|
||||
Collection<SearchGroup> topSearchGroups = firstPassCollector.getTopGroups(groupOffset, fillSortFields);
|
||||
if (topSearchGroups == null) {
|
||||
return new TopGroups(new SortField[0], new SortField[0], 0, 0, new GroupDocs[0]);
|
||||
}
|
||||
|
||||
int topNInsideGroup = groupDocsOffset + groupDocsLimit;
|
||||
AbstractSecondPassGroupingCollector secondPassCollector;
|
||||
if (groupFunction != null) {
|
||||
secondPassCollector = new FunctionSecondPassGroupingCollector((Collection) topSearchGroups, groupSort, sortWithinGroup, topNInsideGroup, includeScores, includeMaxScore, fillSortFields, groupFunction, valueSourceContext);
|
||||
} else if (docValuesType != null) {
|
||||
secondPassCollector = DVSecondPassGroupingCollector.create(groupField, diskResidentDocValues, docValuesType, (Collection) topSearchGroups, groupSort, sortWithinGroup, topNInsideGroup, includeScores, includeMaxScore, fillSortFields);
|
||||
} else {
|
||||
secondPassCollector = new TermSecondPassGroupingCollector(groupField, (Collection) topSearchGroups, groupSort, sortWithinGroup, topNInsideGroup, includeScores, includeMaxScore, fillSortFields);
|
||||
}
|
||||
|
||||
if (cachedCollector != null && cachedCollector.isCached()) {
|
||||
cachedCollector.replay(secondPassCollector);
|
||||
} else {
|
||||
searcher.search(query, filter, secondPassCollector);
|
||||
}
|
||||
|
||||
if (allGroups) {
|
||||
return new TopGroups(secondPassCollector.getTopGroups(groupDocsOffset), matchingGroups.size());
|
||||
} else {
|
||||
return secondPassCollector.getTopGroups(groupDocsOffset);
|
||||
}
|
||||
}
|
||||
|
||||
protected TopGroups<?> groupByDocBlock(IndexSearcher searcher, Filter filter, Query query, int groupOffset, int groupLimit) throws IOException {
|
||||
int topN = groupOffset + groupLimit;
|
||||
BlockGroupingCollector c = new BlockGroupingCollector(groupSort, topN, includeScores, groupEndDocs);
|
||||
searcher.search(query, filter, c);
|
||||
int topNInsideGroup = groupDocsOffset + groupDocsLimit;
|
||||
return c.getTopGroups(sortWithinGroup, groupOffset, groupDocsOffset, topNInsideGroup, fillSortFields);
|
||||
}
|
||||
|
||||
/**
|
||||
* Enables caching for the second pass search. The cache will not grow over a specified limit in MB.
|
||||
* The cache is filled during the first pass searched and then replayed during the second pass searched.
|
||||
* If the cache grows beyond the specified limit, then the cache is purged and not used in the second pass search.
|
||||
*
|
||||
* @param maxCacheRAMMB The maximum amount in MB the cache is allowed to hold
|
||||
* @param cacheScores Whether to cache the scores
|
||||
* @return <code>this</code>
|
||||
*/
|
||||
public GroupingSearch setCachingInMB(double maxCacheRAMMB, boolean cacheScores) {
|
||||
this.maxCacheRAMMB = maxCacheRAMMB;
|
||||
this.maxDocsToCache = null;
|
||||
this.cacheScores = cacheScores;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Enables caching for the second pass search. The cache will not contain more than the maximum specified documents.
|
||||
* The cache is filled during the first pass searched and then replayed during the second pass searched.
|
||||
* If the cache grows beyond the specified limit, then the cache is purged and not used in the second pass search.
|
||||
*
|
||||
* @param maxDocsToCache The maximum number of documents the cache is allowed to hold
|
||||
* @param cacheScores Whether to cache the scores
|
||||
* @return <code>this</code>
|
||||
*/
|
||||
public GroupingSearch setCaching(int maxDocsToCache, boolean cacheScores) {
|
||||
this.maxDocsToCache = maxDocsToCache;
|
||||
this.maxCacheRAMMB = null;
|
||||
this.cacheScores = cacheScores;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Disables any enabled cache.
|
||||
*
|
||||
* @return <code>this</code>
|
||||
*/
|
||||
public GroupingSearch disableCaching() {
|
||||
this.maxCacheRAMMB = null;
|
||||
this.maxDocsToCache = null;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Specifies how groups are sorted.
|
||||
* Defaults to {@link Sort#RELEVANCE}.
|
||||
*
|
||||
* @param groupSort The sort for the groups.
|
||||
* @return <code>this</code>
|
||||
*/
|
||||
public GroupingSearch setGroupSort(Sort groupSort) {
|
||||
this.groupSort = groupSort;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Specified how documents inside a group are sorted.
|
||||
* Defaults to {@link Sort#RELEVANCE}.
|
||||
*
|
||||
* @param sortWithinGroup The sort for documents inside a group
|
||||
* @return <code>this</code>
|
||||
*/
|
||||
public GroupingSearch setSortWithinGroup(Sort sortWithinGroup) {
|
||||
this.sortWithinGroup = sortWithinGroup;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Specifies the offset for documents inside a group.
|
||||
*
|
||||
* @param groupDocsOffset The offset for documents inside a
|
||||
* @return <code>this</code>
|
||||
*/
|
||||
public GroupingSearch setGroupDocsOffset(int groupDocsOffset) {
|
||||
this.groupDocsOffset = groupDocsOffset;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Specifies the number of documents to return inside a group from the specified groupDocsOffset.
|
||||
*
|
||||
* @param groupDocsLimit The number of documents to return inside a group
|
||||
* @return <code>this</code>
|
||||
*/
|
||||
public GroupingSearch setGroupDocsLimit(int groupDocsLimit) {
|
||||
this.groupDocsLimit = groupDocsLimit;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether to also fill the sort fields per returned group and groups docs.
|
||||
*
|
||||
* @param fillSortFields Whether to also fill the sort fields per returned group and groups docs
|
||||
* @return <code>this</code>
|
||||
*/
|
||||
public GroupingSearch setFillSortFields(boolean fillSortFields) {
|
||||
this.fillSortFields = fillSortFields;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether to include the scores per doc inside a group.
|
||||
*
|
||||
* @param includeScores Whether to include the scores per doc inside a group
|
||||
* @return <code>this</code>
|
||||
*/
|
||||
public GroupingSearch setIncludeScores(boolean includeScores) {
|
||||
this.includeScores = includeScores;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether to include the score of the most relevant document per group.
|
||||
*
|
||||
* @param includeMaxScore Whether to include the score of the most relevant document per group
|
||||
* @return <code>this</code>
|
||||
*/
|
||||
public GroupingSearch setIncludeMaxScore(boolean includeMaxScore) {
|
||||
this.includeMaxScore = includeMaxScore;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether to also co0.0mpute all groups matching the query.
|
||||
* This can be used to determine the number of groups, which can be used for accurate pagination.
|
||||
* <p/>
|
||||
* When grouping by doc block the number of groups are automatically included in the {@link TopGroups} and this
|
||||
* option doesn't have any influence.
|
||||
*
|
||||
* @param allGroups to also compute all groups matching the query
|
||||
* @return <code>this</code>
|
||||
*/
|
||||
public GroupingSearch setAllGroups(boolean allGroups) {
|
||||
this.allGroups = allGroups;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* If {@link #setAllGroups(boolean)} was set to <code>true</code> then all matching groups are returned, otherwise
|
||||
* an empty collection is returned.
|
||||
*
|
||||
* @param <T> The group value type. This can be a {@link BytesRef} or a {@link MutableValue} instance. If grouping
|
||||
* by doc block this the group value is always <code>null</code>.
|
||||
* @return all matching groups are returned, or an empty collection
|
||||
*/
|
||||
@SuppressWarnings({"unchecked", "rawtypes"})
|
||||
public <T> Collection<T> getAllMatchingGroups() {
|
||||
return (Collection<T>) matchingGroups;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether to compute all group heads (most relevant document per group) matching the query.
|
||||
* <p/>
|
||||
* This feature isn't enabled when grouping by doc block.
|
||||
*
|
||||
* @param allGroupHeads Whether to compute all group heads (most relevant document per group) matching the query
|
||||
* @return <code>this</code>
|
||||
*/
|
||||
public GroupingSearch setAllGroupHeads(boolean allGroupHeads) {
|
||||
this.allGroupHeads = allGroupHeads;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the matching group heads if {@link #setAllGroupHeads(boolean)} was set to true or an empty bit set.
|
||||
*
|
||||
* @return The matching group heads if {@link #setAllGroupHeads(boolean)} was set to true or an empty bit set
|
||||
*/
|
||||
public Bits getAllGroupHeads() {
|
||||
return matchingGroupHeads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the initial size of some internal used data structures.
|
||||
* This prevents growing data structures many times. This can improve the performance of the grouping at the cost of
|
||||
* more initial RAM.
|
||||
* <p/>
|
||||
* The {@link #allGroups} and {@link #allGroupHeads} features use this option.
|
||||
* Defaults to 128.
|
||||
*
|
||||
* @param initialSize The initial size of some internal used data structures
|
||||
* @return <code>this</code>
|
||||
*/
|
||||
public GroupingSearch setInitialSize(int initialSize) {
|
||||
this.initialSize = initialSize;
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -63,7 +63,7 @@ field fall into a single group.</p>
|
|||
|
||||
<p>Known limitations:</p>
|
||||
<ul>
|
||||
<li> For the two-pass grouping collector, the group field must be a
|
||||
<li> For the two-pass grouping search, the group field must be a
|
||||
single-valued indexed field.
|
||||
{@link org.apache.lucene.search.FieldCache} is used to load the {@link org.apache.lucene.search.FieldCache.DocTermsIndex} for this field.
|
||||
<li> Although Solr support grouping by function and this module has abstraction of what a group is, there are currently only
|
||||
|
@ -73,50 +73,30 @@ field fall into a single group.</p>
|
|||
group yourself.
|
||||
</ul>
|
||||
|
||||
<p>Typical usage for the generic two-pass collector looks like this
|
||||
(using the {@link org.apache.lucene.search.CachingCollector}):</p>
|
||||
<p>Typical usage for the generic two-pass grouping search looks like this using the grouping convenience utility
|
||||
(optionally using caching for the second pass search):</p>
|
||||
|
||||
<pre class="prettyprint">
|
||||
TermFirstPassGroupingCollector c1 = new TermFirstPassGroupingCollector("author", groupSort, groupOffset+topNGroups);
|
||||
GroupingSearch groupingSearch = new GroupingSearch("author");
|
||||
groupingSearch.setGroupSort(groupSort);
|
||||
groupingSearch.setFillSortFields(fillFields);
|
||||
|
||||
boolean cacheScores = true;
|
||||
double maxCacheRAMMB = 4.0;
|
||||
CachingCollector cachedCollector = CachingCollector.create(c1, cacheScores, maxCacheRAMMB);
|
||||
s.search(new TermQuery(new Term("content", searchTerm)), cachedCollector);
|
||||
|
||||
Collection<SearchGroup<BytesRef>> topGroups = c1.getTopGroups(groupOffset, fillFields);
|
||||
|
||||
if (topGroups == null) {
|
||||
// No groups matched
|
||||
return;
|
||||
if (useCache) {
|
||||
// Sets cache in MB
|
||||
groupingSearch.setCachingInMB(4.0, true);
|
||||
}
|
||||
|
||||
boolean getScores = true;
|
||||
boolean getMaxScores = true;
|
||||
boolean fillFields = true;
|
||||
TermSecondPassGroupingCollector c2 = new TermSecondPassGroupingCollector("author", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields);
|
||||
|
||||
//Optionally compute total group count
|
||||
TermAllGroupsCollector allGroupsCollector = null;
|
||||
if (requiredTotalGroupCount) {
|
||||
allGroupsCollector = new TermAllGroupsCollector("author");
|
||||
c2 = MultiCollector.wrap(c2, allGroupsCollector);
|
||||
groupingSearch.setAllGroups(true);
|
||||
}
|
||||
|
||||
if (cachedCollector.isCached()) {
|
||||
// Cache fit within maxCacheRAMMB, so we can replay it:
|
||||
cachedCollector.replay(c2);
|
||||
} else {
|
||||
// Cache was too large; must re-execute query:
|
||||
s.search(new TermQuery(new Term("content", searchTerm)), c2);
|
||||
}
|
||||
|
||||
TopGroups<BytesRef> groupsResult = c2.getTopGroups(docOffset);
|
||||
if (requiredTotalGroupCount) {
|
||||
groupsResult = new TopGroups<BytesRef>(groupsResult, allGroupsCollector.getGroupCount());
|
||||
}
|
||||
TermQuery query = new TermQuery(new Term("content", searchTerm));
|
||||
TopGroups<BytesRef> result = groupingSearch.search(indexSearcher, query, groupOffset, groupLimit);
|
||||
|
||||
// Render groupsResult...
|
||||
if (requiredTotalGroupCount) {
|
||||
int totalGroupCount = result.totalGroupCount;
|
||||
}
|
||||
</pre>
|
||||
|
||||
<p>To use the single-pass <code>BlockGroupingCollector</code>,
|
||||
|
@ -159,6 +139,19 @@ Finally, do this per search:
|
|||
// Render groupsResult...
|
||||
</pre>
|
||||
|
||||
Or alternatively use the <code>GroupingSearch</code> convenience utility:
|
||||
|
||||
<pre class="prettyprint">
|
||||
// Per search:
|
||||
GroupingSearch groupingSearch = new GroupingSearch(groupEndDocs);
|
||||
groupingSearch.setGroupSort(groupSort);
|
||||
groupingSearch.setIncludeScores(needsScores);
|
||||
TermQuery query = new TermQuery(new Term("content", searchTerm));
|
||||
TopGroups groupsResult = groupingSearch.search(indexSearcher, query, groupOffset, groupLimit);
|
||||
|
||||
// Render groupsResult...
|
||||
</pre>
|
||||
|
||||
Note that the <code>groupValue</code> of each <code>GroupDocs</code>
|
||||
will be <code>null</code>, so if you need to present this value you'll
|
||||
have to separately retrieve it (for example using stored
|
||||
|
@ -167,7 +160,8 @@ fields, <code>FieldCache</code>, etc.).
|
|||
<p>Another collector is the <code>TermAllGroupHeadsCollector</code> that can be used to retrieve all most relevant
|
||||
documents per group. Also known as group heads. This can be useful in situations when one wants to compute group
|
||||
based facets / statistics on the complete query result. The collector can be executed during the first or second
|
||||
phase.</p>
|
||||
phase. This collector can also be used with the <code>GroupingSearch</code> convenience utility, but when if one only
|
||||
wants to compute the most relevant documents per group it is better to just use the collector as done here below.</p>
|
||||
|
||||
<pre class="prettyprint">
|
||||
AbstractAllGroupHeadsCollector c = TermAllGroupHeadsCollector.create(groupField, sortWithinGroup);
|
||||
|
@ -182,39 +176,27 @@ fields, <code>FieldCache</code>, etc.).
|
|||
<p>For each of the above collector types there is also a variant that works with <code>ValueSource</code> instead of
|
||||
of fields. Concretely this means that these variants can work with functions. These variants are slower than
|
||||
there term based counter parts. These implementations are located in the
|
||||
<code>org.apache.lucene.search.grouping.function</code> package.
|
||||
<code>org.apache.lucene.search.grouping.function</code> package, but can also be used with the
|
||||
<code>GroupingSearch</code> convenience utility
|
||||
</p>
|
||||
|
||||
<p>
|
||||
There are also DocValues based implementations available for the group collectors. There are factory methods
|
||||
available for creating dv based instances. A typical example using dv based grouping collectors:
|
||||
available for creating dv based instances. A typical example using dv based grouping with the
|
||||
<code>GroupingSearch</code> convenience utility:
|
||||
</p>
|
||||
|
||||
<pre class="prettyprint">
|
||||
boolean diskResident = true; // Whether values should fetched directly from disk by passing the Java heap space.
|
||||
AbstractFirstPassGroupingCollector c1 = DVFirstPassGroupingCollector.create(
|
||||
groupSort, groupOffset+topNGroups, "author", DocValues.Type.BYTES_VAR_SORTED, diskResident
|
||||
);
|
||||
DocValues.Type docValuesType = DocValues.Type.BYTES_VAR_SORTED;
|
||||
GroupingSearch groupingSearch = new GroupingSearch("author", docValuesType, diskResident);
|
||||
groupingSearch.setGroupSort(groupSort);
|
||||
groupingSearch.setFillSortFields(fillFields);
|
||||
|
||||
s.search(new TermQuery(new Term("content", searchTerm)), c1);
|
||||
TermQuery query = new TermQuery(new Term("content", searchTerm));
|
||||
// The docValuesType variable decides the generic type. When float is used this Double and in case of int this is Long
|
||||
TopGroups<BytesRef> result = groupingSearch.search(indexSearcher, query, groupOffset, groupLimit);
|
||||
|
||||
Collection<SearchGroup<BytesRef>> topGroups = c1.getTopGroups(groupOffset, fillFields);
|
||||
|
||||
if (topGroups == null) {
|
||||
// No groups matched
|
||||
return;
|
||||
}
|
||||
|
||||
boolean getScores = true;
|
||||
boolean getMaxScores = true;
|
||||
boolean fillFields = true;
|
||||
AbstractSecondPassGroupingCollector<BytesRef> c2 = DVSecondPassGroupingCollector.create(
|
||||
"author", diskResident, DocValues.Type.BYTES_VAR_SORTED, topGroups, groupSort, docSort,
|
||||
docOffset+docsPerGroup, getScores, getMaxScores, fillFields
|
||||
);
|
||||
|
||||
s.search(new TermQuery(new Term("content", searchTerm)), c2);
|
||||
TopGroups<BytesRef> groupsResult = c2.getTopGroups(docOffset);
|
||||
// Render groupsResult...
|
||||
</pre>
|
||||
|
||||
|
|
|
@ -0,0 +1,222 @@
|
|||
package org.apache.lucene.search.grouping;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.*;
|
||||
import org.apache.lucene.index.DocValues;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queries.function.ValueSource;
|
||||
import org.apache.lucene.queries.function.valuesource.BytesRefFieldSource;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.mutable.MutableValueStr;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
public class GroupingSearchTest extends LuceneTestCase {
|
||||
|
||||
// Tests some very basic usages...
|
||||
public void testBasic() throws Exception {
|
||||
|
||||
final String groupField = "author";
|
||||
|
||||
FieldType customType = new FieldType();
|
||||
customType.setStored(true);
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(
|
||||
random,
|
||||
dir,
|
||||
newIndexWriterConfig(TEST_VERSION_CURRENT,
|
||||
new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
|
||||
boolean canUseIDV = !"Lucene3x".equals(w.w.getConfig().getCodec().getName());
|
||||
List<Document> documents = new ArrayList<Document>();
|
||||
// 0
|
||||
Document doc = new Document();
|
||||
addGroupField(doc, groupField, "author1", canUseIDV);
|
||||
doc.add(new Field("content", "random text", TextField.TYPE_STORED));
|
||||
doc.add(new Field("id", "1", customType));
|
||||
documents.add(doc);
|
||||
|
||||
// 1
|
||||
doc = new Document();
|
||||
addGroupField(doc, groupField, "author1", canUseIDV);
|
||||
doc.add(new Field("content", "some more random text", TextField.TYPE_STORED));
|
||||
doc.add(new Field("id", "2", customType));
|
||||
documents.add(doc);
|
||||
|
||||
// 2
|
||||
doc = new Document();
|
||||
addGroupField(doc, groupField, "author1", canUseIDV);
|
||||
doc.add(new Field("content", "some more random textual data", TextField.TYPE_STORED));
|
||||
doc.add(new Field("id", "3", customType));
|
||||
doc.add(new Field("groupend", "x", StringField.TYPE_UNSTORED));
|
||||
documents.add(doc);
|
||||
w.addDocuments(documents);
|
||||
documents.clear();
|
||||
|
||||
// 3
|
||||
doc = new Document();
|
||||
addGroupField(doc, groupField, "author2", canUseIDV);
|
||||
doc.add(new Field("content", "some random text", TextField.TYPE_STORED));
|
||||
doc.add(new Field("id", "4", customType));
|
||||
doc.add(new Field("groupend", "x", StringField.TYPE_UNSTORED));
|
||||
w.addDocument(doc);
|
||||
|
||||
// 4
|
||||
doc = new Document();
|
||||
addGroupField(doc, groupField, "author3", canUseIDV);
|
||||
doc.add(new Field("content", "some more random text", TextField.TYPE_STORED));
|
||||
doc.add(new Field("id", "5", customType));
|
||||
documents.add(doc);
|
||||
|
||||
// 5
|
||||
doc = new Document();
|
||||
addGroupField(doc, groupField, "author3", canUseIDV);
|
||||
doc.add(new Field("content", "random", TextField.TYPE_STORED));
|
||||
doc.add(new Field("id", "6", customType));
|
||||
doc.add(new Field("groupend", "x", StringField.TYPE_UNSTORED));
|
||||
documents.add(doc);
|
||||
w.addDocuments(documents);
|
||||
documents.clear();
|
||||
|
||||
// 6 -- no author field
|
||||
doc = new Document();
|
||||
doc.add(new Field("content", "random word stuck in alot of other text", TextField.TYPE_STORED));
|
||||
doc.add(new Field("id", "6", customType));
|
||||
doc.add(new Field("groupend", "x", StringField.TYPE_UNSTORED));
|
||||
|
||||
w.addDocument(doc);
|
||||
|
||||
IndexSearcher indexSearcher = new IndexSearcher(w.getReader());
|
||||
w.close();
|
||||
|
||||
Sort groupSort = Sort.RELEVANCE;
|
||||
GroupingSearch groupingSearch = createRandomGroupingSearch(groupField, groupSort, 5, canUseIDV);
|
||||
|
||||
TopGroups<?> groups = groupingSearch.search(indexSearcher, null, new TermQuery(new Term("content", "random")), 0, 10);
|
||||
|
||||
assertEquals(7, groups.totalHitCount);
|
||||
assertEquals(7, groups.totalGroupedHitCount);
|
||||
assertEquals(4, groups.groups.length);
|
||||
|
||||
// relevance order: 5, 0, 3, 4, 1, 2, 6
|
||||
|
||||
// the later a document is added the higher this docId
|
||||
// value
|
||||
GroupDocs<?> group = groups.groups[0];
|
||||
compareGroupValue("author3", group);
|
||||
assertEquals(2, group.scoreDocs.length);
|
||||
assertEquals(5, group.scoreDocs[0].doc);
|
||||
assertEquals(4, group.scoreDocs[1].doc);
|
||||
assertTrue(group.scoreDocs[0].score > group.scoreDocs[1].score);
|
||||
|
||||
group = groups.groups[1];
|
||||
compareGroupValue("author1", group);
|
||||
assertEquals(3, group.scoreDocs.length);
|
||||
assertEquals(0, group.scoreDocs[0].doc);
|
||||
assertEquals(1, group.scoreDocs[1].doc);
|
||||
assertEquals(2, group.scoreDocs[2].doc);
|
||||
assertTrue(group.scoreDocs[0].score > group.scoreDocs[1].score);
|
||||
assertTrue(group.scoreDocs[1].score > group.scoreDocs[2].score);
|
||||
|
||||
group = groups.groups[2];
|
||||
compareGroupValue("author2", group);
|
||||
assertEquals(1, group.scoreDocs.length);
|
||||
assertEquals(3, group.scoreDocs[0].doc);
|
||||
|
||||
group = groups.groups[3];
|
||||
compareGroupValue(null, group);
|
||||
assertEquals(1, group.scoreDocs.length);
|
||||
assertEquals(6, group.scoreDocs[0].doc);
|
||||
|
||||
Filter lastDocInBlock = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("groupend", "x"))));
|
||||
groupingSearch = new GroupingSearch(lastDocInBlock);
|
||||
groups = groupingSearch.search(indexSearcher, null, new TermQuery(new Term("content", "random")), 0, 10);
|
||||
|
||||
assertEquals(7, groups.totalHitCount);
|
||||
assertEquals(7, groups.totalGroupedHitCount);
|
||||
assertEquals(4, groups.totalGroupCount.longValue());
|
||||
assertEquals(4, groups.groups.length);
|
||||
|
||||
indexSearcher.getIndexReader().close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private void addGroupField(Document doc, String groupField, String value, boolean canUseIDV) {
|
||||
doc.add(new Field(groupField, value, TextField.TYPE_STORED));
|
||||
if (canUseIDV) {
|
||||
doc.add(new DocValuesField(groupField, new BytesRef(value), DocValues.Type.BYTES_VAR_SORTED));
|
||||
}
|
||||
}
|
||||
|
||||
private void compareGroupValue(String expected, GroupDocs<?> group) {
|
||||
if (expected == null) {
|
||||
if (group.groupValue == null) {
|
||||
return;
|
||||
} else if (group.groupValue.getClass().isAssignableFrom(MutableValueStr.class)) {
|
||||
return;
|
||||
} else if (((BytesRef) group.groupValue).length == 0) {
|
||||
return;
|
||||
}
|
||||
fail();
|
||||
}
|
||||
|
||||
if (group.groupValue.getClass().isAssignableFrom(BytesRef.class)) {
|
||||
assertEquals(new BytesRef(expected), group.groupValue);
|
||||
} else if (group.groupValue.getClass().isAssignableFrom(MutableValueStr.class)) {
|
||||
MutableValueStr v = new MutableValueStr();
|
||||
v.value = new BytesRef(expected);
|
||||
assertEquals(v, group.groupValue);
|
||||
} else {
|
||||
fail();
|
||||
}
|
||||
}
|
||||
|
||||
private GroupingSearch createRandomGroupingSearch(String groupField, Sort groupSort, int docsInGroup, boolean canUseIDV) throws IOException {
|
||||
GroupingSearch groupingSearch;
|
||||
if (random.nextBoolean()) {
|
||||
ValueSource vs = new BytesRefFieldSource(groupField);
|
||||
groupingSearch = new GroupingSearch(vs, new HashMap<Object, Object>());
|
||||
} else {
|
||||
if (canUseIDV && random.nextBoolean()) {
|
||||
boolean diskResident = random.nextBoolean();
|
||||
groupingSearch = new GroupingSearch(groupField, DocValues.Type.BYTES_VAR_SORTED, diskResident);
|
||||
} else {
|
||||
groupingSearch = new GroupingSearch(groupField);
|
||||
}
|
||||
}
|
||||
|
||||
groupingSearch.setGroupSort(groupSort);
|
||||
groupingSearch.setGroupDocsLimit(docsInGroup);
|
||||
|
||||
if (random.nextBoolean()) {
|
||||
groupingSearch.setCachingInMB(4.0, true);
|
||||
}
|
||||
|
||||
return groupingSearch;
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue