LUCENE-3778: Added a grouping utility class that makes it easier to use result grouping for pure Lucene apps.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303871 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Martijn van Groningen 2012-03-22 16:15:42 +00:00
parent 55515820f9
commit 89b55566d8
4 changed files with 735 additions and 59 deletions

View File

@ -77,6 +77,9 @@ New Features
* LUCENE-3444: Added a second pass grouping collector that keeps track of distinct * LUCENE-3444: Added a second pass grouping collector that keeps track of distinct
values for a specified field for the top N group. (Martijn van Groningen) values for a specified field for the top N group. (Martijn van Groningen)
* LUCENE-3778: Added a grouping utility class that makes it easier to use result
grouping for pure Lucene apps. (Martijn van Groningen)
API Changes API Changes
* LUCENE-2606: Changed RegexCapabilities interface to fix thread * LUCENE-2606: Changed RegexCapabilities interface to fix thread

View File

@ -0,0 +1,469 @@
package org.apache.lucene.search.grouping;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.document.DocValuesField;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.search.*;
import org.apache.lucene.search.grouping.dv.DVAllGroupHeadsCollector;
import org.apache.lucene.search.grouping.dv.DVAllGroupsCollector;
import org.apache.lucene.search.grouping.dv.DVFirstPassGroupingCollector;
import org.apache.lucene.search.grouping.dv.DVSecondPassGroupingCollector;
import org.apache.lucene.search.grouping.function.FunctionAllGroupHeadsCollector;
import org.apache.lucene.search.grouping.function.FunctionAllGroupsCollector;
import org.apache.lucene.search.grouping.function.FunctionFirstPassGroupingCollector;
import org.apache.lucene.search.grouping.function.FunctionSecondPassGroupingCollector;
import org.apache.lucene.search.grouping.term.TermAllGroupHeadsCollector;
import org.apache.lucene.search.grouping.term.TermAllGroupsCollector;
import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector;
import org.apache.lucene.search.grouping.term.TermSecondPassGroupingCollector;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.mutable.MutableValue;
import java.io.IOException;
import java.util.*;
/**
* Convenience class to perform grouping in a non distributed environment.
*
* @lucene.experimental
*/
public class GroupingSearch {
private final String groupField;
private final ValueSource groupFunction;
private final Map<?, ?> valueSourceContext;
private final Filter groupEndDocs;
private final DocValues.Type docValuesType;
private final boolean diskResidentDocValues;
private Sort groupSort = Sort.RELEVANCE;
private Sort sortWithinGroup;
private int groupDocsOffset;
private int groupDocsLimit = 1;
private boolean fillSortFields;
private boolean includeScores = true;
private boolean includeMaxScore = true;
private Double maxCacheRAMMB;
private Integer maxDocsToCache;
private boolean cacheScores;
private boolean allGroups;
private boolean allGroupHeads;
private int initialSize = 128;
private Collection<?> matchingGroups;
private Bits matchingGroupHeads;
/**
* Constructs a <code>GroupingSearch</code> instance that groups documents by index terms using the {@link FieldCache}.
* The group field can only have one token per document. This means that the field must not be analysed.
*
* @param groupField The name of the field to group by.
*/
public GroupingSearch(String groupField) {
this(groupField, null, null, null, null, false);
}
/**
* Constructs a <code>GroupingSearch</code> instance that groups documents by doc values.
* This constructor can only be used when the groupField is a {@link DocValuesField}.
*
* @param groupField The name of the field to group by that contains doc values
* @param docValuesType The doc values type of the specified groupField
* @param diskResidentDocValues Whether the values to group by should be disk resident
*/
public GroupingSearch(String groupField, DocValues.Type docValuesType, boolean diskResidentDocValues) {
this(groupField, null, null, null, docValuesType, diskResidentDocValues);
}
/**
* Constructs a <code>GroupingSearch</code> instance that groups documents by function using a {@link ValueSource}
* instance.
*
* @param groupFunction The function to group by specified as {@link ValueSource}
* @param valueSourceContext The context of the specified groupFunction
*/
public GroupingSearch(ValueSource groupFunction, Map<?, ?> valueSourceContext) {
this(null, groupFunction, valueSourceContext, null, null, false);
}
/**
* Constructor for grouping documents by doc block.
* This constructor can only be used when documents belonging in a group are indexed in one block.
*
* @param groupEndDocs The filter that marks the last document in all doc blocks
*/
public GroupingSearch(Filter groupEndDocs) {
this(null, null, null, groupEndDocs, null, false);
}
private GroupingSearch(String groupField, ValueSource groupFunction, Map<?, ?> valueSourceContext, Filter groupEndDocs, DocValues.Type docValuesType, boolean diskResidentDocValues) {
this.groupField = groupField;
this.groupFunction = groupFunction;
this.valueSourceContext = valueSourceContext;
this.groupEndDocs = groupEndDocs;
this.docValuesType = docValuesType;
this.diskResidentDocValues = diskResidentDocValues;
}
/**
* Executes a grouped search. Both the first pass and second pass are executed on the specified searcher.
*
* @param searcher The {@link org.apache.lucene.search.IndexSearcher} instance to execute the grouped search on.
* @param query The query to execute with the grouping
* @param groupOffset The group offset
* @param groupLimit The number of groups to return from the specified group offset
* @return the grouped result as a {@link TopGroups} instance
* @throws IOException If any I/O related errors occur
*/
public <T> TopGroups<T> search(IndexSearcher searcher, Query query, int groupOffset, int groupLimit) throws IOException {
return search(searcher, null, query, groupOffset, groupLimit);
}
/**
* Executes a grouped search. Both the first pass and second pass are executed on the specified searcher.
*
* @param searcher The {@link org.apache.lucene.search.IndexSearcher} instance to execute the grouped search on.
* @param filter The filter to execute with the grouping
* @param query The query to execute with the grouping
* @param groupOffset The group offset
* @param groupLimit The number of groups to return from the specified group offset
* @return the grouped result as a {@link TopGroups} instance
* @throws IOException If any I/O related errors occur
*/
@SuppressWarnings("unchecked")
public <T> TopGroups<T> search(IndexSearcher searcher, Filter filter, Query query, int groupOffset, int groupLimit) throws IOException {
if (groupField != null || groupFunction != null) {
return groupByFieldOrFunction(searcher, filter, query, groupOffset, groupLimit);
} else if (groupEndDocs != null) {
return (TopGroups<T>) groupByDocBlock(searcher, filter, query, groupOffset, groupLimit);
} else {
throw new IllegalStateException("Either groupField, groupFunction or groupEndDocs must be set."); // This can't happen...
}
}
@SuppressWarnings({"unchecked", "rawtypes"})
protected TopGroups groupByFieldOrFunction(IndexSearcher searcher, Filter filter, Query query, int groupOffset, int groupLimit) throws IOException {
int topN = groupOffset + groupLimit;
final AbstractFirstPassGroupingCollector firstPassCollector;
final AbstractAllGroupsCollector allGroupsCollector;
final AbstractAllGroupHeadsCollector allGroupHeadsCollector;
if (groupFunction != null) {
firstPassCollector = new FunctionFirstPassGroupingCollector(groupFunction, valueSourceContext, groupSort, topN);
if (allGroups) {
allGroupsCollector = new FunctionAllGroupsCollector(groupFunction, valueSourceContext);
} else {
allGroupsCollector = null;
}
if (allGroupHeads) {
allGroupHeadsCollector = new FunctionAllGroupHeadsCollector(groupFunction, valueSourceContext, sortWithinGroup);
} else {
allGroupHeadsCollector = null;
}
} else if (docValuesType != null) {
firstPassCollector = DVFirstPassGroupingCollector.create(groupSort, topN, groupField, docValuesType, diskResidentDocValues);
if (allGroups) {
allGroupsCollector = DVAllGroupsCollector.create(groupField, docValuesType, diskResidentDocValues, initialSize);
} else {
allGroupsCollector = null;
}
if (allGroupHeads) {
allGroupHeadsCollector = DVAllGroupHeadsCollector.create(groupField, sortWithinGroup, docValuesType, diskResidentDocValues);
} else {
allGroupHeadsCollector = null;
}
} else {
firstPassCollector = new TermFirstPassGroupingCollector(groupField, groupSort, topN);
if (allGroups) {
allGroupsCollector = new TermAllGroupsCollector(groupField, initialSize);
} else {
allGroupsCollector = null;
}
if (allGroupHeads) {
allGroupHeadsCollector = TermAllGroupHeadsCollector.create(groupField, sortWithinGroup, initialSize);
} else {
allGroupHeadsCollector = null;
}
}
final Collector firstRound;
if (allGroupHeads || allGroups) {
List<Collector> collectors = new ArrayList<Collector>();
collectors.add(firstPassCollector);
if (allGroupHeads) {
collectors.add(allGroupsCollector);
}
if (allGroupHeads) {
collectors.add(allGroupHeadsCollector);
}
firstRound = MultiCollector.wrap(collectors.toArray(new Collector[collectors.size()]));
} else {
firstRound = firstPassCollector;
}
CachingCollector cachedCollector = null;
if (maxCacheRAMMB != null || maxDocsToCache != null) {
if (maxCacheRAMMB != null) {
cachedCollector = CachingCollector.create(firstRound, cacheScores, maxCacheRAMMB);
} else {
cachedCollector = CachingCollector.create(firstRound, cacheScores, maxDocsToCache);
}
searcher.search(query, filter, cachedCollector);
} else {
searcher.search(query, filter, firstRound);
}
if (allGroups) {
matchingGroups = allGroupsCollector.getGroups();
} else {
matchingGroups = Collections.emptyList();
}
if (allGroupHeads) {
matchingGroupHeads = allGroupHeadsCollector.retrieveGroupHeads(searcher.getIndexReader().maxDoc());
} else {
matchingGroupHeads = new Bits.MatchNoBits(searcher.getIndexReader().maxDoc());
}
Collection<SearchGroup> topSearchGroups = firstPassCollector.getTopGroups(groupOffset, fillSortFields);
if (topSearchGroups == null) {
return new TopGroups(new SortField[0], new SortField[0], 0, 0, new GroupDocs[0]);
}
int topNInsideGroup = groupDocsOffset + groupDocsLimit;
AbstractSecondPassGroupingCollector secondPassCollector;
if (groupFunction != null) {
secondPassCollector = new FunctionSecondPassGroupingCollector((Collection) topSearchGroups, groupSort, sortWithinGroup, topNInsideGroup, includeScores, includeMaxScore, fillSortFields, groupFunction, valueSourceContext);
} else if (docValuesType != null) {
secondPassCollector = DVSecondPassGroupingCollector.create(groupField, diskResidentDocValues, docValuesType, (Collection) topSearchGroups, groupSort, sortWithinGroup, topNInsideGroup, includeScores, includeMaxScore, fillSortFields);
} else {
secondPassCollector = new TermSecondPassGroupingCollector(groupField, (Collection) topSearchGroups, groupSort, sortWithinGroup, topNInsideGroup, includeScores, includeMaxScore, fillSortFields);
}
if (cachedCollector != null && cachedCollector.isCached()) {
cachedCollector.replay(secondPassCollector);
} else {
searcher.search(query, filter, secondPassCollector);
}
if (allGroups) {
return new TopGroups(secondPassCollector.getTopGroups(groupDocsOffset), matchingGroups.size());
} else {
return secondPassCollector.getTopGroups(groupDocsOffset);
}
}
protected TopGroups<?> groupByDocBlock(IndexSearcher searcher, Filter filter, Query query, int groupOffset, int groupLimit) throws IOException {
int topN = groupOffset + groupLimit;
BlockGroupingCollector c = new BlockGroupingCollector(groupSort, topN, includeScores, groupEndDocs);
searcher.search(query, filter, c);
int topNInsideGroup = groupDocsOffset + groupDocsLimit;
return c.getTopGroups(sortWithinGroup, groupOffset, groupDocsOffset, topNInsideGroup, fillSortFields);
}
/**
* Enables caching for the second pass search. The cache will not grow over a specified limit in MB.
* The cache is filled during the first pass searched and then replayed during the second pass searched.
* If the cache grows beyond the specified limit, then the cache is purged and not used in the second pass search.
*
* @param maxCacheRAMMB The maximum amount in MB the cache is allowed to hold
* @param cacheScores Whether to cache the scores
* @return <code>this</code>
*/
public GroupingSearch setCachingInMB(double maxCacheRAMMB, boolean cacheScores) {
this.maxCacheRAMMB = maxCacheRAMMB;
this.maxDocsToCache = null;
this.cacheScores = cacheScores;
return this;
}
/**
* Enables caching for the second pass search. The cache will not contain more than the maximum specified documents.
* The cache is filled during the first pass searched and then replayed during the second pass searched.
* If the cache grows beyond the specified limit, then the cache is purged and not used in the second pass search.
*
* @param maxDocsToCache The maximum number of documents the cache is allowed to hold
* @param cacheScores Whether to cache the scores
* @return <code>this</code>
*/
public GroupingSearch setCaching(int maxDocsToCache, boolean cacheScores) {
this.maxDocsToCache = maxDocsToCache;
this.maxCacheRAMMB = null;
this.cacheScores = cacheScores;
return this;
}
/**
* Disables any enabled cache.
*
* @return <code>this</code>
*/
public GroupingSearch disableCaching() {
this.maxCacheRAMMB = null;
this.maxDocsToCache = null;
return this;
}
/**
* Specifies how groups are sorted.
* Defaults to {@link Sort#RELEVANCE}.
*
* @param groupSort The sort for the groups.
* @return <code>this</code>
*/
public GroupingSearch setGroupSort(Sort groupSort) {
this.groupSort = groupSort;
return this;
}
/**
* Specified how documents inside a group are sorted.
* Defaults to {@link Sort#RELEVANCE}.
*
* @param sortWithinGroup The sort for documents inside a group
* @return <code>this</code>
*/
public GroupingSearch setSortWithinGroup(Sort sortWithinGroup) {
this.sortWithinGroup = sortWithinGroup;
return this;
}
/**
* Specifies the offset for documents inside a group.
*
* @param groupDocsOffset The offset for documents inside a
* @return <code>this</code>
*/
public GroupingSearch setGroupDocsOffset(int groupDocsOffset) {
this.groupDocsOffset = groupDocsOffset;
return this;
}
/**
* Specifies the number of documents to return inside a group from the specified groupDocsOffset.
*
* @param groupDocsLimit The number of documents to return inside a group
* @return <code>this</code>
*/
public GroupingSearch setGroupDocsLimit(int groupDocsLimit) {
this.groupDocsLimit = groupDocsLimit;
return this;
}
/**
* Whether to also fill the sort fields per returned group and groups docs.
*
* @param fillSortFields Whether to also fill the sort fields per returned group and groups docs
* @return <code>this</code>
*/
public GroupingSearch setFillSortFields(boolean fillSortFields) {
this.fillSortFields = fillSortFields;
return this;
}
/**
* Whether to include the scores per doc inside a group.
*
* @param includeScores Whether to include the scores per doc inside a group
* @return <code>this</code>
*/
public GroupingSearch setIncludeScores(boolean includeScores) {
this.includeScores = includeScores;
return this;
}
/**
* Whether to include the score of the most relevant document per group.
*
* @param includeMaxScore Whether to include the score of the most relevant document per group
* @return <code>this</code>
*/
public GroupingSearch setIncludeMaxScore(boolean includeMaxScore) {
this.includeMaxScore = includeMaxScore;
return this;
}
/**
* Whether to also co0.0mpute all groups matching the query.
* This can be used to determine the number of groups, which can be used for accurate pagination.
* <p/>
* When grouping by doc block the number of groups are automatically included in the {@link TopGroups} and this
* option doesn't have any influence.
*
* @param allGroups to also compute all groups matching the query
* @return <code>this</code>
*/
public GroupingSearch setAllGroups(boolean allGroups) {
this.allGroups = allGroups;
return this;
}
/**
* If {@link #setAllGroups(boolean)} was set to <code>true</code> then all matching groups are returned, otherwise
* an empty collection is returned.
*
* @param <T> The group value type. This can be a {@link BytesRef} or a {@link MutableValue} instance. If grouping
* by doc block this the group value is always <code>null</code>.
* @return all matching groups are returned, or an empty collection
*/
@SuppressWarnings({"unchecked", "rawtypes"})
public <T> Collection<T> getAllMatchingGroups() {
return (Collection<T>) matchingGroups;
}
/**
* Whether to compute all group heads (most relevant document per group) matching the query.
* <p/>
* This feature isn't enabled when grouping by doc block.
*
* @param allGroupHeads Whether to compute all group heads (most relevant document per group) matching the query
* @return <code>this</code>
*/
public GroupingSearch setAllGroupHeads(boolean allGroupHeads) {
this.allGroupHeads = allGroupHeads;
return this;
}
/**
* Returns the matching group heads if {@link #setAllGroupHeads(boolean)} was set to true or an empty bit set.
*
* @return The matching group heads if {@link #setAllGroupHeads(boolean)} was set to true or an empty bit set
*/
public Bits getAllGroupHeads() {
return matchingGroupHeads;
}
/**
* Sets the initial size of some internal used data structures.
* This prevents growing data structures many times. This can improve the performance of the grouping at the cost of
* more initial RAM.
* <p/>
* The {@link #allGroups} and {@link #allGroupHeads} features use this option.
* Defaults to 128.
*
* @param initialSize The initial size of some internal used data structures
* @return <code>this</code>
*/
public GroupingSearch setInitialSize(int initialSize) {
this.initialSize = initialSize;
return this;
}
}

View File

@ -63,7 +63,7 @@ field fall into a single group.</p>
<p>Known limitations:</p> <p>Known limitations:</p>
<ul> <ul>
<li> For the two-pass grouping collector, the group field must be a <li> For the two-pass grouping search, the group field must be a
single-valued indexed field. single-valued indexed field.
{@link org.apache.lucene.search.FieldCache} is used to load the {@link org.apache.lucene.search.FieldCache.DocTermsIndex} for this field. {@link org.apache.lucene.search.FieldCache} is used to load the {@link org.apache.lucene.search.FieldCache.DocTermsIndex} for this field.
<li> Although Solr support grouping by function and this module has abstraction of what a group is, there are currently only <li> Although Solr support grouping by function and this module has abstraction of what a group is, there are currently only
@ -73,50 +73,30 @@ field fall into a single group.</p>
group yourself. group yourself.
</ul> </ul>
<p>Typical usage for the generic two-pass collector looks like this <p>Typical usage for the generic two-pass grouping search looks like this using the grouping convenience utility
(using the {@link org.apache.lucene.search.CachingCollector}):</p> (optionally using caching for the second pass search):</p>
<pre class="prettyprint"> <pre class="prettyprint">
TermFirstPassGroupingCollector c1 = new TermFirstPassGroupingCollector("author", groupSort, groupOffset+topNGroups); GroupingSearch groupingSearch = new GroupingSearch("author");
groupingSearch.setGroupSort(groupSort);
groupingSearch.setFillSortFields(fillFields);
boolean cacheScores = true; if (useCache) {
double maxCacheRAMMB = 4.0; // Sets cache in MB
CachingCollector cachedCollector = CachingCollector.create(c1, cacheScores, maxCacheRAMMB); groupingSearch.setCachingInMB(4.0, true);
s.search(new TermQuery(new Term("content", searchTerm)), cachedCollector);
Collection&lt;SearchGroup&lt;BytesRef&gt;&gt; topGroups = c1.getTopGroups(groupOffset, fillFields);
if (topGroups == null) {
// No groups matched
return;
} }
boolean getScores = true;
boolean getMaxScores = true;
boolean fillFields = true;
TermSecondPassGroupingCollector c2 = new TermSecondPassGroupingCollector("author", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields);
//Optionally compute total group count
TermAllGroupsCollector allGroupsCollector = null;
if (requiredTotalGroupCount) { if (requiredTotalGroupCount) {
allGroupsCollector = new TermAllGroupsCollector("author"); groupingSearch.setAllGroups(true);
c2 = MultiCollector.wrap(c2, allGroupsCollector);
} }
if (cachedCollector.isCached()) { TermQuery query = new TermQuery(new Term("content", searchTerm));
// Cache fit within maxCacheRAMMB, so we can replay it: TopGroups&lt;BytesRef&gt; result = groupingSearch.search(indexSearcher, query, groupOffset, groupLimit);
cachedCollector.replay(c2);
} else {
// Cache was too large; must re-execute query:
s.search(new TermQuery(new Term("content", searchTerm)), c2);
}
TopGroups&lt;BytesRef&gt; groupsResult = c2.getTopGroups(docOffset);
if (requiredTotalGroupCount) {
groupsResult = new TopGroups&lt;BytesRef&gt;(groupsResult, allGroupsCollector.getGroupCount());
}
// Render groupsResult... // Render groupsResult...
if (requiredTotalGroupCount) {
int totalGroupCount = result.totalGroupCount;
}
</pre> </pre>
<p>To use the single-pass <code>BlockGroupingCollector</code>, <p>To use the single-pass <code>BlockGroupingCollector</code>,
@ -159,6 +139,19 @@ Finally, do this per search:
// Render groupsResult... // Render groupsResult...
</pre> </pre>
Or alternatively use the <code>GroupingSearch</code> convenience utility:
<pre class="prettyprint">
// Per search:
GroupingSearch groupingSearch = new GroupingSearch(groupEndDocs);
groupingSearch.setGroupSort(groupSort);
groupingSearch.setIncludeScores(needsScores);
TermQuery query = new TermQuery(new Term("content", searchTerm));
TopGroups groupsResult = groupingSearch.search(indexSearcher, query, groupOffset, groupLimit);
// Render groupsResult...
</pre>
Note that the <code>groupValue</code> of each <code>GroupDocs</code> Note that the <code>groupValue</code> of each <code>GroupDocs</code>
will be <code>null</code>, so if you need to present this value you'll will be <code>null</code>, so if you need to present this value you'll
have to separately retrieve it (for example using stored have to separately retrieve it (for example using stored
@ -167,7 +160,8 @@ fields, <code>FieldCache</code>, etc.).
<p>Another collector is the <code>TermAllGroupHeadsCollector</code> that can be used to retrieve all most relevant <p>Another collector is the <code>TermAllGroupHeadsCollector</code> that can be used to retrieve all most relevant
documents per group. Also known as group heads. This can be useful in situations when one wants to compute group documents per group. Also known as group heads. This can be useful in situations when one wants to compute group
based facets / statistics on the complete query result. The collector can be executed during the first or second based facets / statistics on the complete query result. The collector can be executed during the first or second
phase.</p> phase. This collector can also be used with the <code>GroupingSearch</code> convenience utility, but when if one only
wants to compute the most relevant documents per group it is better to just use the collector as done here below.</p>
<pre class="prettyprint"> <pre class="prettyprint">
AbstractAllGroupHeadsCollector c = TermAllGroupHeadsCollector.create(groupField, sortWithinGroup); AbstractAllGroupHeadsCollector c = TermAllGroupHeadsCollector.create(groupField, sortWithinGroup);
@ -182,39 +176,27 @@ fields, <code>FieldCache</code>, etc.).
<p>For each of the above collector types there is also a variant that works with <code>ValueSource</code> instead of <p>For each of the above collector types there is also a variant that works with <code>ValueSource</code> instead of
of fields. Concretely this means that these variants can work with functions. These variants are slower than of fields. Concretely this means that these variants can work with functions. These variants are slower than
there term based counter parts. These implementations are located in the there term based counter parts. These implementations are located in the
<code>org.apache.lucene.search.grouping.function</code> package. <code>org.apache.lucene.search.grouping.function</code> package, but can also be used with the
<code>GroupingSearch</code> convenience utility
</p> </p>
<p> <p>
There are also DocValues based implementations available for the group collectors. There are factory methods There are also DocValues based implementations available for the group collectors. There are factory methods
available for creating dv based instances. A typical example using dv based grouping collectors: available for creating dv based instances. A typical example using dv based grouping with the
<code>GroupingSearch</code> convenience utility:
</p> </p>
<pre class="prettyprint"> <pre class="prettyprint">
boolean diskResident = true; // Whether values should fetched directly from disk by passing the Java heap space. boolean diskResident = true; // Whether values should fetched directly from disk by passing the Java heap space.
AbstractFirstPassGroupingCollector c1 = DVFirstPassGroupingCollector.create( DocValues.Type docValuesType = DocValues.Type.BYTES_VAR_SORTED;
groupSort, groupOffset+topNGroups, "author", DocValues.Type.BYTES_VAR_SORTED, diskResident GroupingSearch groupingSearch = new GroupingSearch("author", docValuesType, diskResident);
); groupingSearch.setGroupSort(groupSort);
groupingSearch.setFillSortFields(fillFields);
s.search(new TermQuery(new Term("content", searchTerm)), c1); TermQuery query = new TermQuery(new Term("content", searchTerm));
// The docValuesType variable decides the generic type. When float is used this Double and in case of int this is Long
TopGroups&lt;BytesRef&gt; result = groupingSearch.search(indexSearcher, query, groupOffset, groupLimit);
Collection&lt;SearchGroup&lt;BytesRef&gt;&gt; topGroups = c1.getTopGroups(groupOffset, fillFields);
if (topGroups == null) {
// No groups matched
return;
}
boolean getScores = true;
boolean getMaxScores = true;
boolean fillFields = true;
AbstractSecondPassGroupingCollector&lt;BytesRef&gt; c2 = DVSecondPassGroupingCollector.create(
"author", diskResident, DocValues.Type.BYTES_VAR_SORTED, topGroups, groupSort, docSort,
docOffset+docsPerGroup, getScores, getMaxScores, fillFields
);
s.search(new TermQuery(new Term("content", searchTerm)), c2);
TopGroups&lt;BytesRef&gt; groupsResult = c2.getTopGroups(docOffset);
// Render groupsResult... // Render groupsResult...
</pre> </pre>

View File

@ -0,0 +1,222 @@
package org.apache.lucene.search.grouping;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.queries.function.valuesource.BytesRefFieldSource;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.mutable.MutableValueStr;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
public class GroupingSearchTest extends LuceneTestCase {
// Tests some very basic usages...
public void testBasic() throws Exception {
final String groupField = "author";
FieldType customType = new FieldType();
customType.setStored(true);
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(
random,
dir,
newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
boolean canUseIDV = !"Lucene3x".equals(w.w.getConfig().getCodec().getName());
List<Document> documents = new ArrayList<Document>();
// 0
Document doc = new Document();
addGroupField(doc, groupField, "author1", canUseIDV);
doc.add(new Field("content", "random text", TextField.TYPE_STORED));
doc.add(new Field("id", "1", customType));
documents.add(doc);
// 1
doc = new Document();
addGroupField(doc, groupField, "author1", canUseIDV);
doc.add(new Field("content", "some more random text", TextField.TYPE_STORED));
doc.add(new Field("id", "2", customType));
documents.add(doc);
// 2
doc = new Document();
addGroupField(doc, groupField, "author1", canUseIDV);
doc.add(new Field("content", "some more random textual data", TextField.TYPE_STORED));
doc.add(new Field("id", "3", customType));
doc.add(new Field("groupend", "x", StringField.TYPE_UNSTORED));
documents.add(doc);
w.addDocuments(documents);
documents.clear();
// 3
doc = new Document();
addGroupField(doc, groupField, "author2", canUseIDV);
doc.add(new Field("content", "some random text", TextField.TYPE_STORED));
doc.add(new Field("id", "4", customType));
doc.add(new Field("groupend", "x", StringField.TYPE_UNSTORED));
w.addDocument(doc);
// 4
doc = new Document();
addGroupField(doc, groupField, "author3", canUseIDV);
doc.add(new Field("content", "some more random text", TextField.TYPE_STORED));
doc.add(new Field("id", "5", customType));
documents.add(doc);
// 5
doc = new Document();
addGroupField(doc, groupField, "author3", canUseIDV);
doc.add(new Field("content", "random", TextField.TYPE_STORED));
doc.add(new Field("id", "6", customType));
doc.add(new Field("groupend", "x", StringField.TYPE_UNSTORED));
documents.add(doc);
w.addDocuments(documents);
documents.clear();
// 6 -- no author field
doc = new Document();
doc.add(new Field("content", "random word stuck in alot of other text", TextField.TYPE_STORED));
doc.add(new Field("id", "6", customType));
doc.add(new Field("groupend", "x", StringField.TYPE_UNSTORED));
w.addDocument(doc);
IndexSearcher indexSearcher = new IndexSearcher(w.getReader());
w.close();
Sort groupSort = Sort.RELEVANCE;
GroupingSearch groupingSearch = createRandomGroupingSearch(groupField, groupSort, 5, canUseIDV);
TopGroups<?> groups = groupingSearch.search(indexSearcher, null, new TermQuery(new Term("content", "random")), 0, 10);
assertEquals(7, groups.totalHitCount);
assertEquals(7, groups.totalGroupedHitCount);
assertEquals(4, groups.groups.length);
// relevance order: 5, 0, 3, 4, 1, 2, 6
// the later a document is added the higher this docId
// value
GroupDocs<?> group = groups.groups[0];
compareGroupValue("author3", group);
assertEquals(2, group.scoreDocs.length);
assertEquals(5, group.scoreDocs[0].doc);
assertEquals(4, group.scoreDocs[1].doc);
assertTrue(group.scoreDocs[0].score > group.scoreDocs[1].score);
group = groups.groups[1];
compareGroupValue("author1", group);
assertEquals(3, group.scoreDocs.length);
assertEquals(0, group.scoreDocs[0].doc);
assertEquals(1, group.scoreDocs[1].doc);
assertEquals(2, group.scoreDocs[2].doc);
assertTrue(group.scoreDocs[0].score > group.scoreDocs[1].score);
assertTrue(group.scoreDocs[1].score > group.scoreDocs[2].score);
group = groups.groups[2];
compareGroupValue("author2", group);
assertEquals(1, group.scoreDocs.length);
assertEquals(3, group.scoreDocs[0].doc);
group = groups.groups[3];
compareGroupValue(null, group);
assertEquals(1, group.scoreDocs.length);
assertEquals(6, group.scoreDocs[0].doc);
Filter lastDocInBlock = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("groupend", "x"))));
groupingSearch = new GroupingSearch(lastDocInBlock);
groups = groupingSearch.search(indexSearcher, null, new TermQuery(new Term("content", "random")), 0, 10);
assertEquals(7, groups.totalHitCount);
assertEquals(7, groups.totalGroupedHitCount);
assertEquals(4, groups.totalGroupCount.longValue());
assertEquals(4, groups.groups.length);
indexSearcher.getIndexReader().close();
dir.close();
}
private void addGroupField(Document doc, String groupField, String value, boolean canUseIDV) {
doc.add(new Field(groupField, value, TextField.TYPE_STORED));
if (canUseIDV) {
doc.add(new DocValuesField(groupField, new BytesRef(value), DocValues.Type.BYTES_VAR_SORTED));
}
}
private void compareGroupValue(String expected, GroupDocs<?> group) {
if (expected == null) {
if (group.groupValue == null) {
return;
} else if (group.groupValue.getClass().isAssignableFrom(MutableValueStr.class)) {
return;
} else if (((BytesRef) group.groupValue).length == 0) {
return;
}
fail();
}
if (group.groupValue.getClass().isAssignableFrom(BytesRef.class)) {
assertEquals(new BytesRef(expected), group.groupValue);
} else if (group.groupValue.getClass().isAssignableFrom(MutableValueStr.class)) {
MutableValueStr v = new MutableValueStr();
v.value = new BytesRef(expected);
assertEquals(v, group.groupValue);
} else {
fail();
}
}
private GroupingSearch createRandomGroupingSearch(String groupField, Sort groupSort, int docsInGroup, boolean canUseIDV) throws IOException {
GroupingSearch groupingSearch;
if (random.nextBoolean()) {
ValueSource vs = new BytesRefFieldSource(groupField);
groupingSearch = new GroupingSearch(vs, new HashMap<Object, Object>());
} else {
if (canUseIDV && random.nextBoolean()) {
boolean diskResident = random.nextBoolean();
groupingSearch = new GroupingSearch(groupField, DocValues.Type.BYTES_VAR_SORTED, diskResident);
} else {
groupingSearch = new GroupingSearch(groupField);
}
}
groupingSearch.setGroupSort(groupSort);
groupingSearch.setGroupDocsLimit(docsInGroup);
if (random.nextBoolean()) {
groupingSearch.setCachingInMB(4.0, true);
}
return groupingSearch;
}
}