From 89b55566d8392657be86e1441a329373e9132257 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Thu, 22 Mar 2012 16:15:42 +0000 Subject: [PATCH] LUCENE-3778: Added a grouping utility class that makes it easier to use result grouping for pure Lucene apps. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303871 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 3 + .../search/grouping/GroupingSearch.java | 469 ++++++++++++++++++ .../lucene/search/grouping/package.html | 100 ++-- .../search/grouping/GroupingSearchTest.java | 222 +++++++++ 4 files changed, 735 insertions(+), 59 deletions(-) create mode 100644 modules/grouping/src/java/org/apache/lucene/search/grouping/GroupingSearch.java create mode 100644 modules/grouping/src/test/org/apache/lucene/search/grouping/GroupingSearchTest.java diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 8edd349fd15..ec35d87d0b1 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -77,6 +77,9 @@ New Features * LUCENE-3444: Added a second pass grouping collector that keeps track of distinct values for a specified field for the top N group. (Martijn van Groningen) + * LUCENE-3778: Added a grouping utility class that makes it easier to use result + grouping for pure Lucene apps. (Martijn van Groningen) + API Changes * LUCENE-2606: Changed RegexCapabilities interface to fix thread diff --git a/modules/grouping/src/java/org/apache/lucene/search/grouping/GroupingSearch.java b/modules/grouping/src/java/org/apache/lucene/search/grouping/GroupingSearch.java new file mode 100644 index 00000000000..e2d0210f464 --- /dev/null +++ b/modules/grouping/src/java/org/apache/lucene/search/grouping/GroupingSearch.java @@ -0,0 +1,469 @@ +package org.apache.lucene.search.grouping; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.document.DocValuesField; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.queries.function.ValueSource; +import org.apache.lucene.search.*; +import org.apache.lucene.search.grouping.dv.DVAllGroupHeadsCollector; +import org.apache.lucene.search.grouping.dv.DVAllGroupsCollector; +import org.apache.lucene.search.grouping.dv.DVFirstPassGroupingCollector; +import org.apache.lucene.search.grouping.dv.DVSecondPassGroupingCollector; +import org.apache.lucene.search.grouping.function.FunctionAllGroupHeadsCollector; +import org.apache.lucene.search.grouping.function.FunctionAllGroupsCollector; +import org.apache.lucene.search.grouping.function.FunctionFirstPassGroupingCollector; +import org.apache.lucene.search.grouping.function.FunctionSecondPassGroupingCollector; +import org.apache.lucene.search.grouping.term.TermAllGroupHeadsCollector; +import org.apache.lucene.search.grouping.term.TermAllGroupsCollector; +import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector; +import org.apache.lucene.search.grouping.term.TermSecondPassGroupingCollector; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.mutable.MutableValue; + +import java.io.IOException; +import java.util.*; + +/** + * Convenience class to perform grouping in a non distributed environment. + * + * @lucene.experimental + */ +public class GroupingSearch { + + private final String groupField; + private final ValueSource groupFunction; + private final Map valueSourceContext; + private final Filter groupEndDocs; + private final DocValues.Type docValuesType; + private final boolean diskResidentDocValues; + + private Sort groupSort = Sort.RELEVANCE; + private Sort sortWithinGroup; + + private int groupDocsOffset; + private int groupDocsLimit = 1; + private boolean fillSortFields; + private boolean includeScores = true; + private boolean includeMaxScore = true; + + private Double maxCacheRAMMB; + private Integer maxDocsToCache; + private boolean cacheScores; + private boolean allGroups; + private boolean allGroupHeads; + private int initialSize = 128; + + private Collection matchingGroups; + private Bits matchingGroupHeads; + + /** + * Constructs a GroupingSearch instance that groups documents by index terms using the {@link FieldCache}. + * The group field can only have one token per document. This means that the field must not be analysed. + * + * @param groupField The name of the field to group by. + */ + public GroupingSearch(String groupField) { + this(groupField, null, null, null, null, false); + } + + /** + * Constructs a GroupingSearch instance that groups documents by doc values. + * This constructor can only be used when the groupField is a {@link DocValuesField}. + * + * @param groupField The name of the field to group by that contains doc values + * @param docValuesType The doc values type of the specified groupField + * @param diskResidentDocValues Whether the values to group by should be disk resident + */ + public GroupingSearch(String groupField, DocValues.Type docValuesType, boolean diskResidentDocValues) { + this(groupField, null, null, null, docValuesType, diskResidentDocValues); + } + + /** + * Constructs a GroupingSearch instance that groups documents by function using a {@link ValueSource} + * instance. + * + * @param groupFunction The function to group by specified as {@link ValueSource} + * @param valueSourceContext The context of the specified groupFunction + */ + public GroupingSearch(ValueSource groupFunction, Map valueSourceContext) { + this(null, groupFunction, valueSourceContext, null, null, false); + } + + /** + * Constructor for grouping documents by doc block. + * This constructor can only be used when documents belonging in a group are indexed in one block. + * + * @param groupEndDocs The filter that marks the last document in all doc blocks + */ + public GroupingSearch(Filter groupEndDocs) { + this(null, null, null, groupEndDocs, null, false); + } + + private GroupingSearch(String groupField, ValueSource groupFunction, Map valueSourceContext, Filter groupEndDocs, DocValues.Type docValuesType, boolean diskResidentDocValues) { + this.groupField = groupField; + this.groupFunction = groupFunction; + this.valueSourceContext = valueSourceContext; + this.groupEndDocs = groupEndDocs; + this.docValuesType = docValuesType; + this.diskResidentDocValues = diskResidentDocValues; + } + + /** + * Executes a grouped search. Both the first pass and second pass are executed on the specified searcher. + * + * @param searcher The {@link org.apache.lucene.search.IndexSearcher} instance to execute the grouped search on. + * @param query The query to execute with the grouping + * @param groupOffset The group offset + * @param groupLimit The number of groups to return from the specified group offset + * @return the grouped result as a {@link TopGroups} instance + * @throws IOException If any I/O related errors occur + */ + public TopGroups search(IndexSearcher searcher, Query query, int groupOffset, int groupLimit) throws IOException { + return search(searcher, null, query, groupOffset, groupLimit); + } + + /** + * Executes a grouped search. Both the first pass and second pass are executed on the specified searcher. + * + * @param searcher The {@link org.apache.lucene.search.IndexSearcher} instance to execute the grouped search on. + * @param filter The filter to execute with the grouping + * @param query The query to execute with the grouping + * @param groupOffset The group offset + * @param groupLimit The number of groups to return from the specified group offset + * @return the grouped result as a {@link TopGroups} instance + * @throws IOException If any I/O related errors occur + */ + @SuppressWarnings("unchecked") + public TopGroups search(IndexSearcher searcher, Filter filter, Query query, int groupOffset, int groupLimit) throws IOException { + if (groupField != null || groupFunction != null) { + return groupByFieldOrFunction(searcher, filter, query, groupOffset, groupLimit); + } else if (groupEndDocs != null) { + return (TopGroups) groupByDocBlock(searcher, filter, query, groupOffset, groupLimit); + } else { + throw new IllegalStateException("Either groupField, groupFunction or groupEndDocs must be set."); // This can't happen... + } + } + + @SuppressWarnings({"unchecked", "rawtypes"}) + protected TopGroups groupByFieldOrFunction(IndexSearcher searcher, Filter filter, Query query, int groupOffset, int groupLimit) throws IOException { + int topN = groupOffset + groupLimit; + final AbstractFirstPassGroupingCollector firstPassCollector; + final AbstractAllGroupsCollector allGroupsCollector; + final AbstractAllGroupHeadsCollector allGroupHeadsCollector; + if (groupFunction != null) { + firstPassCollector = new FunctionFirstPassGroupingCollector(groupFunction, valueSourceContext, groupSort, topN); + if (allGroups) { + allGroupsCollector = new FunctionAllGroupsCollector(groupFunction, valueSourceContext); + } else { + allGroupsCollector = null; + } + if (allGroupHeads) { + allGroupHeadsCollector = new FunctionAllGroupHeadsCollector(groupFunction, valueSourceContext, sortWithinGroup); + } else { + allGroupHeadsCollector = null; + } + } else if (docValuesType != null) { + firstPassCollector = DVFirstPassGroupingCollector.create(groupSort, topN, groupField, docValuesType, diskResidentDocValues); + if (allGroups) { + allGroupsCollector = DVAllGroupsCollector.create(groupField, docValuesType, diskResidentDocValues, initialSize); + } else { + allGroupsCollector = null; + } + if (allGroupHeads) { + allGroupHeadsCollector = DVAllGroupHeadsCollector.create(groupField, sortWithinGroup, docValuesType, diskResidentDocValues); + } else { + allGroupHeadsCollector = null; + } + } else { + firstPassCollector = new TermFirstPassGroupingCollector(groupField, groupSort, topN); + if (allGroups) { + allGroupsCollector = new TermAllGroupsCollector(groupField, initialSize); + } else { + allGroupsCollector = null; + } + if (allGroupHeads) { + allGroupHeadsCollector = TermAllGroupHeadsCollector.create(groupField, sortWithinGroup, initialSize); + } else { + allGroupHeadsCollector = null; + } + } + + final Collector firstRound; + if (allGroupHeads || allGroups) { + List collectors = new ArrayList(); + collectors.add(firstPassCollector); + if (allGroupHeads) { + collectors.add(allGroupsCollector); + } + if (allGroupHeads) { + collectors.add(allGroupHeadsCollector); + } + firstRound = MultiCollector.wrap(collectors.toArray(new Collector[collectors.size()])); + } else { + firstRound = firstPassCollector; + } + + CachingCollector cachedCollector = null; + if (maxCacheRAMMB != null || maxDocsToCache != null) { + if (maxCacheRAMMB != null) { + cachedCollector = CachingCollector.create(firstRound, cacheScores, maxCacheRAMMB); + } else { + cachedCollector = CachingCollector.create(firstRound, cacheScores, maxDocsToCache); + } + searcher.search(query, filter, cachedCollector); + } else { + searcher.search(query, filter, firstRound); + } + + if (allGroups) { + matchingGroups = allGroupsCollector.getGroups(); + } else { + matchingGroups = Collections.emptyList(); + } + if (allGroupHeads) { + matchingGroupHeads = allGroupHeadsCollector.retrieveGroupHeads(searcher.getIndexReader().maxDoc()); + } else { + matchingGroupHeads = new Bits.MatchNoBits(searcher.getIndexReader().maxDoc()); + } + + Collection topSearchGroups = firstPassCollector.getTopGroups(groupOffset, fillSortFields); + if (topSearchGroups == null) { + return new TopGroups(new SortField[0], new SortField[0], 0, 0, new GroupDocs[0]); + } + + int topNInsideGroup = groupDocsOffset + groupDocsLimit; + AbstractSecondPassGroupingCollector secondPassCollector; + if (groupFunction != null) { + secondPassCollector = new FunctionSecondPassGroupingCollector((Collection) topSearchGroups, groupSort, sortWithinGroup, topNInsideGroup, includeScores, includeMaxScore, fillSortFields, groupFunction, valueSourceContext); + } else if (docValuesType != null) { + secondPassCollector = DVSecondPassGroupingCollector.create(groupField, diskResidentDocValues, docValuesType, (Collection) topSearchGroups, groupSort, sortWithinGroup, topNInsideGroup, includeScores, includeMaxScore, fillSortFields); + } else { + secondPassCollector = new TermSecondPassGroupingCollector(groupField, (Collection) topSearchGroups, groupSort, sortWithinGroup, topNInsideGroup, includeScores, includeMaxScore, fillSortFields); + } + + if (cachedCollector != null && cachedCollector.isCached()) { + cachedCollector.replay(secondPassCollector); + } else { + searcher.search(query, filter, secondPassCollector); + } + + if (allGroups) { + return new TopGroups(secondPassCollector.getTopGroups(groupDocsOffset), matchingGroups.size()); + } else { + return secondPassCollector.getTopGroups(groupDocsOffset); + } + } + + protected TopGroups groupByDocBlock(IndexSearcher searcher, Filter filter, Query query, int groupOffset, int groupLimit) throws IOException { + int topN = groupOffset + groupLimit; + BlockGroupingCollector c = new BlockGroupingCollector(groupSort, topN, includeScores, groupEndDocs); + searcher.search(query, filter, c); + int topNInsideGroup = groupDocsOffset + groupDocsLimit; + return c.getTopGroups(sortWithinGroup, groupOffset, groupDocsOffset, topNInsideGroup, fillSortFields); + } + + /** + * Enables caching for the second pass search. The cache will not grow over a specified limit in MB. + * The cache is filled during the first pass searched and then replayed during the second pass searched. + * If the cache grows beyond the specified limit, then the cache is purged and not used in the second pass search. + * + * @param maxCacheRAMMB The maximum amount in MB the cache is allowed to hold + * @param cacheScores Whether to cache the scores + * @return this + */ + public GroupingSearch setCachingInMB(double maxCacheRAMMB, boolean cacheScores) { + this.maxCacheRAMMB = maxCacheRAMMB; + this.maxDocsToCache = null; + this.cacheScores = cacheScores; + return this; + } + + /** + * Enables caching for the second pass search. The cache will not contain more than the maximum specified documents. + * The cache is filled during the first pass searched and then replayed during the second pass searched. + * If the cache grows beyond the specified limit, then the cache is purged and not used in the second pass search. + * + * @param maxDocsToCache The maximum number of documents the cache is allowed to hold + * @param cacheScores Whether to cache the scores + * @return this + */ + public GroupingSearch setCaching(int maxDocsToCache, boolean cacheScores) { + this.maxDocsToCache = maxDocsToCache; + this.maxCacheRAMMB = null; + this.cacheScores = cacheScores; + return this; + } + + /** + * Disables any enabled cache. + * + * @return this + */ + public GroupingSearch disableCaching() { + this.maxCacheRAMMB = null; + this.maxDocsToCache = null; + return this; + } + + /** + * Specifies how groups are sorted. + * Defaults to {@link Sort#RELEVANCE}. + * + * @param groupSort The sort for the groups. + * @return this + */ + public GroupingSearch setGroupSort(Sort groupSort) { + this.groupSort = groupSort; + return this; + } + + /** + * Specified how documents inside a group are sorted. + * Defaults to {@link Sort#RELEVANCE}. + * + * @param sortWithinGroup The sort for documents inside a group + * @return this + */ + public GroupingSearch setSortWithinGroup(Sort sortWithinGroup) { + this.sortWithinGroup = sortWithinGroup; + return this; + } + + /** + * Specifies the offset for documents inside a group. + * + * @param groupDocsOffset The offset for documents inside a + * @return this + */ + public GroupingSearch setGroupDocsOffset(int groupDocsOffset) { + this.groupDocsOffset = groupDocsOffset; + return this; + } + + /** + * Specifies the number of documents to return inside a group from the specified groupDocsOffset. + * + * @param groupDocsLimit The number of documents to return inside a group + * @return this + */ + public GroupingSearch setGroupDocsLimit(int groupDocsLimit) { + this.groupDocsLimit = groupDocsLimit; + return this; + } + + /** + * Whether to also fill the sort fields per returned group and groups docs. + * + * @param fillSortFields Whether to also fill the sort fields per returned group and groups docs + * @return this + */ + public GroupingSearch setFillSortFields(boolean fillSortFields) { + this.fillSortFields = fillSortFields; + return this; + } + + /** + * Whether to include the scores per doc inside a group. + * + * @param includeScores Whether to include the scores per doc inside a group + * @return this + */ + public GroupingSearch setIncludeScores(boolean includeScores) { + this.includeScores = includeScores; + return this; + } + + /** + * Whether to include the score of the most relevant document per group. + * + * @param includeMaxScore Whether to include the score of the most relevant document per group + * @return this + */ + public GroupingSearch setIncludeMaxScore(boolean includeMaxScore) { + this.includeMaxScore = includeMaxScore; + return this; + } + + /** + * Whether to also co0.0mpute all groups matching the query. + * This can be used to determine the number of groups, which can be used for accurate pagination. + *

+ * When grouping by doc block the number of groups are automatically included in the {@link TopGroups} and this + * option doesn't have any influence. + * + * @param allGroups to also compute all groups matching the query + * @return this + */ + public GroupingSearch setAllGroups(boolean allGroups) { + this.allGroups = allGroups; + return this; + } + + /** + * If {@link #setAllGroups(boolean)} was set to true then all matching groups are returned, otherwise + * an empty collection is returned. + * + * @param The group value type. This can be a {@link BytesRef} or a {@link MutableValue} instance. If grouping + * by doc block this the group value is always null. + * @return all matching groups are returned, or an empty collection + */ + @SuppressWarnings({"unchecked", "rawtypes"}) + public Collection getAllMatchingGroups() { + return (Collection) matchingGroups; + } + + /** + * Whether to compute all group heads (most relevant document per group) matching the query. + *

+ * This feature isn't enabled when grouping by doc block. + * + * @param allGroupHeads Whether to compute all group heads (most relevant document per group) matching the query + * @return this + */ + public GroupingSearch setAllGroupHeads(boolean allGroupHeads) { + this.allGroupHeads = allGroupHeads; + return this; + } + + /** + * Returns the matching group heads if {@link #setAllGroupHeads(boolean)} was set to true or an empty bit set. + * + * @return The matching group heads if {@link #setAllGroupHeads(boolean)} was set to true or an empty bit set + */ + public Bits getAllGroupHeads() { + return matchingGroupHeads; + } + + /** + * Sets the initial size of some internal used data structures. + * This prevents growing data structures many times. This can improve the performance of the grouping at the cost of + * more initial RAM. + *

+ * The {@link #allGroups} and {@link #allGroupHeads} features use this option. + * Defaults to 128. + * + * @param initialSize The initial size of some internal used data structures + * @return this + */ + public GroupingSearch setInitialSize(int initialSize) { + this.initialSize = initialSize; + return this; + } +} diff --git a/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html b/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html index 0b5d5e7f14c..327c6c0dff5 100644 --- a/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html +++ b/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html @@ -63,7 +63,7 @@ field fall into a single group.

Known limitations:

    -
  • For the two-pass grouping collector, the group field must be a +
  • For the two-pass grouping search, the group field must be a single-valued indexed field. {@link org.apache.lucene.search.FieldCache} is used to load the {@link org.apache.lucene.search.FieldCache.DocTermsIndex} for this field.
  • Although Solr support grouping by function and this module has abstraction of what a group is, there are currently only @@ -73,50 +73,30 @@ field fall into a single group.

    group yourself.
-

Typical usage for the generic two-pass collector looks like this - (using the {@link org.apache.lucene.search.CachingCollector}):

+

Typical usage for the generic two-pass grouping search looks like this using the grouping convenience utility + (optionally using caching for the second pass search):

-  TermFirstPassGroupingCollector c1 = new TermFirstPassGroupingCollector("author", groupSort, groupOffset+topNGroups);
+  GroupingSearch groupingSearch = new GroupingSearch("author");
+  groupingSearch.setGroupSort(groupSort);
+  groupingSearch.setFillSortFields(fillFields);
 
-  boolean cacheScores = true;
-  double maxCacheRAMMB = 4.0;
-  CachingCollector cachedCollector = CachingCollector.create(c1, cacheScores, maxCacheRAMMB);
-  s.search(new TermQuery(new Term("content", searchTerm)), cachedCollector);
-
-  Collection<SearchGroup<BytesRef>> topGroups = c1.getTopGroups(groupOffset, fillFields);
-
-  if (topGroups == null) {
-    // No groups matched
-    return;
+  if (useCache) {
+    // Sets cache in MB
+    groupingSearch.setCachingInMB(4.0, true);
   }
 
-  boolean getScores = true;
-  boolean getMaxScores = true;
-  boolean fillFields = true;
-  TermSecondPassGroupingCollector c2 = new TermSecondPassGroupingCollector("author", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields);
-
-  //Optionally compute total group count
-  TermAllGroupsCollector allGroupsCollector = null;
   if (requiredTotalGroupCount) {
-    allGroupsCollector = new TermAllGroupsCollector("author");
-    c2 = MultiCollector.wrap(c2, allGroupsCollector);
+    groupingSearch.setAllGroups(true);
   }
 
-  if (cachedCollector.isCached()) {
-    // Cache fit within maxCacheRAMMB, so we can replay it:
-    cachedCollector.replay(c2);
-  } else {
-    // Cache was too large; must re-execute query:
-    s.search(new TermQuery(new Term("content", searchTerm)), c2);
-  }
-        
-  TopGroups<BytesRef> groupsResult = c2.getTopGroups(docOffset);
-  if (requiredTotalGroupCount) {
-    groupsResult = new TopGroups<BytesRef>(groupsResult, allGroupsCollector.getGroupCount());
-  }
+  TermQuery query = new TermQuery(new Term("content", searchTerm));
+  TopGroups<BytesRef> result = groupingSearch.search(indexSearcher, query, groupOffset, groupLimit);
 
   // Render groupsResult...
+  if (requiredTotalGroupCount) {
+    int totalGroupCount = result.totalGroupCount;
+  }
 

To use the single-pass BlockGroupingCollector, @@ -159,6 +139,19 @@ Finally, do this per search: // Render groupsResult... +Or alternatively use the GroupingSearch convenience utility: + +

+  // Per search:
+  GroupingSearch groupingSearch = new GroupingSearch(groupEndDocs);
+  groupingSearch.setGroupSort(groupSort);
+  groupingSearch.setIncludeScores(needsScores);
+  TermQuery query = new TermQuery(new Term("content", searchTerm));
+  TopGroups groupsResult = groupingSearch.search(indexSearcher, query, groupOffset, groupLimit);
+
+  // Render groupsResult...
+
+ Note that the groupValue of each GroupDocs will be null, so if you need to present this value you'll have to separately retrieve it (for example using stored @@ -167,7 +160,8 @@ fields, FieldCache, etc.).

Another collector is the TermAllGroupHeadsCollector that can be used to retrieve all most relevant documents per group. Also known as group heads. This can be useful in situations when one wants to compute group based facets / statistics on the complete query result. The collector can be executed during the first or second - phase.

+ phase. This collector can also be used with the GroupingSearch convenience utility, but when if one only + wants to compute the most relevant documents per group it is better to just use the collector as done here below.

   AbstractAllGroupHeadsCollector c = TermAllGroupHeadsCollector.create(groupField, sortWithinGroup);
@@ -182,39 +176,27 @@ fields, FieldCache, etc.).
 

For each of the above collector types there is also a variant that works with ValueSource instead of of fields. Concretely this means that these variants can work with functions. These variants are slower than there term based counter parts. These implementations are located in the - org.apache.lucene.search.grouping.function package. + org.apache.lucene.search.grouping.function package, but can also be used with the + GroupingSearch convenience utility

There are also DocValues based implementations available for the group collectors. There are factory methods - available for creating dv based instances. A typical example using dv based grouping collectors: + available for creating dv based instances. A typical example using dv based grouping with the + GroupingSearch convenience utility:

   boolean diskResident = true; // Whether values should fetched directly from disk by passing the Java heap space.
-  AbstractFirstPassGroupingCollector c1 = DVFirstPassGroupingCollector.create(
-        groupSort, groupOffset+topNGroups, "author", DocValues.Type.BYTES_VAR_SORTED, diskResident
-  );
+  DocValues.Type docValuesType = DocValues.Type.BYTES_VAR_SORTED;
+  GroupingSearch groupingSearch = new GroupingSearch("author", docValuesType, diskResident);
+  groupingSearch.setGroupSort(groupSort);
+  groupingSearch.setFillSortFields(fillFields);
 
-  s.search(new TermQuery(new Term("content", searchTerm)), c1);
+  TermQuery query = new TermQuery(new Term("content", searchTerm));
+  // The docValuesType variable decides the generic type. When float is used this Double and in case of int this is Long
+  TopGroups<BytesRef> result = groupingSearch.search(indexSearcher, query, groupOffset, groupLimit);
 
-  Collection<SearchGroup<BytesRef>> topGroups = c1.getTopGroups(groupOffset, fillFields);
-
-  if (topGroups == null) {
-    // No groups matched
-    return;
-  }
-
-  boolean getScores = true;
-  boolean getMaxScores = true;
-  boolean fillFields = true;
-  AbstractSecondPassGroupingCollector<BytesRef> c2 = DVSecondPassGroupingCollector.create(
-        "author", diskResident, DocValues.Type.BYTES_VAR_SORTED, topGroups, groupSort, docSort,
-        docOffset+docsPerGroup, getScores, getMaxScores, fillFields
-  );
-
-  s.search(new TermQuery(new Term("content", searchTerm)), c2);
-  TopGroups<BytesRef> groupsResult = c2.getTopGroups(docOffset);
   // Render groupsResult...
 
diff --git a/modules/grouping/src/test/org/apache/lucene/search/grouping/GroupingSearchTest.java b/modules/grouping/src/test/org/apache/lucene/search/grouping/GroupingSearchTest.java new file mode 100644 index 00000000000..74bf47eb299 --- /dev/null +++ b/modules/grouping/src/test/org/apache/lucene/search/grouping/GroupingSearchTest.java @@ -0,0 +1,222 @@ +package org.apache.lucene.search.grouping; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.*; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.queries.function.ValueSource; +import org.apache.lucene.queries.function.valuesource.BytesRefFieldSource; +import org.apache.lucene.search.*; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.mutable.MutableValueStr; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +public class GroupingSearchTest extends LuceneTestCase { + + // Tests some very basic usages... + public void testBasic() throws Exception { + + final String groupField = "author"; + + FieldType customType = new FieldType(); + customType.setStored(true); + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter( + random, + dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); + boolean canUseIDV = !"Lucene3x".equals(w.w.getConfig().getCodec().getName()); + List documents = new ArrayList(); + // 0 + Document doc = new Document(); + addGroupField(doc, groupField, "author1", canUseIDV); + doc.add(new Field("content", "random text", TextField.TYPE_STORED)); + doc.add(new Field("id", "1", customType)); + documents.add(doc); + + // 1 + doc = new Document(); + addGroupField(doc, groupField, "author1", canUseIDV); + doc.add(new Field("content", "some more random text", TextField.TYPE_STORED)); + doc.add(new Field("id", "2", customType)); + documents.add(doc); + + // 2 + doc = new Document(); + addGroupField(doc, groupField, "author1", canUseIDV); + doc.add(new Field("content", "some more random textual data", TextField.TYPE_STORED)); + doc.add(new Field("id", "3", customType)); + doc.add(new Field("groupend", "x", StringField.TYPE_UNSTORED)); + documents.add(doc); + w.addDocuments(documents); + documents.clear(); + + // 3 + doc = new Document(); + addGroupField(doc, groupField, "author2", canUseIDV); + doc.add(new Field("content", "some random text", TextField.TYPE_STORED)); + doc.add(new Field("id", "4", customType)); + doc.add(new Field("groupend", "x", StringField.TYPE_UNSTORED)); + w.addDocument(doc); + + // 4 + doc = new Document(); + addGroupField(doc, groupField, "author3", canUseIDV); + doc.add(new Field("content", "some more random text", TextField.TYPE_STORED)); + doc.add(new Field("id", "5", customType)); + documents.add(doc); + + // 5 + doc = new Document(); + addGroupField(doc, groupField, "author3", canUseIDV); + doc.add(new Field("content", "random", TextField.TYPE_STORED)); + doc.add(new Field("id", "6", customType)); + doc.add(new Field("groupend", "x", StringField.TYPE_UNSTORED)); + documents.add(doc); + w.addDocuments(documents); + documents.clear(); + + // 6 -- no author field + doc = new Document(); + doc.add(new Field("content", "random word stuck in alot of other text", TextField.TYPE_STORED)); + doc.add(new Field("id", "6", customType)); + doc.add(new Field("groupend", "x", StringField.TYPE_UNSTORED)); + + w.addDocument(doc); + + IndexSearcher indexSearcher = new IndexSearcher(w.getReader()); + w.close(); + + Sort groupSort = Sort.RELEVANCE; + GroupingSearch groupingSearch = createRandomGroupingSearch(groupField, groupSort, 5, canUseIDV); + + TopGroups groups = groupingSearch.search(indexSearcher, null, new TermQuery(new Term("content", "random")), 0, 10); + + assertEquals(7, groups.totalHitCount); + assertEquals(7, groups.totalGroupedHitCount); + assertEquals(4, groups.groups.length); + + // relevance order: 5, 0, 3, 4, 1, 2, 6 + + // the later a document is added the higher this docId + // value + GroupDocs group = groups.groups[0]; + compareGroupValue("author3", group); + assertEquals(2, group.scoreDocs.length); + assertEquals(5, group.scoreDocs[0].doc); + assertEquals(4, group.scoreDocs[1].doc); + assertTrue(group.scoreDocs[0].score > group.scoreDocs[1].score); + + group = groups.groups[1]; + compareGroupValue("author1", group); + assertEquals(3, group.scoreDocs.length); + assertEquals(0, group.scoreDocs[0].doc); + assertEquals(1, group.scoreDocs[1].doc); + assertEquals(2, group.scoreDocs[2].doc); + assertTrue(group.scoreDocs[0].score > group.scoreDocs[1].score); + assertTrue(group.scoreDocs[1].score > group.scoreDocs[2].score); + + group = groups.groups[2]; + compareGroupValue("author2", group); + assertEquals(1, group.scoreDocs.length); + assertEquals(3, group.scoreDocs[0].doc); + + group = groups.groups[3]; + compareGroupValue(null, group); + assertEquals(1, group.scoreDocs.length); + assertEquals(6, group.scoreDocs[0].doc); + + Filter lastDocInBlock = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("groupend", "x")))); + groupingSearch = new GroupingSearch(lastDocInBlock); + groups = groupingSearch.search(indexSearcher, null, new TermQuery(new Term("content", "random")), 0, 10); + + assertEquals(7, groups.totalHitCount); + assertEquals(7, groups.totalGroupedHitCount); + assertEquals(4, groups.totalGroupCount.longValue()); + assertEquals(4, groups.groups.length); + + indexSearcher.getIndexReader().close(); + dir.close(); + } + + private void addGroupField(Document doc, String groupField, String value, boolean canUseIDV) { + doc.add(new Field(groupField, value, TextField.TYPE_STORED)); + if (canUseIDV) { + doc.add(new DocValuesField(groupField, new BytesRef(value), DocValues.Type.BYTES_VAR_SORTED)); + } + } + + private void compareGroupValue(String expected, GroupDocs group) { + if (expected == null) { + if (group.groupValue == null) { + return; + } else if (group.groupValue.getClass().isAssignableFrom(MutableValueStr.class)) { + return; + } else if (((BytesRef) group.groupValue).length == 0) { + return; + } + fail(); + } + + if (group.groupValue.getClass().isAssignableFrom(BytesRef.class)) { + assertEquals(new BytesRef(expected), group.groupValue); + } else if (group.groupValue.getClass().isAssignableFrom(MutableValueStr.class)) { + MutableValueStr v = new MutableValueStr(); + v.value = new BytesRef(expected); + assertEquals(v, group.groupValue); + } else { + fail(); + } + } + + private GroupingSearch createRandomGroupingSearch(String groupField, Sort groupSort, int docsInGroup, boolean canUseIDV) throws IOException { + GroupingSearch groupingSearch; + if (random.nextBoolean()) { + ValueSource vs = new BytesRefFieldSource(groupField); + groupingSearch = new GroupingSearch(vs, new HashMap()); + } else { + if (canUseIDV && random.nextBoolean()) { + boolean diskResident = random.nextBoolean(); + groupingSearch = new GroupingSearch(groupField, DocValues.Type.BYTES_VAR_SORTED, diskResident); + } else { + groupingSearch = new GroupingSearch(groupField); + } + } + + groupingSearch.setGroupSort(groupSort); + groupingSearch.setGroupDocsLimit(docsInGroup); + + if (random.nextBoolean()) { + groupingSearch.setCachingInMB(4.0, true); + } + + return groupingSearch; + } + +}