From 89b55566d8392657be86e1441a329373e9132257 Mon Sep 17 00:00:00 2001
From: Martijn van Groningen GroupingSearch
instance that groups documents by index terms using the {@link FieldCache}.
+ * The group field can only have one token per document. This means that the field must not be analysed.
+ *
+ * @param groupField The name of the field to group by.
+ */
+ public GroupingSearch(String groupField) {
+ this(groupField, null, null, null, null, false);
+ }
+
+ /**
+ * Constructs a GroupingSearch
instance that groups documents by doc values.
+ * This constructor can only be used when the groupField is a {@link DocValuesField}.
+ *
+ * @param groupField The name of the field to group by that contains doc values
+ * @param docValuesType The doc values type of the specified groupField
+ * @param diskResidentDocValues Whether the values to group by should be disk resident
+ */
+ public GroupingSearch(String groupField, DocValues.Type docValuesType, boolean diskResidentDocValues) {
+ this(groupField, null, null, null, docValuesType, diskResidentDocValues);
+ }
+
+ /**
+ * Constructs a GroupingSearch
instance that groups documents by function using a {@link ValueSource}
+ * instance.
+ *
+ * @param groupFunction The function to group by specified as {@link ValueSource}
+ * @param valueSourceContext The context of the specified groupFunction
+ */
+ public GroupingSearch(ValueSource groupFunction, Map, ?> valueSourceContext) {
+ this(null, groupFunction, valueSourceContext, null, null, false);
+ }
+
+ /**
+ * Constructor for grouping documents by doc block.
+ * This constructor can only be used when documents belonging in a group are indexed in one block.
+ *
+ * @param groupEndDocs The filter that marks the last document in all doc blocks
+ */
+ public GroupingSearch(Filter groupEndDocs) {
+ this(null, null, null, groupEndDocs, null, false);
+ }
+
+ private GroupingSearch(String groupField, ValueSource groupFunction, Map, ?> valueSourceContext, Filter groupEndDocs, DocValues.Type docValuesType, boolean diskResidentDocValues) {
+ this.groupField = groupField;
+ this.groupFunction = groupFunction;
+ this.valueSourceContext = valueSourceContext;
+ this.groupEndDocs = groupEndDocs;
+ this.docValuesType = docValuesType;
+ this.diskResidentDocValues = diskResidentDocValues;
+ }
+
+ /**
+ * Executes a grouped search. Both the first pass and second pass are executed on the specified searcher.
+ *
+ * @param searcher The {@link org.apache.lucene.search.IndexSearcher} instance to execute the grouped search on.
+ * @param query The query to execute with the grouping
+ * @param groupOffset The group offset
+ * @param groupLimit The number of groups to return from the specified group offset
+ * @return the grouped result as a {@link TopGroups} instance
+ * @throws IOException If any I/O related errors occur
+ */
+ public this
+ */
+ public GroupingSearch setCachingInMB(double maxCacheRAMMB, boolean cacheScores) {
+ this.maxCacheRAMMB = maxCacheRAMMB;
+ this.maxDocsToCache = null;
+ this.cacheScores = cacheScores;
+ return this;
+ }
+
+ /**
+ * Enables caching for the second pass search. The cache will not contain more than the maximum specified documents.
+ * The cache is filled during the first pass searched and then replayed during the second pass searched.
+ * If the cache grows beyond the specified limit, then the cache is purged and not used in the second pass search.
+ *
+ * @param maxDocsToCache The maximum number of documents the cache is allowed to hold
+ * @param cacheScores Whether to cache the scores
+ * @return this
+ */
+ public GroupingSearch setCaching(int maxDocsToCache, boolean cacheScores) {
+ this.maxDocsToCache = maxDocsToCache;
+ this.maxCacheRAMMB = null;
+ this.cacheScores = cacheScores;
+ return this;
+ }
+
+ /**
+ * Disables any enabled cache.
+ *
+ * @return this
+ */
+ public GroupingSearch disableCaching() {
+ this.maxCacheRAMMB = null;
+ this.maxDocsToCache = null;
+ return this;
+ }
+
+ /**
+ * Specifies how groups are sorted.
+ * Defaults to {@link Sort#RELEVANCE}.
+ *
+ * @param groupSort The sort for the groups.
+ * @return this
+ */
+ public GroupingSearch setGroupSort(Sort groupSort) {
+ this.groupSort = groupSort;
+ return this;
+ }
+
+ /**
+ * Specified how documents inside a group are sorted.
+ * Defaults to {@link Sort#RELEVANCE}.
+ *
+ * @param sortWithinGroup The sort for documents inside a group
+ * @return this
+ */
+ public GroupingSearch setSortWithinGroup(Sort sortWithinGroup) {
+ this.sortWithinGroup = sortWithinGroup;
+ return this;
+ }
+
+ /**
+ * Specifies the offset for documents inside a group.
+ *
+ * @param groupDocsOffset The offset for documents inside a
+ * @return this
+ */
+ public GroupingSearch setGroupDocsOffset(int groupDocsOffset) {
+ this.groupDocsOffset = groupDocsOffset;
+ return this;
+ }
+
+ /**
+ * Specifies the number of documents to return inside a group from the specified groupDocsOffset.
+ *
+ * @param groupDocsLimit The number of documents to return inside a group
+ * @return this
+ */
+ public GroupingSearch setGroupDocsLimit(int groupDocsLimit) {
+ this.groupDocsLimit = groupDocsLimit;
+ return this;
+ }
+
+ /**
+ * Whether to also fill the sort fields per returned group and groups docs.
+ *
+ * @param fillSortFields Whether to also fill the sort fields per returned group and groups docs
+ * @return this
+ */
+ public GroupingSearch setFillSortFields(boolean fillSortFields) {
+ this.fillSortFields = fillSortFields;
+ return this;
+ }
+
+ /**
+ * Whether to include the scores per doc inside a group.
+ *
+ * @param includeScores Whether to include the scores per doc inside a group
+ * @return this
+ */
+ public GroupingSearch setIncludeScores(boolean includeScores) {
+ this.includeScores = includeScores;
+ return this;
+ }
+
+ /**
+ * Whether to include the score of the most relevant document per group.
+ *
+ * @param includeMaxScore Whether to include the score of the most relevant document per group
+ * @return this
+ */
+ public GroupingSearch setIncludeMaxScore(boolean includeMaxScore) {
+ this.includeMaxScore = includeMaxScore;
+ return this;
+ }
+
+ /**
+ * Whether to also co0.0mpute all groups matching the query.
+ * This can be used to determine the number of groups, which can be used for accurate pagination.
+ *
+ * When grouping by doc block the number of groups are automatically included in the {@link TopGroups} and this
+ * option doesn't have any influence.
+ *
+ * @param allGroups to also compute all groups matching the query
+ * @return this
+ */
+ public GroupingSearch setAllGroups(boolean allGroups) {
+ this.allGroups = allGroups;
+ return this;
+ }
+
+ /**
+ * If {@link #setAllGroups(boolean)} was set to true
then all matching groups are returned, otherwise
+ * an empty collection is returned.
+ *
+ * @param null
.
+ * @return all matching groups are returned, or an empty collection
+ */
+ @SuppressWarnings({"unchecked", "rawtypes"})
+ public this
+ */
+ public GroupingSearch setAllGroupHeads(boolean allGroupHeads) {
+ this.allGroupHeads = allGroupHeads;
+ return this;
+ }
+
+ /**
+ * Returns the matching group heads if {@link #setAllGroupHeads(boolean)} was set to true or an empty bit set.
+ *
+ * @return The matching group heads if {@link #setAllGroupHeads(boolean)} was set to true or an empty bit set
+ */
+ public Bits getAllGroupHeads() {
+ return matchingGroupHeads;
+ }
+
+ /**
+ * Sets the initial size of some internal used data structures.
+ * This prevents growing data structures many times. This can improve the performance of the grouping at the cost of
+ * more initial RAM.
+ *
+ * The {@link #allGroups} and {@link #allGroupHeads} features use this option.
+ * Defaults to 128.
+ *
+ * @param initialSize The initial size of some internal used data structures
+ * @return this
+ */
+ public GroupingSearch setInitialSize(int initialSize) {
+ this.initialSize = initialSize;
+ return this;
+ }
+}
diff --git a/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html b/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html
index 0b5d5e7f14c..327c6c0dff5 100644
--- a/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html
+++ b/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html
@@ -63,7 +63,7 @@ field fall into a single group.
Known limitations:
Typical usage for the generic two-pass collector looks like this - (using the {@link org.apache.lucene.search.CachingCollector}):
+Typical usage for the generic two-pass grouping search looks like this using the grouping convenience utility + (optionally using caching for the second pass search):
- TermFirstPassGroupingCollector c1 = new TermFirstPassGroupingCollector("author", groupSort, groupOffset+topNGroups); + GroupingSearch groupingSearch = new GroupingSearch("author"); + groupingSearch.setGroupSort(groupSort); + groupingSearch.setFillSortFields(fillFields); - boolean cacheScores = true; - double maxCacheRAMMB = 4.0; - CachingCollector cachedCollector = CachingCollector.create(c1, cacheScores, maxCacheRAMMB); - s.search(new TermQuery(new Term("content", searchTerm)), cachedCollector); - - Collection<SearchGroup<BytesRef>> topGroups = c1.getTopGroups(groupOffset, fillFields); - - if (topGroups == null) { - // No groups matched - return; + if (useCache) { + // Sets cache in MB + groupingSearch.setCachingInMB(4.0, true); } - boolean getScores = true; - boolean getMaxScores = true; - boolean fillFields = true; - TermSecondPassGroupingCollector c2 = new TermSecondPassGroupingCollector("author", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields); - - //Optionally compute total group count - TermAllGroupsCollector allGroupsCollector = null; if (requiredTotalGroupCount) { - allGroupsCollector = new TermAllGroupsCollector("author"); - c2 = MultiCollector.wrap(c2, allGroupsCollector); + groupingSearch.setAllGroups(true); } - if (cachedCollector.isCached()) { - // Cache fit within maxCacheRAMMB, so we can replay it: - cachedCollector.replay(c2); - } else { - // Cache was too large; must re-execute query: - s.search(new TermQuery(new Term("content", searchTerm)), c2); - } - - TopGroups<BytesRef> groupsResult = c2.getTopGroups(docOffset); - if (requiredTotalGroupCount) { - groupsResult = new TopGroups<BytesRef>(groupsResult, allGroupsCollector.getGroupCount()); - } + TermQuery query = new TermQuery(new Term("content", searchTerm)); + TopGroups<BytesRef> result = groupingSearch.search(indexSearcher, query, groupOffset, groupLimit); // Render groupsResult... + if (requiredTotalGroupCount) { + int totalGroupCount = result.totalGroupCount; + }
To use the single-pass BlockGroupingCollector
,
@@ -159,6 +139,19 @@ Finally, do this per search:
// Render groupsResult...
+Or alternatively use the GroupingSearch
convenience utility:
+
+
+ // Per search: + GroupingSearch groupingSearch = new GroupingSearch(groupEndDocs); + groupingSearch.setGroupSort(groupSort); + groupingSearch.setIncludeScores(needsScores); + TermQuery query = new TermQuery(new Term("content", searchTerm)); + TopGroups groupsResult = groupingSearch.search(indexSearcher, query, groupOffset, groupLimit); + + // Render groupsResult... ++ Note that the
groupValue
of each GroupDocs
will be null
, so if you need to present this value you'll
have to separately retrieve it (for example using stored
@@ -167,7 +160,8 @@ fields, FieldCache
, etc.).
Another collector is the TermAllGroupHeadsCollector
that can be used to retrieve all most relevant
documents per group. Also known as group heads. This can be useful in situations when one wants to compute group
based facets / statistics on the complete query result. The collector can be executed during the first or second
- phase.
GroupingSearch
convenience utility, but when if one only
+ wants to compute the most relevant documents per group it is better to just use the collector as done here below.
AbstractAllGroupHeadsCollector c = TermAllGroupHeadsCollector.create(groupField, sortWithinGroup); @@ -182,39 +176,27 @@ fields,FieldCache
, etc.).For each of the above collector types there is also a variant that works with
ValueSource
instead of of fields. Concretely this means that these variants can work with functions. These variants are slower than there term based counter parts. These implementations are located in the -org.apache.lucene.search.grouping.function
package. +org.apache.lucene.search.grouping.function
package, but can also be used with the +GroupingSearch
convenience utilityThere are also DocValues based implementations available for the group collectors. There are factory methods - available for creating dv based instances. A typical example using dv based grouping collectors: + available for creating dv based instances. A typical example using dv based grouping with the +
GroupingSearch
convenience utility:boolean diskResident = true; // Whether values should fetched directly from disk by passing the Java heap space. - AbstractFirstPassGroupingCollector c1 = DVFirstPassGroupingCollector.create( - groupSort, groupOffset+topNGroups, "author", DocValues.Type.BYTES_VAR_SORTED, diskResident - ); + DocValues.Type docValuesType = DocValues.Type.BYTES_VAR_SORTED; + GroupingSearch groupingSearch = new GroupingSearch("author", docValuesType, diskResident); + groupingSearch.setGroupSort(groupSort); + groupingSearch.setFillSortFields(fillFields); - s.search(new TermQuery(new Term("content", searchTerm)), c1); + TermQuery query = new TermQuery(new Term("content", searchTerm)); + // The docValuesType variable decides the generic type. When float is used this Double and in case of int this is Long + TopGroups<BytesRef> result = groupingSearch.search(indexSearcher, query, groupOffset, groupLimit); - Collection<SearchGroup<BytesRef>> topGroups = c1.getTopGroups(groupOffset, fillFields); - - if (topGroups == null) { - // No groups matched - return; - } - - boolean getScores = true; - boolean getMaxScores = true; - boolean fillFields = true; - AbstractSecondPassGroupingCollector<BytesRef> c2 = DVSecondPassGroupingCollector.create( - "author", diskResident, DocValues.Type.BYTES_VAR_SORTED, topGroups, groupSort, docSort, - docOffset+docsPerGroup, getScores, getMaxScores, fillFields - ); - - s.search(new TermQuery(new Term("content", searchTerm)), c2); - TopGroups<BytesRef> groupsResult = c2.getTopGroups(docOffset); // Render groupsResult...diff --git a/modules/grouping/src/test/org/apache/lucene/search/grouping/GroupingSearchTest.java b/modules/grouping/src/test/org/apache/lucene/search/grouping/GroupingSearchTest.java new file mode 100644 index 00000000000..74bf47eb299 --- /dev/null +++ b/modules/grouping/src/test/org/apache/lucene/search/grouping/GroupingSearchTest.java @@ -0,0 +1,222 @@ +package org.apache.lucene.search.grouping; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.*; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.queries.function.ValueSource; +import org.apache.lucene.queries.function.valuesource.BytesRefFieldSource; +import org.apache.lucene.search.*; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.mutable.MutableValueStr; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +public class GroupingSearchTest extends LuceneTestCase { + + // Tests some very basic usages... + public void testBasic() throws Exception { + + final String groupField = "author"; + + FieldType customType = new FieldType(); + customType.setStored(true); + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter( + random, + dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); + boolean canUseIDV = !"Lucene3x".equals(w.w.getConfig().getCodec().getName()); + Listdocuments = new ArrayList (); + // 0 + Document doc = new Document(); + addGroupField(doc, groupField, "author1", canUseIDV); + doc.add(new Field("content", "random text", TextField.TYPE_STORED)); + doc.add(new Field("id", "1", customType)); + documents.add(doc); + + // 1 + doc = new Document(); + addGroupField(doc, groupField, "author1", canUseIDV); + doc.add(new Field("content", "some more random text", TextField.TYPE_STORED)); + doc.add(new Field("id", "2", customType)); + documents.add(doc); + + // 2 + doc = new Document(); + addGroupField(doc, groupField, "author1", canUseIDV); + doc.add(new Field("content", "some more random textual data", TextField.TYPE_STORED)); + doc.add(new Field("id", "3", customType)); + doc.add(new Field("groupend", "x", StringField.TYPE_UNSTORED)); + documents.add(doc); + w.addDocuments(documents); + documents.clear(); + + // 3 + doc = new Document(); + addGroupField(doc, groupField, "author2", canUseIDV); + doc.add(new Field("content", "some random text", TextField.TYPE_STORED)); + doc.add(new Field("id", "4", customType)); + doc.add(new Field("groupend", "x", StringField.TYPE_UNSTORED)); + w.addDocument(doc); + + // 4 + doc = new Document(); + addGroupField(doc, groupField, "author3", canUseIDV); + doc.add(new Field("content", "some more random text", TextField.TYPE_STORED)); + doc.add(new Field("id", "5", customType)); + documents.add(doc); + + // 5 + doc = new Document(); + addGroupField(doc, groupField, "author3", canUseIDV); + doc.add(new Field("content", "random", TextField.TYPE_STORED)); + doc.add(new Field("id", "6", customType)); + doc.add(new Field("groupend", "x", StringField.TYPE_UNSTORED)); + documents.add(doc); + w.addDocuments(documents); + documents.clear(); + + // 6 -- no author field + doc = new Document(); + doc.add(new Field("content", "random word stuck in alot of other text", TextField.TYPE_STORED)); + doc.add(new Field("id", "6", customType)); + doc.add(new Field("groupend", "x", StringField.TYPE_UNSTORED)); + + w.addDocument(doc); + + IndexSearcher indexSearcher = new IndexSearcher(w.getReader()); + w.close(); + + Sort groupSort = Sort.RELEVANCE; + GroupingSearch groupingSearch = createRandomGroupingSearch(groupField, groupSort, 5, canUseIDV); + + TopGroups> groups = groupingSearch.search(indexSearcher, null, new TermQuery(new Term("content", "random")), 0, 10); + + assertEquals(7, groups.totalHitCount); + assertEquals(7, groups.totalGroupedHitCount); + assertEquals(4, groups.groups.length); + + // relevance order: 5, 0, 3, 4, 1, 2, 6 + + // the later a document is added the higher this docId + // value + GroupDocs> group = groups.groups[0]; + compareGroupValue("author3", group); + assertEquals(2, group.scoreDocs.length); + assertEquals(5, group.scoreDocs[0].doc); + assertEquals(4, group.scoreDocs[1].doc); + assertTrue(group.scoreDocs[0].score > group.scoreDocs[1].score); + + group = groups.groups[1]; + compareGroupValue("author1", group); + assertEquals(3, group.scoreDocs.length); + assertEquals(0, group.scoreDocs[0].doc); + assertEquals(1, group.scoreDocs[1].doc); + assertEquals(2, group.scoreDocs[2].doc); + assertTrue(group.scoreDocs[0].score > group.scoreDocs[1].score); + assertTrue(group.scoreDocs[1].score > group.scoreDocs[2].score); + + group = groups.groups[2]; + compareGroupValue("author2", group); + assertEquals(1, group.scoreDocs.length); + assertEquals(3, group.scoreDocs[0].doc); + + group = groups.groups[3]; + compareGroupValue(null, group); + assertEquals(1, group.scoreDocs.length); + assertEquals(6, group.scoreDocs[0].doc); + + Filter lastDocInBlock = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("groupend", "x")))); + groupingSearch = new GroupingSearch(lastDocInBlock); + groups = groupingSearch.search(indexSearcher, null, new TermQuery(new Term("content", "random")), 0, 10); + + assertEquals(7, groups.totalHitCount); + assertEquals(7, groups.totalGroupedHitCount); + assertEquals(4, groups.totalGroupCount.longValue()); + assertEquals(4, groups.groups.length); + + indexSearcher.getIndexReader().close(); + dir.close(); + } + + private void addGroupField(Document doc, String groupField, String value, boolean canUseIDV) { + doc.add(new Field(groupField, value, TextField.TYPE_STORED)); + if (canUseIDV) { + doc.add(new DocValuesField(groupField, new BytesRef(value), DocValues.Type.BYTES_VAR_SORTED)); + } + } + + private void compareGroupValue(String expected, GroupDocs> group) { + if (expected == null) { + if (group.groupValue == null) { + return; + } else if (group.groupValue.getClass().isAssignableFrom(MutableValueStr.class)) { + return; + } else if (((BytesRef) group.groupValue).length == 0) { + return; + } + fail(); + } + + if (group.groupValue.getClass().isAssignableFrom(BytesRef.class)) { + assertEquals(new BytesRef(expected), group.groupValue); + } else if (group.groupValue.getClass().isAssignableFrom(MutableValueStr.class)) { + MutableValueStr v = new MutableValueStr(); + v.value = new BytesRef(expected); + assertEquals(v, group.groupValue); + } else { + fail(); + } + } + + private GroupingSearch createRandomGroupingSearch(String groupField, Sort groupSort, int docsInGroup, boolean canUseIDV) throws IOException { + GroupingSearch groupingSearch; + if (random.nextBoolean()) { + ValueSource vs = new BytesRefFieldSource(groupField); + groupingSearch = new GroupingSearch(vs, new HashMap