LUCENE-3098: add AllGroupsCollector

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1104421 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-05-17 17:20:54 +00:00
parent 381461d3c9
commit 1c464e6dcc
5 changed files with 315 additions and 28 deletions

View File

@ -0,0 +1,131 @@
package org.apache.lucene.search.grouping;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
/**
* A collector that collects all groups that match the
* query. Only the group value is collected, and the order
* is undefined. This collector does not determine
* the most relevant document of a group.
*
* <p/>
* Internally, {@link SentinelIntSet} is used to detect
* if a group is already added to the total count. For each
* segment the {@link SentinelIntSet} is cleared and filled
* with previous counted groups that occur in the new
* segment.
*
* @lucene.experimental
*/
public class AllGroupsCollector extends Collector {
private static final int DEFAULT_INITIAL_SIZE = 128;
private final String groupField;
private final SentinelIntSet ordSet;
private final List<BytesRef> groups;
private final BytesRef spareBytesRef = new BytesRef();
private FieldCache.DocTermsIndex index;
/**
* Expert: Constructs a {@link AllGroupsCollector}
*
* @param groupField The field to group by
* @param initialSize The initial size of the {@link SentinelIntSet} and groups list. The initial size should
* roughly match the total number of expected unique groups. Be aware that the heap usage
* is 4 bytes * initialSize.
*/
public AllGroupsCollector(String groupField, int initialSize) {
this.groupField = groupField;
ordSet = new SentinelIntSet(initialSize, -1);
groups = new ArrayList<BytesRef>(initialSize);
}
/**
* Constructs a {@link AllGroupsCollector}. This sets the initialSize for the {@link SentinelIntSet} and group list
* to 128 in the {@link #AllGroupsCollector(String, int)} constructor.
*
* @param groupField The field to group by
*/
public AllGroupsCollector(String groupField) {
this(groupField, DEFAULT_INITIAL_SIZE);
}
public void setScorer(Scorer scorer) throws IOException {
}
public void collect(int doc) throws IOException {
int key = index.getOrd(doc);
if (!ordSet.exists(key)) {
ordSet.put(key);
BytesRef term = key == 0 ? null : index.getTerm(doc, new BytesRef());
groups.add(term);
}
}
/**
* Returns the total number of groups for the executed search.
* This is a convenience method. The following code snippet has the same effect: <pre>getGroups().size()</pre>
*
* @return The total number of groups for the executed search
*/
public int getGroupCount() {
return groups.size();
}
/**
* Returns the group values
* <p/>
* This is an unordered collections of group values. For each group that matched the query there is a {@link BytesRef}
* representing a group value.
*
* @return the group values
*/
public Collection<BytesRef> getGroups() {
return groups;
}
public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException {
index = FieldCache.DEFAULT.getTermsIndex(context.reader, groupField);
// Clear ordSet and fill it with previous encountered groups that can occur in the current segment.
ordSet.clear();
for (BytesRef countedGroup : groups) {
int ord = index.binarySearchLookup(countedGroup, spareBytesRef);
if (ord >= 0) {
ordSet.put(ord);
}
}
}
public boolean acceptsDocsOutOfOrder() {
return true;
}
}

View File

@ -21,9 +21,6 @@ import org.apache.lucene.search.SortField;
/** Represents result returned by a grouping search.
*
* Note that we do not return the total number of unique
* groups; doing so would be costly.
*
* @lucene.experimental */
public class TopGroups {
/** Number of documents matching the search */
@ -32,6 +29,9 @@ public class TopGroups {
/** Number of documents grouped into the topN groups */
public final int totalGroupedHitCount;
/** The total number of unique groups. If <code>null</code> this value is not computed. */
public final Integer totalGroupCount;
/** Group results in groupSort order */
public final GroupDocs[] groups;
@ -47,5 +47,15 @@ public class TopGroups {
this.totalHitCount = totalHitCount;
this.totalGroupedHitCount = totalGroupedHitCount;
this.groups = groups;
this.totalGroupCount = null;
}
public TopGroups(TopGroups oldTopGroups, Integer totalGroupCount) {
this.groupSort = oldTopGroups.groupSort;
this.withinGroupSort = oldTopGroups.withinGroupSort;
this.totalHitCount = oldTopGroups.totalHitCount;
this.totalGroupedHitCount = oldTopGroups.totalGroupedHitCount;
this.groups = oldTopGroups.groups;
this.totalGroupCount = totalGroupCount;
}
}

View File

@ -88,6 +88,13 @@ field fall into a single group.</p>
boolean fillFields = true;
SecondPassGroupingCollector c2 = new SecondPassGroupingCollector("author", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields);
//Optionally compute total group count
AllGroupsCollector allGroupsCollector = null;
if (requiredTotalGroupCount) {
allGroupsCollector = new AllGroupsCollector("author");
c2 = MultiCollector.wrap(c2, allGroupsCollector);
}
if (cachedCollector.isCached()) {
// Cache fit within maxCacheRAMMB, so we can replay it:
cachedCollector.replay(c2);
@ -97,6 +104,9 @@ field fall into a single group.</p>
}
TopGroups groupsResult = c2.getTopGroups(docOffset);
if (requiredTotalGroupCount) {
groupResult = new TopGroups(groupsResult, allGroupsCollector.getGroupCount());
}
// Render groupsResult...
</pre>

View File

@ -0,0 +1,109 @@
package org.apache.lucene.search.grouping;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
public class AllGroupsCollectorTest extends LuceneTestCase {
public void testTotalGroupCount() throws Exception {
final String groupField = "author";
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(
random,
dir,
newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
// 0
Document doc = new Document();
doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("content", "random text", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NO));
w.addDocument(doc);
// 1
doc = new Document();
doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("content", "some more random text blob", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("id", "2", Field.Store.YES, Field.Index.NO));
w.addDocument(doc);
// 2
doc = new Document();
doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("content", "some more random textual data", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("id", "3", Field.Store.YES, Field.Index.NO));
w.addDocument(doc);
w.commit(); // To ensure a second segment
// 3
doc = new Document();
doc.add(new Field(groupField, "author2", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("content", "some random text", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("id", "4", Field.Store.YES, Field.Index.NO));
w.addDocument(doc);
// 4
doc = new Document();
doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("content", "some more random text", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("id", "5", Field.Store.YES, Field.Index.NO));
w.addDocument(doc);
// 5
doc = new Document();
doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("content", "random blob", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO));
w.addDocument(doc);
// 6 -- no author field
doc = new Document();
doc.add(new Field("content", "random word stuck in alot of other text", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO));
w.addDocument(doc);
IndexSearcher indexSearcher = new IndexSearcher(w.getReader());
w.close();
AllGroupsCollector c1 = new AllGroupsCollector(groupField);
indexSearcher.search(new TermQuery(new Term("content", "random")), c1);
assertEquals(4, c1.getGroupCount());
AllGroupsCollector c2 = new AllGroupsCollector(groupField);
indexSearcher.search(new TermQuery(new Term("content", "some")), c2);
assertEquals(3, c2.getGroupCount());
AllGroupsCollector c3 = new AllGroupsCollector(groupField);
indexSearcher.search(new TermQuery(new Term("content", "blob")), c3);
assertEquals(2, c3.getGroupCount());
indexSearcher.getIndexReader().close();
dir.close();
}
}

View File

@ -17,13 +17,7 @@
package org.apache.lucene.search.grouping;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.*;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
@ -32,15 +26,7 @@ import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.CachingCollector;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
@ -121,7 +107,7 @@ public class TestGrouping extends LuceneTestCase {
final SecondPassGroupingCollector c2 = new SecondPassGroupingCollector(groupField, c1.getTopGroups(0, true), groupSort, null, 5, true, false, true);
indexSearcher.search(new TermQuery(new Term("content", "random")), c2);
final TopGroups groups = c2.getTopGroups(0);
assertEquals(7, groups.totalHitCount);
@ -243,6 +229,7 @@ public class TestGrouping extends LuceneTestCase {
boolean fillFields,
boolean getScores,
boolean getMaxScores,
boolean doAllGroups,
Sort groupSort,
Sort docSort,
int topNGroups,
@ -258,6 +245,7 @@ public class TestGrouping extends LuceneTestCase {
final List<Comparable<?>[]> sortedGroupFields = new ArrayList<Comparable<?>[]>();
int totalHitCount = 0;
Set<BytesRef> knownGroups = new HashSet<BytesRef>();
for(GroupDoc d : groupDocs) {
// TODO: would be better to filter by searchTerm before sorting!
@ -265,6 +253,13 @@ public class TestGrouping extends LuceneTestCase {
continue;
}
totalHitCount++;
if (doAllGroups) {
if (!knownGroups.contains(d.group)) {
knownGroups.add(d.group);
}
}
List<GroupDoc> l = groups.get(d.group);
if (l == null) {
sortedGroups.add(d.group);
@ -317,7 +312,14 @@ public class TestGrouping extends LuceneTestCase {
fillFields ? sortedGroupFields.get(idx) : null);
}
return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result);
if (doAllGroups) {
return new TopGroups(
new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result),
knownGroups.size()
);
} else {
return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result);
}
}
public void testRandom() throws Exception {
@ -335,7 +337,7 @@ public class TestGrouping extends LuceneTestCase {
if (VERBOSE) {
System.out.println("TEST: numDocs=" + numDocs + " numGroups=" + numGroups);
}
final List<BytesRef> groups = new ArrayList<BytesRef>();
for(int i=0;i<numGroups;i++) {
groups.add(new BytesRef(_TestUtil.randomRealisticUnicodeString(random)));
@ -428,8 +430,16 @@ public class TestGrouping extends LuceneTestCase {
//final int docOffset = 0;
final boolean doCache = random.nextBoolean();
final boolean doAllGroups = random.nextBoolean();
if (VERBOSE) {
System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup);
System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doAllGroups=" + doAllGroups);
}
final AllGroupsCollector allGroupsCollector;
if (doAllGroups) {
allGroupsCollector = new AllGroupsCollector("group");
} else {
allGroupsCollector = null;
}
final FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("group", groupSort, groupOffset+topNGroups);
@ -440,7 +450,16 @@ public class TestGrouping extends LuceneTestCase {
if (VERBOSE) {
System.out.println("TEST: maxCacheMB=" + maxCacheMB);
}
c = cCache = new CachingCollector(c1, true, maxCacheMB);
if (doAllGroups) {
cCache = new CachingCollector(c1, true, maxCacheMB);
c = MultiCollector.wrap(cCache, allGroupsCollector);
} else {
c = cCache = new CachingCollector(c1, true, maxCacheMB);
}
} else if (doAllGroups) {
c = MultiCollector.wrap(c1, allGroupsCollector);
cCache = null;
} else {
c = c1;
cCache = null;
@ -475,8 +494,13 @@ public class TestGrouping extends LuceneTestCase {
} else {
s.search(new TermQuery(new Term("content", searchTerm)), c2);
}
groupsResult = c2.getTopGroups(docOffset);
if (doAllGroups) {
TopGroups tempTopGroups = c2.getTopGroups(docOffset);
groupsResult = new TopGroups(tempTopGroups, allGroupsCollector.getGroupCount());
} else {
groupsResult = c2.getTopGroups(docOffset);
}
} else {
groupsResult = null;
if (VERBOSE) {
@ -484,7 +508,7 @@ public class TestGrouping extends LuceneTestCase {
}
}
final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, doAllGroups, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
try {
// NOTE: intentional but temporary field cache insanity!
@ -509,7 +533,10 @@ public class TestGrouping extends LuceneTestCase {
assertEquals(expected.groups.length, actual.groups.length);
assertEquals(expected.totalHitCount, actual.totalHitCount);
assertEquals(expected.totalGroupedHitCount, actual.totalGroupedHitCount);
if (expected.totalGroupCount != null) {
assertEquals(expected.totalGroupCount, actual.totalGroupCount);
}
for(int groupIDX=0;groupIDX<expected.groups.length;groupIDX++) {
if (VERBOSE) {
System.out.println(" check groupIDX=" + groupIDX);