mirror of https://github.com/apache/lucene.git
LUCENE-3098: add AllGroupsCollector
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1104421 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
381461d3c9
commit
1c464e6dcc
|
@ -0,0 +1,131 @@
|
|||
package org.apache.lucene.search.grouping;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A collector that collects all groups that match the
|
||||
* query. Only the group value is collected, and the order
|
||||
* is undefined. This collector does not determine
|
||||
* the most relevant document of a group.
|
||||
*
|
||||
* <p/>
|
||||
* Internally, {@link SentinelIntSet} is used to detect
|
||||
* if a group is already added to the total count. For each
|
||||
* segment the {@link SentinelIntSet} is cleared and filled
|
||||
* with previous counted groups that occur in the new
|
||||
* segment.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class AllGroupsCollector extends Collector {
|
||||
|
||||
private static final int DEFAULT_INITIAL_SIZE = 128;
|
||||
|
||||
private final String groupField;
|
||||
private final SentinelIntSet ordSet;
|
||||
private final List<BytesRef> groups;
|
||||
private final BytesRef spareBytesRef = new BytesRef();
|
||||
|
||||
private FieldCache.DocTermsIndex index;
|
||||
|
||||
/**
|
||||
* Expert: Constructs a {@link AllGroupsCollector}
|
||||
*
|
||||
* @param groupField The field to group by
|
||||
* @param initialSize The initial size of the {@link SentinelIntSet} and groups list. The initial size should
|
||||
* roughly match the total number of expected unique groups. Be aware that the heap usage
|
||||
* is 4 bytes * initialSize.
|
||||
*/
|
||||
public AllGroupsCollector(String groupField, int initialSize) {
|
||||
this.groupField = groupField;
|
||||
ordSet = new SentinelIntSet(initialSize, -1);
|
||||
groups = new ArrayList<BytesRef>(initialSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a {@link AllGroupsCollector}. This sets the initialSize for the {@link SentinelIntSet} and group list
|
||||
* to 128 in the {@link #AllGroupsCollector(String, int)} constructor.
|
||||
*
|
||||
* @param groupField The field to group by
|
||||
*/
|
||||
public AllGroupsCollector(String groupField) {
|
||||
this(groupField, DEFAULT_INITIAL_SIZE);
|
||||
}
|
||||
|
||||
public void setScorer(Scorer scorer) throws IOException {
|
||||
}
|
||||
|
||||
public void collect(int doc) throws IOException {
|
||||
int key = index.getOrd(doc);
|
||||
if (!ordSet.exists(key)) {
|
||||
ordSet.put(key);
|
||||
BytesRef term = key == 0 ? null : index.getTerm(doc, new BytesRef());
|
||||
groups.add(term);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the total number of groups for the executed search.
|
||||
* This is a convenience method. The following code snippet has the same effect: <pre>getGroups().size()</pre>
|
||||
*
|
||||
* @return The total number of groups for the executed search
|
||||
*/
|
||||
public int getGroupCount() {
|
||||
return groups.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the group values
|
||||
* <p/>
|
||||
* This is an unordered collections of group values. For each group that matched the query there is a {@link BytesRef}
|
||||
* representing a group value.
|
||||
*
|
||||
* @return the group values
|
||||
*/
|
||||
public Collection<BytesRef> getGroups() {
|
||||
return groups;
|
||||
}
|
||||
|
||||
public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException {
|
||||
index = FieldCache.DEFAULT.getTermsIndex(context.reader, groupField);
|
||||
|
||||
// Clear ordSet and fill it with previous encountered groups that can occur in the current segment.
|
||||
ordSet.clear();
|
||||
for (BytesRef countedGroup : groups) {
|
||||
int ord = index.binarySearchLookup(countedGroup, spareBytesRef);
|
||||
if (ord >= 0) {
|
||||
ordSet.put(ord);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean acceptsDocsOutOfOrder() {
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -21,9 +21,6 @@ import org.apache.lucene.search.SortField;
|
|||
|
||||
/** Represents result returned by a grouping search.
|
||||
*
|
||||
* Note that we do not return the total number of unique
|
||||
* groups; doing so would be costly.
|
||||
*
|
||||
* @lucene.experimental */
|
||||
public class TopGroups {
|
||||
/** Number of documents matching the search */
|
||||
|
@ -32,6 +29,9 @@ public class TopGroups {
|
|||
/** Number of documents grouped into the topN groups */
|
||||
public final int totalGroupedHitCount;
|
||||
|
||||
/** The total number of unique groups. If <code>null</code> this value is not computed. */
|
||||
public final Integer totalGroupCount;
|
||||
|
||||
/** Group results in groupSort order */
|
||||
public final GroupDocs[] groups;
|
||||
|
||||
|
@ -47,5 +47,15 @@ public class TopGroups {
|
|||
this.totalHitCount = totalHitCount;
|
||||
this.totalGroupedHitCount = totalGroupedHitCount;
|
||||
this.groups = groups;
|
||||
this.totalGroupCount = null;
|
||||
}
|
||||
|
||||
public TopGroups(TopGroups oldTopGroups, Integer totalGroupCount) {
|
||||
this.groupSort = oldTopGroups.groupSort;
|
||||
this.withinGroupSort = oldTopGroups.withinGroupSort;
|
||||
this.totalHitCount = oldTopGroups.totalHitCount;
|
||||
this.totalGroupedHitCount = oldTopGroups.totalGroupedHitCount;
|
||||
this.groups = oldTopGroups.groups;
|
||||
this.totalGroupCount = totalGroupCount;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -88,6 +88,13 @@ field fall into a single group.</p>
|
|||
boolean fillFields = true;
|
||||
SecondPassGroupingCollector c2 = new SecondPassGroupingCollector("author", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields);
|
||||
|
||||
//Optionally compute total group count
|
||||
AllGroupsCollector allGroupsCollector = null;
|
||||
if (requiredTotalGroupCount) {
|
||||
allGroupsCollector = new AllGroupsCollector("author");
|
||||
c2 = MultiCollector.wrap(c2, allGroupsCollector);
|
||||
}
|
||||
|
||||
if (cachedCollector.isCached()) {
|
||||
// Cache fit within maxCacheRAMMB, so we can replay it:
|
||||
cachedCollector.replay(c2);
|
||||
|
@ -97,6 +104,9 @@ field fall into a single group.</p>
|
|||
}
|
||||
|
||||
TopGroups groupsResult = c2.getTopGroups(docOffset);
|
||||
if (requiredTotalGroupCount) {
|
||||
groupResult = new TopGroups(groupsResult, allGroupsCollector.getGroupCount());
|
||||
}
|
||||
|
||||
// Render groupsResult...
|
||||
</pre>
|
||||
|
|
|
@ -0,0 +1,109 @@
|
|||
package org.apache.lucene.search.grouping;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class AllGroupsCollectorTest extends LuceneTestCase {
|
||||
|
||||
public void testTotalGroupCount() throws Exception {
|
||||
|
||||
final String groupField = "author";
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(
|
||||
random,
|
||||
dir,
|
||||
newIndexWriterConfig(TEST_VERSION_CURRENT,
|
||||
new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
|
||||
// 0
|
||||
Document doc = new Document();
|
||||
doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(new Field("content", "random text", Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NO));
|
||||
w.addDocument(doc);
|
||||
|
||||
// 1
|
||||
doc = new Document();
|
||||
doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(new Field("content", "some more random text blob", Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(new Field("id", "2", Field.Store.YES, Field.Index.NO));
|
||||
w.addDocument(doc);
|
||||
|
||||
// 2
|
||||
doc = new Document();
|
||||
doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(new Field("content", "some more random textual data", Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(new Field("id", "3", Field.Store.YES, Field.Index.NO));
|
||||
w.addDocument(doc);
|
||||
w.commit(); // To ensure a second segment
|
||||
|
||||
// 3
|
||||
doc = new Document();
|
||||
doc.add(new Field(groupField, "author2", Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(new Field("content", "some random text", Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(new Field("id", "4", Field.Store.YES, Field.Index.NO));
|
||||
w.addDocument(doc);
|
||||
|
||||
// 4
|
||||
doc = new Document();
|
||||
doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(new Field("content", "some more random text", Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(new Field("id", "5", Field.Store.YES, Field.Index.NO));
|
||||
w.addDocument(doc);
|
||||
|
||||
// 5
|
||||
doc = new Document();
|
||||
doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(new Field("content", "random blob", Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO));
|
||||
w.addDocument(doc);
|
||||
|
||||
// 6 -- no author field
|
||||
doc = new Document();
|
||||
doc.add(new Field("content", "random word stuck in alot of other text", Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO));
|
||||
w.addDocument(doc);
|
||||
|
||||
IndexSearcher indexSearcher = new IndexSearcher(w.getReader());
|
||||
w.close();
|
||||
|
||||
AllGroupsCollector c1 = new AllGroupsCollector(groupField);
|
||||
indexSearcher.search(new TermQuery(new Term("content", "random")), c1);
|
||||
assertEquals(4, c1.getGroupCount());
|
||||
|
||||
AllGroupsCollector c2 = new AllGroupsCollector(groupField);
|
||||
indexSearcher.search(new TermQuery(new Term("content", "some")), c2);
|
||||
assertEquals(3, c2.getGroupCount());
|
||||
|
||||
AllGroupsCollector c3 = new AllGroupsCollector(groupField);
|
||||
indexSearcher.search(new TermQuery(new Term("content", "blob")), c3);
|
||||
assertEquals(2, c3.getGroupCount());
|
||||
|
||||
indexSearcher.getIndexReader().close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
|
@ -17,13 +17,7 @@
|
|||
|
||||
package org.apache.lucene.search.grouping;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -32,15 +26,7 @@ import org.apache.lucene.document.NumericField;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.CachingCollector;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
import org.apache.lucene.search.FieldDoc;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
@ -121,7 +107,7 @@ public class TestGrouping extends LuceneTestCase {
|
|||
|
||||
final SecondPassGroupingCollector c2 = new SecondPassGroupingCollector(groupField, c1.getTopGroups(0, true), groupSort, null, 5, true, false, true);
|
||||
indexSearcher.search(new TermQuery(new Term("content", "random")), c2);
|
||||
|
||||
|
||||
final TopGroups groups = c2.getTopGroups(0);
|
||||
|
||||
assertEquals(7, groups.totalHitCount);
|
||||
|
@ -243,6 +229,7 @@ public class TestGrouping extends LuceneTestCase {
|
|||
boolean fillFields,
|
||||
boolean getScores,
|
||||
boolean getMaxScores,
|
||||
boolean doAllGroups,
|
||||
Sort groupSort,
|
||||
Sort docSort,
|
||||
int topNGroups,
|
||||
|
@ -258,6 +245,7 @@ public class TestGrouping extends LuceneTestCase {
|
|||
final List<Comparable<?>[]> sortedGroupFields = new ArrayList<Comparable<?>[]>();
|
||||
|
||||
int totalHitCount = 0;
|
||||
Set<BytesRef> knownGroups = new HashSet<BytesRef>();
|
||||
|
||||
for(GroupDoc d : groupDocs) {
|
||||
// TODO: would be better to filter by searchTerm before sorting!
|
||||
|
@ -265,6 +253,13 @@ public class TestGrouping extends LuceneTestCase {
|
|||
continue;
|
||||
}
|
||||
totalHitCount++;
|
||||
|
||||
if (doAllGroups) {
|
||||
if (!knownGroups.contains(d.group)) {
|
||||
knownGroups.add(d.group);
|
||||
}
|
||||
}
|
||||
|
||||
List<GroupDoc> l = groups.get(d.group);
|
||||
if (l == null) {
|
||||
sortedGroups.add(d.group);
|
||||
|
@ -317,7 +312,14 @@ public class TestGrouping extends LuceneTestCase {
|
|||
fillFields ? sortedGroupFields.get(idx) : null);
|
||||
}
|
||||
|
||||
return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result);
|
||||
if (doAllGroups) {
|
||||
return new TopGroups(
|
||||
new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result),
|
||||
knownGroups.size()
|
||||
);
|
||||
} else {
|
||||
return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result);
|
||||
}
|
||||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
|
@ -335,7 +337,7 @@ public class TestGrouping extends LuceneTestCase {
|
|||
if (VERBOSE) {
|
||||
System.out.println("TEST: numDocs=" + numDocs + " numGroups=" + numGroups);
|
||||
}
|
||||
|
||||
|
||||
final List<BytesRef> groups = new ArrayList<BytesRef>();
|
||||
for(int i=0;i<numGroups;i++) {
|
||||
groups.add(new BytesRef(_TestUtil.randomRealisticUnicodeString(random)));
|
||||
|
@ -428,8 +430,16 @@ public class TestGrouping extends LuceneTestCase {
|
|||
//final int docOffset = 0;
|
||||
|
||||
final boolean doCache = random.nextBoolean();
|
||||
final boolean doAllGroups = random.nextBoolean();
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup);
|
||||
System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doAllGroups=" + doAllGroups);
|
||||
}
|
||||
|
||||
final AllGroupsCollector allGroupsCollector;
|
||||
if (doAllGroups) {
|
||||
allGroupsCollector = new AllGroupsCollector("group");
|
||||
} else {
|
||||
allGroupsCollector = null;
|
||||
}
|
||||
|
||||
final FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("group", groupSort, groupOffset+topNGroups);
|
||||
|
@ -440,7 +450,16 @@ public class TestGrouping extends LuceneTestCase {
|
|||
if (VERBOSE) {
|
||||
System.out.println("TEST: maxCacheMB=" + maxCacheMB);
|
||||
}
|
||||
c = cCache = new CachingCollector(c1, true, maxCacheMB);
|
||||
|
||||
if (doAllGroups) {
|
||||
cCache = new CachingCollector(c1, true, maxCacheMB);
|
||||
c = MultiCollector.wrap(cCache, allGroupsCollector);
|
||||
} else {
|
||||
c = cCache = new CachingCollector(c1, true, maxCacheMB);
|
||||
}
|
||||
} else if (doAllGroups) {
|
||||
c = MultiCollector.wrap(c1, allGroupsCollector);
|
||||
cCache = null;
|
||||
} else {
|
||||
c = c1;
|
||||
cCache = null;
|
||||
|
@ -475,8 +494,13 @@ public class TestGrouping extends LuceneTestCase {
|
|||
} else {
|
||||
s.search(new TermQuery(new Term("content", searchTerm)), c2);
|
||||
}
|
||||
|
||||
groupsResult = c2.getTopGroups(docOffset);
|
||||
|
||||
if (doAllGroups) {
|
||||
TopGroups tempTopGroups = c2.getTopGroups(docOffset);
|
||||
groupsResult = new TopGroups(tempTopGroups, allGroupsCollector.getGroupCount());
|
||||
} else {
|
||||
groupsResult = c2.getTopGroups(docOffset);
|
||||
}
|
||||
} else {
|
||||
groupsResult = null;
|
||||
if (VERBOSE) {
|
||||
|
@ -484,7 +508,7 @@ public class TestGrouping extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
|
||||
final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, doAllGroups, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
|
||||
|
||||
try {
|
||||
// NOTE: intentional but temporary field cache insanity!
|
||||
|
@ -509,7 +533,10 @@ public class TestGrouping extends LuceneTestCase {
|
|||
assertEquals(expected.groups.length, actual.groups.length);
|
||||
assertEquals(expected.totalHitCount, actual.totalHitCount);
|
||||
assertEquals(expected.totalGroupedHitCount, actual.totalGroupedHitCount);
|
||||
|
||||
if (expected.totalGroupCount != null) {
|
||||
assertEquals(expected.totalGroupCount, actual.totalGroupCount);
|
||||
}
|
||||
|
||||
for(int groupIDX=0;groupIDX<expected.groups.length;groupIDX++) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" check groupIDX=" + groupIDX);
|
||||
|
|
Loading…
Reference in New Issue