LUCENE-3098: add AllGroupsCollector

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1104421 13f79535-47bb-0310-9956-ffa450edef68
2011-05-17 17:20:54 +00:00 · 2011-05-17 17:20:54 +00:00 · 1c464e6dcc
parent 381461d3c9
commit 1c464e6dcc
5 changed files with 315 additions and 28 deletions
--- a/modules/grouping/src/java/org/apache/lucene/search/grouping/AllGroupsCollector.java
+++ b/modules/grouping/src/java/org/apache/lucene/search/grouping/AllGroupsCollector.java
@ -0,0 +1,131 @@
+package org.apache.lucene.search.grouping;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.FieldCache;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.util.BytesRef;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+/**
+ * A collector that collects all groups that match the
+ * query. Only the group value is collected, and the order
+ * is undefined.  This collector does not determine
+ * the most relevant document of a group.
+ *
+ * <p/>
+ * Internally, {@link SentinelIntSet} is used to detect
+ * if a group is already added to the total count.  For each
+ * segment the {@link SentinelIntSet} is cleared and filled
+ * with previous counted groups that occur in the new
+ * segment.
+ *
+ * @lucene.experimental
+ */
+public class AllGroupsCollector extends Collector {
+
+  private static final int DEFAULT_INITIAL_SIZE = 128;
+
+  private final String groupField;
+  private final SentinelIntSet ordSet;
+  private final List<BytesRef> groups;
+  private final BytesRef spareBytesRef = new BytesRef();
+
+  private FieldCache.DocTermsIndex index;
+
+  /**
+   * Expert: Constructs a {@link AllGroupsCollector}
+   *
+   * @param groupField  The field to group by
+   * @param initialSize The initial size of the {@link SentinelIntSet} and groups list. The initial size should
+   *                    roughly match the total number of expected unique groups. Be aware that the heap usage
+   *                    is 4 bytes * initialSize.
+   */
+  public AllGroupsCollector(String groupField, int initialSize) {
+    this.groupField = groupField;
+    ordSet = new SentinelIntSet(initialSize, -1);
+    groups = new ArrayList<BytesRef>(initialSize);
+  }
+
+  /**
+   * Constructs a {@link AllGroupsCollector}. This sets the initialSize for the {@link SentinelIntSet} and group list
+   * to 128 in the {@link #AllGroupsCollector(String, int)} constructor.
+   *
+   * @param groupField The field to group by
+   */
+  public AllGroupsCollector(String groupField) {
+    this(groupField, DEFAULT_INITIAL_SIZE);
+  }
+
+  public void setScorer(Scorer scorer) throws IOException {
+  }
+
+  public void collect(int doc) throws IOException {
+    int key = index.getOrd(doc);
+    if (!ordSet.exists(key)) {
+      ordSet.put(key);
+      BytesRef term = key == 0 ? null : index.getTerm(doc, new BytesRef());
+      groups.add(term);
+    }
+  }
+
+  /**
+   * Returns the total number of groups for the executed search.
+   * This is a convenience method. The following code snippet has the same effect: <pre>getGroups().size()</pre>
+   *
+   * @return The total number of groups for the executed search
+   */
+  public int getGroupCount() {
+    return groups.size();
+  }
+
+  /**
+   * Returns the group values
+   * <p/>
+   * This is an unordered collections of group values. For each group that matched the query there is a {@link BytesRef}
+   * representing a group value.
+   *
+   * @return the group values
+   */
+  public Collection<BytesRef> getGroups() {
+    return groups;
+  }
+
+  public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException {
+    index = FieldCache.DEFAULT.getTermsIndex(context.reader, groupField);
+
+    // Clear ordSet and fill it with previous encountered groups that can occur in the current segment.
+    ordSet.clear();
+    for (BytesRef countedGroup : groups) {
+      int ord = index.binarySearchLookup(countedGroup, spareBytesRef);
+      if (ord >= 0) {
+        ordSet.put(ord);
+      }
+    }
+  }
+
+  public boolean acceptsDocsOutOfOrder() {
+    return true;
+  }
+}
--- a/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java
+++ b/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java
@ -21,9 +21,6 @@ import org.apache.lucene.search.SortField;

 /** Represents result returned by a grouping search.
 *
- * Note that we do not return the total number of unique
- * groups; doing so would be costly.
- * 
 * @lucene.experimental */
 public class TopGroups {
  /** Number of documents matching the search */
@ -32,6 +29,9 @@ public class TopGroups {
  /** Number of documents grouped into the topN groups */
  public final int totalGroupedHitCount;

+  /** The total number of unique groups. If <code>null</code> this value is not computed. */
+  public final Integer totalGroupCount;
+
  /** Group results in groupSort order */
  public final GroupDocs[] groups;

@ -47,5 +47,15 @@ public class TopGroups {
    this.totalHitCount = totalHitCount;
    this.totalGroupedHitCount = totalGroupedHitCount;
    this.groups = groups;
+    this.totalGroupCount = null;
+  }
+
+  public TopGroups(TopGroups oldTopGroups, Integer totalGroupCount) {
+    this.groupSort = oldTopGroups.groupSort;
+    this.withinGroupSort = oldTopGroups.withinGroupSort;
+    this.totalHitCount = oldTopGroups.totalHitCount;
+    this.totalGroupedHitCount = oldTopGroups.totalGroupedHitCount;
+    this.groups = oldTopGroups.groups;
+    this.totalGroupCount = totalGroupCount;
  }
 }
--- a/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html
+++ b/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html
@ -88,6 +88,13 @@ field fall into a single group.</p>
  boolean fillFields = true;
  SecondPassGroupingCollector c2 = new SecondPassGroupingCollector("author", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields);

+  //Optionally compute total group count
+  AllGroupsCollector allGroupsCollector = null;
+  if (requiredTotalGroupCount) {
+    allGroupsCollector = new AllGroupsCollector("author");
+    c2 = MultiCollector.wrap(c2, allGroupsCollector);
+  }
+
  if (cachedCollector.isCached()) {
    // Cache fit within maxCacheRAMMB, so we can replay it:
    cachedCollector.replay(c2);
@ -97,6 +104,9 @@ field fall into a single group.</p>
  }
        
  TopGroups groupsResult = c2.getTopGroups(docOffset);
+  if (requiredTotalGroupCount) {
+    groupResult = new TopGroups(groupsResult, allGroupsCollector.getGroupCount());
+  }

  // Render groupsResult...
 </pre>
--- a/modules/grouping/src/test/org/apache/lucene/search/grouping/AllGroupsCollectorTest.java
+++ b/modules/grouping/src/test/org/apache/lucene/search/grouping/AllGroupsCollectorTest.java
@ -0,0 +1,109 @@
+package org.apache.lucene.search.grouping;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class AllGroupsCollectorTest extends LuceneTestCase {
+
+  public void testTotalGroupCount() throws Exception {
+
+    final String groupField = "author";
+
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(
+                               random,
+                               dir,
+                               newIndexWriterConfig(TEST_VERSION_CURRENT,
+                                                    new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
+    // 0
+    Document doc = new Document();
+    doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("content", "random text", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NO));
+    w.addDocument(doc);
+
+    // 1
+    doc = new Document();
+    doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("content", "some more random text blob", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "2", Field.Store.YES, Field.Index.NO));
+    w.addDocument(doc);
+
+    // 2
+    doc = new Document();
+    doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("content", "some more random textual data", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "3", Field.Store.YES, Field.Index.NO));
+    w.addDocument(doc);
+    w.commit(); // To ensure a second segment
+
+    // 3
+    doc = new Document();
+    doc.add(new Field(groupField, "author2", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("content", "some random text", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "4", Field.Store.YES, Field.Index.NO));
+    w.addDocument(doc);
+
+    // 4
+    doc = new Document();
+    doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("content", "some more random text", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "5", Field.Store.YES, Field.Index.NO));
+    w.addDocument(doc);
+
+    // 5
+    doc = new Document();
+    doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("content", "random blob", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO));
+    w.addDocument(doc);
+
+    // 6 -- no author field
+    doc = new Document();
+    doc.add(new Field("content", "random word stuck in alot of other text", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO));
+    w.addDocument(doc);
+
+    IndexSearcher indexSearcher = new IndexSearcher(w.getReader());
+    w.close();
+
+    AllGroupsCollector c1 = new AllGroupsCollector(groupField);
+    indexSearcher.search(new TermQuery(new Term("content", "random")), c1);
+    assertEquals(4, c1.getGroupCount());
+
+    AllGroupsCollector c2 = new AllGroupsCollector(groupField);
+    indexSearcher.search(new TermQuery(new Term("content", "some")), c2);
+    assertEquals(3, c2.getGroupCount());
+
+    AllGroupsCollector c3 = new AllGroupsCollector(groupField);
+    indexSearcher.search(new TermQuery(new Term("content", "blob")), c3);
+    assertEquals(2, c3.getGroupCount());
+
+    indexSearcher.getIndexReader().close();
+    dir.close();
+  }
+}
--- a/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java
+++ b/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java
@ -17,13 +17,7 @@

 package org.apache.lucene.search.grouping;

-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.List;
+import java.util.*;

 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
@ -32,15 +26,7 @@ import org.apache.lucene.document.NumericField;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.search.CachingCollector;
-import org.apache.lucene.search.Collector;
-import org.apache.lucene.search.FieldCache;
-import org.apache.lucene.search.FieldDoc;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.Sort;
-import org.apache.lucene.search.SortField;
-import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.*;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
@ -121,7 +107,7 @@ public class TestGrouping extends LuceneTestCase {

    final SecondPassGroupingCollector c2 = new SecondPassGroupingCollector(groupField, c1.getTopGroups(0, true), groupSort, null, 5, true, false, true);
    indexSearcher.search(new TermQuery(new Term("content", "random")), c2);
-    
+
    final TopGroups groups = c2.getTopGroups(0);

    assertEquals(7, groups.totalHitCount);
@ -243,6 +229,7 @@ public class TestGrouping extends LuceneTestCase {
                                 boolean fillFields,
                                 boolean getScores,
                                 boolean getMaxScores,
+                                 boolean doAllGroups,
                                 Sort groupSort,
                                 Sort docSort,
                                 int topNGroups,
@ -258,6 +245,7 @@ public class TestGrouping extends LuceneTestCase {
    final List<Comparable<?>[]> sortedGroupFields = new ArrayList<Comparable<?>[]>();

    int totalHitCount = 0;
+    Set<BytesRef> knownGroups = new HashSet<BytesRef>();

    for(GroupDoc d : groupDocs) {
      // TODO: would be better to filter by searchTerm before sorting!
@ -265,6 +253,13 @@ public class TestGrouping extends LuceneTestCase {
        continue;
      }
      totalHitCount++;
+
+      if (doAllGroups) {
+        if (!knownGroups.contains(d.group)) {
+          knownGroups.add(d.group);
+        }
+      }
+
      List<GroupDoc> l = groups.get(d.group);
      if (l == null) {
        sortedGroups.add(d.group);
@ -317,7 +312,14 @@ public class TestGrouping extends LuceneTestCase {
                                              fillFields ? sortedGroupFields.get(idx) : null);
    }

-    return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result);
+    if (doAllGroups) {
+      return new TopGroups(
+          new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result),
+          knownGroups.size()
+      );
+    } else {
+      return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result);
+    }
  }

  public void testRandom() throws Exception {
@ -335,7 +337,7 @@ public class TestGrouping extends LuceneTestCase {
      if (VERBOSE) {
        System.out.println("TEST: numDocs=" + numDocs + " numGroups=" + numGroups);
      }
-      
+
      final List<BytesRef> groups = new ArrayList<BytesRef>();
      for(int i=0;i<numGroups;i++) {
        groups.add(new BytesRef(_TestUtil.randomRealisticUnicodeString(random)));
@ -428,8 +430,16 @@ public class TestGrouping extends LuceneTestCase {
        //final int docOffset = 0;

        final boolean doCache = random.nextBoolean();
+        final boolean doAllGroups = random.nextBoolean();
        if (VERBOSE) {
-          System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup);
+          System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doAllGroups=" + doAllGroups);
+        }
+
+        final AllGroupsCollector allGroupsCollector;
+        if (doAllGroups) {
+          allGroupsCollector = new AllGroupsCollector("group");
+        } else {
+          allGroupsCollector = null;
        }

        final FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("group", groupSort, groupOffset+topNGroups);
@ -440,7 +450,16 @@ public class TestGrouping extends LuceneTestCase {
          if (VERBOSE) {
            System.out.println("TEST: maxCacheMB=" + maxCacheMB);
          }
-          c = cCache = new CachingCollector(c1, true, maxCacheMB);
+
+          if (doAllGroups) {
+            cCache = new CachingCollector(c1, true, maxCacheMB);
+            c = MultiCollector.wrap(cCache, allGroupsCollector);
+          } else {
+            c = cCache = new CachingCollector(c1, true, maxCacheMB);
+          }
+        } else if (doAllGroups) {
+          c = MultiCollector.wrap(c1, allGroupsCollector);
+          cCache = null;
        } else {
          c = c1;
          cCache = null;
@ -475,8 +494,13 @@ public class TestGrouping extends LuceneTestCase {
          } else {
            s.search(new TermQuery(new Term("content", searchTerm)), c2);
          }
-        
-          groupsResult = c2.getTopGroups(docOffset);
+
+          if (doAllGroups) {
+            TopGroups tempTopGroups = c2.getTopGroups(docOffset);
+            groupsResult = new TopGroups(tempTopGroups, allGroupsCollector.getGroupCount());
+          } else {
+            groupsResult = c2.getTopGroups(docOffset);
+          }
        } else {
          groupsResult = null;
          if (VERBOSE) {
@ -484,7 +508,7 @@ public class TestGrouping extends LuceneTestCase {
          }
        }

-        final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
+        final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, doAllGroups, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);

        try {
          // NOTE: intentional but temporary field cache insanity!
@ -509,7 +533,10 @@ public class TestGrouping extends LuceneTestCase {
    assertEquals(expected.groups.length, actual.groups.length);
    assertEquals(expected.totalHitCount, actual.totalHitCount);
    assertEquals(expected.totalGroupedHitCount, actual.totalGroupedHitCount);
-    
+    if (expected.totalGroupCount != null) {
+      assertEquals(expected.totalGroupCount, actual.totalGroupCount);
+    }
+
    for(int groupIDX=0;groupIDX<expected.groups.length;groupIDX++) {
      if (VERBOSE) {
        System.out.println("  check groupIDX=" + groupIDX);