SOLR-2564: Integrating grouping module into Solr 4.0

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1137037 13f79535-47bb-0310-9956-ffa450edef68
2011-06-17 21:25:59 +00:00 · 2011-06-17 21:25:59 +00:00 · d805da14c2
parent 4c6cc4ac18
commit d805da14c2
9 changed files with 1095 additions and 790 deletions
--- a/lucene/src/java/org/apache/lucene/search/CachingCollector.java
+++ b/lucene/src/java/org/apache/lucene/search/CachingCollector.java
@ -17,22 +17,22 @@ package org.apache.lucene.search;
 * limitations under the License.
 */

+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
+import org.apache.lucene.util.RamUsageEstimator;
+
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;

-import org.apache.lucene.index.IndexReader.AtomicReaderContext;
-import org.apache.lucene.util.RamUsageEstimator;
-
 /**
 * Caches all docs, and optionally also scores, coming from
 * a search, and is then able to replay them to another
 * collector.  You specify the max RAM this class may use.
- * Once the collection is done, call {@link #isCached}.  If
- * this returns true, you can use {@link #replay} against a
- * new collector.  If it returns false, this means too much
- * RAM was required and you must instead re-run the original
- * search.
+ * Once the collection is done, call {@link #isCached}. If
+ * this returns true, you can use {@link #replay(Collector)}
+ * against a new collector.  If it returns false, this means
+ * too much RAM was required and you must instead re-run the
+ * original search.
 *
 * <p><b>NOTE</b>: this class consumes 4 (or 8 bytes, if
 * scoring is cached) per collected document.  If the result
@ -105,7 +105,16 @@ public abstract class CachingCollector extends Collector {

      cachedScorer = new CachedScorer();
      cachedScores = new ArrayList<float[]>();
-      curScores = new float[128];
+      curScores = new float[INITIAL_ARRAY_SIZE];
+      cachedScores.add(curScores);
+    }
+
+    ScoreCachingCollector(Collector other, int maxDocsToCache) {
+      super(other, maxDocsToCache);
+
+      cachedScorer = new CachedScorer();
+      cachedScores = new ArrayList<float[]>();
+      curScores = new float[INITIAL_ARRAY_SIZE];
      cachedScores.add(curScores);
    }
    
@ -210,7 +219,11 @@ public abstract class CachingCollector extends Collector {
    NoScoreCachingCollector(Collector other, double maxRAMMB) {
     super(other, maxRAMMB, false);
    }
-    
+
+    NoScoreCachingCollector(Collector other, int maxDocsToCache) {
+     super(other, maxDocsToCache);
+    }
+
    @Override
    public void collect(int doc) throws IOException {

@ -353,7 +366,25 @@ public abstract class CachingCollector extends Collector {
   */
  public static CachingCollector create(Collector other, boolean cacheScores, double maxRAMMB) {
    return cacheScores ? new ScoreCachingCollector(other, maxRAMMB) : new NoScoreCachingCollector(other, maxRAMMB);
-    }
+  }
+
+  /**
+   * Create a new {@link CachingCollector} that wraps the given collector and
+   * caches documents and scores up to the specified max docs threshold.
+   *
+   * @param other
+   *          the Collector to wrap and delegate calls to.
+   * @param cacheScores
+   *          whether to cache scores in addition to document IDs. Note that
+   *          this increases the RAM consumed per doc
+   * @param maxDocsToCache
+   *          the maximum number of documents for caching the documents and
+   *          possible the scores. If the collector exceeds the threshold,
+   *          no documents and scores are cached.
+   */
+  public static CachingCollector create(Collector other, boolean cacheScores, int maxDocsToCache) {
+    return cacheScores ? new ScoreCachingCollector(other, maxDocsToCache) : new NoScoreCachingCollector(other, maxDocsToCache);
+  }
  
  // Prevent extension from non-internal classes
  private CachingCollector(Collector other, double maxRAMMB, boolean cacheScores) {
@ -369,6 +400,15 @@ public abstract class CachingCollector extends Collector {
    }
    maxDocsToCache = (int) ((maxRAMMB * 1024 * 1024) / bytesPerDoc);
  }
+
+  private CachingCollector(Collector other, int maxDocsToCache) {
+    this.other = other;
+
+    cachedDocs = new ArrayList<int[]>();
+    curDocs = new int[INITIAL_ARRAY_SIZE];
+    cachedDocs.add(curDocs);
+    this.maxDocsToCache = maxDocsToCache;
+  }
  
  @Override
  public boolean acceptsDocsOutOfOrder() {
--- a/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java
+++ b/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java
@ -17,15 +17,11 @@ package org.apache.lucene.search;
 * limitations under the License.
 */

-import java.io.IOException;
-
 import org.apache.lucene.index.IndexReader.AtomicReaderContext;
-import org.apache.lucene.search.CachingCollector;
-import org.apache.lucene.search.Collector;
-import org.apache.lucene.search.Scorer;
-import org.apache.lucene.search.Weight;
 import org.apache.lucene.util.LuceneTestCase;

+import java.io.IOException;
+
 public class TestCachingCollector extends LuceneTestCase {

  private static final double ONE_BYTE = 1.0 / (1024 * 1024); // 1 byte out of MB
@ -76,7 +72,7 @@ public class TestCachingCollector extends LuceneTestCase {

  public void testBasic() throws Exception {
    for (boolean cacheScores : new boolean[] { false, true }) {
-      CachingCollector cc = CachingCollector.create(new NoOpCollector(false), cacheScores, 1);
+      CachingCollector cc = CachingCollector.create(new NoOpCollector(false), cacheScores, 1.0);
      cc.setScorer(new MockScorer());

      // collect 1000 docs
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -78,6 +78,8 @@ New Features
   Bojan Smid, Charles Hornberger, Dieter Grad, Dmitry Lihachev, Doug Steigerwald,
   Karsten Sperling, Michael Gundlach, Oleg Gnatovskiy, Thomas Traeger,
   Harish Agarwal, yonik)
+  SOLR-2564: Integrate grouping module into Solr. Also adds the ability to return number of
+  groups that have match a query.

 * SOLR-1665: Add debug component options for timings, results and query info only (gsingers, hossman, yonik)

--- a/solr/common-build.xml
+++ b/solr/common-build.xml
@ -198,6 +198,7 @@
    <pathelement location="${common-solr.dir}/../lucene/build/contrib/misc/classes/java" />
    <pathelement location="${common-solr.dir}/../lucene/build/contrib/queries/classes/java" />
    <pathelement location="${common-solr.dir}/../lucene/build/contrib/spatial/classes/java" />
+    <pathelement location="${common-solr.dir}/../modules/grouping/build/classes/java" />
  </path>   

  <target name="prep-lucene-jars">
@ -214,6 +215,7 @@
        <fileset dir="../lucene/contrib/misc" includes="build.xml" />
        <fileset dir="../lucene/contrib/queries" includes="build.xml" />
        <fileset dir="../lucene/contrib/spatial" includes="build.xml" />
+        <fileset dir="../modules/grouping" includes="build.xml" />
      </subant>
    </sequential>
  </target>
@ -248,6 +250,9 @@
      <fileset dir="../lucene/build/contrib/spatial">
        <include name="lucene-spatial-${version}.jar" />
      </fileset>
+      <fileset dir="../modules/grouping/build">
+        <include name="lucene-grouping-${version}.jar" />
+      </fileset>
      </copy>
  </target>  
  
@ -262,6 +267,7 @@
      <fileset dir="../lucene/contrib/misc" includes="build.xml"/>
      <fileset dir="../lucene/contrib/queries" includes="build.xml"/>
      <fileset dir="../lucene/contrib/spatial" includes="build.xml"/>
+      <fileset dir="../modules/grouping" includes="build.xml"/>
    </subant>
  </target>
   
--- a/solr/src/common/org/apache/solr/common/params/GroupParams.java
+++ b/solr/src/common/org/apache/solr/common/params/GroupParams.java
@ -38,5 +38,20 @@ public interface GroupParams {

  /** treat the first group result as the main result.  true/false */
  public static final String GROUP_FORMAT = GROUP + ".format";
+
+  /**
+   * Whether to cache the first pass search (doc ids and score) for the second pass search.
+   * Also defines the maximum size of the group cache relative to maxdoc in a percentage.
+   * Values can be a positive integer, from 0 till 100. A value of 0 will disable the group cache.
+   * The default is 0.*/
+  public static final String GROUP_CACHE_PERCENTAGE = GROUP + ".cache.percent";
+
+  // Note: Since you can supply multiple fields to group on, but only have a facets for the whole result. It only makes
+  // sense to me to support these parameters for the first group.
+  /** Whether the docSet (for example for faceting) should be based on plain documents (a.k.a UNGROUPED) or on the groups (a.k.a GROUPED). */
+  public static final String GROUP_COLLAPSE = GROUP + ".collapse";
+
+  /** Whether the group count should be included in the response. */
+  public static final String GROUP_TOTAL_COUNT = GROUP + ".ngroups";
 }

--- a/solr/src/java/org/apache/solr/handler/component/QueryComponent.java
+++ b/solr/src/java/org/apache/solr/handler/component/QueryComponent.java
@ -45,8 +45,6 @@ import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.schema.FieldType;
 import org.apache.solr.schema.SchemaField;
 import org.apache.solr.search.*;
-import org.apache.solr.search.function.FunctionQuery;
-import org.apache.solr.search.function.QueryValueSource;
 import org.apache.solr.util.SolrPluginUtils;

 import java.io.IOException;
@ -315,16 +313,25 @@ public class QueryComponent extends SearchComponent
    boolean doGroup = params.getBool(GroupParams.GROUP, false);
    if (doGroup) {
      try {
-        Grouping grouping = new Grouping(searcher, result, cmd);
-
+        int maxDocsPercentageToCache = params.getInt(GroupParams.GROUP_CACHE_PERCENTAGE, 0);
+        boolean cacheSecondPassSearch = maxDocsPercentageToCache >= 1 && maxDocsPercentageToCache <= 100;
        String[] fields = params.getParams(GroupParams.GROUP_FIELD);
        String[] funcs = params.getParams(GroupParams.GROUP_FUNC);
        String[] queries = params.getParams(GroupParams.GROUP_QUERY);
        String groupSortStr = params.get(GroupParams.GROUP_SORT);
        boolean main = params.getBool(GroupParams.GROUP_MAIN, false);
-        String format = params.get(GroupParams.GROUP_FORMAT);
-        Grouping.Format defaultFormat = "simple".equals(format) ? Grouping.Format.Simple : Grouping.Format.Grouped; 

+        String formatStr = params.get(GroupParams.GROUP_FORMAT, Grouping.Format.grouped.name());
+        Grouping.Format defaultFormat;
+        try {
+          defaultFormat = Grouping.Format.valueOf(formatStr);
+        } catch (IllegalArgumentException e) {
+          throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, String.format("Illegal %s parameter", GroupParams.GROUP_FORMAT));
+        }
+
+        boolean includeTotalGroupCount = params.getBool(GroupParams.GROUP_TOTAL_COUNT, false);
+        Grouping.TotalCount defaultTotalCount = includeTotalGroupCount ? Grouping.TotalCount.grouped : Grouping.TotalCount.ungrouped;
+        Sort sort = cmd.getSort();
        // groupSort defaults to sort
        Sort groupSort = groupSortStr == null ? cmd.getSort() : QueryParsing.parseSort(groupSortStr, req);

@ -332,95 +339,47 @@ public class QueryComponent extends SearchComponent
        int groupOffsetDefault = params.getInt(GroupParams.GROUP_OFFSET, 0);
        int docsPerGroupDefault = params.getInt(GroupParams.GROUP_LIMIT, 1);

-        // temporary: implement all group-by-field as group-by-func
-        if (funcs == null) {
-          funcs = fields;
-        } else if (fields != null) {
-          // catenate functions and fields
-          String[] both = new String[fields.length + funcs.length];
-          System.arraycopy(fields, 0, both, 0, fields.length);
-          System.arraycopy(funcs, 0, both, fields.length, funcs.length);
-          funcs = both;
-        }
+        Grouping grouping = new Grouping(searcher, result, cmd, cacheSecondPassSearch, maxDocsPercentageToCache, main);
+        grouping.setSort(sort)
+            .setGroupSort(groupSort)
+            .setDefaultFormat(defaultFormat)
+            .setLimitDefault(limitDefault)
+            .setDefaultTotalCount(defaultTotalCount)
+            .setDocsPerGroupDefault(docsPerGroupDefault)
+            .setGroupOffsetDefault(groupOffsetDefault);

+        if (fields != null) {
+          for (String field : fields) {
+            grouping.addFieldCommand(field, rb.req);
+          }
+        }

        if (funcs != null) {
          for (String groupByStr : funcs) {
-            QParser parser = QParser.getParser(groupByStr, "func", rb.req);
-            Query q = parser.getQuery();
-            Grouping.CommandFunc gc = grouping.new CommandFunc();
-            gc.groupSort = groupSort;
-
-            if (q instanceof FunctionQuery) {
-              gc.groupBy = ((FunctionQuery)q).getValueSource();
-            } else {
-              gc.groupBy = new QueryValueSource(q, 0.0f);
-            }
-            gc.key = groupByStr;
-            gc.numGroups = limitDefault;
-            gc.docsPerGroup = docsPerGroupDefault;
-            gc.groupOffset = groupOffsetDefault;
-            gc.offset = cmd.getOffset();
-            gc.sort = cmd.getSort();
-            gc.format = defaultFormat;
-
-            if (main) {
-              gc.main = true;
-              gc.format = Grouping.Format.Simple;
-              main = false;
-            }
-
-            if (gc.format == Grouping.Format.Simple) {
-              gc.groupOffset = 0;  // doesn't make sense
-            }
-
-            grouping.add(gc);
+            grouping.addFunctionCommand(groupByStr, rb.req);
          }
        }

        if (queries != null) {
          for (String groupByStr : queries) {
-            QParser parser = QParser.getParser(groupByStr, null, rb.req);
-            Query gq = parser.getQuery();
-            Grouping.CommandQuery gc = grouping.new CommandQuery();
-            gc.query = gq;
-            gc.groupSort = groupSort;
-            gc.key = groupByStr;
-            gc.numGroups = limitDefault;
-            gc.docsPerGroup = docsPerGroupDefault;
-            gc.groupOffset = groupOffsetDefault;
-
-            // these two params will only be used if this is for the main result set
-            gc.offset = cmd.getOffset();
-            gc.numGroups = limitDefault;
-
-            gc.format = defaultFormat;            
-
-            if (main) {
-              gc.main = true;
-              gc.format = Grouping.Format.Simple;
-              main = false;
-            }
-            if (gc.format == Grouping.Format.Simple) {
-              gc.docsPerGroup = gc.numGroups;  // doesn't make sense to limit to one
-              gc.groupOffset = gc.offset;
-            }
-
-            grouping.add(gc);
+            grouping.addQueryCommand(groupByStr, rb.req);
          }
        }

-
        if (rb.doHighlights || rb.isDebug()) {
          // we need a single list of the returned docs
          cmd.setFlags(SolrIndexSearcher.GET_DOCLIST);
        }

-        // searcher.search(result,cmd);
        grouping.execute();
-        rb.setResult( result );
+        if (grouping.isSignalCacheWarning()) {
+          rsp.add(
+              "cacheWarning",
+              String.format("Cache limit of %d percent relative to maxdoc has exceeded. Please increase cache size or disable caching.", maxDocsPercentageToCache)
+          );
+        }
+        rb.setResult(result);
        rsp.add("grouped", result.groupedResults);
-        // TODO: get "hits" a different way to log

        if (grouping.mainResult != null) {
          ResultContext ctx = new ResultContext();
@ -428,10 +387,10 @@ public class QueryComponent extends SearchComponent
          ctx.query = null; // TODO? add the query?
          rsp.add("response", ctx);
          rsp.getToLog().add("hits", grouping.mainResult.matches());
+        } else if (!grouping.getCommands().isEmpty()) { // Can never be empty since grouping.execute() checks for this.
+          rsp.getToLog().add("hits", grouping.getCommands().get(0).getMatches());
        }
-
        return;
-
      } catch (ParseException e) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
      }
--- a/solr/src/java/org/apache/solr/search/Grouping.java
+++ b/solr/src/java/org/apache/solr/search/Grouping.java
--- a/solr/src/java/org/apache/solr/search/function/FieldCacheSource.java
+++ b/solr/src/java/org/apache/solr/search/function/FieldCacheSource.java
@ -37,6 +37,10 @@ public abstract class FieldCacheSource extends ValueSource {
    return cache;
  }

+  public String getField() {
+    return field;
+  }
+
  @Override
  public String description() {
    return field;
--- a/solr/src/test/org/apache/solr/TestGroupingSearch.java
+++ b/solr/src/test/org/apache/solr/TestGroupingSearch.java
@ -20,6 +20,7 @@ package org.apache.solr;
 import org.apache.lucene.search.FieldCache;
 import org.apache.noggit.JSONUtil;
 import org.apache.noggit.ObjectBuilder;
+import org.apache.solr.common.params.GroupParams;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.schema.IndexSchema;
 import org.junit.Before;
@ -88,6 +89,49 @@ public class TestGroupingSearch extends SolrTestCaseJ4 {
            );
  }

+  @Test
+  public void testGroupingGroupSortingScore_withTotalGroupCount() {
+    assertU(add(doc("id", "1","name", "author1", "title", "a book title", "group_si", "1")));
+    assertU(add(doc("id", "2","name", "author1", "title", "the title", "group_si", "2")));
+    assertU(add(doc("id", "3","name", "author2", "title", "a book title", "group_si", "1")));
+    assertU(add(doc("id", "4","name", "author2", "title", "title", "group_si", "2")));
+    assertU(add(doc("id", "5","name", "author3", "title", "the title of a title", "group_si", "1")));
+    assertU(commit());
+
+    assertQ(req("q","title:title", "group", "true", "group.field","name", "group.ngroups", "true")
+            ,"//lst[@name='grouped']/lst[@name='name']"
+            ,"//lst[@name='grouped']/lst[@name='name']/int[@name='matches'][.='5']"
+            ,"//lst[@name='grouped']/lst[@name='name']/int[@name='ngroups'][.='3']"
+            ,"*[count(//arr[@name='groups']/lst) = 3]"
+
+            ,"//arr[@name='groups']/lst[1]/str[@name='groupValue'][.='author2']"
+            ,"//arr[@name='groups']/lst[1]/result[@numFound='2']"
+            ,"//arr[@name='groups']/lst[1]/result/doc/*[@name='id'][.='4']"
+
+            ,"//arr[@name='groups']/lst[2]/str[@name='groupValue'][.='author1']"
+            ,"//arr[@name='groups']/lst[2]/result[@numFound='2']"
+            ,"//arr[@name='groups']/lst[2]/result/doc/*[@name='id'][.='2']"
+
+            ,"//arr[@name='groups']/lst[3]/str[@name='groupValue'][.='author3']"
+            ,"//arr[@name='groups']/lst[3]/result[@numFound='1']"
+            ,"//arr[@name='groups']/lst[3]/result/doc/*[@name='id'][.='5']"
+            );
+
+    assertQ(req("q","title:title", "group", "true", "group.field","group_si", "group.ngroups", "true")
+            ,"//lst[@name='grouped']/lst[@name='group_si']/int[@name='matches'][.='5']"
+            ,"//lst[@name='grouped']/lst[@name='group_si']/int[@name='ngroups'][.='2']"
+            ,"*[count(//arr[@name='groups']/lst) = 2]"
+
+            ,"//arr[@name='groups']/lst[1]/int[@name='groupValue'][.='2']"
+            ,"//arr[@name='groups']/lst[1]/result[@numFound='2']"
+            ,"//arr[@name='groups']/lst[1]/result/doc/*[@name='id'][.='4']"
+
+            ,"//arr[@name='groups']/lst[2]/int[@name='groupValue'][.='1']"
+            ,"//arr[@name='groups']/lst[2]/result[@numFound='3']"
+            ,"//arr[@name='groups']/lst[2]/result/doc/*[@name='id'][.='5']"
+            );
+  }
+
  @Test
  public void testGroupingGroupSortingScore_basicWithGroupSortEqualToSort() {
    assertU(add(doc("id", "1","name", "author1", "title", "a book title")));
@ -353,7 +397,7 @@ public class TestGroupingSearch extends SolrTestCaseJ4 {
    , "/grouped/foo_i=={'matches':10,'doclist':"
        +"{'numFound':10,'start':1,'docs':[{'id':'10'},{'id':'3'},{'id':'6'}]}}"
    );
-  };
+  }



@ -476,14 +520,16 @@ public class TestGroupingSearch extends SolrTestCaseJ4 {
        List<Grp> sortedGroups = new ArrayList(groups.values());
        Collections.sort(sortedGroups,  groupComparator==sortComparator ? createFirstDocComparator(sortComparator) : createMaxDocComparator(sortComparator));

-        Object modelResponse = buildGroupedResult(h.getCore().getSchema(), sortedGroups, start, rows, group_offset, group_limit);
+        boolean includeNGroups = random.nextBoolean();
+        Object modelResponse = buildGroupedResult(h.getCore().getSchema(), sortedGroups, start, rows, group_offset, group_limit, includeNGroups);

+        int randomPercentage = random.nextInt(101);
        // TODO: create a random filter too
-
        SolrQueryRequest req = req("group","true","wt","json","indent","true", "echoParams","all", "q","{!func}score_f", "group.field",groupField
            ,sortStr==null ? "nosort":"sort", sortStr ==null ? "": sortStr
-            ,(groupSortStr==null || groupSortStr==sortStr) ? "nosort":"group.sort", groupSortStr==null ? "": groupSortStr
-            ,"rows",""+rows, "start",""+start, "group.offset",""+group_offset, "group.limit",""+group_limit
+            ,(groupSortStr==null || groupSortStr==sortStr) ? "noGroupsort":"group.sort", groupSortStr==null ? "": groupSortStr
+            ,"rows",""+rows, "start",""+start, "group.offset",""+group_offset, "group.limit",""+group_limit,
+            GroupParams.GROUP_CACHE_PERCENTAGE, Integer.toString(randomPercentage), GroupParams.GROUP_TOTAL_COUNT, includeNGroups ? "true" : "false"
        );

        String strResponse = h.query(req);
@ -508,7 +554,7 @@ public class TestGroupingSearch extends SolrTestCaseJ4 {

  }

-  public static Object buildGroupedResult(IndexSchema schema, List<Grp> sortedGroups, int start, int rows, int group_offset, int group_limit) {
+  public static Object buildGroupedResult(IndexSchema schema, List<Grp> sortedGroups, int start, int rows, int group_offset, int group_limit, boolean includeNGroups) {
    Map<String,Object> result = new LinkedHashMap<String,Object>();

    long matches = 0;
@ -516,6 +562,9 @@ public class TestGroupingSearch extends SolrTestCaseJ4 {
      matches += grp.docs.size();
    }
    result.put("matches", matches);
+    if (includeNGroups) {
+      result.put("ngroups", sortedGroups.size());
+    }
    List groupList = new ArrayList();
    result.put("groups", groupList);