From d805da14c26e80b1361ccf178d2e9b24c566fbac Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Fri, 17 Jun 2011 21:25:59 +0000 Subject: [PATCH] SOLR-2564: Integrating grouping module into Solr 4.0 git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1137037 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/search/CachingCollector.java | 62 +- .../lucene/search/TestCachingCollector.java | 10 +- solr/CHANGES.txt | 2 + solr/common-build.xml | 6 + .../solr/common/params/GroupParams.java | 15 + .../handler/component/QueryComponent.java | 115 +- .../java/org/apache/solr/search/Grouping.java | 1610 ++++++++++------- .../search/function/FieldCacheSource.java | 4 + .../org/apache/solr/TestGroupingSearch.java | 61 +- 9 files changed, 1095 insertions(+), 790 deletions(-) diff --git a/lucene/src/java/org/apache/lucene/search/CachingCollector.java b/lucene/src/java/org/apache/lucene/search/CachingCollector.java index c17602794fb..ca57b7f52a7 100644 --- a/lucene/src/java/org/apache/lucene/search/CachingCollector.java +++ b/lucene/src/java/org/apache/lucene/search/CachingCollector.java @@ -17,22 +17,22 @@ package org.apache.lucene.search; * limitations under the License. */ +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.util.RamUsageEstimator; + import java.io.IOException; import java.util.ArrayList; import java.util.List; -import org.apache.lucene.index.IndexReader.AtomicReaderContext; -import org.apache.lucene.util.RamUsageEstimator; - /** * Caches all docs, and optionally also scores, coming from * a search, and is then able to replay them to another * collector. You specify the max RAM this class may use. - * Once the collection is done, call {@link #isCached}. If - * this returns true, you can use {@link #replay} against a - * new collector. If it returns false, this means too much - * RAM was required and you must instead re-run the original - * search. + * Once the collection is done, call {@link #isCached}. If + * this returns true, you can use {@link #replay(Collector)} + * against a new collector. If it returns false, this means + * too much RAM was required and you must instead re-run the + * original search. * *

NOTE: this class consumes 4 (or 8 bytes, if * scoring is cached) per collected document. If the result @@ -105,7 +105,16 @@ public abstract class CachingCollector extends Collector { cachedScorer = new CachedScorer(); cachedScores = new ArrayList(); - curScores = new float[128]; + curScores = new float[INITIAL_ARRAY_SIZE]; + cachedScores.add(curScores); + } + + ScoreCachingCollector(Collector other, int maxDocsToCache) { + super(other, maxDocsToCache); + + cachedScorer = new CachedScorer(); + cachedScores = new ArrayList(); + curScores = new float[INITIAL_ARRAY_SIZE]; cachedScores.add(curScores); } @@ -210,7 +219,11 @@ public abstract class CachingCollector extends Collector { NoScoreCachingCollector(Collector other, double maxRAMMB) { super(other, maxRAMMB, false); } - + + NoScoreCachingCollector(Collector other, int maxDocsToCache) { + super(other, maxDocsToCache); + } + @Override public void collect(int doc) throws IOException { @@ -353,7 +366,25 @@ public abstract class CachingCollector extends Collector { */ public static CachingCollector create(Collector other, boolean cacheScores, double maxRAMMB) { return cacheScores ? new ScoreCachingCollector(other, maxRAMMB) : new NoScoreCachingCollector(other, maxRAMMB); - } + } + + /** + * Create a new {@link CachingCollector} that wraps the given collector and + * caches documents and scores up to the specified max docs threshold. + * + * @param other + * the Collector to wrap and delegate calls to. + * @param cacheScores + * whether to cache scores in addition to document IDs. Note that + * this increases the RAM consumed per doc + * @param maxDocsToCache + * the maximum number of documents for caching the documents and + * possible the scores. If the collector exceeds the threshold, + * no documents and scores are cached. + */ + public static CachingCollector create(Collector other, boolean cacheScores, int maxDocsToCache) { + return cacheScores ? new ScoreCachingCollector(other, maxDocsToCache) : new NoScoreCachingCollector(other, maxDocsToCache); + } // Prevent extension from non-internal classes private CachingCollector(Collector other, double maxRAMMB, boolean cacheScores) { @@ -369,6 +400,15 @@ public abstract class CachingCollector extends Collector { } maxDocsToCache = (int) ((maxRAMMB * 1024 * 1024) / bytesPerDoc); } + + private CachingCollector(Collector other, int maxDocsToCache) { + this.other = other; + + cachedDocs = new ArrayList(); + curDocs = new int[INITIAL_ARRAY_SIZE]; + cachedDocs.add(curDocs); + this.maxDocsToCache = maxDocsToCache; + } @Override public boolean acceptsDocsOutOfOrder() { diff --git a/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java b/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java index e812f25d207..050ce0772e4 100755 --- a/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java +++ b/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java @@ -17,15 +17,11 @@ package org.apache.lucene.search; * limitations under the License. */ -import java.io.IOException; - import org.apache.lucene.index.IndexReader.AtomicReaderContext; -import org.apache.lucene.search.CachingCollector; -import org.apache.lucene.search.Collector; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Weight; import org.apache.lucene.util.LuceneTestCase; +import java.io.IOException; + public class TestCachingCollector extends LuceneTestCase { private static final double ONE_BYTE = 1.0 / (1024 * 1024); // 1 byte out of MB @@ -76,7 +72,7 @@ public class TestCachingCollector extends LuceneTestCase { public void testBasic() throws Exception { for (boolean cacheScores : new boolean[] { false, true }) { - CachingCollector cc = CachingCollector.create(new NoOpCollector(false), cacheScores, 1); + CachingCollector cc = CachingCollector.create(new NoOpCollector(false), cacheScores, 1.0); cc.setScorer(new MockScorer()); // collect 1000 docs diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 0c6e3b08089..230baa08fd3 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -78,6 +78,8 @@ New Features Bojan Smid, Charles Hornberger, Dieter Grad, Dmitry Lihachev, Doug Steigerwald, Karsten Sperling, Michael Gundlach, Oleg Gnatovskiy, Thomas Traeger, Harish Agarwal, yonik) + SOLR-2564: Integrate grouping module into Solr. Also adds the ability to return number of + groups that have match a query. * SOLR-1665: Add debug component options for timings, results and query info only (gsingers, hossman, yonik) diff --git a/solr/common-build.xml b/solr/common-build.xml index 4edc6a585e0..76b8fded9f7 100644 --- a/solr/common-build.xml +++ b/solr/common-build.xml @@ -198,6 +198,7 @@ + @@ -214,6 +215,7 @@ + @@ -248,6 +250,9 @@ + + + @@ -262,6 +267,7 @@ + diff --git a/solr/src/common/org/apache/solr/common/params/GroupParams.java b/solr/src/common/org/apache/solr/common/params/GroupParams.java index 8e75d1beb0b..806e147845f 100755 --- a/solr/src/common/org/apache/solr/common/params/GroupParams.java +++ b/solr/src/common/org/apache/solr/common/params/GroupParams.java @@ -38,5 +38,20 @@ public interface GroupParams { /** treat the first group result as the main result. true/false */ public static final String GROUP_FORMAT = GROUP + ".format"; + + /** + * Whether to cache the first pass search (doc ids and score) for the second pass search. + * Also defines the maximum size of the group cache relative to maxdoc in a percentage. + * Values can be a positive integer, from 0 till 100. A value of 0 will disable the group cache. + * The default is 0.*/ + public static final String GROUP_CACHE_PERCENTAGE = GROUP + ".cache.percent"; + + // Note: Since you can supply multiple fields to group on, but only have a facets for the whole result. It only makes + // sense to me to support these parameters for the first group. + /** Whether the docSet (for example for faceting) should be based on plain documents (a.k.a UNGROUPED) or on the groups (a.k.a GROUPED). */ + public static final String GROUP_COLLAPSE = GROUP + ".collapse"; + + /** Whether the group count should be included in the response. */ + public static final String GROUP_TOTAL_COUNT = GROUP + ".ngroups"; } diff --git a/solr/src/java/org/apache/solr/handler/component/QueryComponent.java b/solr/src/java/org/apache/solr/handler/component/QueryComponent.java index 98602d280a6..9af9b023573 100644 --- a/solr/src/java/org/apache/solr/handler/component/QueryComponent.java +++ b/solr/src/java/org/apache/solr/handler/component/QueryComponent.java @@ -45,8 +45,6 @@ import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.schema.FieldType; import org.apache.solr.schema.SchemaField; import org.apache.solr.search.*; -import org.apache.solr.search.function.FunctionQuery; -import org.apache.solr.search.function.QueryValueSource; import org.apache.solr.util.SolrPluginUtils; import java.io.IOException; @@ -315,16 +313,25 @@ public class QueryComponent extends SearchComponent boolean doGroup = params.getBool(GroupParams.GROUP, false); if (doGroup) { try { - Grouping grouping = new Grouping(searcher, result, cmd); - + int maxDocsPercentageToCache = params.getInt(GroupParams.GROUP_CACHE_PERCENTAGE, 0); + boolean cacheSecondPassSearch = maxDocsPercentageToCache >= 1 && maxDocsPercentageToCache <= 100; String[] fields = params.getParams(GroupParams.GROUP_FIELD); String[] funcs = params.getParams(GroupParams.GROUP_FUNC); String[] queries = params.getParams(GroupParams.GROUP_QUERY); String groupSortStr = params.get(GroupParams.GROUP_SORT); boolean main = params.getBool(GroupParams.GROUP_MAIN, false); - String format = params.get(GroupParams.GROUP_FORMAT); - Grouping.Format defaultFormat = "simple".equals(format) ? Grouping.Format.Simple : Grouping.Format.Grouped; + String formatStr = params.get(GroupParams.GROUP_FORMAT, Grouping.Format.grouped.name()); + Grouping.Format defaultFormat; + try { + defaultFormat = Grouping.Format.valueOf(formatStr); + } catch (IllegalArgumentException e) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, String.format("Illegal %s parameter", GroupParams.GROUP_FORMAT)); + } + + boolean includeTotalGroupCount = params.getBool(GroupParams.GROUP_TOTAL_COUNT, false); + Grouping.TotalCount defaultTotalCount = includeTotalGroupCount ? Grouping.TotalCount.grouped : Grouping.TotalCount.ungrouped; + Sort sort = cmd.getSort(); // groupSort defaults to sort Sort groupSort = groupSortStr == null ? cmd.getSort() : QueryParsing.parseSort(groupSortStr, req); @@ -332,95 +339,47 @@ public class QueryComponent extends SearchComponent int groupOffsetDefault = params.getInt(GroupParams.GROUP_OFFSET, 0); int docsPerGroupDefault = params.getInt(GroupParams.GROUP_LIMIT, 1); - // temporary: implement all group-by-field as group-by-func - if (funcs == null) { - funcs = fields; - } else if (fields != null) { - // catenate functions and fields - String[] both = new String[fields.length + funcs.length]; - System.arraycopy(fields, 0, both, 0, fields.length); - System.arraycopy(funcs, 0, both, fields.length, funcs.length); - funcs = both; - } + Grouping grouping = new Grouping(searcher, result, cmd, cacheSecondPassSearch, maxDocsPercentageToCache, main); + grouping.setSort(sort) + .setGroupSort(groupSort) + .setDefaultFormat(defaultFormat) + .setLimitDefault(limitDefault) + .setDefaultTotalCount(defaultTotalCount) + .setDocsPerGroupDefault(docsPerGroupDefault) + .setGroupOffsetDefault(groupOffsetDefault); + if (fields != null) { + for (String field : fields) { + grouping.addFieldCommand(field, rb.req); + } + } if (funcs != null) { for (String groupByStr : funcs) { - QParser parser = QParser.getParser(groupByStr, "func", rb.req); - Query q = parser.getQuery(); - Grouping.CommandFunc gc = grouping.new CommandFunc(); - gc.groupSort = groupSort; - - if (q instanceof FunctionQuery) { - gc.groupBy = ((FunctionQuery)q).getValueSource(); - } else { - gc.groupBy = new QueryValueSource(q, 0.0f); - } - gc.key = groupByStr; - gc.numGroups = limitDefault; - gc.docsPerGroup = docsPerGroupDefault; - gc.groupOffset = groupOffsetDefault; - gc.offset = cmd.getOffset(); - gc.sort = cmd.getSort(); - gc.format = defaultFormat; - - if (main) { - gc.main = true; - gc.format = Grouping.Format.Simple; - main = false; - } - - if (gc.format == Grouping.Format.Simple) { - gc.groupOffset = 0; // doesn't make sense - } - - grouping.add(gc); + grouping.addFunctionCommand(groupByStr, rb.req); } } if (queries != null) { for (String groupByStr : queries) { - QParser parser = QParser.getParser(groupByStr, null, rb.req); - Query gq = parser.getQuery(); - Grouping.CommandQuery gc = grouping.new CommandQuery(); - gc.query = gq; - gc.groupSort = groupSort; - gc.key = groupByStr; - gc.numGroups = limitDefault; - gc.docsPerGroup = docsPerGroupDefault; - gc.groupOffset = groupOffsetDefault; - - // these two params will only be used if this is for the main result set - gc.offset = cmd.getOffset(); - gc.numGroups = limitDefault; - - gc.format = defaultFormat; - - if (main) { - gc.main = true; - gc.format = Grouping.Format.Simple; - main = false; - } - if (gc.format == Grouping.Format.Simple) { - gc.docsPerGroup = gc.numGroups; // doesn't make sense to limit to one - gc.groupOffset = gc.offset; - } - - grouping.add(gc); + grouping.addQueryCommand(groupByStr, rb.req); } } - if (rb.doHighlights || rb.isDebug()) { // we need a single list of the returned docs cmd.setFlags(SolrIndexSearcher.GET_DOCLIST); } - // searcher.search(result,cmd); grouping.execute(); - rb.setResult( result ); + if (grouping.isSignalCacheWarning()) { + rsp.add( + "cacheWarning", + String.format("Cache limit of %d percent relative to maxdoc has exceeded. Please increase cache size or disable caching.", maxDocsPercentageToCache) + ); + } + rb.setResult(result); rsp.add("grouped", result.groupedResults); - // TODO: get "hits" a different way to log if (grouping.mainResult != null) { ResultContext ctx = new ResultContext(); @@ -428,10 +387,10 @@ public class QueryComponent extends SearchComponent ctx.query = null; // TODO? add the query? rsp.add("response", ctx); rsp.getToLog().add("hits", grouping.mainResult.matches()); + } else if (!grouping.getCommands().isEmpty()) { // Can never be empty since grouping.execute() checks for this. + rsp.getToLog().add("hits", grouping.getCommands().get(0).getMatches()); } - return; - } catch (ParseException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); } diff --git a/solr/src/java/org/apache/solr/search/Grouping.java b/solr/src/java/org/apache/solr/search/Grouping.java index 3075ac4700c..32637b3f69b 100755 --- a/solr/src/java/org/apache/solr/search/Grouping.java +++ b/solr/src/java/org/apache/solr/search/Grouping.java @@ -17,355 +17,287 @@ package org.apache.solr.search; +import org.apache.commons.lang.ArrayUtils; +import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.*; +import org.apache.lucene.search.grouping.*; import org.apache.lucene.util.BytesRef; +import org.apache.solr.common.SolrException; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.SimpleOrderedMap; -import org.apache.solr.schema.StrFieldSource; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.schema.*; import org.apache.solr.search.function.DocValues; -import org.apache.solr.search.function.StringIndexDocValues; +import org.apache.solr.search.function.FunctionQuery; +import org.apache.solr.search.function.QueryValueSource; import org.apache.solr.search.function.ValueSource; -import org.apache.solr.util.SentinelIntSet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.*; +/** + * Basic Solr Grouping infrastructure. + * Warning NOT thread save! + * + * @lucene.experimental + */ public class Grouping { - public enum Format {Grouped, Simple} + private final static Logger logger = LoggerFactory.getLogger(Grouping.class); - public abstract class Command { - public String key; // the name to use for this group in the response - public Sort groupSort; // the sort of the documents *within* a single group. - public Sort sort; // the sort between groups - public int docsPerGroup; // how many docs in each group - from "group.limit" param, default=1 - public int groupOffset; // the offset within each group (for paging within each group) - public int numGroups; // how many groups - defaults to the "rows" parameter - public int offset; // offset into the list of groups - public Format format; - public boolean main; // use as the main result in simple format (grouped.main=true param) + private final SolrIndexSearcher searcher; + private final SolrIndexSearcher.QueryResult qr; + private final SolrIndexSearcher.QueryCommand cmd; + private final List commands = new ArrayList(); + private final boolean main; + private final boolean cacheSecondPassSearch; + private final int maxDocsPercentageToCache; + private Sort sort; + private Sort groupSort; + private int limitDefault; + private int docsPerGroupDefault; + private int groupOffsetDefault; + private Format defaultFormat; + private TotalCount defaultTotalCount; - abstract void prepare() throws IOException; - abstract Collector createCollector() throws IOException; - Collector createNextCollector() throws IOException { - return null; - } - abstract void finish() throws IOException; + private int maxDoc; + private boolean needScores; + private boolean getDocSet; + private boolean getDocList; // doclist needed for debugging or highlighting + private Query query; + private DocSet filter; + private Filter luceneFilter; + private NamedList grouped = new SimpleOrderedMap(); + private Set idSet = new LinkedHashSet(); // used for tracking unique docs when we need a doclist + private int maxMatches; // max number of matches from any grouping command + private float maxScore = Float.NEGATIVE_INFINITY; // max score seen in any doclist + private boolean signalCacheWarning = false; - abstract int getMatches(); - - NamedList commonResponse() { - NamedList groupResult = new SimpleOrderedMap(); - grouped.add(key, groupResult); // grouped={ key={ - - int this_matches = getMatches(); - groupResult.add("matches", this_matches); - maxMatches = Math.max(maxMatches, this_matches); - return groupResult; - } - - DocList getDocList(TopDocsCollector collector) { - int max = collector.getTotalHits(); - int off = groupOffset; - int len = docsPerGroup; - if (format == Format.Simple) { - off = offset; - len = numGroups; - } - int docsToCollect = getMax(off, len, max); - - // TODO: implement a DocList impl that doesn't need to start at offset=0 - TopDocs topDocs = collector.topDocs(0, Math.max(docsToCollect,1)); // 0 isn't supported as a valid value - int docsCollected = Math.min(docsToCollect, topDocs.scoreDocs.length); - - int ids[] = new int[docsCollected]; - float[] scores = needScores ? new float[docsCollected] : null; - for (int i=0; i 0) { - skipCount--; - continue; - } - NamedList nl = new SimpleOrderedMap(); - groupList.add(nl); // grouped={ key={ groups=[ { - - nl.add("groupValue", group.groupValue.toObject()); - - SearchGroupDocs groupDocs = collector2.groupMap.get(group.groupValue); - addDocList(nl, groupDocs.collector); - } - } - - private DocList createSimpleResponse() { - int docCount = numGroups; - int docOffset = offset; - int docsToGather = getMax(docOffset, docCount, maxDoc); - - float maxScore = Float.NEGATIVE_INFINITY; - List topDocsList = new ArrayList(); - int numDocs = 0; - for (SearchGroup group : collector.orderedGroups) { - SearchGroupDocs groupDocs = collector2.groupMap.get(group.groupValue); - - TopDocsCollector collector = groupDocs.collector; - int hits = collector.getTotalHits(); - - int num = Math.min(docsPerGroup, hits - groupOffset); // how many docs are in this group - if (num <= 0) continue; - - TopDocs topDocs = collector.topDocs(groupOffset, Math.min(docsPerGroup,docsToGather-numDocs)); - topDocsList.add(topDocs); - numDocs += topDocs.scoreDocs.length; - - float score = topDocs.getMaxScore(); - maxScore = Math.max(maxScore, score); - - if (numDocs >= docsToGather) break; - } - assert numDocs <= docsToGather; // make sure we didn't gather too many - - int[] ids = new int[numDocs]; - float[] scores = needScores ? new float[numDocs] : null; - int pos = 0; - - for (TopDocs topDocs : topDocsList) { - for (ScoreDoc sd : topDocs.scoreDocs) { - ids[pos] = sd.doc; - if (scores != null) scores[pos] = sd.score; - pos++; - } - } - - DocSlice docs = new DocSlice(docOffset, Math.max(0, ids.length - docOffset), ids, scores, getMatches(), maxScore); - - if (getDocList) { - DocIterator iter = docs.iterator(); - while (iter.hasNext()) - idSet.add(iter.nextDoc()); - } - - return docs; - } - - @Override - int getMatches() { - return collector.getMatches(); - } - } - - - - static Sort byScoreDesc = new Sort(); - - static boolean compareSorts(Sort sort1, Sort sort2) { - return sort1 == sort2 || normalizeSort(sort1).equals(normalizeSort(sort2)); - } - - /** returns a sort by score desc if null */ - static Sort normalizeSort(Sort sort) { - return sort==null ? byScoreDesc : sort; - } - - static int getMax(int offset, int len, int max) { - int v = len<0 ? max : offset + len; - if (v < 0 || v > max) v = max; - return v; - } - - TopDocsCollector newCollector(Sort sort, int numHits, boolean fillFields, boolean needScores) throws IOException { - if (sort==null || sort==byScoreDesc) { - return TopScoreDocCollector.create(numHits, true); - } else { - return TopFieldCollector.create(searcher.weightSort(sort), numHits, false, needScores, needScores, true); - } - } - - - final SolrIndexSearcher searcher; - final SolrIndexSearcher.QueryResult qr; - final SolrIndexSearcher.QueryCommand cmd; - final List commands = new ArrayList(); public DocList mainResult; // output if one of the grouping commands should be used as the main result. - public Grouping(SolrIndexSearcher searcher, SolrIndexSearcher.QueryResult qr, SolrIndexSearcher.QueryCommand cmd) { + /** + * @param searcher + * @param qr + * @param cmd + * @param cacheSecondPassSearch Whether to cache the documents and scores from the first pass search for the second + * pass search. + * @param maxDocsPercentageToCache The maximum number of documents in a percentage relative from maxdoc + * that is allowed in the cache. When this threshold is met, + * the cache is not used in the second pass search. + */ + public Grouping(SolrIndexSearcher searcher, + SolrIndexSearcher.QueryResult qr, + SolrIndexSearcher.QueryCommand cmd, + boolean cacheSecondPassSearch, + int maxDocsPercentageToCache, + boolean main) { this.searcher = searcher; this.qr = qr; this.cmd = cmd; + this.cacheSecondPassSearch = cacheSecondPassSearch; + this.maxDocsPercentageToCache = maxDocsPercentageToCache; + this.main = main; } public void add(Grouping.Command groupingCommand) { commands.add(groupingCommand); } - int maxDoc; - boolean needScores; - boolean getDocSet; - boolean getDocList; // doclist needed for debugging or highlighting - Query query; - DocSet filter; - Filter luceneFilter; - NamedList grouped = new SimpleOrderedMap(); - Set idSet = new LinkedHashSet(); // used for tracking unique docs when we need a doclist - int maxMatches; // max number of matches from any grouping command - float maxScore = Float.NEGATIVE_INFINITY; // max score seen in any doclist - + /** + * Adds a field command based on the specified field. + * If the field is not compatible with {@link CommandField} it invokes the + * {@link #addFunctionCommand(String, org.apache.solr.request.SolrQueryRequest)} method. + * + * @param field The fieldname to group by. + */ + public void addFieldCommand(String field, SolrQueryRequest request) throws ParseException { + SchemaField schemaField = searcher.getSchema().getField(field); // Throws an exception when field doesn't exist. Bad request. + FieldType fieldType = schemaField.getType(); + ValueSource valueSource = fieldType.getValueSource(schemaField, null); + if (!(valueSource instanceof StrFieldSource)) { + addFunctionCommand(field, request); + return; + } + + Grouping.CommandField gc = new CommandField(); + gc.groupSort = groupSort; + gc.groupBy = field; + gc.key = field; + gc.numGroups = limitDefault; + gc.docsPerGroup = docsPerGroupDefault; + gc.groupOffset = groupOffsetDefault; + gc.offset = cmd.getOffset(); + gc.sort = sort; + gc.format = defaultFormat; + gc.totalCount = defaultTotalCount; + + if (main) { + gc.main = true; + gc.format = Grouping.Format.simple; + } + + if (gc.format == Grouping.Format.simple) { + gc.groupOffset = 0; // doesn't make sense + } + commands.add(gc); + } + + public void addFunctionCommand(String groupByStr, SolrQueryRequest request) throws ParseException { + QParser parser = QParser.getParser(groupByStr, "func", request); + Query q = parser.getQuery(); + final Grouping.Command gc; + if (q instanceof FunctionQuery) { + ValueSource valueSource = ((FunctionQuery)q).getValueSource(); + if (valueSource instanceof StrFieldSource) { + String field = ((StrFieldSource) valueSource).getField(); + CommandField commandField = new CommandField(); + commandField.groupBy = field; + gc = commandField; + } else { + CommandFunc commandFunc = new CommandFunc(); + commandFunc.groupBy = valueSource; + gc = commandFunc; + } + } else { + CommandFunc commandFunc = new CommandFunc(); + commandFunc.groupBy = new QueryValueSource(q, 0.0f); + gc = commandFunc; + } + gc.groupSort = groupSort; + gc.key = groupByStr; + gc.numGroups = limitDefault; + gc.docsPerGroup = docsPerGroupDefault; + gc.groupOffset = groupOffsetDefault; + gc.offset = cmd.getOffset(); + gc.sort = sort; + gc.format = defaultFormat; + gc.totalCount = defaultTotalCount; + + if (main) { + gc.main = true; + gc.format = Grouping.Format.simple; + } + + if (gc.format == Grouping.Format.simple) { + gc.groupOffset = 0; // doesn't make sense + } + + commands.add(gc); + } + + public void addQueryCommand(String groupByStr, SolrQueryRequest request) throws ParseException { + QParser parser = QParser.getParser(groupByStr, null, request); + Query gq = parser.getQuery(); + Grouping.CommandQuery gc = new CommandQuery(); + gc.query = gq; + gc.groupSort = groupSort; + gc.key = groupByStr; + gc.numGroups = limitDefault; + gc.docsPerGroup = docsPerGroupDefault; + gc.groupOffset = groupOffsetDefault; + + // these two params will only be used if this is for the main result set + gc.offset = cmd.getOffset(); + gc.numGroups = limitDefault; + gc.format = defaultFormat; + + if (main) { + gc.main = true; + gc.format = Grouping.Format.simple; + } + if (gc.format == Grouping.Format.simple) { + gc.docsPerGroup = gc.numGroups; // doesn't make sense to limit to one + gc.groupOffset = gc.offset; + } + + commands.add(gc); + } + + public Grouping setSort(Sort sort) { + this.sort = sort; + return this; + } + + public Grouping setGroupSort(Sort groupSort) { + this.groupSort = groupSort; + return this; + } + + public Grouping setLimitDefault(int limitDefault) { + this.limitDefault = limitDefault; + return this; + } + + public Grouping setDocsPerGroupDefault(int docsPerGroupDefault) { + this.docsPerGroupDefault = docsPerGroupDefault; + return this; + } + + public Grouping setGroupOffsetDefault(int groupOffsetDefault) { + this.groupOffsetDefault = groupOffsetDefault; + return this; + } + + public Grouping setDefaultFormat(Format defaultFormat) { + this.defaultFormat = defaultFormat; + return this; + } + + public Grouping setDefaultTotalCount(TotalCount defaultTotalCount) { + this.defaultTotalCount = defaultTotalCount; + return this; + } + + public List getCommands() { + return commands; + } + public void execute() throws IOException { + if (commands.isEmpty()) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Specify at least on field, function or query to group by."); + } + DocListAndSet out = new DocListAndSet(); qr.setDocListAndSet(out); - filter = cmd.getFilter()!=null ? cmd.getFilter() : searcher.getDocSet(cmd.getFilterList()); + filter = cmd.getFilter() != null ? cmd.getFilter() : searcher.getDocSet(cmd.getFilterList()); luceneFilter = filter == null ? null : filter.getTopFilter(); - maxDoc = searcher.maxDoc(); needScores = (cmd.getFlags() & SolrIndexSearcher.GET_SCORES) != 0; + boolean cacheScores = false; + // NOTE: Change this when groupSort can be specified per group + if (!needScores && !commands.isEmpty()) { + if (commands.get(0).groupSort == null) { + cacheScores = true; + } else { + for (SortField field : commands.get(0).groupSort.getSort()) { + if (field.getType() == SortField.SCORE) { + cacheScores = true; + break; + } + } + } + } else if (needScores) { + cacheScores = needScores; + } getDocSet = (cmd.getFlags() & SolrIndexSearcher.GET_DOCSET) != 0; - getDocList = (cmd.getFlags() & SolrIndexSearcher.GET_DOCLIST) != 0; // doclist needed for debugging or highlighting + getDocList = (cmd.getFlags() & SolrIndexSearcher.GET_DOCLIST) != 0; query = QueryUtils.makeQueryable(cmd.getQuery()); for (Command cmd : commands) { cmd.prepare(); } - + List collectors = new ArrayList(commands.size()); for (Command cmd : commands) { - Collector collector = cmd.createCollector(); + Collector collector = cmd.createFirstPassCollector(); if (collector != null) collectors.add(collector); } @@ -373,11 +305,23 @@ public class Grouping { Collector allCollectors = MultiCollector.wrap(collectors.toArray(new Collector[collectors.size()])); DocSetCollector setCollector = null; if (getDocSet) { - setCollector = new DocSetDelegateCollector(maxDoc>>6, maxDoc, allCollectors); + setCollector = new DocSetDelegateCollector(maxDoc >> 6, maxDoc, allCollectors); allCollectors = setCollector; } - searcher.search(query, luceneFilter, allCollectors); + CachingCollector cachedCollector = null; + if (cacheSecondPassSearch && allCollectors != null) { + int maxDocsToCache = (int) Math.round(maxDoc * (maxDocsPercentageToCache / 100.0d)); + // Only makes sense to cache if we cache more than zero. + // Maybe we should have a minimum and a maximum, that defines the window we would like caching for. + if (maxDocsToCache > 0) { + allCollectors = cachedCollector = CachingCollector.create(allCollectors, cacheScores, maxDocsToCache); + } + } + + if (allCollectors != null) { + searcher.search(query, luceneFilter, allCollectors); + } if (getDocSet) { qr.setDocSet(setCollector.getDocSet()); @@ -385,13 +329,27 @@ public class Grouping { collectors.clear(); for (Command cmd : commands) { - Collector collector = cmd.createNextCollector(); + Collector collector = cmd.createSecondPassCollector(); if (collector != null) collectors.add(collector); } - if (collectors.size() > 0) { - searcher.search(query, luceneFilter, MultiCollector.wrap(collectors.toArray(new Collector[collectors.size()]))); + if (!collectors.isEmpty()) { + Collector secondPhaseCollectors = MultiCollector.wrap(collectors.toArray(new Collector[collectors.size()])); + if (collectors.size() > 0) { + if (cachedCollector != null) { + if (cachedCollector.isCached()) { + cachedCollector.replay(secondPhaseCollectors); + } else { + signalCacheWarning = true; + logger.warn(String.format("The grouping cache is active, but not used because it exceeded the max cache limit of %d percent", maxDocsPercentageToCache)); + logger.warn("Please increase cache size or disable group caching."); + searcher.search(query, luceneFilter, secondPhaseCollectors); + } + } else { + searcher.search(query, luceneFilter, secondPhaseCollectors); + } + } } for (Command cmd : commands) { @@ -411,429 +369,705 @@ public class Grouping { } } -} - - -class SearchGroup { - public MutableValue groupValue; - int matches; - int topDoc; - // float topDocScore; // currently unused - int comparatorSlot; - - /*** - @Override - public int hashCode() { - return super.hashCode(); + /** + * Returns offset + len if len equals zero or higher. Otherwise returns max. + * + * @param offset The offset + * @param len The number of documents to return + * @param max The number of document to return if len < 0 or if offset + len < 0 + * @return offset + len if len equals zero or higher. Otherwise returns max + */ + int getMax(int offset, int len, int max) { + int v = len < 0 ? max : offset + len; + if (v < 0 || v > max) v = max; + return v; } - @Override - public boolean equals(Object obj) { - return groupValue.equalsSameType(((SearchGroup)obj).groupValue); - } - ***/ -} - -abstract class GroupCollector extends Collector { - /** get the number of matches before grouping or limiting have been applied */ - public abstract int getMatches(); -} - -class FilterCollector extends GroupCollector { - private final DocSet filter; - private final Collector collector; - private int docBase; - private int matches; - - public FilterCollector(DocSet filter, Collector collector) throws IOException { - this.filter = filter; - this.collector = collector; + /** + * Returns whether a cache warning should be send to the client. + * The value true is returned when the cache is emptied because the caching limits where met, otherwise + * false is returned. + * + * @return whether a cache warning should be send to the client + */ + public boolean isSignalCacheWarning() { + return signalCacheWarning; } - @Override - public void setScorer(Scorer scorer) throws IOException { - collector.setScorer(scorer); + //====================================== Inner classes ============================================================= + + public static enum Format { + + /** + * Grouped result. Each group has its own result set. + */ + grouped, + + /** + * Flat result. All documents of all groups are put in one list. + */ + simple } - @Override - public void collect(int doc) throws IOException { - matches++; - if (filter.exists(doc + docBase)) { - collector.collect(doc); + public static enum TotalCount { + /** + * Computations should be based on groups. + */ + grouped, + + /** + * Computations should be based on plain documents, so not taking grouping into account. + */ + ungrouped + } + + /** + * General group command. A group command is responsible for creating the first and second pass collectors. + * A group command is also responsible for creating the response structure. + *

+ * Note: Maybe the creating the response structure should be done in something like a ReponseBuilder??? + * Warning NOT thread save! + */ + public abstract class Command { + + public String key; // the name to use for this group in the response + public Sort groupSort; // the sort of the documents *within* a single group. + public Sort sort; // the sort between groups + public int docsPerGroup; // how many docs in each group - from "group.limit" param, default=1 + public int groupOffset; // the offset within each group (for paging within each group) + public int numGroups; // how many groups - defaults to the "rows" parameter + int actualGroupsToFind; // How many groups should actually be found. Based on groupOffset and numGroups. + public int offset; // offset into the list of groups + public Format format; + public boolean main; // use as the main result in simple format (grouped.main=true param) + public TotalCount totalCount = TotalCount.ungrouped; + + TopGroups result; + + + /** + * Prepare this Command for execution. + * + * @throws IOException If I/O related errors occur + */ + protected abstract void prepare() throws IOException; + + /** + * Returns one or more {@link Collector} instances that are needed to perform the first pass search. + * If multiple Collectors are returned then these wrapped in a {@link org.apache.lucene.search.MultiCollector}. + * + * @return one or more {@link Collector} instances that are need to perform the first pass search + * @throws IOException If I/O related errors occur + */ + protected abstract Collector createFirstPassCollector() throws IOException; + + /** + * Returns zero or more {@link Collector} instances that are needed to perform the second pass search. + * In the case when no {@link Collector} instances are created null is returned. + * If multiple Collectors are returned then these wrapped in a {@link org.apache.lucene.search.MultiCollector}. + * + * @return zero or more {@link Collector} instances that are needed to perform the second pass search + * @throws IOException If I/O related errors occur + */ + protected Collector createSecondPassCollector() throws IOException { + return null; } - } - @Override - public void setNextReader(AtomicReaderContext context) throws IOException { - docBase = context.docBase; - collector.setNextReader(context); - } + /** + * Performs any necessary post actions to prepare the response. + * + * @throws IOException If I/O related errors occur + */ + protected abstract void finish() throws IOException; - @Override - public boolean acceptsDocsOutOfOrder() { - return collector.acceptsDocsOutOfOrder(); - } + /** + * Returns the number of matches for this Command. + * + * @return the number of matches for this Command + */ + public abstract int getMatches(); - @Override - public int getMatches() { - return matches; - } - - Collector getCollector() { - return collector; - } -} - - - - -/** Finds the top set of groups, grouped by groupByVS when sort == group.sort */ -class TopGroupCollector extends GroupCollector { - final int nGroups; - final HashMap groupMap; - TreeSet orderedGroups; - final ValueSource vs; - final Map context; - final FieldComparator[] comparators; - final int[] reversed; - - DocValues docValues; - DocValues.ValueFiller filler; - MutableValue mval; - Scorer scorer; - int docBase; - int spareSlot; - - int matches; - - public TopGroupCollector(ValueSource groupByVS, Map vsContext, Sort weightedSort, int nGroups) throws IOException { - this.vs = groupByVS; - this.context = vsContext; - this.nGroups = nGroups = Math.max(1,nGroups); // we need a minimum of 1 for this collector - - SortField[] sortFields = weightedSort.getSort(); - this.comparators = new FieldComparator[sortFields.length]; - this.reversed = new int[sortFields.length]; - for (int i = 0; i < sortFields.length; i++) { - SortField sortField = sortFields[i]; - reversed[i] = sortField.getReverse() ? -1 : 1; - // use nGroups + 1 so we have a spare slot to use for comparing (tracked by this.spareSlot) - comparators[i] = sortField.getComparator(nGroups + 1, i); + /** + * Returns the number of groups found for this Command. + * If the command doesn't support counting the groups null is returned. + * + * @return the number of groups found for this Command + */ + protected Integer getNumberOfGroups() { + return null; } - this.spareSlot = nGroups; - this.groupMap = new HashMap(nGroups); - } + protected NamedList commonResponse() { + NamedList groupResult = new SimpleOrderedMap(); + grouped.add(key, groupResult); // grouped={ key={ - @Override - public void setScorer(Scorer scorer) throws IOException { - this.scorer = scorer; - for (FieldComparator fc : comparators) - fc.setScorer(scorer); - } + int matches = getMatches(); + groupResult.add("matches", matches); + if (totalCount == TotalCount.grouped) { + Integer totalNrOfGroups = getNumberOfGroups(); + groupResult.add("ngroups", totalNrOfGroups == null ? 0 : totalNrOfGroups); + } + maxMatches = Math.max(maxMatches, matches); + return groupResult; + } - @Override - public void collect(int doc) throws IOException { - matches++; + protected DocList getDocList(GroupDocs groups) { + int max = groups.totalHits; + int off = groupOffset; + int len = docsPerGroup; + if (format == Format.simple) { + off = offset; + len = numGroups; + } + int docsToCollect = getMax(off, len, max); - // if orderedGroups != null, then we already have collected N groups and - // can short circuit by comparing this document to the smallest group - // without having to even find what group this document belongs to. - // Even if this document belongs to a group in the top N, we know that - // we don't have to update that group. - // - // Downside: if the number of unique groups is very low, this is - // wasted effort as we will most likely be updating an existing group. - if (orderedGroups != null) { - for (int i = 0;; i++) { - final int c = reversed[i] * comparators[i].compareBottom(doc); - if (c < 0) { - // Definitely not competitive. So don't even bother to continue - return; - } else if (c > 0) { - // Definitely competitive. - break; - } else if (i == comparators.length - 1) { - // Here c=0. If we're at the last comparator, this doc is not - // competitive, since docs are visited in doc Id order, which means - // this doc cannot compete with any other document in the queue. - return; + // TODO: implement a DocList impl that doesn't need to start at offset=0 + int docsCollected = Math.min(docsToCollect, groups.scoreDocs.length); + + int ids[] = new int[docsCollected]; + float[] scores = needScores ? new float[docsCollected] : null; + for (int i = 0; i < ids.length; i++) { + ids[i] = groups.scoreDocs[i].doc; + if (scores != null) + scores[i] = groups.scoreDocs[i].score; + } + + float score = groups.maxScore; + maxScore = Math.max(maxScore, score); + DocSlice docs = new DocSlice(off, Math.max(0, ids.length - off), ids, scores, groups.totalHits, score); + + if (getDocList) { + DocIterator iter = docs.iterator(); + while (iter.hasNext()) + idSet.add(iter.nextDoc()); + } + return docs; + } + + protected void addDocList(NamedList rsp, GroupDocs groups) { + rsp.add("doclist", getDocList(groups)); + } + + // Flatten the groups and get up offset + rows documents + protected DocList createSimpleResponse() { + GroupDocs[] groups = result != null ? result.groups : new GroupDocs[0]; + + List ids = new ArrayList(); + List scores = new ArrayList(); + int docsToGather = getMax(offset, numGroups, maxDoc); + int docsGathered = 0; + float maxScore = Float.NEGATIVE_INFINITY; + + outer: + for (GroupDocs group : groups) { + if (group.maxScore > maxScore) { + maxScore = group.maxScore; } - } - } - filler.fillValue(doc); - SearchGroup group = groupMap.get(mval); - if (group == null) { - int num = groupMap.size(); - if (groupMap.size() < nGroups) { - SearchGroup sg = new SearchGroup(); - sg.groupValue = mval.duplicate(); - sg.comparatorSlot = num++; - sg.matches = 1; - sg.topDoc = docBase + doc; - // sg.topDocScore = scorer.score(); - for (FieldComparator fc : comparators) - fc.copy(sg.comparatorSlot, doc); - groupMap.put(sg.groupValue, sg); - if (groupMap.size() == nGroups) { - buildSet(); - } - return; - } - - // we already tested that the document is competitive, so replace - // the smallest group with this new group. - - // remove current smallest group - SearchGroup smallest = orderedGroups.pollLast(); - assert orderedGroups.size() == nGroups -1; - - groupMap.remove(smallest.groupValue); - - // reuse the removed SearchGroup - smallest.groupValue.copy(mval); - smallest.matches = 1; - smallest.topDoc = docBase + doc; - // smallest.topDocScore = scorer.score(); - for (FieldComparator fc : comparators) - fc.copy(smallest.comparatorSlot, doc); - - groupMap.put(smallest.groupValue, smallest); - orderedGroups.add(smallest); - assert orderedGroups.size() == nGroups; - - for (FieldComparator fc : comparators) - fc.setBottom(orderedGroups.last().comparatorSlot); - - return; - } - - // - // update existing group - // - - group.matches++; // TODO: these aren't valid if the group is every discarded then re-added. keep track if there have been discards? - - for (int i = 0;; i++) { - FieldComparator fc = comparators[i]; - fc.copy(spareSlot, doc); - - final int c = reversed[i] * fc.compare(group.comparatorSlot, spareSlot); - if (c < 0) { - // Definitely not competitive. - return; - } else if (c > 0) { - // Definitely competitive. - // Set remaining comparators - for (int j=i+1; j comparator = new Comparator() { - public int compare(SearchGroup o1, SearchGroup o2) { - for (int i = 0;; i++) { - FieldComparator fc = comparators[i]; - int c = reversed[i] * fc.compare(o1.comparatorSlot, o2.comparatorSlot); - if (c != 0) { - return c; - } else if (i == comparators.length - 1) { - return o1.topDoc - o2.topDoc; + for (ScoreDoc scoreDoc : group.scoreDocs) { + if (docsGathered >= docsToGather) { + break outer; } + + ids.add(scoreDoc.doc); + scores.add(scoreDoc.score); + docsGathered++; } } - }; - orderedGroups = new TreeSet(comparator); - orderedGroups.addAll(groupMap.values()); - if (orderedGroups.size() == 0) return; - for (FieldComparator fc : comparators) - fc.setBottom(orderedGroups.last().comparatorSlot); - } - - @Override - public void setNextReader(AtomicReaderContext readerContext) throws IOException { - this.docBase = readerContext.docBase; - docValues = vs.getValues(context, readerContext); - filler = docValues.getValueFiller(); - mval = filler.getValue(); - for (int i=0; i groupMap; - final ValueSource vs; - final Map context; - - DocValues docValues; - DocValues.ValueFiller filler; - MutableValue mval; - Scorer scorer; - int docBase; - - // TODO: may want to decouple from the phase1 collector - public Phase2GroupCollector(TopGroupCollector topGroups, ValueSource groupByVS, Map vsContext, Sort weightedSort, int docsPerGroup, boolean getScores, int offset) throws IOException { - boolean getSortFields = false; - - if (topGroups.orderedGroups == null) - topGroups.buildSet(); - - groupMap = new HashMap(topGroups.groupMap.size()); - for (SearchGroup group : topGroups.orderedGroups) { - if (offset > 0) { - offset--; - continue; + int len = Math.min(numGroups, docsGathered); + if (offset > len) { + len = 0; } - SearchGroupDocs groupDocs = new SearchGroupDocs(); - groupDocs.groupValue = group.groupValue; - if (weightedSort==null) - groupDocs.collector = TopScoreDocCollector.create(docsPerGroup, true); - else - groupDocs.collector = TopFieldCollector.create(weightedSort, docsPerGroup, getSortFields, getScores, getScores, true); - groupMap.put(groupDocs.groupValue, groupDocs); + + int[] docs = ArrayUtils.toPrimitive(ids.toArray(new Integer[ids.size()])); + float[] docScores = ArrayUtils.toPrimitive(scores.toArray(new Float[scores.size()])); + DocSlice docSlice = new DocSlice(offset, len, docs, docScores, getMatches(), maxScore); + + if (getDocList) { + for (int i = offset; i < docs.length; i++) { + idSet.add(docs[i]); + } + } + + return docSlice; } - this.vs = groupByVS; - this.context = vsContext; } - @Override - public void setScorer(Scorer scorer) throws IOException { - this.scorer = scorer; - for (SearchGroupDocs group : groupMap.values()) - group.collector.setScorer(scorer); - } + /** + * A group command for grouping on a field. + */ + public class CommandField extends Command { - @Override - public void collect(int doc) throws IOException { - filler.fillValue(doc); - SearchGroupDocs group = groupMap.get(mval); - if (group == null) return; - group.collector.collect(doc); - } + public String groupBy; + TermFirstPassGroupingCollector firstPass; + TermSecondPassGroupingCollector secondPass; - @Override - public void setNextReader(AtomicReaderContext readerContext) throws IOException { - this.docBase = readerContext.docBase; - docValues = vs.getValues(context, readerContext); - filler = docValues.getValueFiller(); - mval = filler.getValue(); - for (SearchGroupDocs group : groupMap.values()) - group.collector.setNextReader(readerContext); - } + TermAllGroupsCollector allGroupsCollector; - @Override - public boolean acceptsDocsOutOfOrder() { - return false; - } -} + // If offset falls outside the number of documents a group can provide use this collector instead of secondPass + TotalHitCountCollector fallBackCollector; + Collection> topGroups; -// TODO: merge with SearchGroup or not? -// ad: don't need to build a new hashmap -// disad: blows up the size of SearchGroup if we need many of them, and couples implementations -class SearchGroupDocs { - public MutableValue groupValue; - TopDocsCollector collector; -} + /** + * {@inheritDoc} + */ + protected void prepare() throws IOException { + actualGroupsToFind = getMax(offset, numGroups, maxDoc); + } + + /** + * {@inheritDoc} + */ + protected Collector createFirstPassCollector() throws IOException { + // Ok we don't want groups, but do want a total count + if (actualGroupsToFind <= 0) { + fallBackCollector = new TotalHitCountCollector(); + return fallBackCollector; + } + + sort = sort == null ? Sort.RELEVANCE : sort; + firstPass = new TermFirstPassGroupingCollector(groupBy, sort, actualGroupsToFind); + return firstPass; + } + + /** + * {@inheritDoc} + */ + protected Collector createSecondPassCollector() throws IOException { + if (actualGroupsToFind <= 0) { + allGroupsCollector = new TermAllGroupsCollector(groupBy); + return totalCount == TotalCount.grouped ? allGroupsCollector : null; + } + + topGroups = format == Format.grouped ? firstPass.getTopGroups(offset, false) : firstPass.getTopGroups(0, false); + if (topGroups == null) { + if (totalCount == TotalCount.grouped) { + allGroupsCollector = new TermAllGroupsCollector(groupBy); + fallBackCollector = new TotalHitCountCollector(); + return MultiCollector.wrap(allGroupsCollector, fallBackCollector); + } else { + fallBackCollector = new TotalHitCountCollector(); + return fallBackCollector; + } + } + + int groupedDocsToCollect = getMax(groupOffset, docsPerGroup, maxDoc); + groupedDocsToCollect = Math.max(groupedDocsToCollect, 1); + secondPass = new TermSecondPassGroupingCollector( + groupBy, topGroups, sort, groupSort, groupedDocsToCollect, needScores, needScores, false + ); + + if (totalCount == TotalCount.grouped) { + allGroupsCollector = new TermAllGroupsCollector(groupBy); + return MultiCollector.wrap(secondPass, allGroupsCollector); + } else { + return secondPass; + } + } + + /** + * {@inheritDoc} + */ + protected void finish() throws IOException { + result = secondPass != null ? secondPass.getTopGroups(0) : null; + if (main) { + mainResult = createSimpleResponse(); + return; + } + + NamedList groupResult = commonResponse(); + + if (format == Format.simple) { + groupResult.add("doclist", createSimpleResponse()); + return; + } + + List groupList = new ArrayList(); + groupResult.add("groups", groupList); // grouped={ key={ groups=[ + + if (result == null) { + return; + } + + // handle case of rows=0 + if (numGroups == 0) return; + + for (GroupDocs group : result.groups) { + NamedList nl = new SimpleOrderedMap(); + groupList.add(nl); // grouped={ key={ groups=[ { + // To keep the response format compatable with trunk. + // In trunk MutableValue can convert an indexed value to its native type. E.g. string to int + // The only option I currently see is the use the FieldType for this + if (group.groupValue != null) { + SchemaField schemaField = searcher.getSchema().getField(groupBy); + FieldType fieldType = schemaField.getType(); + String readableValue = fieldType.indexedToReadable(group.groupValue.utf8ToString()); + Fieldable field = schemaField.createField(readableValue, 0.0f); + nl.add("groupValue", fieldType.toObject(field)); + } else { + nl.add("groupValue", null); + } -class Phase2StringGroupCollector extends Phase2GroupCollector { - FieldCache.DocTermsIndex index; - final SentinelIntSet ordSet; - final SearchGroupDocs[] groups; - final BytesRef spare = new BytesRef(); + addDocList(nl, group); + } + } - public Phase2StringGroupCollector(TopGroupCollector topGroups, ValueSource groupByVS, Map vsContext, Sort weightedSort, int docsPerGroup, boolean getScores, int offset) throws IOException { - super(topGroups, groupByVS, vsContext,weightedSort,docsPerGroup,getScores,offset); - ordSet = new SentinelIntSet(groupMap.size(), -1); - groups = new SearchGroupDocs[ordSet.keys.length]; - } + /** + * {@inheritDoc} + */ + public int getMatches() { + if (result == null && fallBackCollector == null) { + return 0; + } - @Override - public void setScorer(Scorer scorer) throws IOException { - this.scorer = scorer; - for (SearchGroupDocs group : groupMap.values()) - group.collector.setScorer(scorer); - } + return result != null ? result.totalHitCount : fallBackCollector.getTotalHits(); + } - @Override - public void collect(int doc) throws IOException { - int slot = ordSet.find(index.getOrd(doc)); - if (slot >= 0) { - groups[slot].collector.collect(doc); + /** + * {@inheritDoc} + */ + protected Integer getNumberOfGroups() { + return allGroupsCollector == null ? null : allGroupsCollector.getGroupCount(); } } - @Override - public void setNextReader(AtomicReaderContext context) throws IOException { - super.setNextReader(context); - index = ((StringIndexDocValues)docValues).getDocTermsIndex(); + /** + * A group command for grouping on a query. + */ + //NOTE: doesn't need to be generic. Maybe Command interface --> First / Second pass abstract impl. + public class CommandQuery extends Command { - ordSet.clear(); - for (SearchGroupDocs group : groupMap.values()) { - MutableValueStr gv = (MutableValueStr)group.groupValue; - int ord = 0; - if (gv.exists) { - ord = index.binarySearchLookup(((MutableValueStr)group.groupValue).value, spare); + public Query query; + TopDocsCollector topCollector; + FilterCollector collector; + + /** + * {@inheritDoc} + */ + protected void prepare() throws IOException { + actualGroupsToFind = getMax(offset, numGroups, maxDoc); + } + + /** + * {@inheritDoc} + */ + protected Collector createFirstPassCollector() throws IOException { + DocSet groupFilt = searcher.getDocSet(query); + topCollector = newCollector(groupSort, needScores); + collector = new FilterCollector(groupFilt, topCollector); + return collector; + } + + TopDocsCollector newCollector(Sort sort, boolean needScores) throws IOException { + int groupDocsToCollect = getMax(groupOffset, docsPerGroup, maxDoc); + if (sort == null || sort == Sort.RELEVANCE) { + return TopScoreDocCollector.create(groupDocsToCollect, true); + } else { + return TopFieldCollector.create(searcher.weightSort(sort), groupDocsToCollect, false, needScores, needScores, true); } - if (ord >= 0) { - int slot = ordSet.put(ord); - groups[slot] = group; + } + + /** + * {@inheritDoc} + */ + protected void finish() throws IOException { + TopDocsCollector topDocsCollector = (TopDocsCollector) collector.collector; + TopDocs topDocs = topDocsCollector.topDocs(); + GroupDocs groupDocs = new GroupDocs(topDocs.getMaxScore(), topDocs.totalHits, topDocs.scoreDocs, query.toString(), null); + if (main) { + mainResult = getDocList(groupDocs); + } else { + NamedList rsp = commonResponse(); + addDocList(rsp, groupDocs); } } + + /** + * {@inheritDoc} + */ + public int getMatches() { + return collector.matches; + } } - @Override - public boolean acceptsDocsOutOfOrder() { - return false; + /** + * A command for grouping on a function. + */ + public class CommandFunc extends Command { + + public ValueSource groupBy; + Map context; + + FunctionFirstPassGroupingCollector firstPass; + FunctionSecondPassGroupingCollector secondPass; + // If offset falls outside the number of documents a group can provide use this collector instead of secondPass + TotalHitCountCollector fallBackCollector; + FunctionAllGroupsCollector allGroupsCollector; + Collection> topGroups; + + /** + * {@inheritDoc} + */ + protected void prepare() throws IOException { + Map context = ValueSource.newContext(searcher); + groupBy.createWeight(context, searcher); + actualGroupsToFind = getMax(offset, numGroups, maxDoc); + } + + /** + * {@inheritDoc} + */ + protected Collector createFirstPassCollector() throws IOException { + // Ok we don't want groups, but do want a total count + if (actualGroupsToFind <= 0) { + fallBackCollector = new TotalHitCountCollector(); + return fallBackCollector; + } + + sort = sort == null ? Sort.RELEVANCE : sort; + firstPass = new FunctionFirstPassGroupingCollector(groupBy, context, searcher.weightSort(sort), actualGroupsToFind); + return firstPass; + } + + /** + * {@inheritDoc} + */ + protected Collector createSecondPassCollector() throws IOException { + if (actualGroupsToFind <= 0) { + allGroupsCollector = new FunctionAllGroupsCollector(groupBy, context); + return totalCount == TotalCount.grouped ? allGroupsCollector : null; + } + + topGroups = format == Format.grouped ? firstPass.getTopGroups(offset, false) : firstPass.getTopGroups(0, false); + if (topGroups == null) { + if (totalCount == TotalCount.grouped) { + allGroupsCollector = new FunctionAllGroupsCollector(groupBy, context); + fallBackCollector = new TotalHitCountCollector(); + return MultiCollector.wrap(allGroupsCollector, fallBackCollector); + } else { + fallBackCollector = new TotalHitCountCollector(); + return fallBackCollector; + } + } + + int groupdDocsToCollect = getMax(groupOffset, docsPerGroup, maxDoc); + groupdDocsToCollect = Math.max(groupdDocsToCollect, 1); + secondPass = new FunctionSecondPassGroupingCollector( + topGroups, sort, groupSort, groupdDocsToCollect, needScores, needScores, false, groupBy, context + ); + + if (totalCount == TotalCount.grouped) { + allGroupsCollector = new FunctionAllGroupsCollector(groupBy, context); + return MultiCollector.wrap(secondPass, allGroupsCollector); + } else { + return secondPass; + } + } + + /** + * {@inheritDoc} + */ + protected void finish() throws IOException { + result = secondPass != null ? secondPass.getTopGroups(0) : null; + if (main) { + mainResult = createSimpleResponse(); + return; + } + + NamedList groupResult = commonResponse(); + + if (format == Format.simple) { + groupResult.add("doclist", createSimpleResponse()); + return; + } + + List groupList = new ArrayList(); + groupResult.add("groups", groupList); // grouped={ key={ groups=[ + + if (result == null) { + return; + } + + // handle case of rows=0 + if (numGroups == 0) return; + + for (GroupDocs group : result.groups) { + NamedList nl = new SimpleOrderedMap(); + groupList.add(nl); // grouped={ key={ groups=[ { + nl.add("groupValue", group.groupValue.toObject()); + addDocList(nl, group); + } + } + + /** + * {@inheritDoc} + */ + public int getMatches() { + if (result == null && fallBackCollector == null) { + return 0; + } + + return result != null ? result.totalHitCount : fallBackCollector.getTotalHits(); + } + + /** + * {@inheritDoc} + */ + protected Integer getNumberOfGroups() { + return allGroupsCollector == null ? null : allGroupsCollector.getGroupCount(); + } + } + + /** + * A collector that filters incoming doc ids that are not in the filter + */ + static class FilterCollector extends Collector { + + final DocSet filter; + final Collector collector; + int docBase; + int matches; + + public FilterCollector(DocSet filter, Collector collector) throws IOException { + this.filter = filter; + this.collector = collector; + } + + public void setScorer(Scorer scorer) throws IOException { + collector.setScorer(scorer); + } + + public void collect(int doc) throws IOException { + matches++; + if (filter.exists(doc + docBase)) { + collector.collect(doc); + } + } + + public void setNextReader(AtomicReaderContext context) throws IOException { + this.docBase = context.docBase; + collector.setNextReader(context); + } + + public boolean acceptsDocsOutOfOrder() { + return collector.acceptsDocsOutOfOrder(); + } + } + + static class FunctionFirstPassGroupingCollector extends AbstractFirstPassGroupingCollector { + + private final ValueSource groupByVS; + private final Map vsContext; + + private DocValues docValues; + private DocValues.ValueFiller filler; + private MutableValue mval; + + FunctionFirstPassGroupingCollector(ValueSource groupByVS, Map vsContext, Sort groupSort, int topNGroups) throws IOException { + super(groupSort, topNGroups); + this.groupByVS = groupByVS; + this.vsContext = vsContext; + } + + @Override + protected MutableValue getDocGroupValue(int doc) { + filler.fillValue(doc); + return mval; + } + + @Override + protected MutableValue copyDocGroupValue(MutableValue groupValue, MutableValue reuse) { + if (reuse != null) { + reuse.copy(groupValue); + return reuse; + } + return groupValue.duplicate(); + } + + @Override + public void setNextReader(AtomicReaderContext readerContext) throws IOException { + super.setNextReader(readerContext); + docValues = groupByVS.getValues(vsContext, readerContext); + filler = docValues.getValueFiller(); + mval = filler.getValue(); + } + } + + static class FunctionSecondPassGroupingCollector extends AbstractSecondPassGroupingCollector { + + private final ValueSource groupByVS; + private final Map vsContext; + + private DocValues docValues; + private DocValues.ValueFiller filler; + private MutableValue mval; + + FunctionSecondPassGroupingCollector(Collection> searchGroups, Sort groupSort, Sort withinGroupSort, int maxDocsPerGroup, boolean getScores, boolean getMaxScores, boolean fillSortFields, ValueSource groupByVS, Map vsContext) throws IOException { + super(searchGroups, groupSort, withinGroupSort, maxDocsPerGroup, getScores, getMaxScores, fillSortFields); + this.groupByVS = groupByVS; + this.vsContext = vsContext; + } + + /** + * {@inheritDoc} + */ + protected SearchGroupDocs retrieveGroup(int doc) throws IOException { + filler.fillValue(doc); + return groupMap.get(mval); + } + + /** + * {@inheritDoc} + */ + public void setNextReader(AtomicReaderContext readerContext) throws IOException { + super.setNextReader(readerContext); + docValues = groupByVS.getValues(vsContext, readerContext); + filler = docValues.getValueFiller(); + mval = filler.getValue(); + } + } + + + static class FunctionAllGroupsCollector extends AbstractAllGroupsCollector { + + private final Map vsContext; + private final ValueSource groupBy; + private final SortedSet groups = new TreeSet(); + + private DocValues docValues; + private DocValues.ValueFiller filler; + private MutableValue mval; + + FunctionAllGroupsCollector(ValueSource groupBy, Map vsContext) { + this.vsContext = vsContext; + this.groupBy = groupBy; + } + + public Collection getGroups() { + return groups; + } + + public void collect(int doc) throws IOException { + filler.fillValue(doc); + if (!groups.contains(mval)) { + groups.add(mval.duplicate()); + } + } + + /** + * {@inheritDoc} + */ + public void setNextReader(AtomicReaderContext context) throws IOException { + docValues = groupBy.getValues(vsContext, context); + filler = docValues.getValueFiller(); + mval = filler.getValue(); + } + + } + } \ No newline at end of file diff --git a/solr/src/java/org/apache/solr/search/function/FieldCacheSource.java b/solr/src/java/org/apache/solr/search/function/FieldCacheSource.java index 2dd1d8703ef..b717550a3e9 100644 --- a/solr/src/java/org/apache/solr/search/function/FieldCacheSource.java +++ b/solr/src/java/org/apache/solr/search/function/FieldCacheSource.java @@ -37,6 +37,10 @@ public abstract class FieldCacheSource extends ValueSource { return cache; } + public String getField() { + return field; + } + @Override public String description() { return field; diff --git a/solr/src/test/org/apache/solr/TestGroupingSearch.java b/solr/src/test/org/apache/solr/TestGroupingSearch.java index f0b53bc32fe..f5f91b67844 100644 --- a/solr/src/test/org/apache/solr/TestGroupingSearch.java +++ b/solr/src/test/org/apache/solr/TestGroupingSearch.java @@ -20,6 +20,7 @@ package org.apache.solr; import org.apache.lucene.search.FieldCache; import org.apache.noggit.JSONUtil; import org.apache.noggit.ObjectBuilder; +import org.apache.solr.common.params.GroupParams; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.schema.IndexSchema; import org.junit.Before; @@ -88,6 +89,49 @@ public class TestGroupingSearch extends SolrTestCaseJ4 { ); } + @Test + public void testGroupingGroupSortingScore_withTotalGroupCount() { + assertU(add(doc("id", "1","name", "author1", "title", "a book title", "group_si", "1"))); + assertU(add(doc("id", "2","name", "author1", "title", "the title", "group_si", "2"))); + assertU(add(doc("id", "3","name", "author2", "title", "a book title", "group_si", "1"))); + assertU(add(doc("id", "4","name", "author2", "title", "title", "group_si", "2"))); + assertU(add(doc("id", "5","name", "author3", "title", "the title of a title", "group_si", "1"))); + assertU(commit()); + + assertQ(req("q","title:title", "group", "true", "group.field","name", "group.ngroups", "true") + ,"//lst[@name='grouped']/lst[@name='name']" + ,"//lst[@name='grouped']/lst[@name='name']/int[@name='matches'][.='5']" + ,"//lst[@name='grouped']/lst[@name='name']/int[@name='ngroups'][.='3']" + ,"*[count(//arr[@name='groups']/lst) = 3]" + + ,"//arr[@name='groups']/lst[1]/str[@name='groupValue'][.='author2']" + ,"//arr[@name='groups']/lst[1]/result[@numFound='2']" + ,"//arr[@name='groups']/lst[1]/result/doc/*[@name='id'][.='4']" + + ,"//arr[@name='groups']/lst[2]/str[@name='groupValue'][.='author1']" + ,"//arr[@name='groups']/lst[2]/result[@numFound='2']" + ,"//arr[@name='groups']/lst[2]/result/doc/*[@name='id'][.='2']" + + ,"//arr[@name='groups']/lst[3]/str[@name='groupValue'][.='author3']" + ,"//arr[@name='groups']/lst[3]/result[@numFound='1']" + ,"//arr[@name='groups']/lst[3]/result/doc/*[@name='id'][.='5']" + ); + + assertQ(req("q","title:title", "group", "true", "group.field","group_si", "group.ngroups", "true") + ,"//lst[@name='grouped']/lst[@name='group_si']/int[@name='matches'][.='5']" + ,"//lst[@name='grouped']/lst[@name='group_si']/int[@name='ngroups'][.='2']" + ,"*[count(//arr[@name='groups']/lst) = 2]" + + ,"//arr[@name='groups']/lst[1]/int[@name='groupValue'][.='2']" + ,"//arr[@name='groups']/lst[1]/result[@numFound='2']" + ,"//arr[@name='groups']/lst[1]/result/doc/*[@name='id'][.='4']" + + ,"//arr[@name='groups']/lst[2]/int[@name='groupValue'][.='1']" + ,"//arr[@name='groups']/lst[2]/result[@numFound='3']" + ,"//arr[@name='groups']/lst[2]/result/doc/*[@name='id'][.='5']" + ); + } + @Test public void testGroupingGroupSortingScore_basicWithGroupSortEqualToSort() { assertU(add(doc("id", "1","name", "author1", "title", "a book title"))); @@ -353,7 +397,7 @@ public class TestGroupingSearch extends SolrTestCaseJ4 { , "/grouped/foo_i=={'matches':10,'doclist':" +"{'numFound':10,'start':1,'docs':[{'id':'10'},{'id':'3'},{'id':'6'}]}}" ); - }; + } @@ -476,14 +520,16 @@ public class TestGroupingSearch extends SolrTestCaseJ4 { List sortedGroups = new ArrayList(groups.values()); Collections.sort(sortedGroups, groupComparator==sortComparator ? createFirstDocComparator(sortComparator) : createMaxDocComparator(sortComparator)); - Object modelResponse = buildGroupedResult(h.getCore().getSchema(), sortedGroups, start, rows, group_offset, group_limit); + boolean includeNGroups = random.nextBoolean(); + Object modelResponse = buildGroupedResult(h.getCore().getSchema(), sortedGroups, start, rows, group_offset, group_limit, includeNGroups); + int randomPercentage = random.nextInt(101); // TODO: create a random filter too - SolrQueryRequest req = req("group","true","wt","json","indent","true", "echoParams","all", "q","{!func}score_f", "group.field",groupField ,sortStr==null ? "nosort":"sort", sortStr ==null ? "": sortStr - ,(groupSortStr==null || groupSortStr==sortStr) ? "nosort":"group.sort", groupSortStr==null ? "": groupSortStr - ,"rows",""+rows, "start",""+start, "group.offset",""+group_offset, "group.limit",""+group_limit + ,(groupSortStr==null || groupSortStr==sortStr) ? "noGroupsort":"group.sort", groupSortStr==null ? "": groupSortStr + ,"rows",""+rows, "start",""+start, "group.offset",""+group_offset, "group.limit",""+group_limit, + GroupParams.GROUP_CACHE_PERCENTAGE, Integer.toString(randomPercentage), GroupParams.GROUP_TOTAL_COUNT, includeNGroups ? "true" : "false" ); String strResponse = h.query(req); @@ -508,7 +554,7 @@ public class TestGroupingSearch extends SolrTestCaseJ4 { } - public static Object buildGroupedResult(IndexSchema schema, List sortedGroups, int start, int rows, int group_offset, int group_limit) { + public static Object buildGroupedResult(IndexSchema schema, List sortedGroups, int start, int rows, int group_offset, int group_limit, boolean includeNGroups) { Map result = new LinkedHashMap(); long matches = 0; @@ -516,6 +562,9 @@ public class TestGroupingSearch extends SolrTestCaseJ4 { matches += grp.docs.size(); } result.put("matches", matches); + if (includeNGroups) { + result.put("ngroups", sortedGroups.size()); + } List groupList = new ArrayList(); result.put("groups", groupList);