SOLR-2564: Integrating grouping module into Solr 4.0

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1137037 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Martijn van Groningen 2011-06-17 21:25:59 +00:00
parent 4c6cc4ac18
commit d805da14c2
9 changed files with 1095 additions and 790 deletions

View File

@ -17,22 +17,22 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.util.RamUsageEstimator;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.util.RamUsageEstimator;
/**
* Caches all docs, and optionally also scores, coming from
* a search, and is then able to replay them to another
* collector. You specify the max RAM this class may use.
* Once the collection is done, call {@link #isCached}. If
* this returns true, you can use {@link #replay} against a
* new collector. If it returns false, this means too much
* RAM was required and you must instead re-run the original
* search.
* Once the collection is done, call {@link #isCached}. If
* this returns true, you can use {@link #replay(Collector)}
* against a new collector. If it returns false, this means
* too much RAM was required and you must instead re-run the
* original search.
*
* <p><b>NOTE</b>: this class consumes 4 (or 8 bytes, if
* scoring is cached) per collected document. If the result
@ -105,7 +105,16 @@ public abstract class CachingCollector extends Collector {
cachedScorer = new CachedScorer();
cachedScores = new ArrayList<float[]>();
curScores = new float[128];
curScores = new float[INITIAL_ARRAY_SIZE];
cachedScores.add(curScores);
}
ScoreCachingCollector(Collector other, int maxDocsToCache) {
super(other, maxDocsToCache);
cachedScorer = new CachedScorer();
cachedScores = new ArrayList<float[]>();
curScores = new float[INITIAL_ARRAY_SIZE];
cachedScores.add(curScores);
}
@ -210,7 +219,11 @@ public abstract class CachingCollector extends Collector {
NoScoreCachingCollector(Collector other, double maxRAMMB) {
super(other, maxRAMMB, false);
}
NoScoreCachingCollector(Collector other, int maxDocsToCache) {
super(other, maxDocsToCache);
}
@Override
public void collect(int doc) throws IOException {
@ -353,7 +366,25 @@ public abstract class CachingCollector extends Collector {
*/
public static CachingCollector create(Collector other, boolean cacheScores, double maxRAMMB) {
return cacheScores ? new ScoreCachingCollector(other, maxRAMMB) : new NoScoreCachingCollector(other, maxRAMMB);
}
}
/**
* Create a new {@link CachingCollector} that wraps the given collector and
* caches documents and scores up to the specified max docs threshold.
*
* @param other
* the Collector to wrap and delegate calls to.
* @param cacheScores
* whether to cache scores in addition to document IDs. Note that
* this increases the RAM consumed per doc
* @param maxDocsToCache
* the maximum number of documents for caching the documents and
* possible the scores. If the collector exceeds the threshold,
* no documents and scores are cached.
*/
public static CachingCollector create(Collector other, boolean cacheScores, int maxDocsToCache) {
return cacheScores ? new ScoreCachingCollector(other, maxDocsToCache) : new NoScoreCachingCollector(other, maxDocsToCache);
}
// Prevent extension from non-internal classes
private CachingCollector(Collector other, double maxRAMMB, boolean cacheScores) {
@ -369,6 +400,15 @@ public abstract class CachingCollector extends Collector {
}
maxDocsToCache = (int) ((maxRAMMB * 1024 * 1024) / bytesPerDoc);
}
private CachingCollector(Collector other, int maxDocsToCache) {
this.other = other;
cachedDocs = new ArrayList<int[]>();
curDocs = new int[INITIAL_ARRAY_SIZE];
cachedDocs.add(curDocs);
this.maxDocsToCache = maxDocsToCache;
}
@Override
public boolean acceptsDocsOutOfOrder() {

View File

@ -17,15 +17,11 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.CachingCollector;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
public class TestCachingCollector extends LuceneTestCase {
private static final double ONE_BYTE = 1.0 / (1024 * 1024); // 1 byte out of MB
@ -76,7 +72,7 @@ public class TestCachingCollector extends LuceneTestCase {
public void testBasic() throws Exception {
for (boolean cacheScores : new boolean[] { false, true }) {
CachingCollector cc = CachingCollector.create(new NoOpCollector(false), cacheScores, 1);
CachingCollector cc = CachingCollector.create(new NoOpCollector(false), cacheScores, 1.0);
cc.setScorer(new MockScorer());
// collect 1000 docs

View File

@ -78,6 +78,8 @@ New Features
Bojan Smid, Charles Hornberger, Dieter Grad, Dmitry Lihachev, Doug Steigerwald,
Karsten Sperling, Michael Gundlach, Oleg Gnatovskiy, Thomas Traeger,
Harish Agarwal, yonik)
SOLR-2564: Integrate grouping module into Solr. Also adds the ability to return number of
groups that have match a query.
* SOLR-1665: Add debug component options for timings, results and query info only (gsingers, hossman, yonik)

View File

@ -198,6 +198,7 @@
<pathelement location="${common-solr.dir}/../lucene/build/contrib/misc/classes/java" />
<pathelement location="${common-solr.dir}/../lucene/build/contrib/queries/classes/java" />
<pathelement location="${common-solr.dir}/../lucene/build/contrib/spatial/classes/java" />
<pathelement location="${common-solr.dir}/../modules/grouping/build/classes/java" />
</path>
<target name="prep-lucene-jars">
@ -214,6 +215,7 @@
<fileset dir="../lucene/contrib/misc" includes="build.xml" />
<fileset dir="../lucene/contrib/queries" includes="build.xml" />
<fileset dir="../lucene/contrib/spatial" includes="build.xml" />
<fileset dir="../modules/grouping" includes="build.xml" />
</subant>
</sequential>
</target>
@ -248,6 +250,9 @@
<fileset dir="../lucene/build/contrib/spatial">
<include name="lucene-spatial-${version}.jar" />
</fileset>
<fileset dir="../modules/grouping/build">
<include name="lucene-grouping-${version}.jar" />
</fileset>
</copy>
</target>
@ -262,6 +267,7 @@
<fileset dir="../lucene/contrib/misc" includes="build.xml"/>
<fileset dir="../lucene/contrib/queries" includes="build.xml"/>
<fileset dir="../lucene/contrib/spatial" includes="build.xml"/>
<fileset dir="../modules/grouping" includes="build.xml"/>
</subant>
</target>

View File

@ -38,5 +38,20 @@ public interface GroupParams {
/** treat the first group result as the main result. true/false */
public static final String GROUP_FORMAT = GROUP + ".format";
/**
* Whether to cache the first pass search (doc ids and score) for the second pass search.
* Also defines the maximum size of the group cache relative to maxdoc in a percentage.
* Values can be a positive integer, from 0 till 100. A value of 0 will disable the group cache.
* The default is 0.*/
public static final String GROUP_CACHE_PERCENTAGE = GROUP + ".cache.percent";
// Note: Since you can supply multiple fields to group on, but only have a facets for the whole result. It only makes
// sense to me to support these parameters for the first group.
/** Whether the docSet (for example for faceting) should be based on plain documents (a.k.a UNGROUPED) or on the groups (a.k.a GROUPED). */
public static final String GROUP_COLLAPSE = GROUP + ".collapse";
/** Whether the group count should be included in the response. */
public static final String GROUP_TOTAL_COUNT = GROUP + ".ngroups";
}

View File

@ -45,8 +45,6 @@ import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.*;
import org.apache.solr.search.function.FunctionQuery;
import org.apache.solr.search.function.QueryValueSource;
import org.apache.solr.util.SolrPluginUtils;
import java.io.IOException;
@ -315,16 +313,25 @@ public class QueryComponent extends SearchComponent
boolean doGroup = params.getBool(GroupParams.GROUP, false);
if (doGroup) {
try {
Grouping grouping = new Grouping(searcher, result, cmd);
int maxDocsPercentageToCache = params.getInt(GroupParams.GROUP_CACHE_PERCENTAGE, 0);
boolean cacheSecondPassSearch = maxDocsPercentageToCache >= 1 && maxDocsPercentageToCache <= 100;
String[] fields = params.getParams(GroupParams.GROUP_FIELD);
String[] funcs = params.getParams(GroupParams.GROUP_FUNC);
String[] queries = params.getParams(GroupParams.GROUP_QUERY);
String groupSortStr = params.get(GroupParams.GROUP_SORT);
boolean main = params.getBool(GroupParams.GROUP_MAIN, false);
String format = params.get(GroupParams.GROUP_FORMAT);
Grouping.Format defaultFormat = "simple".equals(format) ? Grouping.Format.Simple : Grouping.Format.Grouped;
String formatStr = params.get(GroupParams.GROUP_FORMAT, Grouping.Format.grouped.name());
Grouping.Format defaultFormat;
try {
defaultFormat = Grouping.Format.valueOf(formatStr);
} catch (IllegalArgumentException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, String.format("Illegal %s parameter", GroupParams.GROUP_FORMAT));
}
boolean includeTotalGroupCount = params.getBool(GroupParams.GROUP_TOTAL_COUNT, false);
Grouping.TotalCount defaultTotalCount = includeTotalGroupCount ? Grouping.TotalCount.grouped : Grouping.TotalCount.ungrouped;
Sort sort = cmd.getSort();
// groupSort defaults to sort
Sort groupSort = groupSortStr == null ? cmd.getSort() : QueryParsing.parseSort(groupSortStr, req);
@ -332,95 +339,47 @@ public class QueryComponent extends SearchComponent
int groupOffsetDefault = params.getInt(GroupParams.GROUP_OFFSET, 0);
int docsPerGroupDefault = params.getInt(GroupParams.GROUP_LIMIT, 1);
// temporary: implement all group-by-field as group-by-func
if (funcs == null) {
funcs = fields;
} else if (fields != null) {
// catenate functions and fields
String[] both = new String[fields.length + funcs.length];
System.arraycopy(fields, 0, both, 0, fields.length);
System.arraycopy(funcs, 0, both, fields.length, funcs.length);
funcs = both;
}
Grouping grouping = new Grouping(searcher, result, cmd, cacheSecondPassSearch, maxDocsPercentageToCache, main);
grouping.setSort(sort)
.setGroupSort(groupSort)
.setDefaultFormat(defaultFormat)
.setLimitDefault(limitDefault)
.setDefaultTotalCount(defaultTotalCount)
.setDocsPerGroupDefault(docsPerGroupDefault)
.setGroupOffsetDefault(groupOffsetDefault);
if (fields != null) {
for (String field : fields) {
grouping.addFieldCommand(field, rb.req);
}
}
if (funcs != null) {
for (String groupByStr : funcs) {
QParser parser = QParser.getParser(groupByStr, "func", rb.req);
Query q = parser.getQuery();
Grouping.CommandFunc gc = grouping.new CommandFunc();
gc.groupSort = groupSort;
if (q instanceof FunctionQuery) {
gc.groupBy = ((FunctionQuery)q).getValueSource();
} else {
gc.groupBy = new QueryValueSource(q, 0.0f);
}
gc.key = groupByStr;
gc.numGroups = limitDefault;
gc.docsPerGroup = docsPerGroupDefault;
gc.groupOffset = groupOffsetDefault;
gc.offset = cmd.getOffset();
gc.sort = cmd.getSort();
gc.format = defaultFormat;
if (main) {
gc.main = true;
gc.format = Grouping.Format.Simple;
main = false;
}
if (gc.format == Grouping.Format.Simple) {
gc.groupOffset = 0; // doesn't make sense
}
grouping.add(gc);
grouping.addFunctionCommand(groupByStr, rb.req);
}
}
if (queries != null) {
for (String groupByStr : queries) {
QParser parser = QParser.getParser(groupByStr, null, rb.req);
Query gq = parser.getQuery();
Grouping.CommandQuery gc = grouping.new CommandQuery();
gc.query = gq;
gc.groupSort = groupSort;
gc.key = groupByStr;
gc.numGroups = limitDefault;
gc.docsPerGroup = docsPerGroupDefault;
gc.groupOffset = groupOffsetDefault;
// these two params will only be used if this is for the main result set
gc.offset = cmd.getOffset();
gc.numGroups = limitDefault;
gc.format = defaultFormat;
if (main) {
gc.main = true;
gc.format = Grouping.Format.Simple;
main = false;
}
if (gc.format == Grouping.Format.Simple) {
gc.docsPerGroup = gc.numGroups; // doesn't make sense to limit to one
gc.groupOffset = gc.offset;
}
grouping.add(gc);
grouping.addQueryCommand(groupByStr, rb.req);
}
}
if (rb.doHighlights || rb.isDebug()) {
// we need a single list of the returned docs
cmd.setFlags(SolrIndexSearcher.GET_DOCLIST);
}
// searcher.search(result,cmd);
grouping.execute();
rb.setResult( result );
if (grouping.isSignalCacheWarning()) {
rsp.add(
"cacheWarning",
String.format("Cache limit of %d percent relative to maxdoc has exceeded. Please increase cache size or disable caching.", maxDocsPercentageToCache)
);
}
rb.setResult(result);
rsp.add("grouped", result.groupedResults);
// TODO: get "hits" a different way to log
if (grouping.mainResult != null) {
ResultContext ctx = new ResultContext();
@ -428,10 +387,10 @@ public class QueryComponent extends SearchComponent
ctx.query = null; // TODO? add the query?
rsp.add("response", ctx);
rsp.getToLog().add("hits", grouping.mainResult.matches());
} else if (!grouping.getCommands().isEmpty()) { // Can never be empty since grouping.execute() checks for this.
rsp.getToLog().add("hits", grouping.getCommands().get(0).getMatches());
}
return;
} catch (ParseException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
}

File diff suppressed because it is too large Load Diff

View File

@ -37,6 +37,10 @@ public abstract class FieldCacheSource extends ValueSource {
return cache;
}
public String getField() {
return field;
}
@Override
public String description() {
return field;

View File

@ -20,6 +20,7 @@ package org.apache.solr;
import org.apache.lucene.search.FieldCache;
import org.apache.noggit.JSONUtil;
import org.apache.noggit.ObjectBuilder;
import org.apache.solr.common.params.GroupParams;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.junit.Before;
@ -88,6 +89,49 @@ public class TestGroupingSearch extends SolrTestCaseJ4 {
);
}
@Test
public void testGroupingGroupSortingScore_withTotalGroupCount() {
assertU(add(doc("id", "1","name", "author1", "title", "a book title", "group_si", "1")));
assertU(add(doc("id", "2","name", "author1", "title", "the title", "group_si", "2")));
assertU(add(doc("id", "3","name", "author2", "title", "a book title", "group_si", "1")));
assertU(add(doc("id", "4","name", "author2", "title", "title", "group_si", "2")));
assertU(add(doc("id", "5","name", "author3", "title", "the title of a title", "group_si", "1")));
assertU(commit());
assertQ(req("q","title:title", "group", "true", "group.field","name", "group.ngroups", "true")
,"//lst[@name='grouped']/lst[@name='name']"
,"//lst[@name='grouped']/lst[@name='name']/int[@name='matches'][.='5']"
,"//lst[@name='grouped']/lst[@name='name']/int[@name='ngroups'][.='3']"
,"*[count(//arr[@name='groups']/lst) = 3]"
,"//arr[@name='groups']/lst[1]/str[@name='groupValue'][.='author2']"
,"//arr[@name='groups']/lst[1]/result[@numFound='2']"
,"//arr[@name='groups']/lst[1]/result/doc/*[@name='id'][.='4']"
,"//arr[@name='groups']/lst[2]/str[@name='groupValue'][.='author1']"
,"//arr[@name='groups']/lst[2]/result[@numFound='2']"
,"//arr[@name='groups']/lst[2]/result/doc/*[@name='id'][.='2']"
,"//arr[@name='groups']/lst[3]/str[@name='groupValue'][.='author3']"
,"//arr[@name='groups']/lst[3]/result[@numFound='1']"
,"//arr[@name='groups']/lst[3]/result/doc/*[@name='id'][.='5']"
);
assertQ(req("q","title:title", "group", "true", "group.field","group_si", "group.ngroups", "true")
,"//lst[@name='grouped']/lst[@name='group_si']/int[@name='matches'][.='5']"
,"//lst[@name='grouped']/lst[@name='group_si']/int[@name='ngroups'][.='2']"
,"*[count(//arr[@name='groups']/lst) = 2]"
,"//arr[@name='groups']/lst[1]/int[@name='groupValue'][.='2']"
,"//arr[@name='groups']/lst[1]/result[@numFound='2']"
,"//arr[@name='groups']/lst[1]/result/doc/*[@name='id'][.='4']"
,"//arr[@name='groups']/lst[2]/int[@name='groupValue'][.='1']"
,"//arr[@name='groups']/lst[2]/result[@numFound='3']"
,"//arr[@name='groups']/lst[2]/result/doc/*[@name='id'][.='5']"
);
}
@Test
public void testGroupingGroupSortingScore_basicWithGroupSortEqualToSort() {
assertU(add(doc("id", "1","name", "author1", "title", "a book title")));
@ -353,7 +397,7 @@ public class TestGroupingSearch extends SolrTestCaseJ4 {
, "/grouped/foo_i=={'matches':10,'doclist':"
+"{'numFound':10,'start':1,'docs':[{'id':'10'},{'id':'3'},{'id':'6'}]}}"
);
};
}
@ -476,14 +520,16 @@ public class TestGroupingSearch extends SolrTestCaseJ4 {
List<Grp> sortedGroups = new ArrayList(groups.values());
Collections.sort(sortedGroups, groupComparator==sortComparator ? createFirstDocComparator(sortComparator) : createMaxDocComparator(sortComparator));
Object modelResponse = buildGroupedResult(h.getCore().getSchema(), sortedGroups, start, rows, group_offset, group_limit);
boolean includeNGroups = random.nextBoolean();
Object modelResponse = buildGroupedResult(h.getCore().getSchema(), sortedGroups, start, rows, group_offset, group_limit, includeNGroups);
int randomPercentage = random.nextInt(101);
// TODO: create a random filter too
SolrQueryRequest req = req("group","true","wt","json","indent","true", "echoParams","all", "q","{!func}score_f", "group.field",groupField
,sortStr==null ? "nosort":"sort", sortStr ==null ? "": sortStr
,(groupSortStr==null || groupSortStr==sortStr) ? "nosort":"group.sort", groupSortStr==null ? "": groupSortStr
,"rows",""+rows, "start",""+start, "group.offset",""+group_offset, "group.limit",""+group_limit
,(groupSortStr==null || groupSortStr==sortStr) ? "noGroupsort":"group.sort", groupSortStr==null ? "": groupSortStr
,"rows",""+rows, "start",""+start, "group.offset",""+group_offset, "group.limit",""+group_limit,
GroupParams.GROUP_CACHE_PERCENTAGE, Integer.toString(randomPercentage), GroupParams.GROUP_TOTAL_COUNT, includeNGroups ? "true" : "false"
);
String strResponse = h.query(req);
@ -508,7 +554,7 @@ public class TestGroupingSearch extends SolrTestCaseJ4 {
}
public static Object buildGroupedResult(IndexSchema schema, List<Grp> sortedGroups, int start, int rows, int group_offset, int group_limit) {
public static Object buildGroupedResult(IndexSchema schema, List<Grp> sortedGroups, int start, int rows, int group_offset, int group_limit, boolean includeNGroups) {
Map<String,Object> result = new LinkedHashMap<String,Object>();
long matches = 0;
@ -516,6 +562,9 @@ public class TestGroupingSearch extends SolrTestCaseJ4 {
matches += grp.docs.size();
}
result.put("matches", matches);
if (includeNGroups) {
result.put("ngroups", sortedGroups.size());
}
List groupList = new ArrayList();
result.put("groups", groupList);