SOLR-7452: add refine param to json facets, implement for array field faceting

This commit is contained in:
yonik 2017-03-17 12:13:43 -04:00 committed by Shalin Shekhar Mangar
parent 2d32832bcb
commit 78de123645
9 changed files with 254 additions and 69 deletions

View File

@ -31,6 +31,7 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.PriorityQueue;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocSet;
@ -310,7 +311,7 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
if (freq.missing) {
// TODO: it would be more efficient to build up a missing DocSet if we need it here anyway.
SimpleOrderedMap<Object> missingBucket = new SimpleOrderedMap<>();
fillBucket(missingBucket, getFieldMissingQuery(fcontext.searcher, freq.field), null);
fillBucket(missingBucket, getFieldMissingQuery(fcontext.searcher, freq.field), null, false);
res.add("missing", missingBucket);
}
@ -378,7 +379,7 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
}
}
processSubs(target, filter, subDomain);
processSubs(target, filter, subDomain, false);
}
@Override
@ -510,4 +511,41 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
}
}
}
protected SimpleOrderedMap<Object> refineFacets() throws IOException {
List leaves = (List)fcontext.facetInfo.get("_l");
// For leaf refinements, we do full faceting for each leaf bucket. Any sub-facets of these buckets will be fully evaluated. Because of this, we should never
// encounter leaf refinements that have sub-facets that return partial results.
SimpleOrderedMap<Object> res = new SimpleOrderedMap<>();
List<SimpleOrderedMap> bucketList = new ArrayList<>(leaves.size());
res.add("buckets", bucketList);
// TODO: an alternate implementations can fill all accs at once
createAccs(-1, 1);
FieldType ft = sf.getType();
for (Object bucketVal : leaves) {
SimpleOrderedMap<Object> bucket = new SimpleOrderedMap<>();
bucketList.add(bucket);
bucket.add("val", bucketVal);
// String internal = ft.toInternal( tobj.toString() ); // TODO - we need a better way to get from object to query...
Query domainQ = ft.getFieldQuery(null, sf, bucketVal.toString());
fillBucket(bucket, domainQ, null, false);
}
// If there are just a couple of leaves, and if the domain is large, then
// going by term is likely the most efficient?
// If the domain is small, or if the number of leaves is large, then doing
// the normal collection method may be best.
return res;
}
}

View File

@ -57,6 +57,10 @@ abstract class FacetFieldProcessorByArray extends FacetFieldProcessor {
}
private SimpleOrderedMap<Object> calcFacets() throws IOException {
if (fcontext.facetInfo != null) {
return refineFacets();
}
String prefix = freq.prefix;
if (prefix == null || prefix.length() == 0) {
prefixRef = null;

View File

@ -333,7 +333,7 @@ class FacetFieldProcessorByEnumTermsStream extends FacetFieldProcessor implement
bucket.add("val", bucketVal);
addStats(bucket, 0);
if (hasSubFacets) {
processSubs(bucket, bucketQuery, termSet);
processSubs(bucket, bucketQuery, termSet, false);
}
// TODO... termSet needs to stick around for streaming sub-facets?

View File

@ -39,6 +39,7 @@ import org.apache.solr.search.QueryContext;
import org.apache.solr.search.SyntaxError;
import org.apache.solr.util.RTimer;
import org.noggit.JSONUtil;
import org.noggit.ObjectBuilder;
public class FacetModule extends SearchComponent {
@ -52,7 +53,7 @@ public class FacetModule extends SearchComponent {
public final static int PURPOSE_REFINE_JSON_FACETS = 0x00200000;
// Internal information passed down from the top level to shards for distributed faceting.
private final static String FACET_STATE = "_facet_";
private final static String FACET_INFO = "_facet_";
private final static String FACET_REFINE = "refine";
@ -62,6 +63,58 @@ public class FacetModule extends SearchComponent {
return (FacetComponentState) rb.req.getContext().get(FacetComponentState.class);
}
@Override
public void prepare(ResponseBuilder rb) throws IOException {
Map<String,Object> json = rb.req.getJSON();
Map<String,Object> jsonFacet = null;
if (json == null) {
int version = rb.req.getParams().getInt("facet.version",1);
if (version <= 1) return;
boolean facetsEnabled = rb.req.getParams().getBool(FacetParams.FACET, false);
if (!facetsEnabled) return;
jsonFacet = new LegacyFacet(rb.req.getParams()).getLegacy();
} else {
jsonFacet = (Map<String, Object>) json.get("facet");
}
if (jsonFacet == null) return;
SolrParams params = rb.req.getParams();
boolean isShard = params.getBool(ShardParams.IS_SHARD, false);
Map<String,Object> facetInfo = null;
if (isShard) {
String jfacet = params.get(FACET_INFO);
if (jfacet == null) {
// if this is a shard request, but there is no _facet_ info, then don't do anything.
return;
}
facetInfo = (Map<String,Object>) ObjectBuilder.fromJSON(jfacet);
}
// At this point, we know we need to do something. Create and save the state.
rb.setNeedDocSet(true);
// Parse the facet in the prepare phase?
FacetParser parser = new FacetTopParser(rb.req);
FacetRequest facetRequest = null;
try {
facetRequest = parser.parse(jsonFacet);
} catch (SyntaxError syntaxError) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, syntaxError);
}
FacetComponentState fcState = new FacetComponentState();
fcState.rb = rb;
fcState.isShard = isShard;
fcState.facetInfo = facetInfo;
fcState.facetCommands = jsonFacet;
fcState.facetRequest = facetRequest;
rb.req.getContext().put(FacetComponentState.class, fcState);
}
@Override
public void process(ResponseBuilder rb) throws IOException {
// if this is null, faceting is not enabled
@ -77,6 +130,11 @@ public class FacetModule extends SearchComponent {
fcontext.qcontext = QueryContext.newContext(fcontext.searcher);
if (isShard) {
fcontext.flags |= FacetContext.IS_SHARD;
fcontext.facetInfo = facetState.facetInfo.isEmpty() ? null : (Map<String,Object>)facetState.facetInfo.get(FACET_REFINE);
if (fcontext.facetInfo != null) {
fcontext.flags |= FacetContext.IS_REFINEMENT;
fcontext.flags |= FacetContext.SKIP_FACET; // the root bucket should have been received from all shards previously
}
}
FacetProcessor fproc = facetState.facetRequest.createFacetProcessor(fcontext);
@ -100,60 +158,14 @@ public class FacetModule extends SearchComponent {
}
@Override
public void prepare(ResponseBuilder rb) throws IOException {
Map<String,Object> json = rb.req.getJSON();
Map<String,Object> jsonFacet = null;
if (json == null) {
int version = rb.req.getParams().getInt("facet.version",1);
if (version <= 1) return;
boolean facetsEnabled = rb.req.getParams().getBool(FacetParams.FACET, false);
if (!facetsEnabled) return;
jsonFacet = new LegacyFacet(rb.req.getParams()).getLegacy();
} else {
jsonFacet = (Map<String, Object>) json.get("facet");
}
if (jsonFacet == null) return;
SolrParams params = rb.req.getParams();
boolean isShard = params.getBool(ShardParams.IS_SHARD, false);
if (isShard) {
String jfacet = params.get(FACET_STATE);
if (jfacet == null) {
// if this is a shard request, but there is no facet state, then don't do anything.
return;
}
}
// At this point, we know we need to do something. Create and save the state.
rb.setNeedDocSet(true);
// Parse the facet in the prepare phase?
FacetParser parser = new FacetTopParser(rb.req);
FacetRequest facetRequest = null;
try {
facetRequest = parser.parse(jsonFacet);
} catch (SyntaxError syntaxError) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, syntaxError);
}
FacetComponentState fcState = new FacetComponentState();
fcState.rb = rb;
fcState.isShard = isShard;
fcState.facetCommands = jsonFacet;
fcState.facetRequest = facetRequest;
rb.req.getContext().put(FacetComponentState.class, fcState);
}
private void clearFaceting(List<ShardRequest> outgoing) {
// turn off faceting for requests not marked as being for faceting refinements
for (ShardRequest sreq : outgoing) {
if ((sreq.purpose & PURPOSE_REFINE_JSON_FACETS) != 0) continue;
sreq.params.remove("json.facet"); // this just saves space... the presence of FACET_STATE really control the faceting
sreq.params.remove(FACET_STATE);
sreq.params.remove("json.facet"); // this just saves space... the presence of FACET_INFO is enough to control the faceting
sreq.params.remove(FACET_INFO);
}
}
@ -215,16 +227,15 @@ public class FacetModule extends SearchComponent {
// don't request any documents
shardsRefineRequest.params.remove(CommonParams.START);
shardsRefineRequest.params.set(CommonParams.ROWS, "0");
shardsRefineRequest.params.set(CommonParams.ROWS, "0");
shardsRefineRequest.params.set(FacetParams.FACET, false);
}
shardsRefineRequest.purpose |= PURPOSE_REFINE_JSON_FACETS;
Map<String,Object> fstate = new HashMap<>(1);
fstate.put(FACET_REFINE, refinement);
String fstateString = JSONUtil.toJSON(fstate);
shardsRefineRequest.params.add(FACET_STATE, fstateString);
Map<String,Object> finfo = new HashMap<>(1);
finfo.put(FACET_REFINE, refinement);
String finfoStr = JSONUtil.toJSON(finfo);
shardsRefineRequest.params.add(FACET_INFO, finfoStr);
if (newRequest) {
rb.addRequest(this, shardsRefineRequest);
@ -242,12 +253,12 @@ public class FacetModule extends SearchComponent {
if ((sreq.purpose & ShardRequest.PURPOSE_GET_TOP_IDS) != 0) {
sreq.purpose |= FacetModule.PURPOSE_GET_JSON_FACETS;
sreq.params.set(FACET_STATE, "{}"); // The presence of FACET_STATE (_facet_) turns on json faceting
sreq.params.set(FACET_INFO, "{}"); // The presence of FACET_INFO (_facet_) turns on json faceting
} else {
// turn off faceting on other requests
/*** distributedProcess will need to use other requests for refinement
sreq.params.remove("json.facet"); // this just saves space... the presence of FACET_STATE really control the faceting
sreq.params.remove(FACET_STATE);
sreq.params.remove("json.facet"); // this just saves space... the presence of FACET_INFO really control the faceting
sreq.params.remove(FACET_INFO);
**/
}
}
@ -267,6 +278,18 @@ public class FacetModule extends SearchComponent {
facetState.merger = facetState.facetRequest.createFacetMerger(facet);
facetState.mcontext = new FacetMerger.Context( sreq.responses.size() );
}
if ((sreq.purpose & PURPOSE_REFINE_JSON_FACETS) != 0) {
System.err.println("REFINE FACET RESULT FROM SHARD = " + facet);
// call merge again with a diff flag set on the context???
// throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "WORK IN PROGRESS, MERGING FACET REFINEMENT NOT SUPPORTED YET!");
facetState.mcontext.root = facet;
facetState.mcontext.setShard(shardRsp.getShard()); // TODO: roll newShard into setShard?
facetState.merger.merge(facet , facetState.mcontext);
return;
}
facetState.mcontext.root = facet;
facetState.mcontext.newShard(shardRsp.getShard());
facetState.merger.merge(facet , facetState.mcontext);
@ -304,11 +327,15 @@ public class FacetModule extends SearchComponent {
}
// TODO: perhaps factor out some sort of root/parent facet object that doesn't depend
// on stuff like ResponseBuilder, but contains request parameters,
// root filter lists (for filter exclusions), etc?
class FacetComponentState {
ResponseBuilder rb;
Map<String,Object> facetCommands;
FacetRequest facetRequest;
boolean isShard;
Map<String,Object> facetInfo; // _facet_ param: contains out-of-band facet info, mainly for refinement requests
//
// Only used for distributed search

View File

@ -366,10 +366,13 @@ public abstract class FacetProcessor<FacetRequestT extends FacetRequest> {
}
}
void fillBucket(SimpleOrderedMap<Object> bucket, Query q, DocSet result) throws IOException {
// TODO: rather than just have a raw "response", perhaps we should model as a bucket object that contains the response plus extra info?
void fillBucket(SimpleOrderedMap<Object> bucket, Query q, DocSet result, boolean skip) throws IOException {
// TODO: we don't need the DocSet if we've already calculated everything during the first phase
boolean needDocSet = freq.getFacetStats().size() > 0 || freq.getSubFacets().size() > 0;
// TODO: always collect counts or not???
// TODO: put info in for the merger (like "skip=true"?) Maybe we don't need to if we leave out all extraneous info?
int count;
@ -382,7 +385,7 @@ public abstract class FacetProcessor<FacetRequestT extends FacetRequest> {
} else {
result = fcontext.searcher.getDocSet(q, fcontext.base);
}
count = result.size();
count = result.size(); // don't really need this if we are skipping, but it's free.
} else {
if (q == null) {
count = fcontext.base.size();
@ -392,8 +395,10 @@ public abstract class FacetProcessor<FacetRequestT extends FacetRequest> {
}
try {
processStats(bucket, result, count);
processSubs(bucket, q, result);
if (!skip) {
processStats(bucket, result, count);
}
processSubs(bucket, q, result, skip);
} finally {
if (result != null) {
// result.decref(); // OFF-HEAP
@ -402,7 +407,7 @@ public abstract class FacetProcessor<FacetRequestT extends FacetRequest> {
}
}
void processSubs(SimpleOrderedMap<Object> response, Query filter, DocSet domain) throws IOException {
void processSubs(SimpleOrderedMap<Object> response, Query filter, DocSet domain, boolean skip) throws IOException {
boolean emptyDomain = domain == null || domain.size() == 0;
@ -417,8 +422,18 @@ public abstract class FacetProcessor<FacetRequestT extends FacetRequest> {
continue;
}
Map<String,Object>facetInfoSub = null;
if (fcontext.facetInfo != null) {
facetInfoSub = (Map<String,Object>)fcontext.facetInfo.get(sub.getKey());
}
// If we're skipping this node, then we only need to process sub-facets that have facet info specified.
if (skip && facetInfoSub == null) continue;
// make a new context for each sub-facet since they can change the domain
FacetContext subContext = fcontext.sub(filter, domain);
subContext.facetInfo = facetInfoSub;
if (!skip) subContext.flags &= ~FacetContext.SKIP_FACET; // turn off the skip flag if we're not skipping this bucket
FacetProcessor subProcessor = subRequest.createFacetProcessor(subContext);
if (fcontext.getDebugInfo() != null) { // if fcontext.debugInfo != null, it means rb.debug() == true

View File

@ -56,8 +56,12 @@ class FacetQueryProcessor extends FacetProcessor<FacetQuery> {
@Override
public void process() throws IOException {
super.process();
if (fcontext.facetInfo != null) {
// FIXME - what needs to be done here?
}
response = new SimpleOrderedMap<>();
fillBucket(response, freq.q, null);
fillBucket(response, freq.q, null, (fcontext.flags & FacetContext.SKIP_FACET)!=0);
}

View File

@ -350,7 +350,7 @@ class FacetRangeProcessor extends FacetProcessor<FacetRange> {
if (freq.getSubFacets().size() > 0) {
DocSet subBase = intersections[slot];
try {
processSubs(bucket, filters[slot], subBase);
processSubs(bucket, filters[slot], subBase, false);
} finally {
// subContext.base.decref(); // OFF-HEAP
// subContext.base = null; // do not modify context after creation... there may be deferred execution (i.e. streaming)
@ -367,7 +367,7 @@ class FacetRangeProcessor extends FacetProcessor<FacetRange> {
}
Query rangeQ = sf.getType().getRangeQuery(null, sf, range.low == null ? null : calc.formatValue(range.low), range.high==null ? null : calc.formatValue(range.high), range.includeLower, range.includeUpper);
fillBucket(bucket, rangeQ, null);
fillBucket(bucket, rangeQ, null, false);
return bucket;
}

View File

@ -168,7 +168,10 @@ public abstract class FacetRequest {
class FacetContext {
// Context info for actually executing a local facet command
public static final int IS_SHARD=0x01;
public static final int IS_REFINEMENT=0x02;
public static final int SKIP_FACET=0x04; // refinement: skip calculating this immediate facet, but proceed to specific sub-facets based on facetInfo
Map<String,Object> facetInfo; // refinement info for this node
QueryContext qcontext;
SolrQueryRequest req; // TODO: replace with params?
SolrIndexSearcher searcher;

View File

@ -18,9 +18,12 @@
package org.apache.solr.search.facet;
import java.io.IOException;
import java.util.List;
import org.apache.solr.JSONTestUtil;
import org.apache.solr.SolrTestCaseHS;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.request.SolrQueryRequest;
import org.junit.AfterClass;
@ -209,6 +212,97 @@ public class TestJsonFacetRefinement extends SolrTestCaseHS {
}
@Test
public void testBasicRefinement() throws Exception {
initServers();
Client client = servers.getClient(random().nextInt());
client.queryDefaults().set( "shards", servers.getShards(), "debugQuery", Boolean.toString(random().nextBoolean()) );
List<SolrClient> clients = client.getClientProvider().all();
assertTrue(clients.size() >= 3);
client.deleteByQuery("*:*", null);
ModifiableSolrParams p = params("cat_s", "cat_s", "num_d", "num_d");
String cat_s = p.get("cat_s");
String num_d = p.get("num_d");
clients.get(0).add( sdoc("id", "01", cat_s, "A", num_d, -1) ); // A wins count tie
clients.get(0).add( sdoc("id", "02", cat_s, "B", num_d, 3) );
clients.get(1).add( sdoc("id", "11", cat_s, "B", num_d, -5) ); // B highest count
clients.get(1).add( sdoc("id", "12", cat_s, "B", num_d, -11) );
clients.get(1).add( sdoc("id", "13", cat_s, "A", num_d, 7) );
clients.get(2).add( sdoc("id", "21", cat_s, "A", num_d, 17) ); // A highest count
clients.get(2).add( sdoc("id", "22", cat_s, "A", num_d, -19) );
clients.get(2).add( sdoc("id", "23", cat_s, "B", num_d, 11) );
client.commit();
// Shard responses should be A=1, B=2, A=2, merged should be "A=3, B=2"
// One shard will have _facet_={"refine":{"cat0":{"_l":["A"]}}} on the second phase
/****
// fake a refinement request... good for development/debugging
assertJQ(clients.get(1),
params(p, "q", "*:*", "_facet_","{refine:{cat0:{_l:[A]}}}", "isShard","true", "distrib","false", "shards.purpose","2097216", "ids","11,12,13",
"json.facet", "{" +
"cat0:{type:terms, field:cat_s, sort:'count desc', limit:1, overrequest:0, refine:true}" +
"}"
)
, "facets=={foo:555}"
);
****/
client.testJQ(params(p, "q", "*:*",
"json.facet", "{" +
"cat0:{type:terms, field:${cat_s}, sort:'count desc', limit:1, overrequest:0, refine:false}" +
"}"
)
, "facets=={ count:8" +
", cat0:{ buckets:[ {val:A,count:3} ] }" + // w/o overrequest and refinement, count is lower than it should be (we don't see the A from the middle shard)
"}"
);
client.testJQ(params(p, "q", "*:*",
"json.facet", "{" +
"cat0:{type:terms, field:${cat_s}, sort:'count desc', limit:1, overrequest:0, refine:true}" +
"}"
)
, "facets=={ count:8" +
", cat0:{ buckets:[ {val:A,count:4} ] }" + // w/o overrequest, we need refining to get the correct count.
"}"
);
// test that basic stats work for refinement
client.testJQ(params(p, "q", "*:*",
"json.facet", "{" +
"cat0:{type:terms, field:${cat_s}, sort:'count desc', limit:1, overrequest:0, refine:true, facet:{ stat1:'sum(${num_d})'} }" +
"}"
)
, "facets=={ count:8" +
", cat0:{ buckets:[ {val:A,count:4, stat1:4.0} ] }" +
"}"
);
// test sorting buckets by a different stat
client.testJQ(params(p, "q", "*:*",
"json.facet", "{" +
" cat0:{type:terms, field:${cat_s}, sort:'min1 asc', limit:1, overrequest:0, refine:false, facet:{ min1:'min(${num_d})'} }" +
",cat1:{type:terms, field:${cat_s}, sort:'min1 asc', limit:1, overrequest:0, refine:true, facet:{ min1:'min(${num_d})'} }" +
",sum1:'sum(num_d)'" + // make sure that root bucket stats aren't affected by refinement
"}"
)
, "facets=={ count:8" +
", cat0:{ buckets:[ {val:A,count:3, min1:-19.0} ] }" + // B wins in shard2, so we're missing the "A" count for that shar w/o refinement.
", cat1:{ buckets:[ {val:A,count:4, min1:-19.0} ] }" + // with refinement, we get the right count
", sum1:2.0" +
"}"
);
}
}