From 7fbd55f67949836ca051333937e8b4a7acb231ec Mon Sep 17 00:00:00 2001 From: Erick Erickson Date: Mon, 8 Sep 2014 16:34:23 +0000 Subject: [PATCH] SOLR-6187: facet.mincount ignored in range faceting using distributed search git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1623429 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 4 + .../handler/component/FacetComponent.java | 169 +++++++++++++++--- .../apache/solr/TestDistributedSearch.java | 122 ++++++++++++- 3 files changed, 267 insertions(+), 28 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 1cdee048ee9..ad4beb62026 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -179,6 +179,10 @@ Bug Fixes * SOLR-6467: bin/solr script should direct stdout/stderr when starting in the background to the solr-PORT-console.log in the logs directory instead of bin. (Timothy Potter) +* SOLR-6187: SOLR-6154: facet.mincount ignored in range faceting using distributed search + NOTE: This does NOT fixed for the (deprecated) facet.date idiom, use facet.range + instead. (Erick Erickson, Zacchio Bagnati, Ronald Matamoros, Vamsee Yalargadda) + Other Changes --------------------- diff --git a/solr/core/src/java/org/apache/solr/handler/component/FacetComponent.java b/solr/core/src/java/org/apache/solr/handler/component/FacetComponent.java index 305f25a8ef2..5a9b86ec492 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/FacetComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/FacetComponent.java @@ -135,7 +135,7 @@ public class FacetComponent extends SearchComponent { // only other required phase). // We do this in distributedProcess so we can look at all of the // requests in the outgoing queue at once. - + for (int shardNum = 0; shardNum < rb.shards.length; shardNum++) { List distribFieldFacetRefinements = null; @@ -164,7 +164,7 @@ public class FacetComponent extends SearchComponent { if (distribFieldFacetRefinements == null) { distribFieldFacetRefinements = new ArrayList<>(); } - + distribFieldFacetRefinements.add(facetCommand); distribFieldFacetRefinements.add(termsKey); distribFieldFacetRefinements.add(termsVal); @@ -175,7 +175,7 @@ public class FacetComponent extends SearchComponent { if (distribFieldFacetRefinements == null && !pivotFacetRefinementRequestsExistForShard) { - // nothing to refine, short circut out + // nothing to refine, short circuit out continue; } @@ -214,22 +214,22 @@ public class FacetComponent extends SearchComponent { shardsRefineRequest.params.set(FacetParams.FACET, "true"); shardsRefineRequest.params.remove(FacetParams.FACET_FIELD); shardsRefineRequest.params.remove(FacetParams.FACET_QUERY); - + for (int i = 0; i < distribFieldFacetRefinements.size();) { String facetCommand = distribFieldFacetRefinements.get(i++); String termsKey = distribFieldFacetRefinements.get(i++); String termsVal = distribFieldFacetRefinements.get(i++); - + shardsRefineRequest.params.add(FacetParams.FACET_FIELD, facetCommand); shardsRefineRequest.params.set(termsKey, termsVal); } } - + if (newRequest) { rb.addRequest(this, shardsRefineRequest); } - + // PivotFacetAdditions if (pivotFacetRefinementRequestsExistForShard) { if (newRequest) { @@ -314,6 +314,8 @@ public class FacetComponent extends SearchComponent { } modifyRequestForFieldFacets(rb, sreq, fi); + + modifyRequestForRangeFacets(sreq, fi); modifyRequestForPivotFacets(rb, sreq, fi.pivotFacets); @@ -326,7 +328,24 @@ public class FacetComponent extends SearchComponent { // we could optionally remove faceting params } } - + + // we must get all the range buckets back in order to have coherent lists at the end, see SOLR-6154 + private void modifyRequestForRangeFacets(ShardRequest sreq, FacetInfo fi) { + // Collect all the range fields. + if (sreq.params.getParams(FacetParams.FACET_RANGE) == null) { + return; + } + List rangeFields = new ArrayList<>(); + for (String field : sreq.params.getParams(FacetParams.FACET_RANGE)) { + rangeFields.add(field); + } + + for (String field : rangeFields) { + sreq.params.remove("f." + field + ".facet.mincount"); + sreq.params.add("f." + field + ".facet.mincount", "0"); + } + } + private void modifyRequestForFieldFacets(ResponseBuilder rb, ShardRequest sreq, FacetInfo fi) { for (DistribFieldFacet dff : fi.facets.values()) { @@ -372,7 +391,7 @@ public class FacetComponent extends SearchComponent { dff.initialMincount = (int) Math.ceil((double) dff.minCount / rb.slices.length); } } - + // Currently this is for testing only and allows overriding of the // facet.limit set to the shards dff.initialLimit = rb.req.getParams().getInt("facet.shard.limit", dff.initialLimit); @@ -517,7 +536,7 @@ public class FacetComponent extends SearchComponent { dff.add(shardNum, (NamedList) facet_fields.get(dff.getKey()), dff.initialLimit); } } - + // Distributed facet_dates doDistribDates(fi, facet_counts); @@ -546,14 +565,14 @@ public class FacetComponent extends SearchComponent { for (DistribFieldFacet dff : fi.facets.values()) { // no need to check these facets for refinement if (dff.initialLimit <= 0 && dff.initialMincount <= 1) continue; - + // only other case where index-sort doesn't need refinement is if minCount==0 if (dff.minCount <= 1 && dff.sort.equals(FacetParams.FACET_SORT_INDEX)) continue; - + @SuppressWarnings("unchecked") // generic array's are annoying List[] tmp = (List[]) new List[rb.shards.length]; dff._toRefine = tmp; - + ShardFacetCount[] counts = dff.getCountSorted(); int ntop = Math.min(counts.length, dff.limit >= 0 ? dff.offset + dff.limit : Integer.MAX_VALUE); @@ -562,7 +581,7 @@ public class FacetComponent extends SearchComponent { for (int i = 0; i < counts.length; i++) { ShardFacetCount sfc = counts[i]; boolean needRefinement = false; - + if (i < ntop) { // automatically flag the top values for refinement // this should always be true for facet.sort=index @@ -586,14 +605,14 @@ public class FacetComponent extends SearchComponent { needRefinement = true; } } - + if (needRefinement) { // add a query for each shard missing the term that needs refinement for (int shardNum = 0; shardNum < rb.shards.length; shardNum++) { FixedBitSet fbs = dff.counted[shardNum]; // fbs can be null if a shard request failed - if (fbs != null && - (sfc.termNum >= fbs.length() || !fbs.get(sfc.termNum)) && + if (fbs != null && + (sfc.termNum >= fbs.length() || !fbs.get(sfc.termNum)) && dff.maxPossible(sfc, shardNum) > 0) { dff.needRefinements = true; @@ -607,6 +626,100 @@ public class FacetComponent extends SearchComponent { } } } + removeFieldFacetsUnderLimits(rb); + removeRangeFacetsUnderLimits(rb); + removeQueryFacetsUnderLimits(rb); + + } + + private void removeQueryFacetsUnderLimits(ResponseBuilder rb) { + if (rb.stage != ResponseBuilder.STAGE_EXECUTE_QUERY) { + return; + } + FacetInfo fi = rb._facetInfo; + Map query_facets = fi.queryFacets; + if (query_facets == null) { + return; + } + LinkedHashMap newQueryFacets = new LinkedHashMap<>(); + + // The + int minCount = rb.req.getParams().getInt(FacetParams.FACET_MINCOUNT, 0); + boolean replace = false; + for (Map.Entry ent : query_facets.entrySet()) { + if (ent.getValue().count >= minCount) { + newQueryFacets.put(ent.getKey(), ent.getValue()); + } else { + log.trace("Removing facetQuery/key: " + ent.getKey() + "/" + ent.getValue().toString() + " mincount=" + minCount); + replace = true; + } + } + if (replace) { + fi.queryFacets = newQueryFacets; + } + } + + private void removeRangeFacetsUnderLimits(ResponseBuilder rb) { + if (rb.stage != ResponseBuilder.STAGE_EXECUTE_QUERY) { + return; + } + + FacetInfo fi = rb._facetInfo; + + @SuppressWarnings("unchecked") + SimpleOrderedMap> facet_ranges = + (SimpleOrderedMap>) + fi.rangeFacets; + + if (facet_ranges == null) { + return; + } + + // go through each facet_range + for (Map.Entry> entry : facet_ranges) { + boolean replace = false; + final String field = entry.getKey(); + int minCount = rb.req.getParams().getFieldInt(field, FacetParams.FACET_MINCOUNT, 0); + if (minCount == 0) { + continue; + } + + @SuppressWarnings("unchecked") + NamedList vals + = (NamedList) facet_ranges.get(field).get("counts"); + NamedList newList = new NamedList(); + for (Map.Entry pair : vals) { + if (pair.getValue() >= minCount) { + newList.add(pair.getKey(), pair.getValue()); + } else { + log.trace("Removing facet/key: " + pair.getKey() + "/" + pair.getValue().toString() + " mincount=" + minCount); + replace = true; + } + } + if (replace) { + vals.clear(); + vals.addAll(newList); + } + } + } + private void removeFieldFacetsUnderLimits(ResponseBuilder rb) { + if (rb.stage != ResponseBuilder.STAGE_DONE) { + return; + } + + FacetInfo fi = rb._facetInfo; + if (fi.facets == null) { + return; + } + // Do field facets + for (Entry ent : fi.facets.entrySet()) { + String field = ent.getKey(); + int minCount = rb.req.getParams().getFieldInt(field, FacetParams.FACET_MINCOUNT, 0); + if (minCount == 0) { // return them all + continue; + } + ent.getValue().respectMinCount(minCount); + } } // The implementation below uses the first encountered shard's @@ -763,7 +876,7 @@ public class FacetComponent extends SearchComponent { private void refineFacets(ResponseBuilder rb, ShardRequest sreq) { FacetInfo fi = rb._facetInfo; - + for (ShardResponse srsp : sreq.responses) { // int shardNum = rb.getShardNum(srsp.shard); NamedList facet_counts = (NamedList) srsp.getSolrResponse().getResponse().get("facet_counts"); @@ -775,7 +888,7 @@ public class FacetComponent extends SearchComponent { String key = facet_fields.getName(i); DistribFieldFacet dff = fi.facets.get(key); if (dff == null) continue; - + NamedList shardCounts = (NamedList) facet_fields.getVal(i); for (int j = 0; j < shardCounts.size(); j++) { @@ -1179,7 +1292,7 @@ public class FacetComponent extends SearchComponent { int numReceived = sz; FixedBitSet terms = new FixedBitSet(termNum + sz); - + long last = 0; for (int i = 0; i < sz; i++) { String name = shardCounts.getName(i); @@ -1248,6 +1361,22 @@ public class FacetComponent extends SearchComponent { // TODO: could store the last term in the shard to tell if this term // comes before or after it. If it comes before, we could subtract 1 } + + public void respectMinCount(long minCount) { + HashMap newOne = new HashMap<>(); + boolean replace = false; + for (Map.Entry ent : counts.entrySet()) { + if (ent.getValue().count >= minCount) { + newOne.put(ent.getKey(), ent.getValue()); + } else { + log.trace("Removing facet/key: " + ent.getKey() + "/" + ent.getValue().toString() + " mincount=" + minCount); + replace = true; + } + } + if (replace) { + counts = newOne; + } + } } /** diff --git a/solr/core/src/test/org/apache/solr/TestDistributedSearch.java b/solr/core/src/test/org/apache/solr/TestDistributedSearch.java index 2348591e8a2..42a6542dac7 100644 --- a/solr/core/src/test/org/apache/solr/TestDistributedSearch.java +++ b/solr/core/src/test/org/apache/solr/TestDistributedSearch.java @@ -28,7 +28,9 @@ import org.apache.solr.client.solrj.SolrServer; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.embedded.JettySolrRunner; import org.apache.solr.client.solrj.impl.HttpSolrServer; +import org.apache.solr.client.solrj.response.FacetField; import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.client.solrj.response.RangeFacet; import org.apache.solr.cloud.ChaosMonkey; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.CommonParams; @@ -66,14 +68,14 @@ public class TestDistributedSearch extends BaseDistributedSearchTestCase { del("*:*"); - indexr(id,1, i1, 100, tlong, 100,t1,"now is the time for all good men", + indexr(id,1, i1, 100, tlong, 100,t1,"now is the time for all good men", tdate_a, "2010-04-20T11:00:00Z", tdate_b, "2009-08-20T11:00:00Z", "foo_f", 1.414f, "foo_b", "true", "foo_d", 1.414d); - indexr(id,2, i1, 50 , tlong, 50,t1,"to come to the aid of their country.", + indexr(id,2, i1, 50 , tlong, 50,t1,"to come to the aid of their country.", tdate_a, "2010-05-02T11:00:00Z", tdate_b, "2009-11-02T11:00:00Z"); - indexr(id,3, i1, 2, tlong, 2,t1,"how now brown cow", + indexr(id,3, i1, 2, tlong, 2,t1,"how now brown cow", tdate_a, "2010-05-03T11:00:00Z"); indexr(id,4, i1, -100 ,tlong, 101, t1,"the quick fox jumped over the lazy dog", @@ -175,13 +177,13 @@ public class TestDistributedSearch extends BaseDistributedSearchTestCase { // a facet query to test out chars out of the ascii range query("q","*:*", "rows",0, "facet","true", "facet.query","{!term f=foo_s}international\u00ff\u01ff\u2222\u3333"); - + // simple field facet on date fields - rsp = query("q","*:*", "rows", 0, + rsp = query("q","*:*", "rows", 0, "facet","true", "facet.limit", 1, // TODO: limit shouldn't be needed: SOLR-6386 "facet.field", tdate_a); assertEquals(1, rsp.getFacetFields().size()); - rsp = query("q","*:*", "rows", 0, + rsp = query("q","*:*", "rows", 0, "facet","true", "facet.limit", 1, // TODO: limit shouldn't be needed: SOLR-6386 "facet.field", tdate_b, "facet.field", tdate_a); assertEquals(2, rsp.getFacetFields().size()); @@ -225,7 +227,85 @@ public class TestDistributedSearch extends BaseDistributedSearchTestCase { "facet.range.start",200, "facet.range.gap",100, "f."+tlong+".facet.range.end",900); - + + // Test mincounts. Do NOT want to go through all the stuff where with validateControlData in query() method + // Purposely packing a _bunch_ of stuff together here to insure that the proper level of mincount is used for + // each + ModifiableSolrParams minParams = new ModifiableSolrParams(); + minParams.set("q","*:*"); + minParams.set("rows", 1); + minParams.set("facet", "true"); + minParams.set("facet.missing", "true"); + minParams.set("facet.field", i1); + minParams.set("facet.missing", "true"); + minParams.set("facet.mincount", 2); + + // Return a separate section of ranges over i1. Should respect global range mincount + minParams.set("facet.range", i1); + minParams.set("f." + i1 + ".facet.range.start", 0); + minParams.set("f." + i1 + ".facet.range.gap", 200); + minParams.set("f." + i1 + ".facet.range.end", 1200); + minParams.set("f." + i1 + ".facet.mincount", 4); + + + // Return a separate section of ranges over tlong Should respect facet.mincount + minParams.add("facet.range", tlong); + minParams.set("f." + tlong + ".facet.range.start", 0); + minParams.set("f." + tlong + ".facet.range.gap", 100); + minParams.set("f." + tlong + ".facet.range.end", 1200); + // Repeat with a range type of date + minParams.add("facet.range", tdate_b); + minParams.set("f." + tdate_b + ".facet.range.start", "2009-02-01T00:00:00Z"); + minParams.set("f." + tdate_b + ".facet.range.gap", "+1YEAR"); + minParams.set("f." + tdate_b + ".facet.range.end", "2011-01-01T00:00:00Z"); + minParams.set("f." + tdate_b + ".facet.mincount", 3); + + // Insure that global mincount is respected for facet queries + minParams.set("facet.query", tdate_a + ":[2010-01-01T00:00:00Z TO 2011-01-01T00:00:00Z]"); // Should return some counts + //minParams.set("facet.query", tdate_a + ":[* TO *]"); // Should be removed + minParams.add("facet.query", tdate_b + ":[2008-01-01T00:00:00Z TO 2009-09-01T00:00:00Z]"); // Should be removed from response + + + setDistributedParams(minParams); + QueryResponse minResp = queryServer(minParams); + + ModifiableSolrParams eParams = new ModifiableSolrParams(); + eParams.set("q",tdate_b + ":[* TO *]"); + eParams.set("rows", 1000); + eParams.set("fl", tdate_b); + setDistributedParams(eParams); + QueryResponse eResp = queryServer(eParams); + + // Check that exactly the right numbers of counts came through + assertEquals("Should be exactly 2 range facets returned after minCounts taken into account ", 3, minResp.getFacetRanges().size()); + assertEquals("Should only be 1 query facets returned after minCounts taken into account ", 1, minResp.getFacetQuery().size()); + + checkMinCountsField(minResp.getFacetField(i1).getValues(), new Object[]{null, 55L}); // Should just be the null entries for field + + checkMinCountsRange(minResp.getFacetRanges().get(0).getCounts(), new Object[]{"0", 5L}); // range on i1 + checkMinCountsRange(minResp.getFacetRanges().get(1).getCounts(), new Object[]{"0", 3L, "100", 3L}); // range on tlong + checkMinCountsRange(minResp.getFacetRanges().get(2).getCounts(), new Object[]{"2009-02-01T00:00:00Z", 3L}); // date (range) on tvh + + assertTrue("Should have a facet for tdate_a", minResp.getFacetQuery().containsKey("a_n_tdt:[2010-01-01T00:00:00Z TO 2011-01-01T00:00:00Z]")); + int qCount = minResp.getFacetQuery().get("a_n_tdt:[2010-01-01T00:00:00Z TO 2011-01-01T00:00:00Z]"); + assertEquals("tdate_a should be 5", qCount, 5); + + // Now let's do some queries, the above is getting too complex + minParams = new ModifiableSolrParams(); + minParams.set("q","*:*"); + minParams.set("rows", 1); + minParams.set("facet", "true"); + minParams.set("facet.mincount", 3); + + minParams.set("facet.query", tdate_a + ":[2010-01-01T00:00:00Z TO 2010-05-04T00:00:00Z]"); + minParams.add("facet.query", tdate_b + ":[2009-01-01T00:00:00Z TO 2010-01-01T00:00:00Z]"); // Should be removed + setDistributedParams(minParams); + minResp = queryServer(minParams); + + assertEquals("Should only be 1 query facets returned after minCounts taken into account ", 1, minResp.getFacetQuery().size()); + assertTrue("Should be an entry for a_n_tdt", minResp.getFacetQuery().containsKey("a_n_tdt:[2010-01-01T00:00:00Z TO 2010-05-04T00:00:00Z]")); + qCount = minResp.getFacetQuery().get("a_n_tdt:[2010-01-01T00:00:00Z TO 2010-05-04T00:00:00Z]"); + assertEquals("a_n_tdt should have a count of 4 ", qCount, 4); // variations of fl query("q","*:*", "fl","score","sort",i1 + " desc"); query("q","*:*", "fl",i1 + ",score","sort",i1 + " desc"); @@ -454,7 +534,33 @@ public class TestDistributedSearch extends BaseDistributedSearchTestCase { } } } - + + protected void checkMinCountsField(List counts, Object[] pairs) { + assertEquals("There should be exactly " + pairs.length / 2 + " returned counts. There were: " + counts.size(), counts.size(), pairs.length / 2); + assertTrue("Variable len param must be an even number, it was: " + pairs.length, (pairs.length % 2) == 0); + for (int pairs_idx = 0, counts_idx = 0; pairs_idx < pairs.length; pairs_idx += 2, counts_idx++) { + String act_name = counts.get(counts_idx).getName(); + long act_count = counts.get(counts_idx).getCount(); + String exp_name = (String) pairs[pairs_idx]; + long exp_count = (long) pairs[pairs_idx + 1]; + assertEquals("Expected ordered entry " + exp_name + " at position " + counts_idx + " got " + act_name, act_name, exp_name); + assertEquals("Expected count for entry: " + exp_name + " at position " + counts_idx + " got " + act_count, act_count, exp_count); + } + } + + protected void checkMinCountsRange(List counts, Object[] pairs) { + assertEquals("There should be exactly " + pairs.length / 2 + " returned counts. There were: " + counts.size(), counts.size(), pairs.length / 2); + assertTrue("Variable len param must be an even number, it was: " + pairs.length, (pairs.length % 2) == 0); + for (int pairs_idx = 0, counts_idx = 0; pairs_idx < pairs.length; pairs_idx += 2, counts_idx++) { + String act_name = counts.get(counts_idx).getValue(); + long act_count = counts.get(counts_idx).getCount(); + String exp_name = (String) pairs[pairs_idx]; + long exp_count = (long) pairs[pairs_idx + 1]; + assertEquals("Expected ordered entry " + exp_name + " at position " + counts_idx + " got " + act_name, act_name, exp_name); + assertEquals("Expected count for entry: " + exp_name + " at position " + counts_idx + " got " + act_count, act_count, exp_count); + } + } + protected void queryPartialResults(final List upShards, final List upClients, Object... q) throws Exception {