From c5ff4a4444fe95001bc1baf29f64fb2406fd2ca3 Mon Sep 17 00:00:00 2001 From: Chris Hostetter Date: Fri, 2 Nov 2018 20:32:32 -0700 Subject: [PATCH] SOLR-12954: fix facet.pivot refinement bugs when using facet.sort=index and facet.mincount>1 --- solr/CHANGES.txt | 2 + .../handler/component/FacetComponent.java | 3 +- .../PivotFacetFieldValueCollection.java | 34 +++++-- .../DistributedFacetPivotLargeTest.java | 88 +++++++++++++++++++ 4 files changed, 119 insertions(+), 8 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 6f8c449eae2..90cce5b27f9 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -233,6 +233,8 @@ Bug Fixes * SOLR-12875: fix ArrayIndexOutOfBoundsException when unique(field) or uniqueBlock(_root_) is used with DVHASH method in json.facet. (Tim Underwood via Mikhail Khludnev) +* SOLR-12954: fix facet.pivot refinement bugs when using facet.sort=index and facet.mincount>1 (hossman) + Improvements ---------------------- diff --git a/solr/core/src/java/org/apache/solr/handler/component/FacetComponent.java b/solr/core/src/java/org/apache/solr/handler/component/FacetComponent.java index 01f88758f33..e01c9582f19 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/FacetComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/FacetComponent.java @@ -650,7 +650,7 @@ public class FacetComponent extends SearchComponent { (fieldToOverRequest, FacetParams.FACET_SORT, defaultSort); int shardLimit = requestedLimit + offset; - int shardMinCount = requestedMinCount; + int shardMinCount = Math.min(requestedMinCount, 1); // per-shard mincount & overrequest if ( FacetParams.FACET_SORT_INDEX.equals(sort) && @@ -670,7 +670,6 @@ public class FacetComponent extends SearchComponent { if ( 0 < requestedLimit ) { shardLimit = doOverRequestMath(shardLimit, overRequestRatio, overRequestCount); } - shardMinCount = Math.min(requestedMinCount, 1); } sreq.params.set(paramStart + FacetParams.FACET_LIMIT, shardLimit); sreq.params.set(paramStart + FacetParams.FACET_PIVOT_MINCOUNT, shardMinCount); diff --git a/solr/core/src/java/org/apache/solr/handler/component/PivotFacetFieldValueCollection.java b/solr/core/src/java/org/apache/solr/handler/component/PivotFacetFieldValueCollection.java index 5c2b07f61d1..d9aeaeaef90 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/PivotFacetFieldValueCollection.java +++ b/solr/core/src/java/org/apache/solr/handler/component/PivotFacetFieldValueCollection.java @@ -118,14 +118,36 @@ public class PivotFacetFieldValueCollection implements Iterable */ public List getNextLevelValuesToRefine() { final int numRefinableValues = getExplicitValuesListSize(); - if (facetFieldOffset < numRefinableValues) { - final int offsetPlusCount = (facetFieldLimit >= 0) - ? Math.min(facetFieldLimit + facetFieldOffset, numRefinableValues) - : numRefinableValues; - return getExplicitValuesList().subList(facetFieldOffset, offsetPlusCount); - } else { + if (numRefinableValues < facetFieldOffset) { return Collections.emptyList(); } + + final int offsetPlusCount = (facetFieldLimit >= 0) + ? Math.min(facetFieldLimit + facetFieldOffset, numRefinableValues) + : numRefinableValues; + + if (1 < facetFieldMinimumCount && facetFieldSort.equals(FacetParams.FACET_SORT_INDEX)) { + // we have to skip any values that (still) don't meet the mincount + // + // TODO: in theory we could avoid this extra check by trimming sooner (SOLR-6331) + // but since that's a destructive op that blows away the `valuesMap` which we (might?) still need + // (and pre-emptively skips the offsets) we're avoiding re-working that optimization + // for now until/unless someone gives it more careful thought... + final List results = new ArrayList<>(numRefinableValues); + for (PivotFacetValue pivotValue : explicitValues) { + if (pivotValue.getCount() >= facetFieldMinimumCount) { + results.add(pivotValue); + if (numRefinableValues <= results.size()) { + break; + } + } + } + return results; + } + + // in the non "sort==count OR mincount==1" situation, we can just return the first N values + // because any viable candidate is already in the top N + return getExplicitValuesList().subList(facetFieldOffset, offsetPlusCount); } /** diff --git a/solr/core/src/test/org/apache/solr/handler/component/DistributedFacetPivotLargeTest.java b/solr/core/src/test/org/apache/solr/handler/component/DistributedFacetPivotLargeTest.java index eb6f54d5db4..cc311358b90 100644 --- a/solr/core/src/test/org/apache/solr/handler/component/DistributedFacetPivotLargeTest.java +++ b/solr/core/src/test/org/apache/solr/handler/component/DistributedFacetPivotLargeTest.java @@ -17,6 +17,7 @@ package org.apache.solr.handler.component; import java.io.IOException; +import java.util.Arrays; import java.util.Date; import java.util.List; @@ -264,6 +265,93 @@ public class DistributedFacetPivotLargeTest extends BaseDistributedSearchTestCas } } + // check of a single level pivot using sort=index w/mincount big enough + // to triggers per-shard mincount > num docs on one shard + // (beefed up test of same with nested pivot below) + for (int limit : Arrays.asList(4, 444444, -1)) { + SolrParams p = params("q", "*:*", + "rows", "0", + // skip place_s:Nplaceholder buckets + "fq","-hiredate_dt:\"2012-10-01T12:30:00Z\"", + // skip company_t:compHolderN buckets from twoShard + "fq","-(+company_t:compHolder* +real_b:true)", + "facet","true", + "facet.pivot","place_s", + FacetParams.FACET_PIVOT_MINCOUNT, "50", + FacetParams.FACET_LIMIT, ""+limit, + "facet.sort", "index"); + rsp = null; + try { + rsp = query( p ); + assertPivot("place_s", "cardiff", 107, rsp.getFacetPivot().get("place_s").get(0)); + // - zeroShard = 50 ... above per-shard min of 50/(numShards=4) + // - oneShard = 5 ... below per-shard min of 50/(numShards=4) .. should be refined + // - twoShard = 52 ... above per-shard min of 50/(numShards=4) + // = threeShard = 0 ... should be refined and still match nothing + } catch (AssertionError ae) { + throw new AssertionError(ae.getMessage() + ": " + p.toString() + " ==> " + rsp, ae); + } + } + + // test permutations of mincount & limit with sort=index + // (there is a per-shard optimization on mincount when sort=index is used) + for (int limit : Arrays.asList(4, 444444, -1)) { + SolrParams p = params("q", "*:*", + "rows", "0", + // skip place_s:Nplaceholder buckets + "fq","-hiredate_dt:\"2012-10-01T12:30:00Z\"", + // skip company_t:compHolderN buckets from twoShard + "fq","-(+company_t:compHolder* +real_b:true)", + "facet","true", + "facet.pivot","place_s,company_t", + FacetParams.FACET_PIVOT_MINCOUNT, "50", + FacetParams.FACET_LIMIT, ""+limit, + "facet.sort", "index"); + rsp = null; + try { + rsp = query( p ); + pivots = rsp.getFacetPivot().get("place_s,company_t"); + firstPlace = pivots.get(0); + assertPivot("place_s", "cardiff", 107, firstPlace); + // + assertPivot("company_t", "bbc", 101, firstPlace.getPivot().get(0)); + assertPivot("company_t", "honda", 50, firstPlace.getPivot().get(1)); + assertPivot("company_t", "microsoft", 56, firstPlace.getPivot().get(2)); + assertPivot("company_t", "polecat", 52, firstPlace.getPivot().get(3)); + } catch (AssertionError ae) { + throw new AssertionError(ae.getMessage() + ": " + p.toString() + " ==> " + rsp, ae); + } + } + + { // similar to the test above, but now force a restriction on the over request and allow + // terms that are early in index sort -- but don't meet the mincount overall -- to be considered + // in the first phase. (SOLR-12954) + SolrParams p = params("q", "*:*", + "rows", "0", + // skip company_t:compHolderN buckets from twoShard + "fq","-(+company_t:compHolder* +real_b:true)", + "facet","true", + "facet.pivot","place_s,company_t", + // the (50) Nplaceholder place_s values exist in 6 each on oneShard + FacetParams.FACET_PIVOT_MINCOUNT, ""+(6 * shardsArr.length), + FacetParams.FACET_LIMIT, "4", + "facet.sort", "index"); + rsp = null; + try { + rsp = query( p ); + pivots = rsp.getFacetPivot().get("place_s,company_t"); + firstPlace = pivots.get(0); + assertPivot("place_s", "cardiff", 107, firstPlace); + // + assertPivot("company_t", "bbc", 101, firstPlace.getPivot().get(0)); + assertPivot("company_t", "honda", 50, firstPlace.getPivot().get(1)); + assertPivot("company_t", "microsoft", 56, firstPlace.getPivot().get(2)); + assertPivot("company_t", "polecat", 52, firstPlace.getPivot().get(3)); + } catch (AssertionError ae) { + throw new AssertionError(ae.getMessage() + ": " + p.toString() + " ==> " + rsp, ae); + } + } + // Pivot Faceting (combined wtih Field Faceting) for (SolrParams facetParams : // with and w/o an excluded fq