LUCENE-7506: FastTaxonomyFacetCounts use ConjunctionDISI so cost is in proportion to size of intersected set of documents

2016-10-19 10:04:39 -04:00 · 2016-10-19 10:04:39 -04:00 · d03cc92b22
parent 731c5f9316
commit d03cc92b22
2 changed files with 24 additions and 22 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -40,6 +40,11 @@ Optimizations
  in the sets of SHOULD and FILTER clauses, or both in MUST/FILTER and MUST_NOT
  clauses. (Spyros Kapnissis via Adrien Grand, Uwe Schindler)

+* LUCENE-7506: FastTaxonomyFacetCounts should use CPU in proportion to
+  the size of the intersected set of hits from the query and documents
+  that have a facet value, so sparse faceting works as expected
+  (Adrien Grand via Mike McCandless)
+
 Other

 * LUCENE-7328: Remove LegacyNumericEncoding from GeoPointField. (Nick Knize)
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java
@ -17,12 +17,14 @@
 package org.apache.lucene.facet.taxonomy;

 import java.io.IOException;
+import java.util.Arrays;
 import java.util.List;

 import org.apache.lucene.facet.FacetsCollector.MatchingDocs;
 import org.apache.lucene.facet.FacetsCollector;
 import org.apache.lucene.facet.FacetsConfig;
 import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.search.ConjunctionDISI;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.util.BytesRef;

@ -55,29 +57,24 @@ public class FastTaxonomyFacetCounts extends IntTaxonomyFacets {
        continue;
      }

-      DocIdSetIterator docs = hits.bits.iterator();
+      DocIdSetIterator it = ConjunctionDISI.intersectIterators(Arrays.asList(
+          hits.bits.iterator(), dv));
      
-      int doc;
-      while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
-        if (dv.docID() < doc) {
-          dv.advance(doc);
-        }
-        if (dv.docID() == doc) {
-          final BytesRef bytesRef = dv.binaryValue();
-          byte[] bytes = bytesRef.bytes;
-          int end = bytesRef.offset + bytesRef.length;
-          int ord = 0;
-          int offset = bytesRef.offset;
-          int prev = 0;
-          while (offset < end) {
-            byte b = bytes[offset++];
-            if (b >= 0) {
-              prev = ord = ((ord << 7) | b) + prev;
-              ++values[ord];
-              ord = 0;
-            } else {
-              ord = (ord << 7) | (b & 0x7F);
-            }
+      for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+        final BytesRef bytesRef = dv.binaryValue();
+        byte[] bytes = bytesRef.bytes;
+        int end = bytesRef.offset + bytesRef.length;
+        int ord = 0;
+        int offset = bytesRef.offset;
+        int prev = 0;
+        while (offset < end) {
+          byte b = bytes[offset++];
+          if (b >= 0) {
+            prev = ord = ((ord << 7) | b) + prev;
+            ++values[ord];
+            ord = 0;
+          } else {
+            ord = (ord << 7) | (b & 0x7F);
          }
        }
      }