SOLR-8222: optimize method=dv faceting for counts

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1712608 13f79535-47bb-0310-9956-ffa450edef68
2015-11-04 18:27:55 +00:00 · 2015-11-04 18:27:55 +00:00 · 2bbbc4d42d
parent b5b4292afc
commit 2bbbc4d42d
4 changed files with 303 additions and 152 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -346,6 +346,12 @@ Optimizations

 * SOLR-7983: Utils.toUTF8 uses larger buffer than necessary for holding UTF8 data. (shalin)

+* SOLR-8222: JSON Facet API optimization to faceting by count on docvalue fields (or indexed fields
+  with method=dv) when there are multiple hits expected for enoug buckets. For example, this
+  more than doubled the performance of faceting 5M documents over a field with 1M unique values.
+  (yonik)
+
+
 Other Changes
 ----------------------

--- a/solr/core/src/java/org/apache/solr/search/facet/FacetField.java
+++ b/solr/core/src/java/org/apache/solr/search/facet/FacetField.java
@ -73,6 +73,9 @@ public class FacetField extends FacetRequest {
  FacetMethod method;
  int cacheDf;  // 0 means "default", -1 means "never cache"

+  // experimental - force perSeg collection when using dv method, currently for testing purposes only.
+  Boolean perSeg;
+
  // TODO: put this somewhere more generic?
  public static enum SortDirection {
    asc(-1) ,
@ -692,158 +695,6 @@ abstract class FacetFieldProcessorFCBase extends FacetFieldProcessor {
 }


-class FacetFieldProcessorDV extends FacetFieldProcessorFCBase {
-  static boolean unwrap_singleValued_multiDv = true;  // only set to false for test coverage
-
-  boolean multiValuedField;
-  SortedSetDocValues si;  // only used for term lookups (for both single and multi-valued)
-  MultiDocValues.OrdinalMap ordinalMap = null; // maps per-segment ords to global ords
-
-
-  public FacetFieldProcessorDV(FacetContext fcontext, FacetField freq, SchemaField sf) {
-    super(fcontext, freq, sf);
-    multiValuedField = sf.multiValued() || sf.getType().multiValuedFieldCache();
-  }
-
-  protected BytesRef lookupOrd(int ord) throws IOException {
-    return si.lookupOrd(ord);
-  }
-
-  protected void findStartAndEndOrds() throws IOException {
-    if (multiValuedField) {
-      si = FieldUtil.getSortedSetDocValues(fcontext.qcontext, sf, null);
-      if (si instanceof MultiDocValues.MultiSortedSetDocValues) {
-        ordinalMap = ((MultiDocValues.MultiSortedSetDocValues)si).mapping;
-      }
-    } else {
-      SortedDocValues single = FieldUtil.getSortedDocValues(fcontext.qcontext, sf, null);
-      si = DocValues.singleton(single);  // multi-valued view
-      if (single instanceof MultiDocValues.MultiSortedDocValues) {
-        ordinalMap = ((MultiDocValues.MultiSortedDocValues)single).mapping;
-      }
-    }
-
-    if (si.getValueCount() >= Integer.MAX_VALUE) {
-      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Field has too many unique values. field=" + sf + " nterms= " + si.getValueCount());
-    }
-
-    if (prefixRef != null) {
-      startTermIndex = (int)si.lookupTerm(prefixRef.get());
-      if (startTermIndex < 0) startTermIndex = -startTermIndex - 1;
-      prefixRef.append(UnicodeUtil.BIG_TERM);
-      endTermIndex = (int)si.lookupTerm(prefixRef.get());
-      assert endTermIndex < 0;
-      endTermIndex = -endTermIndex - 1;
-    } else {
-      startTermIndex = 0;
-      endTermIndex = (int)si.getValueCount();
-    }
-
-    nTerms = endTermIndex - startTermIndex;
-  }
-
-  @Override
-  protected void collectDocs() throws IOException {
-    if (nTerms <= 0 || fcontext.base.size() < effectiveMincount) { // TODO: what about allBuckets? missing bucket?
-      return;
-    }
-
-    final List<LeafReaderContext> leaves = fcontext.searcher.getIndexReader().leaves();
-    Filter filter = fcontext.base.getTopFilter();
-
-    for (int subIdx = 0; subIdx < leaves.size(); subIdx++) {
-      LeafReaderContext subCtx = leaves.get(subIdx);
-
-      setNextReaderFirstPhase(subCtx);
-
-      DocIdSet dis = filter.getDocIdSet(subCtx, null); // solr docsets already exclude any deleted docs
-      DocIdSetIterator disi = dis.iterator();
-
-      SortedDocValues singleDv = null;
-      SortedSetDocValues multiDv = null;
-      if (multiValuedField) {
-        // TODO: get sub from multi?
-        multiDv = subCtx.reader().getSortedSetDocValues(sf.getName());
-        if (multiDv == null) {
-          multiDv = DocValues.emptySortedSet();
-        }
-        // some codecs may optimize SortedSet storage for single-valued fields
-        // this will be null if this is not a wrapped single valued docvalues.
-        if (unwrap_singleValued_multiDv) {
-          singleDv = DocValues.unwrapSingleton(multiDv);
-        }
-      } else {
-        singleDv = subCtx.reader().getSortedDocValues(sf.getName());
-        if (singleDv == null) {
-          singleDv = DocValues.emptySorted();
-        }
-      }
-
-      LongValues toGlobal = ordinalMap == null ? null : ordinalMap.getGlobalOrds(subIdx);
-
-      if (singleDv != null) {
-        collectDocs(singleDv, disi, toGlobal);
-      } else {
-        collectDocs(multiDv, disi, toGlobal);
-      }
-    }
-
-  }
-
-  protected void collectDocs(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
-    int doc;
-    while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
-      int segOrd = singleDv.getOrd(doc);
-      if (segOrd < 0) continue;
-      collect(doc, segOrd, toGlobal);
-    }
-  }
-
-  protected void collectDocs(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
-    int doc;
-    while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
-      multiDv.setDocument(doc);
-      for(;;) {
-        int segOrd = (int)multiDv.nextOrd();
-        if (segOrd < 0) break;
-        collect(doc, segOrd, toGlobal);
-      }
-    }
-  }
-
-  private void collect(int doc, int segOrd, LongValues toGlobal) throws IOException {
-    int ord = (toGlobal != null && segOrd >= 0) ? (int)toGlobal.get(segOrd) : segOrd;
-
-    int arrIdx = ord - startTermIndex;
-    if (arrIdx >= 0 && arrIdx < nTerms) {
-      countAcc.incrementCount(arrIdx, 1);
-      if (collectAcc != null) {
-        collectAcc.collect(doc, arrIdx);
-      }
-      if (allBucketsAcc != null) {
-        allBucketsAcc.collect(doc, arrIdx);
-      }
-    }
-  }
-
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-
-
-
-
-
-
-
-
-
-
-
-



--- a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorDV.java
+++ b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorDV.java
@ -0,0 +1,292 @@
+package org.apache.solr.search.facet;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.MultiDocValues;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LongValues;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.search.Filter;
+
+class FacetFieldProcessorDV extends FacetFieldProcessorFCBase {
+  static boolean unwrap_singleValued_multiDv = true;  // only set to false for test coverage
+
+  boolean multiValuedField;
+  SortedSetDocValues si;  // only used for term lookups (for both single and multi-valued)
+  MultiDocValues.OrdinalMap ordinalMap = null; // maps per-segment ords to global ords
+
+
+  public FacetFieldProcessorDV(FacetContext fcontext, FacetField freq, SchemaField sf) {
+    super(fcontext, freq, sf);
+    multiValuedField = sf.multiValued() || sf.getType().multiValuedFieldCache();
+  }
+
+  protected BytesRef lookupOrd(int ord) throws IOException {
+    return si.lookupOrd(ord);
+  }
+
+  protected void findStartAndEndOrds() throws IOException {
+    if (multiValuedField) {
+      si = FieldUtil.getSortedSetDocValues(fcontext.qcontext, sf, null);
+      if (si instanceof MultiDocValues.MultiSortedSetDocValues) {
+        ordinalMap = ((MultiDocValues.MultiSortedSetDocValues)si).mapping;
+      }
+    } else {
+      SortedDocValues single = FieldUtil.getSortedDocValues(fcontext.qcontext, sf, null);
+      si = DocValues.singleton(single);  // multi-valued view
+      if (single instanceof MultiDocValues.MultiSortedDocValues) {
+        ordinalMap = ((MultiDocValues.MultiSortedDocValues)single).mapping;
+      }
+    }
+
+    if (si.getValueCount() >= Integer.MAX_VALUE) {
+      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Field has too many unique values. field=" + sf + " nterms= " + si.getValueCount());
+    }
+
+    if (prefixRef != null) {
+      startTermIndex = (int)si.lookupTerm(prefixRef.get());
+      if (startTermIndex < 0) startTermIndex = -startTermIndex - 1;
+      prefixRef.append(UnicodeUtil.BIG_TERM);
+      endTermIndex = (int)si.lookupTerm(prefixRef.get());
+      assert endTermIndex < 0;
+      endTermIndex = -endTermIndex - 1;
+    } else {
+      startTermIndex = 0;
+      endTermIndex = (int)si.getValueCount();
+    }
+
+    nTerms = endTermIndex - startTermIndex;
+  }
+
+  @Override
+  protected void collectDocs() throws IOException {
+    int domainSize = fcontext.base.size();
+
+    if (nTerms <= 0 || domainSize < effectiveMincount) { // TODO: what about allBuckets? missing bucket?
+      return;
+    }
+
+    // TODO: refactor some of this logic into a base class
+    boolean countOnly = collectAcc==null && allBucketsAcc==null;
+    boolean fullRange = startTermIndex == 0 && endTermIndex == si.getValueCount();
+
+    // Are we expecting many hits per bucket?
+    // FUTURE: pro-rate for nTerms?
+    // FUTURE: better take into account number of values in multi-valued fields.  This info is available for indexed fields.
+    // FUTURE: take into account that bigger ord maps are more expensive than smaller ones
+    // One test: 5M doc index, faceting on a single-valued field with almost 1M unique values, crossover point where global counting was slower
+    // than per-segment counting was a domain of 658k docs.  At that point, top 10 buckets had 6-7 matches each.
+    // this was for heap docvalues produced by UninvertingReader
+    // Since these values were randomly distributed, lets round our domain multiplier up to account for less random real world data.
+    long domainMultiplier = multiValuedField ? 4L : 2L;
+    boolean manyHitsPerBucket = domainSize * domainMultiplier > (si.getValueCount() + 3);  // +3 to increase test coverage with small tests
+
+    // If we're only calculating counts, we're not prefixing, and we expect to collect many documents per unique value,
+    // then collect per-segment before mapping to global ords at the end.  This will save redundant seg->global ord mappings.
+    // FUTURE: there are probably some other non "countOnly" cases where we can use this as well (i.e. those where
+    // the docid is not used)
+    boolean canDoPerSeg = countOnly && fullRange;
+    boolean accumSeg = manyHitsPerBucket && canDoPerSeg;
+
+    if (freq.perSeg != null) accumSeg = canDoPerSeg && freq.perSeg;  // internal - override perSeg heuristic
+
+    final List<LeafReaderContext> leaves = fcontext.searcher.getIndexReader().leaves();
+    Filter filter = fcontext.base.getTopFilter();
+
+    for (int subIdx = 0; subIdx < leaves.size(); subIdx++) {
+      LeafReaderContext subCtx = leaves.get(subIdx);
+
+      setNextReaderFirstPhase(subCtx);
+
+      DocIdSet dis = filter.getDocIdSet(subCtx, null); // solr docsets already exclude any deleted docs
+      DocIdSetIterator disi = dis.iterator();
+
+      SortedDocValues singleDv = null;
+      SortedSetDocValues multiDv = null;
+      if (multiValuedField) {
+        // TODO: get sub from multi?
+        multiDv = subCtx.reader().getSortedSetDocValues(sf.getName());
+        if (multiDv == null) {
+          multiDv = DocValues.emptySortedSet();
+        }
+        // some codecs may optimize SortedSet storage for single-valued fields
+        // this will be null if this is not a wrapped single valued docvalues.
+        if (unwrap_singleValued_multiDv) {
+          singleDv = DocValues.unwrapSingleton(multiDv);
+        }
+      } else {
+        singleDv = subCtx.reader().getSortedDocValues(sf.getName());
+        if (singleDv == null) {
+          singleDv = DocValues.emptySorted();
+        }
+      }
+
+      LongValues toGlobal = ordinalMap == null ? null : ordinalMap.getGlobalOrds(subIdx);
+
+      if (singleDv != null) {
+        if (accumSeg) {
+          collectPerSeg(singleDv, disi, toGlobal);
+        } else {
+          if (canDoPerSeg && toGlobal != null) {
+            collectCounts(singleDv, disi, toGlobal);
+          } else {
+            collectDocs(singleDv, disi, toGlobal);
+          }
+        }
+      } else {
+        if (accumSeg) {
+          collectPerSeg(multiDv, disi, toGlobal);
+        } else {
+          if (canDoPerSeg && toGlobal != null) {
+            collectCounts(multiDv, disi, toGlobal);
+          } else {
+            collectDocs(multiDv, disi, toGlobal);
+          }
+        }
+      }
+    }
+
+    reuse = null;  // better GC
+  }
+
+  private int[] reuse;
+  private int[] getCountArr(int maxNeeded) {
+    if (reuse == null) {
+      // make the count array large enough for any segment
+      // FUTURE: (optionally) directly use the array of the CountAcc for an optimized index..
+      reuse = new int[(int) si.getValueCount() + 1];
+    } else {
+      Arrays.fill(reuse, 0, maxNeeded, 0);
+    }
+    return reuse;
+  }
+
+  private void collectPerSeg(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
+    int segMax = singleDv.getValueCount() + 1;
+    final int[] counts = getCountArr( segMax );
+
+    int doc;
+    while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
+      counts[ singleDv.getOrd(doc) + 1 ]++;
+    }
+
+    for (int i=1; i<segMax; i++) {
+      int segCount = counts[i];
+      if (segCount > 0) {
+        int slot = toGlobal == null ? (i - 1) : (int) toGlobal.get(i - 1);
+        countAcc.incrementCount(slot, segCount);
+      }
+    }
+  }
+
+
+  private void collectPerSeg(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
+    int segMax = (int)multiDv.getValueCount();
+    final int[] counts = getCountArr( segMax );
+
+    int doc;
+    while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
+      multiDv.setDocument(doc);
+      for(;;) {
+        int segOrd = (int)multiDv.nextOrd();
+        if (segOrd < 0) break;
+        counts[segOrd]++;
+      }
+    }
+
+    for (int i=0; i<segMax; i++) {
+      int segCount = counts[i];
+      if (segCount > 0) {
+        int slot = toGlobal == null ? (i) : (int) toGlobal.get(i);
+        countAcc.incrementCount(slot, segCount);
+      }
+    }
+  }
+
+  private void collectDocs(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
+    int doc;
+    while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
+      int segOrd = singleDv.getOrd(doc);
+      if (segOrd < 0) continue;
+      collect(doc, segOrd, toGlobal);
+    }
+  }
+
+  private void collectCounts(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
+    int doc;
+    while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
+      int segOrd = singleDv.getOrd(doc);
+      if (segOrd < 0) continue;
+      int ord = (int)toGlobal.get(segOrd);
+      countAcc.incrementCount(ord, 1);
+    }
+  }
+
+  private void collectDocs(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
+    int doc;
+    while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
+      multiDv.setDocument(doc);
+      for(;;) {
+        int segOrd = (int)multiDv.nextOrd();
+        if (segOrd < 0) break;
+        collect(doc, segOrd, toGlobal);
+      }
+    }
+  }
+
+  private void collectCounts(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
+    int doc;
+    while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
+      multiDv.setDocument(doc);
+      for(;;) {
+        int segOrd = (int)multiDv.nextOrd();
+        if (segOrd < 0) break;
+        int ord = (int)toGlobal.get(segOrd);
+        countAcc.incrementCount(ord, 1);
+      }
+    }
+  }
+
+  private void collect(int doc, int segOrd, LongValues toGlobal) throws IOException {
+    int ord = (toGlobal != null && segOrd >= 0) ? (int)toGlobal.get(segOrd) : segOrd;
+
+    int arrIdx = ord - startTermIndex;
+    if (arrIdx >= 0 && arrIdx < nTerms) {
+      countAcc.incrementCount(arrIdx, 1);
+      if (collectAcc != null) {
+        collectAcc.collect(doc, arrIdx);
+      }
+      if (allBucketsAcc != null) {
+        allBucketsAcc.collect(doc, arrIdx);
+      }
+    }
+  }
+
+}
--- a/solr/core/src/java/org/apache/solr/search/facet/FacetRequest.java
+++ b/solr/core/src/java/org/apache/solr/search/facet/FacetRequest.java
@ -532,6 +532,8 @@ class FacetFieldParser extends FacetParser<FacetField> {
      facet.method = FacetField.FacetMethod.fromString(getString(m, "method", null));
      facet.cacheDf = (int)getLong(m, "cacheDf", facet.cacheDf);

+      facet.perSeg = (Boolean)m.get("perSeg");
+
      // facet.sort may depend on a facet stat...
      // should we be parsing / validating this here, or in the execution environment?
      Object o = m.get("facet");