LUCENE-5666: fix StatsComponent insanity

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5666@1594441 13f79535-47bb-0310-9956-ffa450edef68
2014-05-14 01:25:48 +00:00 · 2014-05-14 01:25:48 +00:00 · fe8dd29a74
parent 9374fcc6e9
commit fe8dd29a74
2 changed files with 201 additions and 3 deletions
--- a/solr/core/src/java/org/apache/solr/handler/component/StatsComponent.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/StatsComponent.java
@ -37,6 +37,7 @@ import org.apache.solr.common.params.StatsParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.SimpleOrderedMap;
 import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.request.DocValuesStats;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.request.UnInvertedField;
 import org.apache.solr.schema.FieldType;
@ -315,9 +316,8 @@ class SimpleStats {
        NamedList<?> stv;

        if (sf.multiValued() || ft.multiValuedFieldCache()) {
-          //use UnInvertedField for multivalued fields
-          UnInvertedField uif = UnInvertedField.getUnInvertedField(statsField, searcher);
-          stv = uif.getStats(searcher, docs, calcDistinct, facets).getStatsValues();
+          // TODO: should this also be used for single-valued string fields? (should work fine)
+          stv = DocValuesStats.getCounts(searcher, sf.getName(), docs, calcDistinct, facets).getStatsValues();
        } else {
          stv = getFieldCacheStats(statsField, calcDistinct, facets);
        }
--- a/solr/core/src/java/org/apache/solr/request/DocValuesStats.java
+++ b/solr/core/src/java/org/apache/solr/request/DocValuesStats.java
@ -0,0 +1,198 @@
+package org.apache.solr.request;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.MultiDocValues.MultiSortedDocValues;
+import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues;
+import org.apache.lucene.index.MultiDocValues.OrdinalMap;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TermRangeQuery;
+import org.apache.lucene.util.BytesRef;
+import org.apache.solr.handler.component.FieldFacetStats;
+import org.apache.solr.handler.component.StatsValues;
+import org.apache.solr.handler.component.StatsValuesFactory;
+import org.apache.solr.schema.FieldType;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.search.DocSet;
+import org.apache.solr.search.SolrIndexSearcher;
+
+/**
+ * Computes term stats for docvalues field (single or multivalued).
+ * <p>
+ * Instead of working on a top-level reader view (binary-search per docid),
+ * it collects per-segment, but maps ordinals to global ordinal space using
+ * MultiDocValues' OrdinalMap.
+ */
+public class DocValuesStats {
+  private DocValuesStats() {}
+  
+  public static StatsValues getCounts(SolrIndexSearcher searcher, String fieldName, DocSet docs, boolean calcDistinct, String[] facet) throws IOException {
+    SchemaField schemaField = searcher.getSchema().getField(fieldName);
+    FieldType ft = schemaField.getType();
+    StatsValues res = StatsValuesFactory.createStatsValues(schemaField, calcDistinct);
+    
+    //Initialize facetstats, if facets have been passed in
+    final FieldFacetStats[] facetStats = new FieldFacetStats[facet.length];
+    int upto = 0;
+    for (String facetField : facet) {
+      SchemaField facetSchemaField = searcher.getSchema().getField(facetField);
+      facetStats[upto++] = new FieldFacetStats(searcher, facetField, schemaField, facetSchemaField, calcDistinct);
+    }
+    
+    // TODO: remove multiValuedFieldCache(), check dv type / uninversion type?
+    final boolean multiValued = schemaField.multiValued() || ft.multiValuedFieldCache();
+
+    final SortedSetDocValues si; // for term lookups only
+    OrdinalMap ordinalMap = null; // for mapping per-segment ords to global ones
+    if (multiValued) {
+      si = searcher.getAtomicReader().getSortedSetDocValues(fieldName);
+      if (si instanceof MultiSortedSetDocValues) {
+        ordinalMap = ((MultiSortedSetDocValues)si).mapping;
+      }
+    } else {
+      SortedDocValues single = searcher.getAtomicReader().getSortedDocValues(fieldName);
+      si = single == null ? null : DocValues.singleton(single);
+      if (single instanceof MultiSortedDocValues) {
+        ordinalMap = ((MultiSortedDocValues)single).mapping;
+      }
+    }
+    if (si == null) {
+      return res;
+    }
+    if (si.getValueCount() >= Integer.MAX_VALUE) {
+      throw new UnsupportedOperationException("Currently this stats method is limited to " + Integer.MAX_VALUE + " unique terms");
+    }
+
+    DocSet missing = docs.andNot( searcher.getDocSet(new TermRangeQuery(fieldName, null, null, false, false)));
+
+    final int nTerms = (int) si.getValueCount();   
+    
+    // count collection array only needs to be as big as the number of terms we are
+    // going to collect counts for.
+    final int[] counts = new int[nTerms];
+    
+    Filter filter = docs.getTopFilter();
+    List<AtomicReaderContext> leaves = searcher.getTopReaderContext().leaves();
+    for (int subIndex = 0; subIndex < leaves.size(); subIndex++) {
+      AtomicReaderContext leaf = leaves.get(subIndex);
+      DocIdSet dis = filter.getDocIdSet(leaf, null); // solr docsets already exclude any deleted docs
+      DocIdSetIterator disi = null;
+      if (dis != null) {
+        disi = dis.iterator();
+      }
+      if (disi != null) {
+        int docBase = leaf.docBase;
+        if (multiValued) {
+          SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName);
+          if (sub == null) {
+            sub = DocValues.EMPTY_SORTED_SET;
+          }
+          final SortedDocValues singleton = DocValues.unwrapSingleton(sub);
+          if (singleton != null) {
+            // some codecs may optimize SORTED_SET storage for single-valued fields
+            accumSingle(counts, docBase, facetStats, singleton, disi, subIndex, ordinalMap);
+          } else {
+            accumMulti(counts, docBase, facetStats, sub, disi, subIndex, ordinalMap);
+          }
+        } else {
+          SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName);
+          if (sub == null) {
+            sub = DocValues.EMPTY_SORTED;
+          }
+          accumSingle(counts, docBase, facetStats, sub, disi, subIndex, ordinalMap);
+        }
+      }
+    }
+    
+    // add results in index order
+    BytesRef value = new BytesRef();
+    for (int ord = 0; ord < counts.length; ord++) {
+      int count = counts[ord];
+      if (count > 0) {
+        si.lookupOrd(ord, value);
+        res.accumulate(value, count);
+        for (FieldFacetStats f : facetStats) {
+          f.accumulateTermNum(ord, value);
+        }
+      }
+    }
+
+    res.addMissing(missing.size());
+    if (facetStats.length > 0) {
+      for (FieldFacetStats f : facetStats) {
+        Map<String, StatsValues> facetStatsValues = f.facetStatsValues;
+        FieldType facetType = searcher.getSchema().getFieldType(f.name);
+        for (Map.Entry<String,StatsValues> entry : facetStatsValues.entrySet()) {
+          String termLabel = entry.getKey();
+          int missingCount = searcher.numDocs(new TermQuery(new Term(f.name, facetType.toInternal(termLabel))), missing);
+          entry.getValue().addMissing(missingCount);
+        }
+        res.addFacet(f.name, facetStatsValues);
+      }
+    }
+    return res;
+  }
+
+  /** accumulates per-segment single-valued stats */
+  static void accumSingle(int counts[], int docBase, FieldFacetStats[] facetStats, SortedDocValues si, DocIdSetIterator disi, int subIndex, OrdinalMap map) throws IOException {
+    int doc;
+    while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
+      int term = si.getOrd(doc);
+      if (term >= 0) {
+        if (map != null) {
+          term = (int) map.getGlobalOrd(subIndex, term);
+        }
+        counts[term]++;
+        for (FieldFacetStats f : facetStats) {
+          f.facetTermNum(docBase + doc, term);
+        }
+      }
+    }
+  }
+  
+  /** accumulates per-segment multi-valued stats */
+  static void accumMulti(int counts[], int docBase, FieldFacetStats[] facetStats, SortedSetDocValues si, DocIdSetIterator disi, int subIndex, OrdinalMap map) throws IOException {
+    int doc;
+    while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
+      si.setDocument(doc);
+      long ord;
+      while ((ord = si.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
+        int term = (int) ord;
+        if (map != null) {
+          term = (int) map.getGlobalOrd(subIndex, term);
+        }
+        counts[term]++;
+        for (FieldFacetStats f : facetStats) {
+          f.facetTermNum(docBase + doc, term);
+        }
+      }
+    }
+  }
+}