SOLR-6452: StatsComponent's stat 'missing' will work on fields with docValues=true and indexed=false

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1624091 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tomas Eduardo Fernandez Lobbe 2014-09-10 18:36:27 +00:00
parent 031fc7d167
commit e51bce7c68
4 changed files with 123 additions and 40 deletions

View File

@ -51,7 +51,7 @@ public class FieldFacetStats {
final boolean calcDistinct;
public final Map<String, StatsValues> facetStatsValues;
private final Map<Integer, Integer> missingStats;
List<HashMap<String, Integer>> facetStatsTerms;
final AtomicReader topLevelReader;
@ -73,6 +73,7 @@ public class FieldFacetStats {
facetStatsValues = new HashMap<>();
facetStatsTerms = new ArrayList<>();
missingStats = new HashMap<>();
}
private StatsValues getStatsValues(String key) throws IOException {
@ -100,8 +101,10 @@ public class FieldFacetStats {
if (topLevelSortedValues == null) {
topLevelSortedValues = DocValues.getSorted(topLevelReader, name);
}
int term = topLevelSortedValues.getOrd(docID);
int arrIdx = term;
if (arrIdx >= 0 && arrIdx < topLevelSortedValues.getValueCount()) {
final String key;
@ -113,6 +116,8 @@ public class FieldFacetStats {
while (facetStatsTerms.size() <= statsTermNum) {
facetStatsTerms.add(new HashMap<String, Integer>());
}
final Map<String, Integer> statsTermCounts = facetStatsTerms.get(statsTermNum);
Integer statsTermCount = statsTermCounts.get(key);
if (statsTermCount == null) {
@ -122,6 +127,7 @@ public class FieldFacetStats {
}
return true;
}
return false;
}
@ -132,8 +138,7 @@ public class FieldFacetStats {
while (facetStatsTerms.size() <= statsTermNum) {
facetStatsTerms.add(new HashMap<String, Integer>());
}
for (Map.Entry<String, Integer> stringIntegerEntry : facetStatsTerms.get(statsTermNum).entrySet()) {
Map.Entry pairs = (Map.Entry) stringIntegerEntry;
for (Map.Entry<String, Integer> pairs : facetStatsTerms.get(statsTermNum).entrySet()) {
String key = (String) pairs.getKey();
StatsValues facetStats = facetStatsValues.get(key);
if (facetStats == null) {
@ -156,6 +161,35 @@ public class FieldFacetStats {
}
}
public void facetMissingNum(int docID) throws IOException {
if (topLevelSortedValues == null) {
topLevelSortedValues = DocValues.getSorted(topLevelReader, name);
}
int ord = topLevelSortedValues.getOrd(docID);
if (ord != -1) {
Integer missingCount = missingStats.get(ord);
if (missingCount == null) {
missingStats.put(ord, 1);
} else {
missingStats.put(ord, missingCount + 1);
}
}
}
public void accumulateMissing() throws IOException {
StatsValues statsValue;
for (Map.Entry<Integer, Integer> entry : missingStats.entrySet()) {
if (entry.getKey() >= 0) {
String key = topLevelSortedValues.lookupOrd(entry.getKey()).utf8ToString();
if ((statsValue = facetStatsValues.get(key)) != null) {
statsValue.addMissing(entry.getValue());
}
}
}
return;
}
}

View File

@ -39,7 +39,6 @@ import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.request.DocValuesStats;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.UnInvertedField;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;

View File

@ -23,7 +23,6 @@ import java.util.Map;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.MultiDocValues.MultiSortedDocValues;
import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues;
import org.apache.lucene.index.MultiDocValues.OrdinalMap;
@ -32,10 +31,9 @@ import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LongValues;
import org.apache.solr.common.SolrException;
import org.apache.solr.handler.component.FieldFacetStats;
import org.apache.solr.handler.component.StatsValues;
import org.apache.solr.handler.component.StatsValuesFactory;
@ -62,11 +60,17 @@ public class DocValuesStats {
//Initialize facetstats, if facets have been passed in
final FieldFacetStats[] facetStats = new FieldFacetStats[facet.length];
int upto = 0;
for (String facetField : facet) {
SchemaField fsf = searcher.getSchema().getField(facetField);
if ( fsf.multiValued()) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"Stats can only facet on single-valued fields, not: " + facetField );
}
SchemaField facetSchemaField = searcher.getSchema().getField(facetField);
facetStats[upto++] = new FieldFacetStats(searcher, facetField, schemaField, facetSchemaField, calcDistinct);
}
// TODO: remove multiValuedFieldCache(), check dv type / uninversion type?
final boolean multiValued = schemaField.multiValued() || ft.multiValuedFieldCache();
@ -74,6 +78,7 @@ public class DocValuesStats {
OrdinalMap ordinalMap = null; // for mapping per-segment ords to global ones
if (multiValued) {
si = searcher.getAtomicReader().getSortedSetDocValues(fieldName);
if (si instanceof MultiSortedSetDocValues) {
ordinalMap = ((MultiSortedSetDocValues)si).mapping;
}
@ -90,26 +95,27 @@ public class DocValuesStats {
if (si.getValueCount() >= Integer.MAX_VALUE) {
throw new UnsupportedOperationException("Currently this stats method is limited to " + Integer.MAX_VALUE + " unique terms");
}
DocSet missing = docs.andNot( searcher.getDocSet(new TermRangeQuery(fieldName, null, null, false, false)));
final int nTerms = (int) si.getValueCount();
int missingDocCountTotal = 0;
final int nTerms = (int) si.getValueCount();
// count collection array only needs to be as big as the number of terms we are
// going to collect counts for.
final int[] counts = new int[nTerms];
Filter filter = docs.getTopFilter();
List<AtomicReaderContext> leaves = searcher.getTopReaderContext().leaves();
for (int subIndex = 0; subIndex < leaves.size(); subIndex++) {
AtomicReaderContext leaf = leaves.get(subIndex);
DocIdSet dis = filter.getDocIdSet(leaf, null); // solr docsets already exclude any deleted docs
DocIdSetIterator disi = null;
if (dis != null) {
disi = dis.iterator();
}
if (disi != null) {
int docBase = leaf.docBase;
if (multiValued) {
SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName);
if (sub == null) {
@ -118,23 +124,23 @@ public class DocValuesStats {
final SortedDocValues singleton = DocValues.unwrapSingleton(sub);
if (singleton != null) {
// some codecs may optimize SORTED_SET storage for single-valued fields
accumSingle(counts, docBase, facetStats, singleton, disi, subIndex, ordinalMap);
missingDocCountTotal += accumSingle(counts, docBase, facetStats, singleton, disi, subIndex, ordinalMap);
} else {
accumMulti(counts, docBase, facetStats, sub, disi, subIndex, ordinalMap);
missingDocCountTotal += accumMulti(counts, docBase, facetStats, sub, disi, subIndex, ordinalMap);
}
} else {
SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName);
if (sub == null) {
sub = DocValues.emptySorted();
}
accumSingle(counts, docBase, facetStats, sub, disi, subIndex, ordinalMap);
missingDocCountTotal += accumSingle(counts, docBase, facetStats, sub, disi, subIndex, ordinalMap);
}
}
}
// add results in index order
for (int ord = 0; ord < counts.length; ord++) {
int count = counts[ord];
if (count > 0) {
final BytesRef value = si.lookupOrd(ord);
res.accumulate(value, count);
@ -143,26 +149,23 @@ public class DocValuesStats {
}
}
}
res.addMissing(missing.size());
res.addMissing(missingDocCountTotal);
if (facetStats.length > 0) {
for (FieldFacetStats f : facetStats) {
Map<String, StatsValues> facetStatsValues = f.facetStatsValues;
FieldType facetType = searcher.getSchema().getFieldType(f.name);
for (Map.Entry<String,StatsValues> entry : facetStatsValues.entrySet()) {
String termLabel = entry.getKey();
int missingCount = searcher.numDocs(new TermQuery(new Term(f.name, facetType.toInternal(termLabel))), missing);
entry.getValue().addMissing(missingCount);
}
Map<String,StatsValues> facetStatsValues = f.facetStatsValues;
f.accumulateMissing();
res.addFacet(f.name, facetStatsValues);
}
}
return res;
}
/** accumulates per-segment single-valued stats */
static void accumSingle(int counts[], int docBase, FieldFacetStats[] facetStats, SortedDocValues si, DocIdSetIterator disi, int subIndex, OrdinalMap map) throws IOException {
static int accumSingle(int counts[], int docBase, FieldFacetStats[] facetStats, SortedDocValues si, DocIdSetIterator disi, int subIndex, OrdinalMap map) throws IOException {
final LongValues ordMap = map == null ? null : map.getGlobalOrds(subIndex);
int missingDocCount = 0;
int doc;
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
int term = si.getOrd(doc);
@ -174,18 +177,29 @@ public class DocValuesStats {
for (FieldFacetStats f : facetStats) {
f.facetTermNum(docBase + doc, term);
}
}else{
for (FieldFacetStats f : facetStats) {
f.facetMissingNum(docBase + doc);
}
missingDocCount++;
}
}
return missingDocCount;
}
/** accumulates per-segment multi-valued stats */
static void accumMulti(int counts[], int docBase, FieldFacetStats[] facetStats, SortedSetDocValues si, DocIdSetIterator disi, int subIndex, OrdinalMap map) throws IOException {
static int accumMulti(int counts[], int docBase, FieldFacetStats[] facetStats, SortedSetDocValues si, DocIdSetIterator disi, int subIndex, OrdinalMap map) throws IOException {
final LongValues ordMap = map == null ? null : map.getGlobalOrds(subIndex);
int missingDocCount = 0;
int doc;
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
si.setDocument(doc);
long ord;
boolean emptyTerm = true;
while ((ord = si.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
emptyTerm = false;
int term = (int) ord;
if (map != null) {
term = (int) ordMap.get(term);
@ -195,6 +209,15 @@ public class DocValuesStats {
f.facetTermNum(docBase + doc, term);
}
}
if (emptyTerm){
for (FieldFacetStats f : facetStats) {
f.facetMissingNum(docBase + doc);
}
missingDocCount++;
}
}
return missingDocCount;
}
}

View File

@ -28,10 +28,9 @@ import java.util.Map;
import java.util.TimeZone;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.StatsParams;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.LocalSolrQueryRequest;
@ -64,9 +63,8 @@ public class StatsComponentTest extends AbstractSolrTestCase {
for (String f : new String[] {
"stats_i","stats_l","stats_f","stats_d",
"stats_ti","stats_tl","stats_tf","stats_td",
"stats_ti_dv","stats_tl_dv","stats_tf_dv","stats_td_dv"
// , TODO: enable this test after SOLR-6452 is fixed
// "stats_ti_ni_dv","stats_tl_ni_dv","stats_tf_ni_dv","stats_td_ni_dv"
"stats_ti_dv","stats_tl_dv","stats_tf_dv","stats_td_dv",
"stats_ti_ni_dv","stats_tl_ni_dv","stats_tf_ni_dv","stats_td_ni_dv"
}) {
// all of our checks should work with all of these params
@ -91,9 +89,8 @@ public class StatsComponentTest extends AbstractSolrTestCase {
for (String f : new String[] {"stats_ii",
"stats_tis","stats_tfs","stats_tls","stats_tds", // trie fields
"stats_tis_dv","stats_tfs_dv","stats_tls_dv","stats_tds_dv" // Doc Values
// , TODO: enable this test after SOLR-6452 is fixed
//"stats_tis_ni_dv","stats_tfs_ni_dv","stats_tls_ni_dv","stats_tds_ni_dv" // Doc Values Not indexed
"stats_tis_dv","stats_tfs_dv","stats_tls_dv","stats_tds_dv", // Doc Values
"stats_tis_ni_dv","stats_tfs_ni_dv","stats_tls_ni_dv","stats_tds_ni_dv" // Doc Values Not indexed
}) {
doTestMVFieldStatisticsResult(f);
@ -153,6 +150,9 @@ public class StatsComponentTest extends AbstractSolrTestCase {
assertU(adoc("id", "3", f, "-30", f, "-1", "active_s", "false"));
assertU(adoc("id", "4", f, "-40", f, "10", "active_s", "false"));
assertU(adoc("id", "5", "active_s", "false"));
assertU(adoc("id", "6", "active_s", "false"));
assertU(adoc("id", "7", "active_s", "true"));
assertU(commit());
// with or w/o these excluded filters, results should be the same
@ -171,7 +171,7 @@ public class StatsComponentTest extends AbstractSolrTestCase {
, "//double[@name='max'][.='200.0']"
, "//double[@name='sum'][.='9.0']"
, "//long[@name='count'][.='8']"
, "//long[@name='missing'][.='1']"
, "//long[@name='missing'][.='3']"
, "//long[@name='countDistinct'][.='8']"
, "count(//arr[@name='distinctValues']/*)=8"
, "//double[@name='sumOfSquares'][.='53101.0']"
@ -186,7 +186,7 @@ public class StatsComponentTest extends AbstractSolrTestCase {
, "//double[@name='max'][.='200.0']"
, "//double[@name='sum'][.='119.0']"
, "//long[@name='count'][.='6']"
, "//long[@name='missing'][.='1']"
, "//long[@name='missing'][.='3']"
, "//long[@name='countDistinct'][.='6']"
, "count(//arr[@name='distinctValues']/*)=6"
, "//double[@name='sumOfSquares'][.='43001.0']"
@ -202,7 +202,7 @@ public class StatsComponentTest extends AbstractSolrTestCase {
, "//double[@name='max'][.='200.0']"
, "//double[@name='sum'][.='9.0']"
, "//long[@name='count'][.='8']"
, "//long[@name='missing'][.='1']"
, "//long[@name='missing'][.='3']"
, "//long[@name='countDistinct'][.='8']"
, "count(//lst[@name='" + f + "']/arr[@name='distinctValues']/*)=8"
, "//double[@name='sumOfSquares'][.='53101.0']"
@ -216,7 +216,7 @@ public class StatsComponentTest extends AbstractSolrTestCase {
, "//lst[@name='true']/double[@name='max'][.='200.0']"
, "//lst[@name='true']/double[@name='sum'][.='70.0']"
, "//lst[@name='true']/long[@name='count'][.='4']"
, "//lst[@name='true']/long[@name='missing'][.='0']"
, "//lst[@name='true']/long[@name='missing'][.='1']"
, "//lst[@name='true']//long[@name='countDistinct'][.='4']"
, "count(//lst[@name='true']/arr[@name='distinctValues']/*)=4"
, "//lst[@name='true']/double[@name='sumOfSquares'][.='50500.0']"
@ -230,7 +230,7 @@ public class StatsComponentTest extends AbstractSolrTestCase {
, "//lst[@name='false']/double[@name='max'][.='10.0']"
, "//lst[@name='false']/double[@name='sum'][.='-61.0']"
, "//lst[@name='false']/long[@name='count'][.='4']"
, "//lst[@name='false']/long[@name='missing'][.='1']"
, "//lst[@name='false']/long[@name='missing'][.='2']"
, "//lst[@name='true']//long[@name='countDistinct'][.='4']"
, "count(//lst[@name='true']/arr[@name='distinctValues']/*)=4"
, "//lst[@name='false']/double[@name='sumOfSquares'][.='2601.0']"
@ -711,4 +711,31 @@ public class StatsComponentTest extends AbstractSolrTestCase {
Collections.addAll(cat_docValues, comparables);
return cat_docValues;
}
// public void testOtherFacetStatsResult() throws Exception {
//
// assertU(adoc("id", "1", "stats_tls_dv", "10", "active_i", "1"));
// assertU(adoc("id", "2", "stats_tls_dv", "20", "active_i", "1"));
// assertU(commit());
// assertU(adoc("id", "3", "stats_tls_dv", "30", "active_i", "2"));
// assertU(adoc("id", "4", "stats_tls_dv", "40", "active_i", "2"));
// assertU(commit());
//
// final String pre = "//lst[@name='stats_fields']/lst[@name='stats_tls_dv']/lst[@name='facets']/lst[@name='active_i']";
//
// assertQ("test value for active_s=true", req("q", "*:*", "stats", "true", "stats.field", "stats_tls_dv", "stats.facet", "active_i","indent", "true")
// , "*[count("+pre+")=1]"
// , pre+"/lst[@name='1']/double[@name='min'][.='10.0']"
// , pre+"/lst[@name='1']/double[@name='max'][.='20.0']"
// , pre+"/lst[@name='1']/double[@name='sum'][.='30.0']"
// , pre+"/lst[@name='1']/long[@name='count'][.='2']"
// , pre+"/lst[@name='1']/long[@name='missing'][.='0']"
// , pre + "/lst[@name='true']/long[@name='countDistinct'][.='2']"
// , "count(" + pre + "/lst[@name='true']/arr[@name='distinctValues']/*)=2"
// , pre+"/lst[@name='1']/double[@name='sumOfSquares'][.='500.0']"
// , pre+"/lst[@name='1']/double[@name='mean'][.='15.0']"
// , pre+"/lst[@name='1']/double[@name='stddev'][.='7.0710678118654755']"
// );
// }
}