mirror of https://github.com/apache/lucene.git
SOLR-6452: StatsComponent's stat 'missing' will work on fields with docValues=true and indexed=false
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1624091 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
031fc7d167
commit
e51bce7c68
|
@ -51,7 +51,7 @@ public class FieldFacetStats {
|
|||
final boolean calcDistinct;
|
||||
|
||||
public final Map<String, StatsValues> facetStatsValues;
|
||||
|
||||
private final Map<Integer, Integer> missingStats;
|
||||
List<HashMap<String, Integer>> facetStatsTerms;
|
||||
|
||||
final AtomicReader topLevelReader;
|
||||
|
@ -73,6 +73,7 @@ public class FieldFacetStats {
|
|||
|
||||
facetStatsValues = new HashMap<>();
|
||||
facetStatsTerms = new ArrayList<>();
|
||||
missingStats = new HashMap<>();
|
||||
}
|
||||
|
||||
private StatsValues getStatsValues(String key) throws IOException {
|
||||
|
@ -100,8 +101,10 @@ public class FieldFacetStats {
|
|||
if (topLevelSortedValues == null) {
|
||||
topLevelSortedValues = DocValues.getSorted(topLevelReader, name);
|
||||
}
|
||||
|
||||
|
||||
int term = topLevelSortedValues.getOrd(docID);
|
||||
|
||||
int arrIdx = term;
|
||||
if (arrIdx >= 0 && arrIdx < topLevelSortedValues.getValueCount()) {
|
||||
final String key;
|
||||
|
@ -113,6 +116,8 @@ public class FieldFacetStats {
|
|||
while (facetStatsTerms.size() <= statsTermNum) {
|
||||
facetStatsTerms.add(new HashMap<String, Integer>());
|
||||
}
|
||||
|
||||
|
||||
final Map<String, Integer> statsTermCounts = facetStatsTerms.get(statsTermNum);
|
||||
Integer statsTermCount = statsTermCounts.get(key);
|
||||
if (statsTermCount == null) {
|
||||
|
@ -122,6 +127,7 @@ public class FieldFacetStats {
|
|||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -132,8 +138,7 @@ public class FieldFacetStats {
|
|||
while (facetStatsTerms.size() <= statsTermNum) {
|
||||
facetStatsTerms.add(new HashMap<String, Integer>());
|
||||
}
|
||||
for (Map.Entry<String, Integer> stringIntegerEntry : facetStatsTerms.get(statsTermNum).entrySet()) {
|
||||
Map.Entry pairs = (Map.Entry) stringIntegerEntry;
|
||||
for (Map.Entry<String, Integer> pairs : facetStatsTerms.get(statsTermNum).entrySet()) {
|
||||
String key = (String) pairs.getKey();
|
||||
StatsValues facetStats = facetStatsValues.get(key);
|
||||
if (facetStats == null) {
|
||||
|
@ -156,6 +161,35 @@ public class FieldFacetStats {
|
|||
}
|
||||
}
|
||||
|
||||
public void facetMissingNum(int docID) throws IOException {
|
||||
if (topLevelSortedValues == null) {
|
||||
topLevelSortedValues = DocValues.getSorted(topLevelReader, name);
|
||||
}
|
||||
|
||||
int ord = topLevelSortedValues.getOrd(docID);
|
||||
if (ord != -1) {
|
||||
Integer missingCount = missingStats.get(ord);
|
||||
if (missingCount == null) {
|
||||
missingStats.put(ord, 1);
|
||||
} else {
|
||||
missingStats.put(ord, missingCount + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void accumulateMissing() throws IOException {
|
||||
StatsValues statsValue;
|
||||
|
||||
for (Map.Entry<Integer, Integer> entry : missingStats.entrySet()) {
|
||||
if (entry.getKey() >= 0) {
|
||||
String key = topLevelSortedValues.lookupOrd(entry.getKey()).utf8ToString();
|
||||
if ((statsValue = facetStatsValues.get(key)) != null) {
|
||||
statsValue.addMissing(entry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -39,7 +39,6 @@ import org.apache.solr.common.util.SimpleOrderedMap;
|
|||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.solr.request.DocValuesStats;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.request.UnInvertedField;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
|
|
|
@ -23,7 +23,6 @@ import java.util.Map;
|
|||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.DocValues;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.MultiDocValues.MultiSortedDocValues;
|
||||
import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues;
|
||||
import org.apache.lucene.index.MultiDocValues.OrdinalMap;
|
||||
|
@ -32,10 +31,9 @@ import org.apache.lucene.index.SortedSetDocValues;
|
|||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TermRangeQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LongValues;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.handler.component.FieldFacetStats;
|
||||
import org.apache.solr.handler.component.StatsValues;
|
||||
import org.apache.solr.handler.component.StatsValuesFactory;
|
||||
|
@ -62,11 +60,17 @@ public class DocValuesStats {
|
|||
//Initialize facetstats, if facets have been passed in
|
||||
final FieldFacetStats[] facetStats = new FieldFacetStats[facet.length];
|
||||
int upto = 0;
|
||||
|
||||
for (String facetField : facet) {
|
||||
SchemaField fsf = searcher.getSchema().getField(facetField);
|
||||
if ( fsf.multiValued()) {
|
||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
|
||||
"Stats can only facet on single-valued fields, not: " + facetField );
|
||||
}
|
||||
|
||||
SchemaField facetSchemaField = searcher.getSchema().getField(facetField);
|
||||
facetStats[upto++] = new FieldFacetStats(searcher, facetField, schemaField, facetSchemaField, calcDistinct);
|
||||
}
|
||||
|
||||
// TODO: remove multiValuedFieldCache(), check dv type / uninversion type?
|
||||
final boolean multiValued = schemaField.multiValued() || ft.multiValuedFieldCache();
|
||||
|
||||
|
@ -74,6 +78,7 @@ public class DocValuesStats {
|
|||
OrdinalMap ordinalMap = null; // for mapping per-segment ords to global ones
|
||||
if (multiValued) {
|
||||
si = searcher.getAtomicReader().getSortedSetDocValues(fieldName);
|
||||
|
||||
if (si instanceof MultiSortedSetDocValues) {
|
||||
ordinalMap = ((MultiSortedSetDocValues)si).mapping;
|
||||
}
|
||||
|
@ -90,26 +95,27 @@ public class DocValuesStats {
|
|||
if (si.getValueCount() >= Integer.MAX_VALUE) {
|
||||
throw new UnsupportedOperationException("Currently this stats method is limited to " + Integer.MAX_VALUE + " unique terms");
|
||||
}
|
||||
|
||||
DocSet missing = docs.andNot( searcher.getDocSet(new TermRangeQuery(fieldName, null, null, false, false)));
|
||||
|
||||
final int nTerms = (int) si.getValueCount();
|
||||
|
||||
int missingDocCountTotal = 0;
|
||||
final int nTerms = (int) si.getValueCount();
|
||||
// count collection array only needs to be as big as the number of terms we are
|
||||
// going to collect counts for.
|
||||
final int[] counts = new int[nTerms];
|
||||
|
||||
Filter filter = docs.getTopFilter();
|
||||
List<AtomicReaderContext> leaves = searcher.getTopReaderContext().leaves();
|
||||
|
||||
for (int subIndex = 0; subIndex < leaves.size(); subIndex++) {
|
||||
AtomicReaderContext leaf = leaves.get(subIndex);
|
||||
DocIdSet dis = filter.getDocIdSet(leaf, null); // solr docsets already exclude any deleted docs
|
||||
DocIdSetIterator disi = null;
|
||||
|
||||
if (dis != null) {
|
||||
disi = dis.iterator();
|
||||
}
|
||||
if (disi != null) {
|
||||
int docBase = leaf.docBase;
|
||||
|
||||
if (multiValued) {
|
||||
SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName);
|
||||
if (sub == null) {
|
||||
|
@ -118,23 +124,23 @@ public class DocValuesStats {
|
|||
final SortedDocValues singleton = DocValues.unwrapSingleton(sub);
|
||||
if (singleton != null) {
|
||||
// some codecs may optimize SORTED_SET storage for single-valued fields
|
||||
accumSingle(counts, docBase, facetStats, singleton, disi, subIndex, ordinalMap);
|
||||
missingDocCountTotal += accumSingle(counts, docBase, facetStats, singleton, disi, subIndex, ordinalMap);
|
||||
} else {
|
||||
accumMulti(counts, docBase, facetStats, sub, disi, subIndex, ordinalMap);
|
||||
missingDocCountTotal += accumMulti(counts, docBase, facetStats, sub, disi, subIndex, ordinalMap);
|
||||
}
|
||||
} else {
|
||||
SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName);
|
||||
if (sub == null) {
|
||||
sub = DocValues.emptySorted();
|
||||
}
|
||||
accumSingle(counts, docBase, facetStats, sub, disi, subIndex, ordinalMap);
|
||||
missingDocCountTotal += accumSingle(counts, docBase, facetStats, sub, disi, subIndex, ordinalMap);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// add results in index order
|
||||
for (int ord = 0; ord < counts.length; ord++) {
|
||||
int count = counts[ord];
|
||||
|
||||
if (count > 0) {
|
||||
final BytesRef value = si.lookupOrd(ord);
|
||||
res.accumulate(value, count);
|
||||
|
@ -143,26 +149,23 @@ public class DocValuesStats {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
res.addMissing(missing.size());
|
||||
res.addMissing(missingDocCountTotal);
|
||||
|
||||
if (facetStats.length > 0) {
|
||||
for (FieldFacetStats f : facetStats) {
|
||||
Map<String, StatsValues> facetStatsValues = f.facetStatsValues;
|
||||
FieldType facetType = searcher.getSchema().getFieldType(f.name);
|
||||
for (Map.Entry<String,StatsValues> entry : facetStatsValues.entrySet()) {
|
||||
String termLabel = entry.getKey();
|
||||
int missingCount = searcher.numDocs(new TermQuery(new Term(f.name, facetType.toInternal(termLabel))), missing);
|
||||
entry.getValue().addMissing(missingCount);
|
||||
}
|
||||
Map<String,StatsValues> facetStatsValues = f.facetStatsValues;
|
||||
f.accumulateMissing();
|
||||
res.addFacet(f.name, facetStatsValues);
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
/** accumulates per-segment single-valued stats */
|
||||
static void accumSingle(int counts[], int docBase, FieldFacetStats[] facetStats, SortedDocValues si, DocIdSetIterator disi, int subIndex, OrdinalMap map) throws IOException {
|
||||
static int accumSingle(int counts[], int docBase, FieldFacetStats[] facetStats, SortedDocValues si, DocIdSetIterator disi, int subIndex, OrdinalMap map) throws IOException {
|
||||
final LongValues ordMap = map == null ? null : map.getGlobalOrds(subIndex);
|
||||
int missingDocCount = 0;
|
||||
int doc;
|
||||
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
int term = si.getOrd(doc);
|
||||
|
@ -174,18 +177,29 @@ public class DocValuesStats {
|
|||
for (FieldFacetStats f : facetStats) {
|
||||
f.facetTermNum(docBase + doc, term);
|
||||
}
|
||||
}else{
|
||||
for (FieldFacetStats f : facetStats) {
|
||||
f.facetMissingNum(docBase + doc);
|
||||
}
|
||||
|
||||
missingDocCount++;
|
||||
}
|
||||
}
|
||||
return missingDocCount;
|
||||
}
|
||||
|
||||
/** accumulates per-segment multi-valued stats */
|
||||
static void accumMulti(int counts[], int docBase, FieldFacetStats[] facetStats, SortedSetDocValues si, DocIdSetIterator disi, int subIndex, OrdinalMap map) throws IOException {
|
||||
|
||||
static int accumMulti(int counts[], int docBase, FieldFacetStats[] facetStats, SortedSetDocValues si, DocIdSetIterator disi, int subIndex, OrdinalMap map) throws IOException {
|
||||
final LongValues ordMap = map == null ? null : map.getGlobalOrds(subIndex);
|
||||
int missingDocCount = 0;
|
||||
int doc;
|
||||
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
si.setDocument(doc);
|
||||
long ord;
|
||||
boolean emptyTerm = true;
|
||||
while ((ord = si.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
|
||||
emptyTerm = false;
|
||||
int term = (int) ord;
|
||||
if (map != null) {
|
||||
term = (int) ordMap.get(term);
|
||||
|
@ -195,6 +209,15 @@ public class DocValuesStats {
|
|||
f.facetTermNum(docBase + doc, term);
|
||||
}
|
||||
}
|
||||
if (emptyTerm){
|
||||
for (FieldFacetStats f : facetStats) {
|
||||
f.facetMissingNum(docBase + doc);
|
||||
}
|
||||
|
||||
missingDocCount++;
|
||||
}
|
||||
}
|
||||
|
||||
return missingDocCount;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,10 +28,9 @@ import java.util.Map;
|
|||
import java.util.TimeZone;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.params.CommonParams;
|
||||
import org.apache.solr.common.params.MapSolrParams;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.params.StatsParams;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.request.LocalSolrQueryRequest;
|
||||
|
@ -64,9 +63,8 @@ public class StatsComponentTest extends AbstractSolrTestCase {
|
|||
for (String f : new String[] {
|
||||
"stats_i","stats_l","stats_f","stats_d",
|
||||
"stats_ti","stats_tl","stats_tf","stats_td",
|
||||
"stats_ti_dv","stats_tl_dv","stats_tf_dv","stats_td_dv"
|
||||
// , TODO: enable this test after SOLR-6452 is fixed
|
||||
// "stats_ti_ni_dv","stats_tl_ni_dv","stats_tf_ni_dv","stats_td_ni_dv"
|
||||
"stats_ti_dv","stats_tl_dv","stats_tf_dv","stats_td_dv",
|
||||
"stats_ti_ni_dv","stats_tl_ni_dv","stats_tf_ni_dv","stats_td_ni_dv"
|
||||
}) {
|
||||
|
||||
// all of our checks should work with all of these params
|
||||
|
@ -91,9 +89,8 @@ public class StatsComponentTest extends AbstractSolrTestCase {
|
|||
|
||||
for (String f : new String[] {"stats_ii",
|
||||
"stats_tis","stats_tfs","stats_tls","stats_tds", // trie fields
|
||||
"stats_tis_dv","stats_tfs_dv","stats_tls_dv","stats_tds_dv" // Doc Values
|
||||
// , TODO: enable this test after SOLR-6452 is fixed
|
||||
//"stats_tis_ni_dv","stats_tfs_ni_dv","stats_tls_ni_dv","stats_tds_ni_dv" // Doc Values Not indexed
|
||||
"stats_tis_dv","stats_tfs_dv","stats_tls_dv","stats_tds_dv", // Doc Values
|
||||
"stats_tis_ni_dv","stats_tfs_ni_dv","stats_tls_ni_dv","stats_tds_ni_dv" // Doc Values Not indexed
|
||||
}) {
|
||||
|
||||
doTestMVFieldStatisticsResult(f);
|
||||
|
@ -153,6 +150,9 @@ public class StatsComponentTest extends AbstractSolrTestCase {
|
|||
assertU(adoc("id", "3", f, "-30", f, "-1", "active_s", "false"));
|
||||
assertU(adoc("id", "4", f, "-40", f, "10", "active_s", "false"));
|
||||
assertU(adoc("id", "5", "active_s", "false"));
|
||||
assertU(adoc("id", "6", "active_s", "false"));
|
||||
assertU(adoc("id", "7", "active_s", "true"));
|
||||
|
||||
assertU(commit());
|
||||
|
||||
// with or w/o these excluded filters, results should be the same
|
||||
|
@ -171,7 +171,7 @@ public class StatsComponentTest extends AbstractSolrTestCase {
|
|||
, "//double[@name='max'][.='200.0']"
|
||||
, "//double[@name='sum'][.='9.0']"
|
||||
, "//long[@name='count'][.='8']"
|
||||
, "//long[@name='missing'][.='1']"
|
||||
, "//long[@name='missing'][.='3']"
|
||||
, "//long[@name='countDistinct'][.='8']"
|
||||
, "count(//arr[@name='distinctValues']/*)=8"
|
||||
, "//double[@name='sumOfSquares'][.='53101.0']"
|
||||
|
@ -186,7 +186,7 @@ public class StatsComponentTest extends AbstractSolrTestCase {
|
|||
, "//double[@name='max'][.='200.0']"
|
||||
, "//double[@name='sum'][.='119.0']"
|
||||
, "//long[@name='count'][.='6']"
|
||||
, "//long[@name='missing'][.='1']"
|
||||
, "//long[@name='missing'][.='3']"
|
||||
, "//long[@name='countDistinct'][.='6']"
|
||||
, "count(//arr[@name='distinctValues']/*)=6"
|
||||
, "//double[@name='sumOfSquares'][.='43001.0']"
|
||||
|
@ -202,7 +202,7 @@ public class StatsComponentTest extends AbstractSolrTestCase {
|
|||
, "//double[@name='max'][.='200.0']"
|
||||
, "//double[@name='sum'][.='9.0']"
|
||||
, "//long[@name='count'][.='8']"
|
||||
, "//long[@name='missing'][.='1']"
|
||||
, "//long[@name='missing'][.='3']"
|
||||
, "//long[@name='countDistinct'][.='8']"
|
||||
, "count(//lst[@name='" + f + "']/arr[@name='distinctValues']/*)=8"
|
||||
, "//double[@name='sumOfSquares'][.='53101.0']"
|
||||
|
@ -216,7 +216,7 @@ public class StatsComponentTest extends AbstractSolrTestCase {
|
|||
, "//lst[@name='true']/double[@name='max'][.='200.0']"
|
||||
, "//lst[@name='true']/double[@name='sum'][.='70.0']"
|
||||
, "//lst[@name='true']/long[@name='count'][.='4']"
|
||||
, "//lst[@name='true']/long[@name='missing'][.='0']"
|
||||
, "//lst[@name='true']/long[@name='missing'][.='1']"
|
||||
, "//lst[@name='true']//long[@name='countDistinct'][.='4']"
|
||||
, "count(//lst[@name='true']/arr[@name='distinctValues']/*)=4"
|
||||
, "//lst[@name='true']/double[@name='sumOfSquares'][.='50500.0']"
|
||||
|
@ -230,7 +230,7 @@ public class StatsComponentTest extends AbstractSolrTestCase {
|
|||
, "//lst[@name='false']/double[@name='max'][.='10.0']"
|
||||
, "//lst[@name='false']/double[@name='sum'][.='-61.0']"
|
||||
, "//lst[@name='false']/long[@name='count'][.='4']"
|
||||
, "//lst[@name='false']/long[@name='missing'][.='1']"
|
||||
, "//lst[@name='false']/long[@name='missing'][.='2']"
|
||||
, "//lst[@name='true']//long[@name='countDistinct'][.='4']"
|
||||
, "count(//lst[@name='true']/arr[@name='distinctValues']/*)=4"
|
||||
, "//lst[@name='false']/double[@name='sumOfSquares'][.='2601.0']"
|
||||
|
@ -711,4 +711,31 @@ public class StatsComponentTest extends AbstractSolrTestCase {
|
|||
Collections.addAll(cat_docValues, comparables);
|
||||
return cat_docValues;
|
||||
}
|
||||
|
||||
|
||||
// public void testOtherFacetStatsResult() throws Exception {
|
||||
//
|
||||
// assertU(adoc("id", "1", "stats_tls_dv", "10", "active_i", "1"));
|
||||
// assertU(adoc("id", "2", "stats_tls_dv", "20", "active_i", "1"));
|
||||
// assertU(commit());
|
||||
// assertU(adoc("id", "3", "stats_tls_dv", "30", "active_i", "2"));
|
||||
// assertU(adoc("id", "4", "stats_tls_dv", "40", "active_i", "2"));
|
||||
// assertU(commit());
|
||||
//
|
||||
// final String pre = "//lst[@name='stats_fields']/lst[@name='stats_tls_dv']/lst[@name='facets']/lst[@name='active_i']";
|
||||
//
|
||||
// assertQ("test value for active_s=true", req("q", "*:*", "stats", "true", "stats.field", "stats_tls_dv", "stats.facet", "active_i","indent", "true")
|
||||
// , "*[count("+pre+")=1]"
|
||||
// , pre+"/lst[@name='1']/double[@name='min'][.='10.0']"
|
||||
// , pre+"/lst[@name='1']/double[@name='max'][.='20.0']"
|
||||
// , pre+"/lst[@name='1']/double[@name='sum'][.='30.0']"
|
||||
// , pre+"/lst[@name='1']/long[@name='count'][.='2']"
|
||||
// , pre+"/lst[@name='1']/long[@name='missing'][.='0']"
|
||||
// , pre + "/lst[@name='true']/long[@name='countDistinct'][.='2']"
|
||||
// , "count(" + pre + "/lst[@name='true']/arr[@name='distinctValues']/*)=2"
|
||||
// , pre+"/lst[@name='1']/double[@name='sumOfSquares'][.='500.0']"
|
||||
// , pre+"/lst[@name='1']/double[@name='mean'][.='15.0']"
|
||||
// , pre+"/lst[@name='1']/double[@name='stddev'][.='7.0710678118654755']"
|
||||
// );
|
||||
// }
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue