SOLR-1492: add missing support for multivalued stats.facet, add facet and stats support for multi-part trie fields

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@822286 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2009-10-06 14:13:20 +00:00
parent 72a154af8b
commit a1614ae8d8
9 changed files with 282 additions and 121 deletions

View File

@ -103,7 +103,8 @@ public class FieldFacetStats {
}
//function to keep track of facet counts for term number
// Function to keep track of facet counts for term number.
// Currently only used by UnInvertedField stats
public boolean facetTermNum(int docID, int statsTermNum) {
int term = termNum[docID];

View File

@ -34,6 +34,7 @@ import org.apache.solr.handler.component.FieldFacetStats;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TrieField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrIndexSearcher;
@ -224,7 +225,11 @@ class SimpleStats {
SchemaField sf = searcher.getSchema().getField(f);
FieldType ft = sf.getType();
NamedList stv;
if (ft.isTokenized() || sf.multiValued()) {
// Currently, only UnInvertedField can deal with multi-part trie fields
String prefix = TrieField.getMainValuePrefix(ft);
if (sf.multiValued() || ft.multiValuedFieldCache() || prefix!=null) {
//use UnInvertedField for multivalued fields
UnInvertedField uif = UnInvertedField.getUnInvertedField(f, searcher);
stv = uif.getStats(searcher, docs, facets).getStatsValues();

View File

@ -233,7 +233,14 @@ public class SimpleFacets {
// Always use filters for booleans... we know the number of values is very small.
enumMethod = true;
}
boolean multiToken = sf.multiValued() || ft.isTokenized();
boolean multiToken = sf.multiValued() || ft.multiValuedFieldCache();
if (TrieField.getMainValuePrefix(ft) != null) {
// A TrieField with multiple parts indexed per value... currently only
// UnInvertedField can handle this case, so force it's use.
enumMethod = false;
multiToken = true;
}
// unless the enum method is explicitly specified, use a counting method.
if (enumMethod) {

View File

@ -23,12 +23,14 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.SolrException;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.TrieField;
import org.apache.solr.search.*;
import org.apache.solr.util.BoundedTreeSet;
import org.apache.solr.handler.component.StatsValues;
@ -171,7 +173,8 @@ public class UnInvertedField {
public UnInvertedField(String field, SolrIndexSearcher searcher) throws IOException {
this.field = field;
this.ti = new TermIndex(field);
this.ti = new TermIndex(field,
TrieField.getMainValuePrefix(searcher.getSchema().getFieldType(field)));
uninvert(searcher);
}
@ -641,141 +644,150 @@ public class UnInvertedField {
int baseSize = docs.size();
int maxDoc = searcher.maxDoc();
if (baseSize > 0) {
FieldType ft = searcher.getSchema().getFieldType(field);
if (baseSize <= 0) return allstats;
int i = 0;
final FieldFacetStats[] finfo = new FieldFacetStats[facet.length];
//Initialize facetstats, if facets have been passed in
FieldCache.StringIndex si;
for (String f : facet) {
ft = searcher.getSchema().getFieldType(f);
try {
si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), f);
}
catch (IOException e) {
throw new RuntimeException("failed to open field cache for: " + f, e);
}
finfo[i++] = new FieldFacetStats(f, si, ft, numTermsInField);
FieldType ft = searcher.getSchema().getFieldType(field);
DocSet missing = docs.andNot( searcher.getDocSet(new TermRangeQuery(field, null, null, false, false)) );
int i = 0;
final FieldFacetStats[] finfo = new FieldFacetStats[facet.length];
//Initialize facetstats, if facets have been passed in
FieldCache.StringIndex si;
for (String f : facet) {
FieldType facet_ft = searcher.getSchema().getFieldType(f);
try {
si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), f);
}
final int[] index = this.index;
final int[] counts = new int[numTermsInField];//keep track of the number of times we see each word in the field for all the documents in the docset
NumberedTermEnum te = ti.getEnumerator(searcher.getReader());
boolean doNegative = false;
if (finfo.length == 0) {
//if we're collecting statistics with a facet field, can't do inverted counting
doNegative = baseSize > maxDoc >> 1 && termInstances > 0
&& docs instanceof BitDocSet;
catch (IOException e) {
throw new RuntimeException("failed to open field cache for: " + f, e);
}
finfo[i] = new FieldFacetStats(f, si, facet_ft, numTermsInField);
i++;
}
if (doNegative) {
OpenBitSet bs = (OpenBitSet) ((BitDocSet) docs).getBits().clone();
bs.flip(0, maxDoc);
// TODO: when iterator across negative elements is available, use that
// instead of creating a new bitset and inverting.
docs = new BitDocSet(bs, maxDoc - baseSize);
// simply negating will mean that we have deleted docs in the set.
// that should be OK, as their entries in our table should be empty.
}
final int[] index = this.index;
final int[] counts = new int[numTermsInField];//keep track of the number of times we see each word in the field for all the documents in the docset
// For the biggest terms, do straight set intersections
for (TopTerm tt : bigTerms.values()) {
// TODO: counts could be deferred if sorted==false
if (tt.termNum >= 0 && tt.termNum < numTermsInField) {
if (finfo.length == 0) {
counts[tt.termNum] = searcher.numDocs(new TermQuery(tt.term), docs);
} else {
//COULD BE VERY SLOW
//if we're collecting stats for facet fields, we need to iterate on all matching documents
DocSet bigTermDocSet = searcher.getDocSet(new TermQuery(tt.term)).intersection(docs);
DocIterator iter = bigTermDocSet.iterator();
while (iter.hasNext()) {
int doc = iter.nextDoc();
counts[tt.termNum]++;
for (FieldFacetStats f : finfo) {
f.facetTermNum(doc, tt.termNum);
}
NumberedTermEnum te = ti.getEnumerator(searcher.getReader());
boolean doNegative = false;
if (finfo.length == 0) {
//if we're collecting statistics with a facet field, can't do inverted counting
doNegative = baseSize > maxDoc >> 1 && termInstances > 0
&& docs instanceof BitDocSet;
}
if (doNegative) {
OpenBitSet bs = (OpenBitSet) ((BitDocSet) docs).getBits().clone();
bs.flip(0, maxDoc);
// TODO: when iterator across negative elements is available, use that
// instead of creating a new bitset and inverting.
docs = new BitDocSet(bs, maxDoc - baseSize);
// simply negating will mean that we have deleted docs in the set.
// that should be OK, as their entries in our table should be empty.
}
// For the biggest terms, do straight set intersections
for (TopTerm tt : bigTerms.values()) {
// TODO: counts could be deferred if sorted==false
if (tt.termNum >= 0 && tt.termNum < numTermsInField) {
if (finfo.length == 0) {
counts[tt.termNum] = searcher.numDocs(new TermQuery(tt.term), docs);
} else {
//COULD BE VERY SLOW
//if we're collecting stats for facet fields, we need to iterate on all matching documents
DocSet bigTermDocSet = searcher.getDocSet(new TermQuery(tt.term)).intersection(docs);
DocIterator iter = bigTermDocSet.iterator();
while (iter.hasNext()) {
int doc = iter.nextDoc();
counts[tt.termNum]++;
for (FieldFacetStats f : finfo) {
f.facetTermNum(doc, tt.termNum);
}
}
}
}
}
if (termInstances > 0) {
DocIterator iter = docs.iterator();
while (iter.hasNext()) {
int doc = iter.nextDoc();
int code = index[doc];
if (termInstances > 0) {
DocIterator iter = docs.iterator();
while (iter.hasNext()) {
int doc = iter.nextDoc();
int code = index[doc];
if ((code & 0xff) == 1) {
int pos = code >>> 8;
int whichArray = (doc >>> 16) & 0xff;
byte[] arr = tnums[whichArray];
int tnum = 0;
if ((code & 0xff) == 1) {
int pos = code >>> 8;
int whichArray = (doc >>> 16) & 0xff;
byte[] arr = tnums[whichArray];
int tnum = 0;
for (; ;) {
int delta = 0;
for (; ;) {
int delta = 0;
for (; ;) {
byte b = arr[pos++];
delta = (delta << 7) | (b & 0x7f);
if ((b & 0x80) == 0) break;
}
byte b = arr[pos++];
delta = (delta << 7) | (b & 0x7f);
if ((b & 0x80) == 0) break;
}
if (delta == 0) break;
tnum += delta - TNUM_OFFSET;
counts[tnum]++;
for (FieldFacetStats f : finfo) {
f.facetTermNum(doc, tnum);
}
}
} else {
int tnum = 0;
int delta = 0;
for (; ;) {
delta = (delta << 7) | (code & 0x7f);
if ((code & 0x80) == 0) {
if (delta == 0) break;
tnum += delta - TNUM_OFFSET;
counts[tnum]++;
for (FieldFacetStats f : finfo) {
f.facetTermNum(doc, tnum);
}
delta = 0;
}
} else {
int tnum = 0;
int delta = 0;
for (; ;) {
delta = (delta << 7) | (code & 0x7f);
if ((code & 0x80) == 0) {
if (delta == 0) break;
tnum += delta - TNUM_OFFSET;
counts[tnum]++;
for (FieldFacetStats f : finfo) {
f.facetTermNum(doc, tnum);
}
delta = 0;
}
code >>>= 8;
}
code >>>= 8;
}
}
}
}
// add results in index order
// add results in index order
for (i = 0; i < numTermsInField; i++) {
int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i];
if (c == 0) {
continue;
}
Double value = Double.parseDouble(ft.indexedToReadable(getTermText(te, i)));
allstats.accumulate(value, c);
//as we've parsed the termnum into a value, lets also accumulate fieldfacet statistics
for (FieldFacetStats f : finfo) {
f.accumulateTermNum(i, value);
}
}
te.close();
int c = SimpleFacets.getFieldMissingCount(searcher, baseDocs, field);
if (c > 0) {
allstats.addMissing(c);
}
if (finfo.length > 0) {
allstats.facets = new HashMap<String, Map<String, StatsValues>>();
for (FieldFacetStats f : finfo) {
allstats.facets.put(f.name, f.facetStatsValues);
}
for (i = 0; i < numTermsInField; i++) {
int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i];
if (c == 0) continue;
Double value = Double.parseDouble(ft.indexedToReadable(getTermText(te, i)));
allstats.accumulate(value, c);
//as we've parsed the termnum into a value, lets also accumulate fieldfacet statistics
for (FieldFacetStats f : finfo) {
f.accumulateTermNum(i, value);
}
}
te.close();
int c = missing.size();
allstats.addMissing(c);
if (finfo.length > 0) {
allstats.facets = new HashMap<String, Map<String, StatsValues>>();
for (FieldFacetStats f : finfo) {
Map<String, StatsValues> facetStatsValues = f.facetStatsValues;
FieldType facetType = searcher.getSchema().getFieldType(f.name);
for (Map.Entry<String,StatsValues> entry : facetStatsValues.entrySet()) {
String termLabel = entry.getKey();
int missingCount = searcher.numDocs(new TermQuery(new Term(f.name, facetType.toInternal(termLabel))), missing);
entry.getValue().addMissing(missingCount);
}
allstats.facets.put(f.name, facetStatsValues);
}
}
return allstats;
}
@ -879,7 +891,10 @@ class NumberedTermEnum extends TermEnum {
protected boolean setTerm() {
t = tenum.term();
if (t==null || t.field() != tindex.fterm.field()) { // intern'd compare
if (t==null
|| t.field() != tindex.fterm.field() // intern'd compare
|| (tindex.prefix != null && !t.text().startsWith(tindex.prefix,0)) )
{
t = null;
return false;
}
@ -1004,12 +1019,18 @@ class TermIndex {
final static int interval = 1 << intervalBits;
final Term fterm; // prototype to be used in term construction w/o String.intern overhead
final String prefix;
String[] index;
int nTerms;
long sizeOfStrings;
TermIndex(String field) {
this(field, null);
}
TermIndex(String field, String prefix) {
this.fterm = new Term(field, "");
this.prefix = prefix;
}
Term createTerm(String termVal) {
@ -1027,7 +1048,7 @@ class TermIndex {
will be built.
*/
NumberedTermEnum getEnumerator(IndexReader reader) throws IOException {
if (index==null) return new NumberedTermEnum(reader,this,"",0) {
if (index==null) return new NumberedTermEnum(reader,this, prefix==null?"":prefix, 0) {
ArrayList<String> lst;
protected boolean setTerm() {

View File

@ -72,6 +72,14 @@ public abstract class FieldType extends FieldProperties {
return (properties & MULTIVALUED) != 0;
}
/** Returns true if a single field value of this type has multiple logical values
* for the purposes of faceting, sorting, etc. Text fields normally return
* true since each token/word is a logical value.
*/
public boolean multiValuedFieldCache() {
return isTokenized();
}
/** subclasses should initialize themselves with the args provided
* and remove valid arguments. leftover arguments will cause an exception.
* Common boolean properties have already been handled.

View File

@ -207,6 +207,11 @@ public class TrieField extends FieldType {
return true;
}
@Override
public boolean multiValuedFieldCache() {
return false;
}
/**
* @return the precisionStep used to index values into the field
*/
@ -465,6 +470,38 @@ public class TrieField extends FieldType {
DOUBLE,
DATE
}
static final String INT_PREFIX = new String(new char[]{NumericUtils.SHIFT_START_INT});
static final String LONG_PREFIX = new String(new char[]{NumericUtils.SHIFT_START_LONG});
/** expert internal use, subject to change.
* Returns null if no prefix or prefix not needed, or the prefix of the main value of a trie field
* that indexes multiple precisions per value.
*/
public static String getMainValuePrefix(FieldType ft) {
if (ft instanceof TrieDateField) {
int step = ((TrieDateField)ft).getPrecisionStep();
if (step <= 0 || step >=64) return null;
return LONG_PREFIX;
} else if (ft instanceof TrieField) {
TrieField trie = (TrieField)ft;
if (trie.precisionStep == Integer.MAX_VALUE) return null;
switch (trie.type) {
case INTEGER:
case FLOAT:
return INT_PREFIX;
case LONG:
case DOUBLE:
case DATE:
return LONG_PREFIX;
default:
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + trie.type);
}
}
return null;
}
}
class TrieDateFieldSource extends LongFieldSource {

View File

@ -18,6 +18,8 @@ package org.apache.solr;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.DateField;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.TrieField;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.util.DateMathParser;
@ -194,4 +196,78 @@ public class TestTrie extends AbstractSolrTestCase {
String fq = "tdouble4:[" + Integer.MAX_VALUE * 2.33d + " TO " + (5l + Integer.MAX_VALUE) * 2.33d + "]";
assertQ("Range filter must match only 5 documents", req("q", "*:*", "fq", fq), "//*[@numFound='6']");
}
public void testTrieFacet_PrecisionStep() throws Exception {
// Future protect - assert 0<precisionStep<64
checkPrecisionSteps("tint");
checkPrecisionSteps("tfloat");
checkPrecisionSteps("tdouble");
checkPrecisionSteps("tlong");
checkPrecisionSteps("tdate");
// For tdate tests
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
format.setTimeZone(TimeZone.getTimeZone("UTC"));
DateMathParser dmp = new DateMathParser(DateField.UTC, Locale.US);
for (int i = 0; i < 10; i++) {
long l = Integer.MAX_VALUE + i*1L;
// index 10 days starting with today
String d = format.format(i == 0 ? dmp.parseMath("/DAY") : dmp.parseMath("/DAY+" + i + "DAYS"));
assertU(adoc("id", String.valueOf(i), "tint", String.valueOf(i),
"tlong", String.valueOf(l),
"tfloat", String.valueOf(i * i * 31.11f),
"tdouble", String.valueOf(i * 2.33d),
"tdate", d));
}
for (int i = 0; i < 5; i++) {
long l = Integer.MAX_VALUE + i*1L;
String d = format.format(i == 0 ? dmp.parseMath("/DAY") : dmp.parseMath("/DAY+" + i + "DAYS"));
assertU(adoc("id", String.valueOf((i+1)*10), "tint", String.valueOf(i),
"tlong", String.valueOf(l),
"tfloat", String.valueOf(i * i * 31.11f),
"tdouble", String.valueOf(i * 2.33d),
"tdate", d));
}
assertU(commit());
SolrQueryRequest req = req("q", "*:*", "facet", "true", "rows", "15",
"facet.field", "tint",
"facet.field", "tlong",
"facet.field", "tfloat",
"facet.field", "tdouble",
"facet.date", "tdate",
"facet.date.start", "NOW/DAY",
"facet.date.end", "NOW/DAY+6DAYS",
"facet.date.gap", "+1DAY");
testFacetField(req, "tint", "0", "2");
testFacetField(req, "tint", "5", "1");
testFacetField(req, "tlong", String.valueOf(Integer.MAX_VALUE), "2");
testFacetField(req, "tlong", String.valueOf(Integer.MAX_VALUE+5L), "1");
testFacetField(req, "tfloat", String.valueOf(31.11f), "2");
testFacetField(req, "tfloat", String.valueOf(5*5*31.11f), "1");
testFacetField(req, "tdouble", String.valueOf(2.33d), "2");
testFacetField(req, "tdouble", String.valueOf(5*2.33d), "1");
testFacetDate(req, "tdate", format.format(dmp.parseMath("/DAY")), "4");
testFacetDate(req, "tdate", format.format(dmp.parseMath("/DAY+5DAYS")), "2");
}
private void checkPrecisionSteps(String fieldType) {
FieldType type = h.getCore().getSchema().getFieldType(fieldType);
if (type instanceof TrieField) {
TrieField field = (TrieField) type;
assertTrue(field.getPrecisionStep() > 0 && field.getPrecisionStep() < 64);
}
}
private void testFacetField(SolrQueryRequest req, String field, String value, String count) {
String xpath = "//lst[@name='facet_fields']/lst[@name='" + field + "']/int[@name='" + value + "'][.='" + count + "']";
assertQ(req, xpath);
}
private void testFacetDate(SolrQueryRequest req, String field, String value, String count) {
String xpath = "//lst[@name='facet_dates']/lst[@name='" + field + "']/int[@name='" + value + "'][.='" + count + "']";
assertQ(req, xpath);
}
}

View File

@ -50,14 +50,20 @@ public class StatsComponentTest extends AbstractSolrTestCase {
}
public void testStats() throws Exception {
for (String f : new String[] {"stats_i"}) {
for (String f : new String[] {
"stats_i","stats_l","stats_f","stats_d",
"stats_ti","stats_tl","stats_tf","stats_td"
}) {
doTestFieldStatisticsResult(f);
doTestFieldStatisticsMissingResult(f);
doTestFacetStatisticsResult(f);
doTestFacetStatisticsMissingResult(f);
}
for (String f : new String[] {"stats_ii"}) {
for (String f : new String[] {"stats_ii", // plain int
"stats_is", // sortable int
"stats_tis","stats_tfs","stats_tls","stats_tds" // trie fields
}) {
doTestMVFieldStatisticsResult(f);
}
@ -86,7 +92,6 @@ public class StatsComponentTest extends AbstractSolrTestCase {
public void doTestMVFieldStatisticsResult(String f) throws Exception {
assertU(adoc("id", "1", f, "-10", f, "-100", "active_s", "true"));
assertU(adoc("id", "2", f, "-20", f, "200", "active_s", "true"));
assertU(adoc("id", "3", f, "-30", f, "-1", "active_s", "false"));
assertU(adoc("id", "4", f, "-40", f, "10", "active_s", "false"));
assertU(adoc("id", "5", "active_s", "false"));
@ -124,17 +129,17 @@ public class StatsComponentTest extends AbstractSolrTestCase {
, "//lst[@name='true']/double[@name='mean'][.='17.5']"
, "//lst[@name='true']/double[@name='stddev'][.='128.16005617976296']"
);
//Test for fixing multivalued missing
/*assertQ("test value for active_s=false", req
assertQ("test value for active_s=false", req("q","*:*", "stats","true", "stats.field",f, "stats.facet","active_s", "indent","true")
, "//lst[@name='false']/double[@name='min'][.='-40.0']"
, "//lst[@name='false']/double[@name='max'][.='10.0']"
, "//lst[@name='false']/double[@name='sum'][.='-61.0']"
, "//lst[@name='false']/long[@name='count'][.='4']"
, "//lst[@name='false']/long[@name='missing'][.='1']"
, "//lst[@name='false']/double[@name='sumOfSquares'][.='2601.0']"
, "//lst[@name='false']/double[@name='mean'][.='-15.22']"
, "//lst[@name='false']/double[@name='mean'][.='-15.25']"
, "//lst[@name='false']/double[@name='stddev'][.='23.59908190304586']"
);*/
);
}

View File

@ -288,6 +288,7 @@
<dynamicField name="*_ss" type="string" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_ii" type="integer" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_i" type="sint" indexed="true" stored="true"/>
<dynamicField name="*_is" type="sint" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_l" type="slong" indexed="true" stored="true"/>
<dynamicField name="*_f" type="sfloat" indexed="true" stored="true"/>
<dynamicField name="*_d" type="sdouble" indexed="true" stored="true"/>