SOLR-3855: Fix faceting on numeric fields with precisionStep < Integer.MAX_VALUE, facet.mincount=0 and facet.method=fcs.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1449360 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2013-02-23 16:28:13 +00:00
parent 69c559c00b
commit e6231ad585
4 changed files with 142 additions and 47 deletions

View File

@ -35,7 +35,6 @@ import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.UnInvertedField;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TrieField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrIndexSearcher;
@ -214,10 +213,7 @@ class SimpleStats {
FieldType ft = sf.getType();
NamedList<?> stv;
// Currently, only UnInvertedField can deal with multi-part trie fields
String prefix = TrieField.getMainValuePrefix(ft);
if (sf.multiValued() || ft.multiValuedFieldCache() || prefix!=null) {
if (sf.multiValued() || ft.multiValuedFieldCache()) {
//use UnInvertedField for multivalued fields
UnInvertedField uif = UnInvertedField.getUnInvertedField(f, searcher);
stv = uif.getStats(searcher, docs, facets).getStatsValues();

View File

@ -40,10 +40,12 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.StringHelper;
import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TrieField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrIndexSearcher;
@ -266,10 +268,28 @@ final class NumericFacets {
}
final Terms terms = searcher.getAtomicReader().terms(fieldName);
if (terms != null) {
final String prefixStr = TrieField.getMainValuePrefix(ft);
final BytesRef prefix;
if (prefixStr != null) {
prefix = new BytesRef(prefixStr);
} else {
prefix = new BytesRef();
}
final TermsEnum termsEnum = terms.iterator(null);
BytesRef term = termsEnum.next();
BytesRef term;
switch (termsEnum.seekCeil(prefix)) {
case FOUND:
case NOT_FOUND:
term = termsEnum.term();
break;
case END:
term = null;
break;
default:
throw new AssertionError();
}
final CharsRef spare = new CharsRef();
for (int skipped = hashTable.size; skipped < offset && term != null; ) {
for (int skipped = hashTable.size; skipped < offset && term != null && StringHelper.startsWith(term, prefix); ) {
ft.indexedToReadable(term, spare);
final String termStr = spare.toString();
if (!alreadySeen.contains(termStr)) {
@ -277,7 +297,7 @@ final class NumericFacets {
}
term = termsEnum.next();
}
for ( ; term != null && (limit < 0 || result.size() < limit); term = termsEnum.next()) {
for ( ; term != null && StringHelper.startsWith(term, prefix) && (limit < 0 || result.size() < limit); term = termsEnum.next()) {
ft.indexedToReadable(term, spare);
final String termStr = spare.toString();
if (!alreadySeen.contains(termStr)) {
@ -301,13 +321,31 @@ final class NumericFacets {
}
final Terms terms = searcher.getAtomicReader().terms(fieldName);
if (terms != null) {
final String prefixStr = TrieField.getMainValuePrefix(ft);
final BytesRef prefix;
if (prefixStr != null) {
prefix = new BytesRef(prefixStr);
} else {
prefix = new BytesRef();
}
final TermsEnum termsEnum = terms.iterator(null);
BytesRef term;
switch (termsEnum.seekCeil(prefix)) {
case FOUND:
case NOT_FOUND:
term = termsEnum.term();
break;
case END:
term = null;
break;
default:
throw new AssertionError();
}
final CharsRef spare = new CharsRef();
BytesRef term = termsEnum.next();
for (int i = 0; i < offset && term != null; ++i) {
for (int i = 0; i < offset && term != null && StringHelper.startsWith(term, prefix); ++i) {
term = termsEnum.next();
}
for ( ; term != null && (limit < 0 || result.size() < limit); term = termsEnum.next()) {
for ( ; term != null && StringHelper.startsWith(term, prefix) && (limit < 0 || result.size() < limit); term = termsEnum.next()) {
ft.indexedToReadable(term, spare);
final String termStr = spare.toString();
Integer count = counts.get(termStr);

View File

@ -320,6 +320,10 @@ public class SimpleFacets {
return collector.getGroupCount();
}
enum FacetMethod {
ENUM, FC, FCS;
}
public NamedList<Integer> getTermCounts(String field) throws IOException {
int offset = params.getFieldInt(field, FacetParams.FACET_OFFSET, 0);
int limit = params.getFieldInt(field, FacetParams.FACET_LIMIT, 100);
@ -342,57 +346,84 @@ public class SimpleFacets {
FieldType ft = sf.getType();
// determine what type of faceting method to use
String method = params.getFieldParam(field, FacetParams.FACET_METHOD);
boolean enumMethod = FacetParams.FACET_METHOD_enum.equals(method);
final String methodStr = params.getFieldParam(field, FacetParams.FACET_METHOD);
FacetMethod method = null;
if (FacetParams.FACET_METHOD_enum.equals(methodStr)) {
method = FacetMethod.ENUM;
} else if (FacetParams.FACET_METHOD_fcs.equals(methodStr)) {
method = FacetMethod.FCS;
} else if (FacetParams.FACET_METHOD_fc.equals(methodStr)) {
method = FacetMethod.FC;
}
// TODO: default to per-segment or not?
boolean per_segment = FacetParams.FACET_METHOD_fcs.equals(method) // explicit
|| (ft.getNumericType() != null && sf.hasDocValues()); // numeric doc values are per-segment by default
if (method == FacetMethod.ENUM && TrieField.getMainValuePrefix(ft) != null) {
// enum can't deal with trie fields that index several terms per value
method = sf.multiValued() ? FacetMethod.FC : FacetMethod.FCS;
}
if (method == null && ft instanceof BoolField) {
// Always use filters for booleans... we know the number of values is very small.
enumMethod = true;
method = FacetMethod.ENUM;
}
boolean multiToken = sf.multiValued() || ft.multiValuedFieldCache();
if (TrieField.getMainValuePrefix(ft) != null) {
// A TrieField with multiple parts indexed per value... currently only
// UnInvertedField can handle this case, so force it's use.
enumMethod = false;
multiToken = true;
final boolean multiToken = sf.multiValued() || ft.multiValuedFieldCache();
if (method == null && ft.getNumericType() != null && !sf.multiValued()) {
// the per-segment approach is optimal for numeric field types since there
// are no global ords to merge and no need to create an expensive
// top-level reader
method = FacetMethod.FCS;
}
if (ft.getNumericType() != null && sf.hasDocValues()) {
// only fcs is able to leverage the numeric field caches
method = FacetMethod.FCS;
}
if (method == null) {
// TODO: default to per-segment or not?
method = FacetMethod.FC;
}
if (method == FacetMethod.FCS && multiToken) {
// only fc knows how to deal with multi-token fields
method = FacetMethod.FC;
}
if (params.getFieldBool(field, GroupParams.GROUP_FACET, false)) {
counts = getGroupedCounts(searcher, docs, field, multiToken, offset,limit, mincount, missing, sort, prefix);
} else {
// unless the enum method is explicitly specified, use a counting method.
if (enumMethod) {
counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix);
} else {
if (multiToken) {
UnInvertedField uif = UnInvertedField.getUnInvertedField(field, searcher);
counts = uif.getCounts(searcher, docs, offset, limit, mincount,missing,sort,prefix);
} else {
// TODO: future logic could use filters instead of the fieldcache if
// the number of terms in the field is small enough.
if (per_segment) {
if (ft.getNumericType() != null && !sf.multiValued()) {
// force numeric faceting
if (prefix != null && !prefix.isEmpty()) {
throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_PREFIX + " is not supported on numeric types");
}
counts = NumericFacets.getCounts(searcher, docs, field, offset, limit, mincount, missing, sort);
} else {
PerSegmentSingleValuedFaceting ps = new PerSegmentSingleValuedFaceting(searcher, docs, field, offset,limit, mincount, missing, sort, prefix);
Executor executor = threads == 0 ? directExecutor : facetExecutor;
ps.setNumThreads(threads);
counts = ps.getFacetCounts(executor);
assert method != null;
switch (method) {
case ENUM:
assert TrieField.getMainValuePrefix(ft) == null;
counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix);
break;
case FCS:
assert !multiToken;
if (ft.getNumericType() != null && !sf.multiValued()) {
// force numeric faceting
if (prefix != null && !prefix.isEmpty()) {
throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_PREFIX + " is not supported on numeric types");
}
counts = NumericFacets.getCounts(searcher, docs, field, offset, limit, mincount, missing, sort);
} else {
PerSegmentSingleValuedFaceting ps = new PerSegmentSingleValuedFaceting(searcher, docs, field, offset,limit, mincount, missing, sort, prefix);
Executor executor = threads == 0 ? directExecutor : facetExecutor;
ps.setNumThreads(threads);
counts = ps.getFacetCounts(executor);
}
break;
case FC:
if (multiToken || TrieField.getMainValuePrefix(ft) != null) {
UnInvertedField uif = UnInvertedField.getUnInvertedField(field, searcher);
counts = uif.getCounts(searcher, docs, offset, limit, mincount,missing,sort,prefix);
} else {
counts = getFieldCacheCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix);
}
}
break;
default:
throw new AssertionError();
}
}

View File

@ -17,6 +17,8 @@
package org.apache.solr.request;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Random;
@ -25,6 +27,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.FacetParams;
import org.junit.After;
import org.junit.BeforeClass;
import org.junit.Test;
@ -263,5 +266,32 @@ public class TestFaceting extends SolrTestCaseJ4 {
);
}
@Test
public void testTrieFields() {
// make sure that terms are correctly filtered even for trie fields that index several
// terms for a single value
List<String> fields = new ArrayList<String>();
fields.add("id");
fields.add("7");
final String[] suffixes = new String[] {"ti", "tis", "tf", "tfs", "tl", "tls", "td", "tds"};
for (String suffix : suffixes) {
fields.add("f_" + suffix);
fields.add("42");
}
assertU(adoc(fields.toArray(new String[0])));
assertU(commit());
for (String suffix : suffixes) {
for (String facetMethod : new String[] {FacetParams.FACET_METHOD_enum, FacetParams.FACET_METHOD_fc, FacetParams.FACET_METHOD_fcs}) {
for (String facetSort : new String[] {FacetParams.FACET_SORT_COUNT, FacetParams.FACET_SORT_INDEX}) {
for (String value : new String[] {"42", "43"}) { // match or not
final String field = "f_" + suffix;
assertQ("field=" + field + ",method=" + facetMethod + ",sort=" + facetSort,
req("q", field + ":" + value, FacetParams.FACET, "true", FacetParams.FACET_FIELD, field, FacetParams.FACET_MINCOUNT, "0", FacetParams.FACET_SORT, facetSort, FacetParams.FACET_METHOD, facetMethod),
"*[count(//lst[@name='" + field + "']/int)=1]"); // exactly 1 facet count
}
}
}
}
}
}