diff --git a/CHANGES.txt b/CHANGES.txt index 79d4b3f6baf..43a82e154fd 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -37,17 +37,24 @@ Detailed Change List New Features 1. SOLR-82: Default field values can be specified in the schema.xml. (Ryan McKinley via hossman) + 2. SOLR-89: Two new TokenFilters with corresponding Factories... * TrimFilter - Trims leading and trailing whitespace from Tokens * PatternReplaceFilter - applies a Pattern to each token in the stream, replacing match occurances with a specified replacement. (hossman) + 3. SOLR-91: allow configuration of a limit of the number of searchers that can be warming in the background. This can be used to avoid out-of-memory errors, or contention caused by more and more searchers warming in the background. An error is thrown if the limit specified by maxWarmingSearchers in solrconfig.xml is exceeded. (yonik) + 4. SOLR-106: New faceting parameters that allow specification of a + minimum count for returned facets (facet.mincount), paging through facets + (facet.offset, facet.limit), and explicit sorting (facet.sort). + facet.zeros is now deprecated. (yonik) + Changes in runtime behavior 1. Highlighting using DisMax will only pick up terms from the main user query, not boost or filter queries (klaas). @@ -58,9 +65,11 @@ Optimizations Bug Fixes 1. SOLR-87: Parsing of synonym files did not correctly handle escaped whitespace such as \r\n\t\b\f. (yonik) + 2. SOLR-92: DOMUtils.getText (used when parsing config files) did not work properly with many DOM implementations when dealing with "Attributes". (Ryan McKinley via hossman) + 3. SOLR-9,SOLR-99: Tighten up sort specification error checking, throw exceptions for missing sort specifications or a sort on a non-indexed field. (Ryan McKinley via yonik) diff --git a/src/java/org/apache/solr/request/SimpleFacets.java b/src/java/org/apache/solr/request/SimpleFacets.java index 44aae8bed76..4944b8b4edd 100644 --- a/src/java/org/apache/solr/request/SimpleFacets.java +++ b/src/java/org/apache/solr/request/SimpleFacets.java @@ -20,10 +20,12 @@ package org.apache.solr.request; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.TermDocs; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.*; import org.apache.solr.core.SolrCore; import org.apache.solr.core.SolrException; +import org.apache.solr.core.SolrConfig; import org.apache.solr.request.SolrParams; import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.FieldType; @@ -118,20 +120,29 @@ public class SimpleFacets { public NamedList getTermCounts(String field) throws IOException { + int offset = params.getFieldInt(field, params.FACET_OFFSET, 0); int limit = params.getFieldInt(field, params.FACET_LIMIT, 100); - boolean zeros = params.getFieldBool(field, params.FACET_ZEROS, true); + Integer mincount = params.getFieldInt(field, params.FACET_MINCOUNT); + if (mincount==null) { + Boolean zeros = params.getFieldBool(field, params.FACET_ZEROS); + // mincount = (zeros!=null && zeros) ? 0 : 1; + mincount = (zeros!=null && !zeros) ? 1 : 0; + // current default is to include zeros. + } boolean missing = params.getFieldBool(field, params.FACET_MISSING, false); + // default to sorting if there is a limit. + boolean sort = params.getFieldBool(field, params.FACET_SORT, limit>0); NamedList counts; SchemaField sf = searcher.getSchema().getField(field); FieldType ft = sf.getType(); if (sf.multiValued() || ft.isTokenized() || ft instanceof BoolField) { // Always use filters for booleans... we know the number of values is very small. - counts = getFacetTermEnumCounts(searcher,docs,field,limit,zeros,missing); + counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort); } else { // TODO: future logic could use filters instead of the fieldcache if // the number of terms in the field is small enough. - counts = getFieldCacheCounts(searcher, docs, field, limit, zeros, missing); + counts = getFieldCacheCounts(searcher, docs, field, offset,limit, mincount, missing, sort); } return counts; @@ -177,7 +188,7 @@ public class SimpleFacets { * Use the Lucene FieldCache to get counts for each unique field value in docs. * The field must have at most one indexed token per document. */ - public static NamedList getFieldCacheCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int limit, boolean zeros, boolean missing) throws IOException { + public static NamedList getFieldCacheCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, boolean sort) throws IOException { // TODO: If the number of terms is high compared to docs.size(), and zeros==false, // we should use an alternate strategy to avoid // 1) creating another huge int[] for the counts @@ -188,7 +199,7 @@ public class SimpleFacets { // FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), fieldName); - int[] count = new int[si.lookup.length]; + final int[] count = new int[si.lookup.length]; DocIterator iter = docs.iterator(); while (iter.hasNext()) { count[si.order[iter.nextDoc()]]++; @@ -200,42 +211,51 @@ public class SimpleFacets { // IDEA: we could also maintain a count of "other"... everything that fell outside // of the top 'N' - BoundedTreeSet> queue=null; + int off=offset; + int lim=limit>=0 ? limit : Integer.MAX_VALUE; - if (limit>=0) { + if (sort) { // TODO: compare performance of BoundedTreeSet compare against priority queue? - queue = new BoundedTreeSet>(limit); - } - - int min=-1; // the smallest value in the top 'N' values - for (int i=1; imin) { - // NOTE: we use c>min rather than c>=min as an optimization because we are going in - // index order, so we already know that the keys are ordered. This can be very - // important if a lot of the counts are repeated (like zero counts would be). - queue.add(new CountPair(ft.indexedToReadable(si.lookup[i]), c)); - if (queue.size()>=limit) min=queue.last().val; + int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1; + final BoundedTreeSet> queue = new BoundedTreeSet>(maxsize); + int min=mincount-1; // the smallest value in the top 'N' values + for (int i=1; imin) { + // NOTE: we use c>min rather than c>=min as an optimization because we are going in + // index order, so we already know that the keys are ordered. This can be very + // important if a lot of the counts are repeated (like zero counts would be). + queue.add(new CountPair(ft.indexedToReadable(si.lookup[i]), c)); + if (queue.size()>=maxsize) min=queue.last().val; + } } - } - - if (limit>=0) { for (CountPair p : queue) { + if (--off>=0) continue; + if (--lim<0) break; res.add(p.key, p.val); } + } else if (mincount<=0) { + // This is an optimization... if mincount<=0 and we aren't sorting then + // we know exactly where to start and end in the fieldcache. + for (int i=offset+1; i=0) continue; + if (--lim<0) break; + res.add(ft.indexedToReadable(si.lookup[i]), c); + } } - if (missing) res.add(null, count[0]); return res; } /** * Returns a list of terms in the specified field along with the - * corrisponding count of documents in the set that match that constraint. + * corresponding count of documents in the set that match that constraint. * This method uses the FilterCache to get the intersection count between docs * and the DocSet for each term in the filter. * @@ -243,7 +263,7 @@ public class SimpleFacets { * @see SolrParams#FACET_ZEROS * @see SolrParams#FACET_MISSING */ - public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int limit, boolean zeros, boolean missing) + public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, boolean sort) throws IOException { /* :TODO: potential optimization... @@ -255,13 +275,13 @@ public class SimpleFacets { IndexReader r = searcher.getReader(); FieldType ft = schema.getFieldType(field); - Set> counts - = new HashSet>(); - - if (0 <= limit) { - counts = new BoundedTreeSet>(limit); - } + final int maxsize = limit>=0 ? offset+limit : Integer.MAX_VALUE-1; + final BoundedTreeSet> queue = sort ? new BoundedTreeSet>(maxsize) : null; + final NamedList res = new NamedList(); + int min=mincount-1; // the smallest value in the top 'N' values + int off=offset; + int lim=limit>=0 ? limit : Integer.MAX_VALUE; TermEnum te = r.terms(new Term(field,"")); do { Term t = te.term(); @@ -269,26 +289,37 @@ public class SimpleFacets { if (null == t || ! t.field().equals(field)) break; - if (0 < te.docFreq()) { /* all docs may be deleted */ - int count = searcher.numDocs(new TermQuery(t), - docs); + int df = te.docFreq(); - if (zeros || 0 < count) - counts.add(new CountPair - (t.text(), count)); + if (df>0) { /* check df since all docs may be deleted */ + int c = searcher.numDocs(new TermQuery(t), docs); + if (sort) { + if (c>min) { + queue.add(new CountPair(t.text(), c)); + if (queue.size()>=maxsize) min=queue.last().val; + } + } else { + if (c >= mincount && --off<0) { + if (--lim<0) break; + res.add(ft.indexedToReadable(t.text()), c); + } + } } } while (te.next()); - NamedList res = new NamedList(); - for (CountPair p : counts) { - res.add(ft.indexedToReadable(p.key), p.val); + if (sort) { + for (CountPair p : queue) { + if (--off>=0) continue; + if (--lim<0) break; + res.add(ft.indexedToReadable(p.key), p.val); + } } if (missing) { res.add(null, getFieldMissingCount(searcher,docs,field)); } - + return res; } diff --git a/src/java/org/apache/solr/request/SolrParams.java b/src/java/org/apache/solr/request/SolrParams.java index 37356809554..8a878902ebb 100644 --- a/src/java/org/apache/solr/request/SolrParams.java +++ b/src/java/org/apache/solr/request/SolrParams.java @@ -83,17 +83,32 @@ public abstract class SolrParams { * Facet Contraint Counts (multi-value) */ public static final String FACET_FIELD = "facet.field"; + + /** + * The offset into the list of facets. + * Can be overriden on a per field basis. + */ + public static final String FACET_OFFSET = "facet.offset"; + /** * Numeric option indicating the maximum number of facet field counts * be included in the response for each field - in descending order of count. * Can be overriden on a per field basis. */ public static final String FACET_LIMIT = "facet.limit"; + + /** + * Numeric option indicating the minimum number of hits before a facet should + * be included in the response. Can be overriden on a per field basis. + */ + public static final String FACET_MINCOUNT = "facet.mincount"; + /** * Boolean option indicating whether facet field counts of "0" should * be included in the response. Can be overriden on a per field basis. */ public static final String FACET_ZEROS = "facet.zeros"; + /** * Boolean option indicating whether the response should include a * facet field count for all records which have no value for the @@ -101,6 +116,11 @@ public abstract class SolrParams { */ public static final String FACET_MISSING = "facet.missing"; + /** + * Boolean option: true causes facets to be sorted + * by the count, false results in natural index order. + */ + public static final String FACET_SORT = "facet.sort"; /** returns the String value of a param, or null if not set */ public abstract String get(String param); @@ -166,6 +186,13 @@ public abstract class SolrParams { String val = get(param); return val==null ? def : Integer.parseInt(val); } + + /** Returns the int value of the field param, + or the value for param, or def if neither is set. */ + public Integer getFieldInt(String field, String param) { + String val = getFieldParam(field, param); + return val==null ? null : Integer.parseInt(val); + } /** Returns the int value of the field param, or the value for param, or def if neither is set. */ @@ -174,6 +201,7 @@ public abstract class SolrParams { return val==null ? def : Integer.parseInt(val); } + /** Returns the Float value of the param, or null if not set */ public Float getFloat(String param) { String val = get(param); diff --git a/src/test/org/apache/solr/BasicFunctionalityTest.java b/src/test/org/apache/solr/BasicFunctionalityTest.java index b614ca72d1d..8f0c6484b91 100644 --- a/src/test/org/apache/solr/BasicFunctionalityTest.java +++ b/src/test/org/apache/solr/BasicFunctionalityTest.java @@ -240,7 +240,7 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase { ); } - /** @see TestRemoveDuplicatesTokenFilter */ + /** @see org.apache.solr.analysis.TestRemoveDuplicatesTokenFilter */ public void testRemoveDuplicatesTokenFilter() { Query q = QueryParsing.parseQuery("TV", "dedup", h.getCore().getSchema()); @@ -508,76 +508,240 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase { ,"//lst[@name='trait_s']/int[@name='Chauvinist'][.='1']" ,"//lst[@name='trait_s']/int[not(@name)][.='1']" ); - + + assertQ("check counts with facet.mincount=1&facet.missing=true using fq", + req("q", "id:[42 TO 47]" + ,"facet", "true" + ,"facet.mincount", "1" + ,"f.trait_s.facet.missing", "true" + ,"fq", "id:[42 TO 45]" + ,"facet.field", "trait_s" + ) + ,"*[count(//doc)=4]" + ,"*[count(//lst[@name='trait_s']/int)=4]" + ,"//lst[@name='trait_s']/int[@name='Tool'][.='2']" + ,"//lst[@name='trait_s']/int[@name='Obnoxious'][.='1']" + ,"//lst[@name='trait_s']/int[@name='Chauvinist'][.='1']" + ,"//lst[@name='trait_s']/int[not(@name)][.='1']" + ); + + assertQ("check counts with facet.mincount=2&facet.missing=true using fq", + req("q", "id:[42 TO 47]" + ,"facet", "true" + ,"facet.mincount", "2" + ,"f.trait_s.facet.missing", "true" + ,"fq", "id:[42 TO 45]" + ,"facet.field", "trait_s" + ) + ,"*[count(//doc)=4]" + ,"*[count(//lst[@name='trait_s']/int)=2]" + ,"//lst[@name='trait_s']/int[@name='Tool'][.='2']" + ,"//lst[@name='trait_s']/int[not(@name)][.='1']" + ); + + assertQ("check sorted paging", + req("q", "id:[42 TO 47]" + ,"facet", "true" + ,"fq", "id:[42 TO 45]" + ,"facet.field", "trait_s" + ,"facet.mincount","0" + ,"facet.offset","0" + ,"facet.limit","4" + ) + ,"*[count(//lst[@name='trait_s']/int)=4]" + ,"//lst[@name='trait_s']/int[@name='Tool'][.='2']" + ,"//lst[@name='trait_s']/int[@name='Obnoxious'][.='1']" + ,"//lst[@name='trait_s']/int[@name='Chauvinist'][.='1']" + ,"//lst[@name='trait_s']/int[@name='Pig'][.='0']" + ); + + assertQ("check sorted paging", + req("q", "id:[42 TO 47]" + ,"facet", "true" + ,"fq", "id:[42 TO 45]" + ,"facet.field", "trait_s" + ,"facet.mincount","0" + ,"facet.offset","0" + ,"facet.limit","3" + ,"sort","true" + ) + ,"*[count(//lst[@name='trait_s']/int)=3]" + ,"//lst[@name='trait_s']/int[@name='Tool'][.='2']" + ,"//lst[@name='trait_s']/int[@name='Obnoxious'][.='1']" + ,"//lst[@name='trait_s']/int[@name='Chauvinist'][.='1']" + ); + } - public void testSimpleFacetCountsWithLimits() { - assertU(adoc("id", "1", "t_s", "A")); - assertU(adoc("id", "2", "t_s", "B")); - assertU(adoc("id", "3", "t_s", "C")); - assertU(adoc("id", "4", "t_s", "C")); - assertU(adoc("id", "5", "t_s", "D")); - assertU(adoc("id", "6", "t_s", "E")); - assertU(adoc("id", "7", "t_s", "E")); - assertU(adoc("id", "8", "t_s", "E")); - assertU(adoc("id", "9", "t_s", "F")); - assertU(adoc("id", "10", "t_s", "G")); - assertU(adoc("id", "11", "t_s", "G")); - assertU(adoc("id", "12", "t_s", "G")); - assertU(adoc("id", "13", "t_s", "G")); - assertU(adoc("id", "14", "t_s", "G")); + + public void testFacetMultiValued() { + doSimpleFacetCountsWithLimits("t_s"); + } + + public void testFacetSingleValued() { + doSimpleFacetCountsWithLimits("t_s1"); + } + + public void doSimpleFacetCountsWithLimits(String f) { + String pre = "//lst[@name='"+f+"']"; + String notc = "id:[* TO *] -"+f+":C"; + + assertU(adoc("id", "1", f, "A")); + assertU(adoc("id", "2", f, "B")); + assertU(adoc("id", "3", f, "C")); + assertU(adoc("id", "4", f, "C")); + assertU(adoc("id", "5", f, "D")); + assertU(adoc("id", "6", f, "E")); + assertU(adoc("id", "7", f, "E")); + assertU(adoc("id", "8", f, "E")); + assertU(adoc("id", "9", f, "F")); + assertU(adoc("id", "10", f, "G")); + assertU(adoc("id", "11", f, "G")); + assertU(adoc("id", "12", f, "G")); + assertU(adoc("id", "13", f, "G")); + assertU(adoc("id", "14", f, "G")); assertU(commit()); - + assertQ("check counts for unlimited facet", req("q", "id:[* TO *]" ,"facet", "true" - ,"facet.field", "t_s" + ,"facet.field", f ) - ,"*[count(//lst[@name='facet_fields']/lst[@name='t_s']/int)=7]" - - ,"//lst[@name='t_s']/int[@name='G'][.='5']" - ,"//lst[@name='t_s']/int[@name='E'][.='3']" - ,"//lst[@name='t_s']/int[@name='C'][.='2']" - - ,"//lst[@name='t_s']/int[@name='A'][.='1']" - ,"//lst[@name='t_s']/int[@name='B'][.='1']" - ,"//lst[@name='t_s']/int[@name='D'][.='1']" - ,"//lst[@name='t_s']/int[@name='F'][.='1']" + ,"*[count(//lst[@name='facet_fields']/lst/int)=7]" + + ,pre+"/int[@name='G'][.='5']" + ,pre+"/int[@name='E'][.='3']" + ,pre+"/int[@name='C'][.='2']" + + ,pre+"/int[@name='A'][.='1']" + ,pre+"/int[@name='B'][.='1']" + ,pre+"/int[@name='D'][.='1']" + ,pre+"/int[@name='F'][.='1']" ); - + assertQ("check counts for facet with generous limit", req("q", "id:[* TO *]" ,"facet", "true" ,"facet.limit", "100" - ,"facet.field", "t_s" + ,"facet.field", f ) - ,"*[count(//lst[@name='facet_fields']/lst[@name='t_s']/int)=7]" - - ,"//lst[@name='t_s']/int[1][@name='G'][.='5']" - ,"//lst[@name='t_s']/int[2][@name='E'][.='3']" - ,"//lst[@name='t_s']/int[3][@name='C'][.='2']" - - ,"//lst[@name='t_s']/int[@name='A'][.='1']" - ,"//lst[@name='t_s']/int[@name='B'][.='1']" - ,"//lst[@name='t_s']/int[@name='D'][.='1']" - ,"//lst[@name='t_s']/int[@name='F'][.='1']" + ,"*[count(//lst[@name='facet_fields']/lst/int)=7]" + + ,pre+"/int[1][@name='G'][.='5']" + ,pre+"/int[2][@name='E'][.='3']" + ,pre+"/int[3][@name='C'][.='2']" + + ,pre+"/int[@name='A'][.='1']" + ,pre+"/int[@name='B'][.='1']" + ,pre+"/int[@name='D'][.='1']" + ,pre+"/int[@name='F'][.='1']" ); - + assertQ("check counts for limited facet", req("q", "id:[* TO *]" ,"facet", "true" ,"facet.limit", "2" - ,"facet.field", "t_s" + ,"facet.field", f ) - ,"*[count(//lst[@name='facet_fields']/lst[@name='t_s']/int)=2]" - - ,"//lst[@name='t_s']/int[1][@name='G'][.='5']" - ,"//lst[@name='t_s']/int[2][@name='E'][.='3']" - ); - - } - + ,"*[count(//lst[@name='facet_fields']/lst/int)=2]" + ,pre+"/int[1][@name='G'][.='5']" + ,pre+"/int[2][@name='E'][.='3']" + ); + + assertQ("check offset", + req("q", "id:[* TO *]" + ,"facet", "true" + ,"facet.offset", "1" + ,"facet.limit", "1" + ,"facet.field", f + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=1]" + + ,pre+"/int[1][@name='E'][.='3']" + ); + + assertQ("test sorted facet paging with zero (don't count in limit)", + req("q", "id:[* TO *]" + ,"fq",notc + ,"facet", "true" + ,"facet.field", f + ,"facet.mincount","1" + ,"facet.offset","0" + ,"facet.limit","6" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=6]" + ,pre+"/int[1][@name='G'][.='5']" + ,pre+"/int[2][@name='E'][.='3']" + ,pre+"/int[3][@name='A'][.='1']" + ,pre+"/int[4][@name='B'][.='1']" + ,pre+"/int[5][@name='D'][.='1']" + ,pre+"/int[6][@name='F'][.='1']" + ); + + assertQ("test sorted facet paging with zero (test offset correctness)", + req("q", "id:[* TO *]" + ,"fq",notc + ,"facet", "true" + ,"facet.field", f + ,"facet.mincount","1" + ,"facet.offset","3" + ,"facet.limit","2" + ,"facet.sort","true" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=2]" + ,pre+"/int[1][@name='B'][.='1']" + ,pre+"/int[2][@name='D'][.='1']" + ); + + assertQ("test facet unsorted paging", + req("q", "id:[* TO *]" + ,"fq",notc + ,"facet", "true" + ,"facet.field", f + ,"facet.mincount","1" + ,"facet.offset","0" + ,"facet.limit","6" + ,"facet.sort","false" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=6]" + ,pre+"/int[1][@name='A'][.='1']" + ,pre+"/int[2][@name='B'][.='1']" + ,pre+"/int[3][@name='D'][.='1']" + ,pre+"/int[4][@name='E'][.='3']" + ,pre+"/int[5][@name='F'][.='1']" + ,pre+"/int[6][@name='G'][.='5']" + ); + + assertQ("test facet unsorted paging", + req("q", "id:[* TO *]" + ,"fq",notc + ,"facet", "true" + ,"facet.field", f + ,"facet.mincount","1" + ,"facet.offset","3" + ,"facet.limit","2" + ,"facet.sort","false" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=2]" + ,pre+"/int[1][@name='E'][.='3']" + ,pre+"/int[2][@name='F'][.='1']" + ); + + assertQ("test facet unsorted paging, mincount=2", + req("q", "id:[* TO *]" + ,"fq",notc + ,"facet", "true" + ,"facet.field", f + ,"facet.mincount","2" + ,"facet.offset","1" + ,"facet.limit","2" + ,"facet.sort","false" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=1]" + ,pre+"/int[1][@name='G'][.='5']" + ); + } private String mkstr(int len) { StringBuilder sb = new StringBuilder(len); diff --git a/src/test/test-files/solr/conf/schema.xml b/src/test/test-files/solr/conf/schema.xml index fdc0e312ede..af064873853 100644 --- a/src/test/test-files/solr/conf/schema.xml +++ b/src/test/test-files/solr/conf/schema.xml @@ -386,6 +386,7 @@ --> +