From 55afc2031c7aeb75f23e66f9817131cd3798a5b2 Mon Sep 17 00:00:00 2001 From: Christine Poerschke Date: Thu, 9 Feb 2017 11:19:21 +0000 Subject: [PATCH] SOLR-9914: SimpleFacets: refactor "contains" check into "SubstringBytesRefFilter" class. (Jonny Marks, David Smiley, Christine Poerschke) --- solr/CHANGES.txt | 3 + .../apache/solr/request/DocValuesFacets.java | 20 +++-- .../PerSegmentSingleValuedFaceting.java | 14 ++-- .../org/apache/solr/request/SimpleFacets.java | 80 ++++++++++--------- .../solr/request/SubstringBytesRefFilter.java | 52 ++++++++++++ .../apache/solr/request/SimpleFacetsTest.java | 20 ----- .../request/SubstringBytesRefFilterTest.java | 51 ++++++++++++ 7 files changed, 170 insertions(+), 70 deletions(-) create mode 100644 solr/core/src/java/org/apache/solr/request/SubstringBytesRefFilter.java create mode 100644 solr/core/src/test/org/apache/solr/request/SubstringBytesRefFilterTest.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index d6d97243bcc..4d8aa0163e4 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -177,6 +177,9 @@ Other Changes * SOLR-9800: Factor out FacetComponent.newSimpleFacets method. (Jonny Marks via Christine Poerschke) +* SOLR-9914: SimpleFacets: refactor "contains" check into "SubstringBytesRefFilter" class. + (Jonny Marks, David Smiley, Christine Poerschke) + ================== 6.4.1 ================== Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. diff --git a/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java b/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java index 3714bf1b9f6..e9498f8a7d9 100644 --- a/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java +++ b/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java @@ -18,6 +18,7 @@ package org.apache.solr.request; import java.io.IOException; import java.util.List; +import java.util.function.Predicate; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.LeafReaderContext; @@ -57,8 +58,13 @@ import org.apache.solr.util.LongPriorityQueue; */ public class DocValuesFacets { private DocValuesFacets() {} - + public static NamedList getCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase, FacetDebugInfo fdebug) throws IOException { + final Predicate termFilter = new SubstringBytesRefFilter(contains, ignoreCase); + return getCounts(searcher, docs, fieldName, offset, limit, mincount, missing, sort, prefix, termFilter, fdebug); + } + + public static NamedList getCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate termFilter, FacetDebugInfo fdebug) throws IOException { SchemaField schemaField = searcher.getSchema().getField(fieldName); FieldType ft = schemaField.getType(); NamedList res = new NamedList<>(); @@ -178,9 +184,9 @@ public class DocValuesFacets { // index order, so we already know that the keys are ordered. This can be very // important if a lot of the counts are repeated (like zero counts would be). - if (contains != null) { + if (termFilter != null) { final BytesRef term = si.lookupOrd(startTermIndex+i); - if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) { + if (!termFilter.test(term)) { continue; } } @@ -213,8 +219,8 @@ public class DocValuesFacets { } else { // add results in index order int i=(startTermIndex==-1)?1:0; - if (mincount<=0 && contains == null) { - // if mincount<=0 and we're not examining the values for contains, then + if (mincount<=0 && termFilter == null) { + // if mincount<=0 and we're not examining the values for the term filter, then // we won't discard any terms and we know exactly where to start. i+=off; off=0; @@ -224,9 +230,9 @@ public class DocValuesFacets { int c = counts[i]; if (c termFilter; Filter baseSet; int nThreads; public PerSegmentSingleValuedFaceting(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase) { + this(searcher, docs, fieldName, offset, limit, mincount, missing, sort, prefix, new SubstringBytesRefFilter(contains, ignoreCase)); + } + + public PerSegmentSingleValuedFaceting(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate filter) { this.searcher = searcher; this.docs = docs; this.fieldName = fieldName; @@ -79,8 +83,7 @@ class PerSegmentSingleValuedFaceting { this.missing = missing; this.sort = sort; this.prefix = prefix; - this.contains = contains; - this.ignoreCase = ignoreCase; + this.termFilter = filter; } public void setNumThreads(int threads) { @@ -183,8 +186,7 @@ class PerSegmentSingleValuedFaceting { while (queue.size() > 0) { SegFacet seg = queue.top(); - // if facet.contains specified, only actually collect the count if substring contained - boolean collect = contains == null || SimpleFacets.contains(seg.tempBR.utf8ToString(), contains, ignoreCase); + boolean collect = termFilter == null || termFilter.test(seg.tempBR); // we will normally end up advancing the term enum for this segment // while still using "val", so we need to make a copy since the BytesRef diff --git a/solr/core/src/java/org/apache/solr/request/SimpleFacets.java b/solr/core/src/java/org/apache/solr/request/SimpleFacets.java index bc2877c6f05..ac8e9109516 100644 --- a/solr/core/src/java/org/apache/solr/request/SimpleFacets.java +++ b/solr/core/src/java/org/apache/solr/request/SimpleFacets.java @@ -33,6 +33,7 @@ import java.util.concurrent.RunnableFuture; import java.util.concurrent.Semaphore; import java.util.concurrent.SynchronousQueue; import java.util.concurrent.TimeUnit; +import java.util.function.Predicate; import org.apache.commons.lang.StringUtils; import org.apache.lucene.index.Fields; @@ -169,26 +170,6 @@ public class SimpleFacets { this.fdebugParent = fdebugParent; } - /** - * Returns true if a String contains the given substring. Otherwise - * false. - * - * @param ref - * the {@link String} to test - * @param substring - * the substring to look for - * @param ignoreCase - * whether the comparison should be case-insensitive - * @return Returns true iff the String contains the given substring. - * Otherwise false. - */ - public static boolean contains(String ref, String substring, boolean ignoreCase) { - if (ignoreCase) - return StringUtils.containsIgnoreCase(ref, substring); - return StringUtils.contains(ref, substring); - } - - protected ParsedParams parseParams(String type, String param) throws SyntaxError, IOException { SolrParams localParams = QueryParsing.getLocalParams(param, req.getParams()); DocSet docs = docsOrig; @@ -362,6 +343,17 @@ public class SimpleFacets { ENUM, FC, FCS, UIF; } + protected Predicate newBytesRefFilter(String field, SolrParams params) { + final String contains = params.getFieldParam(field, FacetParams.FACET_CONTAINS); + final boolean ignoreCase = params.getFieldBool(field, FacetParams.FACET_CONTAINS_IGNORE_CASE, false); + + if (contains == null) { + return null; + } + + return new SubstringBytesRefFilter(contains, ignoreCase); + } + /** * Term counts for use in pivot faceting that resepcts the appropriate mincount * @see FacetParams#FACET_PIVOT_MINCOUNT @@ -405,8 +397,9 @@ public class SimpleFacets { // default to sorting if there is a limit. String sort = params.getFieldParam(field, FacetParams.FACET_SORT, limit>0 ? FacetParams.FACET_SORT_COUNT : FacetParams.FACET_SORT_INDEX); String prefix = params.getFieldParam(field, FacetParams.FACET_PREFIX); - String contains = params.getFieldParam(field, FacetParams.FACET_CONTAINS); - boolean ignoreCase = params.getFieldBool(field, FacetParams.FACET_CONTAINS_IGNORE_CASE, false); + + final Predicate termFilter = newBytesRefFilter(field, params); + boolean exists = params.getFieldBool(field, FacetParams.FACET_EXISTS, false); NamedList counts; @@ -448,14 +441,13 @@ public class SimpleFacets { } if (params.getFieldBool(field, GroupParams.GROUP_FACET, false)) { - counts = getGroupedCounts(searcher, docs, field, multiToken, offset,limit, mincount, missing, sort, prefix, contains, ignoreCase); + counts = getGroupedCounts(searcher, docs, field, multiToken, offset,limit, mincount, missing, sort, prefix, termFilter); } else { assert appliedFacetMethod != null; switch (appliedFacetMethod) { case ENUM: assert TrieField.getMainValuePrefix(ft) == null; - counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix, contains, ignoreCase, - exists); + counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix, termFilter, exists); break; case FCS: assert !multiToken; @@ -464,12 +456,15 @@ public class SimpleFacets { if (prefix != null && !prefix.isEmpty()) { throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_PREFIX + " is not supported on numeric types"); } - if (contains != null && !contains.isEmpty()) { - throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_CONTAINS + " is not supported on numeric types"); + if (termFilter != null) { + final boolean supportedOperation = (termFilter instanceof SubstringBytesRefFilter) && ((SubstringBytesRefFilter) termFilter).substring().isEmpty(); + if (!supportedOperation) { + throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_CONTAINS + " is not supported on numeric types"); + } } counts = NumericFacets.getCounts(searcher, docs, field, offset, limit, mincount, missing, sort); } else { - PerSegmentSingleValuedFaceting ps = new PerSegmentSingleValuedFaceting(searcher, docs, field, offset, limit, mincount, missing, sort, prefix, contains, ignoreCase); + PerSegmentSingleValuedFaceting ps = new PerSegmentSingleValuedFaceting(searcher, docs, field, offset, limit, mincount, missing, sort, prefix, termFilter); Executor executor = threads == 0 ? directExecutor : facetExecutor; ps.setNumThreads(threads); counts = ps.getFacetCounts(executor); @@ -532,7 +527,7 @@ public class SimpleFacets { } break; case FC: - counts = DocValuesFacets.getCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix, contains, ignoreCase, fdebug); + counts = DocValuesFacets.getCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix, termFilter, fdebug); break; default: throw new AssertionError(); @@ -644,8 +639,7 @@ public class SimpleFacets { boolean missing, String sort, String prefix, - String contains, - boolean ignoreCase) throws IOException { + Predicate termFilter) throws IOException { GroupingSpecification groupingSpecification = rb.getGroupingSpec(); final String groupField = groupingSpecification != null ? groupingSpecification.getFields()[0] : null; if (groupField == null) { @@ -675,8 +669,8 @@ public class SimpleFacets { List scopedEntries = result.getFacetEntries(offset, limit < 0 ? Integer.MAX_VALUE : limit); for (TermGroupFacetCollector.FacetEntry facetEntry : scopedEntries) { - //:TODO:can we do contains earlier than this to make it more efficient? - if (contains != null && !contains(facetEntry.getValue().utf8ToString(), contains, ignoreCase)) { + //:TODO:can we filter earlier than this to make it more efficient? + if (termFilter != null && !termFilter.test(facetEntry.getValue())) { continue; } facetFieldType.indexedToReadable(facetEntry.getValue(), charsRef); @@ -851,6 +845,18 @@ public class SimpleFacets { return docs.andNotSize(hasVal); } + /** + * Works like {@link #getFacetTermEnumCounts(SolrIndexSearcher, DocSet, String, int, int, int, boolean, String, String, Predicate, boolean)} + * but takes a substring directly for the contains check rather than a {@link Predicate} instance. + */ + public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, + String sort, String prefix, String contains, boolean ignoreCase, boolean intersectsCheck) + throws IOException { + + final Predicate termFilter = new SubstringBytesRefFilter(contains, ignoreCase); + return getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount, missing, sort, prefix, termFilter, intersectsCheck); + } + /** * Returns a list of terms in the specified field along with the * corresponding count of documents in the set that match that constraint. @@ -861,8 +867,8 @@ public class SimpleFacets { * @see FacetParams#FACET_ZEROS * @see FacetParams#FACET_MISSING */ - public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, - String sort, String prefix, String contains, boolean ignoreCase, boolean intersectsCheck) + public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, + String sort, String prefix, Predicate termFilter, boolean intersectsCheck) throws IOException { /* :TODO: potential optimization... @@ -934,7 +940,7 @@ public class SimpleFacets { if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes)) break; - if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) { + if (termFilter == null || termFilter.test(term)) { int df = termsEnum.docFreq(); // If we are sorting, we can use df>min (rather than >=) since we @@ -1148,4 +1154,4 @@ public class SimpleFacets { public ResponseBuilder getResponseBuilder() { return rb; } -} \ No newline at end of file +} diff --git a/solr/core/src/java/org/apache/solr/request/SubstringBytesRefFilter.java b/solr/core/src/java/org/apache/solr/request/SubstringBytesRefFilter.java new file mode 100644 index 00000000000..623cb550859 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/request/SubstringBytesRefFilter.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.request; + +import java.util.function.Predicate; + +import org.apache.lucene.util.BytesRef; +import org.apache.commons.lang.StringUtils; + +/** + * An implementation of {@link Predicate} which returns true if the BytesRef contains a given substring. + */ +public class SubstringBytesRefFilter implements Predicate { + final private String contains; + final private boolean ignoreCase; + + public SubstringBytesRefFilter(String contains, boolean ignoreCase) { + this.contains = contains; + this.ignoreCase = ignoreCase; + } + + public String substring() { + return contains; + } + + protected boolean includeString(String term) { + if (ignoreCase) { + return StringUtils.containsIgnoreCase(term, contains); + } + + return StringUtils.contains(term, contains); + } + + @Override + public boolean test(BytesRef term) { + return includeString(term.utf8ToString()); + } +} diff --git a/solr/core/src/test/org/apache/solr/request/SimpleFacetsTest.java b/solr/core/src/test/org/apache/solr/request/SimpleFacetsTest.java index a21135e113c..bec3aa0dfdc 100644 --- a/solr/core/src/test/org/apache/solr/request/SimpleFacetsTest.java +++ b/solr/core/src/test/org/apache/solr/request/SimpleFacetsTest.java @@ -2684,26 +2684,6 @@ public class SimpleFacetsTest extends SolrTestCaseJ4 { "facet.range.gap", "0.0003"), 400); } - - public void testContainsAtStart() { - assertTrue(SimpleFacets.contains("foobar", "foo", false)); - } - - public void testContains() { - assertTrue(SimpleFacets.contains("foobar", "ooba", false)); - } - - public void testContainsAtEnd() { - assertTrue(SimpleFacets.contains("foobar", "bar", false)); - } - - public void testContainsWhole() { - assertTrue(SimpleFacets.contains("foobar", "foobar", false)); - } - - public void testContainsIgnoreCase() { - assertTrue(SimpleFacets.contains("FooBar", "bar", true)); - } public void testRangeQueryHardEndParamFilter() { doTestRangeQueryHardEndParam("range_facet_l", FacetRangeMethod.FILTER); diff --git a/solr/core/src/test/org/apache/solr/request/SubstringBytesRefFilterTest.java b/solr/core/src/test/org/apache/solr/request/SubstringBytesRefFilterTest.java new file mode 100644 index 00000000000..a5b304882f8 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/request/SubstringBytesRefFilterTest.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.request; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.BytesRef; +import org.junit.Test; + +public class SubstringBytesRefFilterTest extends LuceneTestCase { + + @Test + public void testSubstringBytesRefFilter() { + final List substrings = new ArrayList<>(4); + substrings.add("foo"); + substrings.add("ooba"); + substrings.add("bar"); + substrings.add("foobar"); + + final String contains = substrings.get(random().nextInt(substrings.size())); + final boolean ignoreCase = random().nextBoolean(); + final SubstringBytesRefFilter filter = new SubstringBytesRefFilter(contains, ignoreCase); + + assertTrue(filter.test(new BytesRef("foobar"))); + + if (ignoreCase) { + assertTrue(filter.test(new BytesRef("FooBar"))); + } else { + assertFalse(filter.test(new BytesRef("FooBar"))); + } + + assertFalse(filter.test(new BytesRef("qux"))); + } + +}