SOLR-9914: SimpleFacets: refactor "contains" check into "SubstringBytesRefFilter" class. (Jonny Marks, David Smiley, Christine Poerschke)

This commit is contained in:
Christine Poerschke 2017-02-09 11:19:21 +00:00
parent 6696eafaae
commit 55afc2031c
7 changed files with 170 additions and 70 deletions

View File

@ -177,6 +177,9 @@ Other Changes
* SOLR-9800: Factor out FacetComponent.newSimpleFacets method. (Jonny Marks via Christine Poerschke)
* SOLR-9914: SimpleFacets: refactor "contains" check into "SubstringBytesRefFilter" class.
(Jonny Marks, David Smiley, Christine Poerschke)
================== 6.4.1 ==================
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.

View File

@ -18,6 +18,7 @@ package org.apache.solr.request;
import java.io.IOException;
import java.util.List;
import java.util.function.Predicate;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
@ -57,8 +58,13 @@ import org.apache.solr.util.LongPriorityQueue;
*/
public class DocValuesFacets {
private DocValuesFacets() {}
public static NamedList<Integer> getCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase, FacetDebugInfo fdebug) throws IOException {
final Predicate<BytesRef> termFilter = new SubstringBytesRefFilter(contains, ignoreCase);
return getCounts(searcher, docs, fieldName, offset, limit, mincount, missing, sort, prefix, termFilter, fdebug);
}
public static NamedList<Integer> getCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate<BytesRef> termFilter, FacetDebugInfo fdebug) throws IOException {
SchemaField schemaField = searcher.getSchema().getField(fieldName);
FieldType ft = schemaField.getType();
NamedList<Integer> res = new NamedList<>();
@ -178,9 +184,9 @@ public class DocValuesFacets {
// index order, so we already know that the keys are ordered. This can be very
// important if a lot of the counts are repeated (like zero counts would be).
if (contains != null) {
if (termFilter != null) {
final BytesRef term = si.lookupOrd(startTermIndex+i);
if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) {
if (!termFilter.test(term)) {
continue;
}
}
@ -213,8 +219,8 @@ public class DocValuesFacets {
} else {
// add results in index order
int i=(startTermIndex==-1)?1:0;
if (mincount<=0 && contains == null) {
// if mincount<=0 and we're not examining the values for contains, then
if (mincount<=0 && termFilter == null) {
// if mincount<=0 and we're not examining the values for the term filter, then
// we won't discard any terms and we know exactly where to start.
i+=off;
off=0;
@ -224,9 +230,9 @@ public class DocValuesFacets {
int c = counts[i];
if (c<mincount) continue;
BytesRef term = null;
if (contains != null) {
if (termFilter != null) {
term = si.lookupOrd(startTermIndex+i);
if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) {
if (!termFilter.test(term)) {
continue;
}
}

View File

@ -25,6 +25,7 @@ import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.Future;
import java.util.function.Predicate;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
@ -62,14 +63,17 @@ class PerSegmentSingleValuedFaceting {
String sort;
String prefix;
private String contains;
private boolean ignoreCase;
private final Predicate<BytesRef> termFilter;
Filter baseSet;
int nThreads;
public PerSegmentSingleValuedFaceting(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase) {
this(searcher, docs, fieldName, offset, limit, mincount, missing, sort, prefix, new SubstringBytesRefFilter(contains, ignoreCase));
}
public PerSegmentSingleValuedFaceting(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate<BytesRef> filter) {
this.searcher = searcher;
this.docs = docs;
this.fieldName = fieldName;
@ -79,8 +83,7 @@ class PerSegmentSingleValuedFaceting {
this.missing = missing;
this.sort = sort;
this.prefix = prefix;
this.contains = contains;
this.ignoreCase = ignoreCase;
this.termFilter = filter;
}
public void setNumThreads(int threads) {
@ -183,8 +186,7 @@ class PerSegmentSingleValuedFaceting {
while (queue.size() > 0) {
SegFacet seg = queue.top();
// if facet.contains specified, only actually collect the count if substring contained
boolean collect = contains == null || SimpleFacets.contains(seg.tempBR.utf8ToString(), contains, ignoreCase);
boolean collect = termFilter == null || termFilter.test(seg.tempBR);
// we will normally end up advancing the term enum for this segment
// while still using "val", so we need to make a copy since the BytesRef

View File

@ -33,6 +33,7 @@ import java.util.concurrent.RunnableFuture;
import java.util.concurrent.Semaphore;
import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.TimeUnit;
import java.util.function.Predicate;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.index.Fields;
@ -169,26 +170,6 @@ public class SimpleFacets {
this.fdebugParent = fdebugParent;
}
/**
* Returns <code>true</code> if a String contains the given substring. Otherwise
* <code>false</code>.
*
* @param ref
* the {@link String} to test
* @param substring
* the substring to look for
* @param ignoreCase
* whether the comparison should be case-insensitive
* @return Returns <code>true</code> iff the String contains the given substring.
* Otherwise <code>false</code>.
*/
public static boolean contains(String ref, String substring, boolean ignoreCase) {
if (ignoreCase)
return StringUtils.containsIgnoreCase(ref, substring);
return StringUtils.contains(ref, substring);
}
protected ParsedParams parseParams(String type, String param) throws SyntaxError, IOException {
SolrParams localParams = QueryParsing.getLocalParams(param, req.getParams());
DocSet docs = docsOrig;
@ -362,6 +343,17 @@ public class SimpleFacets {
ENUM, FC, FCS, UIF;
}
protected Predicate<BytesRef> newBytesRefFilter(String field, SolrParams params) {
final String contains = params.getFieldParam(field, FacetParams.FACET_CONTAINS);
final boolean ignoreCase = params.getFieldBool(field, FacetParams.FACET_CONTAINS_IGNORE_CASE, false);
if (contains == null) {
return null;
}
return new SubstringBytesRefFilter(contains, ignoreCase);
}
/**
* Term counts for use in pivot faceting that resepcts the appropriate mincount
* @see FacetParams#FACET_PIVOT_MINCOUNT
@ -405,8 +397,9 @@ public class SimpleFacets {
// default to sorting if there is a limit.
String sort = params.getFieldParam(field, FacetParams.FACET_SORT, limit>0 ? FacetParams.FACET_SORT_COUNT : FacetParams.FACET_SORT_INDEX);
String prefix = params.getFieldParam(field, FacetParams.FACET_PREFIX);
String contains = params.getFieldParam(field, FacetParams.FACET_CONTAINS);
boolean ignoreCase = params.getFieldBool(field, FacetParams.FACET_CONTAINS_IGNORE_CASE, false);
final Predicate<BytesRef> termFilter = newBytesRefFilter(field, params);
boolean exists = params.getFieldBool(field, FacetParams.FACET_EXISTS, false);
NamedList<Integer> counts;
@ -448,14 +441,13 @@ public class SimpleFacets {
}
if (params.getFieldBool(field, GroupParams.GROUP_FACET, false)) {
counts = getGroupedCounts(searcher, docs, field, multiToken, offset,limit, mincount, missing, sort, prefix, contains, ignoreCase);
counts = getGroupedCounts(searcher, docs, field, multiToken, offset,limit, mincount, missing, sort, prefix, termFilter);
} else {
assert appliedFacetMethod != null;
switch (appliedFacetMethod) {
case ENUM:
assert TrieField.getMainValuePrefix(ft) == null;
counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix, contains, ignoreCase,
exists);
counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix, termFilter, exists);
break;
case FCS:
assert !multiToken;
@ -464,12 +456,15 @@ public class SimpleFacets {
if (prefix != null && !prefix.isEmpty()) {
throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_PREFIX + " is not supported on numeric types");
}
if (contains != null && !contains.isEmpty()) {
throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_CONTAINS + " is not supported on numeric types");
if (termFilter != null) {
final boolean supportedOperation = (termFilter instanceof SubstringBytesRefFilter) && ((SubstringBytesRefFilter) termFilter).substring().isEmpty();
if (!supportedOperation) {
throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_CONTAINS + " is not supported on numeric types");
}
}
counts = NumericFacets.getCounts(searcher, docs, field, offset, limit, mincount, missing, sort);
} else {
PerSegmentSingleValuedFaceting ps = new PerSegmentSingleValuedFaceting(searcher, docs, field, offset, limit, mincount, missing, sort, prefix, contains, ignoreCase);
PerSegmentSingleValuedFaceting ps = new PerSegmentSingleValuedFaceting(searcher, docs, field, offset, limit, mincount, missing, sort, prefix, termFilter);
Executor executor = threads == 0 ? directExecutor : facetExecutor;
ps.setNumThreads(threads);
counts = ps.getFacetCounts(executor);
@ -532,7 +527,7 @@ public class SimpleFacets {
}
break;
case FC:
counts = DocValuesFacets.getCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix, contains, ignoreCase, fdebug);
counts = DocValuesFacets.getCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix, termFilter, fdebug);
break;
default:
throw new AssertionError();
@ -644,8 +639,7 @@ public class SimpleFacets {
boolean missing,
String sort,
String prefix,
String contains,
boolean ignoreCase) throws IOException {
Predicate<BytesRef> termFilter) throws IOException {
GroupingSpecification groupingSpecification = rb.getGroupingSpec();
final String groupField = groupingSpecification != null ? groupingSpecification.getFields()[0] : null;
if (groupField == null) {
@ -675,8 +669,8 @@ public class SimpleFacets {
List<TermGroupFacetCollector.FacetEntry> scopedEntries
= result.getFacetEntries(offset, limit < 0 ? Integer.MAX_VALUE : limit);
for (TermGroupFacetCollector.FacetEntry facetEntry : scopedEntries) {
//:TODO:can we do contains earlier than this to make it more efficient?
if (contains != null && !contains(facetEntry.getValue().utf8ToString(), contains, ignoreCase)) {
//:TODO:can we filter earlier than this to make it more efficient?
if (termFilter != null && !termFilter.test(facetEntry.getValue())) {
continue;
}
facetFieldType.indexedToReadable(facetEntry.getValue(), charsRef);
@ -851,6 +845,18 @@ public class SimpleFacets {
return docs.andNotSize(hasVal);
}
/**
* Works like {@link #getFacetTermEnumCounts(SolrIndexSearcher, DocSet, String, int, int, int, boolean, String, String, Predicate, boolean)}
* but takes a substring directly for the contains check rather than a {@link Predicate} instance.
*/
public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing,
String sort, String prefix, String contains, boolean ignoreCase, boolean intersectsCheck)
throws IOException {
final Predicate<BytesRef> termFilter = new SubstringBytesRefFilter(contains, ignoreCase);
return getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount, missing, sort, prefix, termFilter, intersectsCheck);
}
/**
* Returns a list of terms in the specified field along with the
* corresponding count of documents in the set that match that constraint.
@ -861,8 +867,8 @@ public class SimpleFacets {
* @see FacetParams#FACET_ZEROS
* @see FacetParams#FACET_MISSING
*/
public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing,
String sort, String prefix, String contains, boolean ignoreCase, boolean intersectsCheck)
public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing,
String sort, String prefix, Predicate<BytesRef> termFilter, boolean intersectsCheck)
throws IOException {
/* :TODO: potential optimization...
@ -934,7 +940,7 @@ public class SimpleFacets {
if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes))
break;
if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) {
if (termFilter == null || termFilter.test(term)) {
int df = termsEnum.docFreq();
// If we are sorting, we can use df>min (rather than >=) since we
@ -1148,4 +1154,4 @@ public class SimpleFacets {
public ResponseBuilder getResponseBuilder() {
return rb;
}
}
}

View File

@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.request;
import java.util.function.Predicate;
import org.apache.lucene.util.BytesRef;
import org.apache.commons.lang.StringUtils;
/**
* An implementation of {@link Predicate} which returns true if the BytesRef contains a given substring.
*/
public class SubstringBytesRefFilter implements Predicate<BytesRef> {
final private String contains;
final private boolean ignoreCase;
public SubstringBytesRefFilter(String contains, boolean ignoreCase) {
this.contains = contains;
this.ignoreCase = ignoreCase;
}
public String substring() {
return contains;
}
protected boolean includeString(String term) {
if (ignoreCase) {
return StringUtils.containsIgnoreCase(term, contains);
}
return StringUtils.contains(term, contains);
}
@Override
public boolean test(BytesRef term) {
return includeString(term.utf8ToString());
}
}

View File

@ -2684,26 +2684,6 @@ public class SimpleFacetsTest extends SolrTestCaseJ4 {
"facet.range.gap", "0.0003"),
400);
}
public void testContainsAtStart() {
assertTrue(SimpleFacets.contains("foobar", "foo", false));
}
public void testContains() {
assertTrue(SimpleFacets.contains("foobar", "ooba", false));
}
public void testContainsAtEnd() {
assertTrue(SimpleFacets.contains("foobar", "bar", false));
}
public void testContainsWhole() {
assertTrue(SimpleFacets.contains("foobar", "foobar", false));
}
public void testContainsIgnoreCase() {
assertTrue(SimpleFacets.contains("FooBar", "bar", true));
}
public void testRangeQueryHardEndParamFilter() {
doTestRangeQueryHardEndParam("range_facet_l", FacetRangeMethod.FILTER);

View File

@ -0,0 +1,51 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.request;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.BytesRef;
import org.junit.Test;
public class SubstringBytesRefFilterTest extends LuceneTestCase {
@Test
public void testSubstringBytesRefFilter() {
final List<String> substrings = new ArrayList<>(4);
substrings.add("foo");
substrings.add("ooba");
substrings.add("bar");
substrings.add("foobar");
final String contains = substrings.get(random().nextInt(substrings.size()));
final boolean ignoreCase = random().nextBoolean();
final SubstringBytesRefFilter filter = new SubstringBytesRefFilter(contains, ignoreCase);
assertTrue(filter.test(new BytesRef("foobar")));
if (ignoreCase) {
assertTrue(filter.test(new BytesRef("FooBar")));
} else {
assertFalse(filter.test(new BytesRef("FooBar")));
}
assertFalse(filter.test(new BytesRef("qux")));
}
}