mirror of https://github.com/apache/lucene.git
SOLR-9914: SimpleFacets: refactor "contains" check into "SubstringBytesRefFilter" class. (Jonny Marks, David Smiley, Christine Poerschke)
This commit is contained in:
parent
6696eafaae
commit
55afc2031c
|
@ -177,6 +177,9 @@ Other Changes
|
|||
|
||||
* SOLR-9800: Factor out FacetComponent.newSimpleFacets method. (Jonny Marks via Christine Poerschke)
|
||||
|
||||
* SOLR-9914: SimpleFacets: refactor "contains" check into "SubstringBytesRefFilter" class.
|
||||
(Jonny Marks, David Smiley, Christine Poerschke)
|
||||
|
||||
================== 6.4.1 ==================
|
||||
|
||||
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.solr.request;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
import org.apache.lucene.index.DocValues;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
|
@ -57,8 +58,13 @@ import org.apache.solr.util.LongPriorityQueue;
|
|||
*/
|
||||
public class DocValuesFacets {
|
||||
private DocValuesFacets() {}
|
||||
|
||||
|
||||
public static NamedList<Integer> getCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase, FacetDebugInfo fdebug) throws IOException {
|
||||
final Predicate<BytesRef> termFilter = new SubstringBytesRefFilter(contains, ignoreCase);
|
||||
return getCounts(searcher, docs, fieldName, offset, limit, mincount, missing, sort, prefix, termFilter, fdebug);
|
||||
}
|
||||
|
||||
public static NamedList<Integer> getCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate<BytesRef> termFilter, FacetDebugInfo fdebug) throws IOException {
|
||||
SchemaField schemaField = searcher.getSchema().getField(fieldName);
|
||||
FieldType ft = schemaField.getType();
|
||||
NamedList<Integer> res = new NamedList<>();
|
||||
|
@ -178,9 +184,9 @@ public class DocValuesFacets {
|
|||
// index order, so we already know that the keys are ordered. This can be very
|
||||
// important if a lot of the counts are repeated (like zero counts would be).
|
||||
|
||||
if (contains != null) {
|
||||
if (termFilter != null) {
|
||||
final BytesRef term = si.lookupOrd(startTermIndex+i);
|
||||
if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) {
|
||||
if (!termFilter.test(term)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -213,8 +219,8 @@ public class DocValuesFacets {
|
|||
} else {
|
||||
// add results in index order
|
||||
int i=(startTermIndex==-1)?1:0;
|
||||
if (mincount<=0 && contains == null) {
|
||||
// if mincount<=0 and we're not examining the values for contains, then
|
||||
if (mincount<=0 && termFilter == null) {
|
||||
// if mincount<=0 and we're not examining the values for the term filter, then
|
||||
// we won't discard any terms and we know exactly where to start.
|
||||
i+=off;
|
||||
off=0;
|
||||
|
@ -224,9 +230,9 @@ public class DocValuesFacets {
|
|||
int c = counts[i];
|
||||
if (c<mincount) continue;
|
||||
BytesRef term = null;
|
||||
if (contains != null) {
|
||||
if (termFilter != null) {
|
||||
term = si.lookupOrd(startTermIndex+i);
|
||||
if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) {
|
||||
if (!termFilter.test(term)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,6 +25,7 @@ import java.util.concurrent.ExecutionException;
|
|||
import java.util.concurrent.Executor;
|
||||
import java.util.concurrent.ExecutorCompletionService;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
import org.apache.lucene.index.DocValues;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
|
@ -62,14 +63,17 @@ class PerSegmentSingleValuedFaceting {
|
|||
String sort;
|
||||
String prefix;
|
||||
|
||||
private String contains;
|
||||
private boolean ignoreCase;
|
||||
private final Predicate<BytesRef> termFilter;
|
||||
|
||||
Filter baseSet;
|
||||
|
||||
int nThreads;
|
||||
|
||||
public PerSegmentSingleValuedFaceting(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase) {
|
||||
this(searcher, docs, fieldName, offset, limit, mincount, missing, sort, prefix, new SubstringBytesRefFilter(contains, ignoreCase));
|
||||
}
|
||||
|
||||
public PerSegmentSingleValuedFaceting(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate<BytesRef> filter) {
|
||||
this.searcher = searcher;
|
||||
this.docs = docs;
|
||||
this.fieldName = fieldName;
|
||||
|
@ -79,8 +83,7 @@ class PerSegmentSingleValuedFaceting {
|
|||
this.missing = missing;
|
||||
this.sort = sort;
|
||||
this.prefix = prefix;
|
||||
this.contains = contains;
|
||||
this.ignoreCase = ignoreCase;
|
||||
this.termFilter = filter;
|
||||
}
|
||||
|
||||
public void setNumThreads(int threads) {
|
||||
|
@ -183,8 +186,7 @@ class PerSegmentSingleValuedFaceting {
|
|||
while (queue.size() > 0) {
|
||||
SegFacet seg = queue.top();
|
||||
|
||||
// if facet.contains specified, only actually collect the count if substring contained
|
||||
boolean collect = contains == null || SimpleFacets.contains(seg.tempBR.utf8ToString(), contains, ignoreCase);
|
||||
boolean collect = termFilter == null || termFilter.test(seg.tempBR);
|
||||
|
||||
// we will normally end up advancing the term enum for this segment
|
||||
// while still using "val", so we need to make a copy since the BytesRef
|
||||
|
|
|
@ -33,6 +33,7 @@ import java.util.concurrent.RunnableFuture;
|
|||
import java.util.concurrent.Semaphore;
|
||||
import java.util.concurrent.SynchronousQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.lucene.index.Fields;
|
||||
|
@ -169,26 +170,6 @@ public class SimpleFacets {
|
|||
this.fdebugParent = fdebugParent;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns <code>true</code> if a String contains the given substring. Otherwise
|
||||
* <code>false</code>.
|
||||
*
|
||||
* @param ref
|
||||
* the {@link String} to test
|
||||
* @param substring
|
||||
* the substring to look for
|
||||
* @param ignoreCase
|
||||
* whether the comparison should be case-insensitive
|
||||
* @return Returns <code>true</code> iff the String contains the given substring.
|
||||
* Otherwise <code>false</code>.
|
||||
*/
|
||||
public static boolean contains(String ref, String substring, boolean ignoreCase) {
|
||||
if (ignoreCase)
|
||||
return StringUtils.containsIgnoreCase(ref, substring);
|
||||
return StringUtils.contains(ref, substring);
|
||||
}
|
||||
|
||||
|
||||
protected ParsedParams parseParams(String type, String param) throws SyntaxError, IOException {
|
||||
SolrParams localParams = QueryParsing.getLocalParams(param, req.getParams());
|
||||
DocSet docs = docsOrig;
|
||||
|
@ -362,6 +343,17 @@ public class SimpleFacets {
|
|||
ENUM, FC, FCS, UIF;
|
||||
}
|
||||
|
||||
protected Predicate<BytesRef> newBytesRefFilter(String field, SolrParams params) {
|
||||
final String contains = params.getFieldParam(field, FacetParams.FACET_CONTAINS);
|
||||
final boolean ignoreCase = params.getFieldBool(field, FacetParams.FACET_CONTAINS_IGNORE_CASE, false);
|
||||
|
||||
if (contains == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return new SubstringBytesRefFilter(contains, ignoreCase);
|
||||
}
|
||||
|
||||
/**
|
||||
* Term counts for use in pivot faceting that resepcts the appropriate mincount
|
||||
* @see FacetParams#FACET_PIVOT_MINCOUNT
|
||||
|
@ -405,8 +397,9 @@ public class SimpleFacets {
|
|||
// default to sorting if there is a limit.
|
||||
String sort = params.getFieldParam(field, FacetParams.FACET_SORT, limit>0 ? FacetParams.FACET_SORT_COUNT : FacetParams.FACET_SORT_INDEX);
|
||||
String prefix = params.getFieldParam(field, FacetParams.FACET_PREFIX);
|
||||
String contains = params.getFieldParam(field, FacetParams.FACET_CONTAINS);
|
||||
boolean ignoreCase = params.getFieldBool(field, FacetParams.FACET_CONTAINS_IGNORE_CASE, false);
|
||||
|
||||
final Predicate<BytesRef> termFilter = newBytesRefFilter(field, params);
|
||||
|
||||
boolean exists = params.getFieldBool(field, FacetParams.FACET_EXISTS, false);
|
||||
|
||||
NamedList<Integer> counts;
|
||||
|
@ -448,14 +441,13 @@ public class SimpleFacets {
|
|||
}
|
||||
|
||||
if (params.getFieldBool(field, GroupParams.GROUP_FACET, false)) {
|
||||
counts = getGroupedCounts(searcher, docs, field, multiToken, offset,limit, mincount, missing, sort, prefix, contains, ignoreCase);
|
||||
counts = getGroupedCounts(searcher, docs, field, multiToken, offset,limit, mincount, missing, sort, prefix, termFilter);
|
||||
} else {
|
||||
assert appliedFacetMethod != null;
|
||||
switch (appliedFacetMethod) {
|
||||
case ENUM:
|
||||
assert TrieField.getMainValuePrefix(ft) == null;
|
||||
counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix, contains, ignoreCase,
|
||||
exists);
|
||||
counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix, termFilter, exists);
|
||||
break;
|
||||
case FCS:
|
||||
assert !multiToken;
|
||||
|
@ -464,12 +456,15 @@ public class SimpleFacets {
|
|||
if (prefix != null && !prefix.isEmpty()) {
|
||||
throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_PREFIX + " is not supported on numeric types");
|
||||
}
|
||||
if (contains != null && !contains.isEmpty()) {
|
||||
throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_CONTAINS + " is not supported on numeric types");
|
||||
if (termFilter != null) {
|
||||
final boolean supportedOperation = (termFilter instanceof SubstringBytesRefFilter) && ((SubstringBytesRefFilter) termFilter).substring().isEmpty();
|
||||
if (!supportedOperation) {
|
||||
throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_CONTAINS + " is not supported on numeric types");
|
||||
}
|
||||
}
|
||||
counts = NumericFacets.getCounts(searcher, docs, field, offset, limit, mincount, missing, sort);
|
||||
} else {
|
||||
PerSegmentSingleValuedFaceting ps = new PerSegmentSingleValuedFaceting(searcher, docs, field, offset, limit, mincount, missing, sort, prefix, contains, ignoreCase);
|
||||
PerSegmentSingleValuedFaceting ps = new PerSegmentSingleValuedFaceting(searcher, docs, field, offset, limit, mincount, missing, sort, prefix, termFilter);
|
||||
Executor executor = threads == 0 ? directExecutor : facetExecutor;
|
||||
ps.setNumThreads(threads);
|
||||
counts = ps.getFacetCounts(executor);
|
||||
|
@ -532,7 +527,7 @@ public class SimpleFacets {
|
|||
}
|
||||
break;
|
||||
case FC:
|
||||
counts = DocValuesFacets.getCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix, contains, ignoreCase, fdebug);
|
||||
counts = DocValuesFacets.getCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix, termFilter, fdebug);
|
||||
break;
|
||||
default:
|
||||
throw new AssertionError();
|
||||
|
@ -644,8 +639,7 @@ public class SimpleFacets {
|
|||
boolean missing,
|
||||
String sort,
|
||||
String prefix,
|
||||
String contains,
|
||||
boolean ignoreCase) throws IOException {
|
||||
Predicate<BytesRef> termFilter) throws IOException {
|
||||
GroupingSpecification groupingSpecification = rb.getGroupingSpec();
|
||||
final String groupField = groupingSpecification != null ? groupingSpecification.getFields()[0] : null;
|
||||
if (groupField == null) {
|
||||
|
@ -675,8 +669,8 @@ public class SimpleFacets {
|
|||
List<TermGroupFacetCollector.FacetEntry> scopedEntries
|
||||
= result.getFacetEntries(offset, limit < 0 ? Integer.MAX_VALUE : limit);
|
||||
for (TermGroupFacetCollector.FacetEntry facetEntry : scopedEntries) {
|
||||
//:TODO:can we do contains earlier than this to make it more efficient?
|
||||
if (contains != null && !contains(facetEntry.getValue().utf8ToString(), contains, ignoreCase)) {
|
||||
//:TODO:can we filter earlier than this to make it more efficient?
|
||||
if (termFilter != null && !termFilter.test(facetEntry.getValue())) {
|
||||
continue;
|
||||
}
|
||||
facetFieldType.indexedToReadable(facetEntry.getValue(), charsRef);
|
||||
|
@ -851,6 +845,18 @@ public class SimpleFacets {
|
|||
return docs.andNotSize(hasVal);
|
||||
}
|
||||
|
||||
/**
|
||||
* Works like {@link #getFacetTermEnumCounts(SolrIndexSearcher, DocSet, String, int, int, int, boolean, String, String, Predicate, boolean)}
|
||||
* but takes a substring directly for the contains check rather than a {@link Predicate} instance.
|
||||
*/
|
||||
public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing,
|
||||
String sort, String prefix, String contains, boolean ignoreCase, boolean intersectsCheck)
|
||||
throws IOException {
|
||||
|
||||
final Predicate<BytesRef> termFilter = new SubstringBytesRefFilter(contains, ignoreCase);
|
||||
return getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount, missing, sort, prefix, termFilter, intersectsCheck);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of terms in the specified field along with the
|
||||
* corresponding count of documents in the set that match that constraint.
|
||||
|
@ -861,8 +867,8 @@ public class SimpleFacets {
|
|||
* @see FacetParams#FACET_ZEROS
|
||||
* @see FacetParams#FACET_MISSING
|
||||
*/
|
||||
public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing,
|
||||
String sort, String prefix, String contains, boolean ignoreCase, boolean intersectsCheck)
|
||||
public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing,
|
||||
String sort, String prefix, Predicate<BytesRef> termFilter, boolean intersectsCheck)
|
||||
throws IOException {
|
||||
|
||||
/* :TODO: potential optimization...
|
||||
|
@ -934,7 +940,7 @@ public class SimpleFacets {
|
|||
if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes))
|
||||
break;
|
||||
|
||||
if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) {
|
||||
if (termFilter == null || termFilter.test(term)) {
|
||||
int df = termsEnum.docFreq();
|
||||
|
||||
// If we are sorting, we can use df>min (rather than >=) since we
|
||||
|
@ -1148,4 +1154,4 @@ public class SimpleFacets {
|
|||
public ResponseBuilder getResponseBuilder() {
|
||||
return rb;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.request;
|
||||
|
||||
import java.util.function.Predicate;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
/**
|
||||
* An implementation of {@link Predicate} which returns true if the BytesRef contains a given substring.
|
||||
*/
|
||||
public class SubstringBytesRefFilter implements Predicate<BytesRef> {
|
||||
final private String contains;
|
||||
final private boolean ignoreCase;
|
||||
|
||||
public SubstringBytesRefFilter(String contains, boolean ignoreCase) {
|
||||
this.contains = contains;
|
||||
this.ignoreCase = ignoreCase;
|
||||
}
|
||||
|
||||
public String substring() {
|
||||
return contains;
|
||||
}
|
||||
|
||||
protected boolean includeString(String term) {
|
||||
if (ignoreCase) {
|
||||
return StringUtils.containsIgnoreCase(term, contains);
|
||||
}
|
||||
|
||||
return StringUtils.contains(term, contains);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean test(BytesRef term) {
|
||||
return includeString(term.utf8ToString());
|
||||
}
|
||||
}
|
|
@ -2684,26 +2684,6 @@ public class SimpleFacetsTest extends SolrTestCaseJ4 {
|
|||
"facet.range.gap", "0.0003"),
|
||||
400);
|
||||
}
|
||||
|
||||
public void testContainsAtStart() {
|
||||
assertTrue(SimpleFacets.contains("foobar", "foo", false));
|
||||
}
|
||||
|
||||
public void testContains() {
|
||||
assertTrue(SimpleFacets.contains("foobar", "ooba", false));
|
||||
}
|
||||
|
||||
public void testContainsAtEnd() {
|
||||
assertTrue(SimpleFacets.contains("foobar", "bar", false));
|
||||
}
|
||||
|
||||
public void testContainsWhole() {
|
||||
assertTrue(SimpleFacets.contains("foobar", "foobar", false));
|
||||
}
|
||||
|
||||
public void testContainsIgnoreCase() {
|
||||
assertTrue(SimpleFacets.contains("FooBar", "bar", true));
|
||||
}
|
||||
|
||||
public void testRangeQueryHardEndParamFilter() {
|
||||
doTestRangeQueryHardEndParam("range_facet_l", FacetRangeMethod.FILTER);
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.request;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.junit.Test;
|
||||
|
||||
public class SubstringBytesRefFilterTest extends LuceneTestCase {
|
||||
|
||||
@Test
|
||||
public void testSubstringBytesRefFilter() {
|
||||
final List<String> substrings = new ArrayList<>(4);
|
||||
substrings.add("foo");
|
||||
substrings.add("ooba");
|
||||
substrings.add("bar");
|
||||
substrings.add("foobar");
|
||||
|
||||
final String contains = substrings.get(random().nextInt(substrings.size()));
|
||||
final boolean ignoreCase = random().nextBoolean();
|
||||
final SubstringBytesRefFilter filter = new SubstringBytesRefFilter(contains, ignoreCase);
|
||||
|
||||
assertTrue(filter.test(new BytesRef("foobar")));
|
||||
|
||||
if (ignoreCase) {
|
||||
assertTrue(filter.test(new BytesRef("FooBar")));
|
||||
} else {
|
||||
assertFalse(filter.test(new BytesRef("FooBar")));
|
||||
}
|
||||
|
||||
assertFalse(filter.test(new BytesRef("qux")));
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue