SOLR-9914: SimpleFacets: refactor "contains" check into "SubstringBytesRefFilter" class. (Jonny Marks, David Smiley, Christine Poerschke)

2017-02-09 11:19:21 +00:00 · 2017-02-09 11:19:21 +00:00 · 55afc2031c
parent 6696eafaae
commit 55afc2031c
7 changed files with 170 additions and 70 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -177,6 +177,9 @@ Other Changes
 * SOLR-9800: Factor out FacetComponent.newSimpleFacets method. (Jonny Marks via Christine Poerschke)
 * SOLR-9914: SimpleFacets: refactor "contains" check into "SubstringBytesRefFilter" class.
  (Jonny Marks, David Smiley, Christine Poerschke)
 ==================  6.4.1 ==================
 Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.
--- a/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java
+++ b/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java
@ -18,6 +18,7 @@ package org.apache.solr.request;
 import java.io.IOException;
 import java.util.List;
 import java.util.function.Predicate;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.LeafReaderContext;
@ -59,6 +60,11 @@ public class DocValuesFacets {
  private DocValuesFacets() {}
  public static NamedList<Integer> getCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase, FacetDebugInfo fdebug) throws IOException {
    final Predicate<BytesRef> termFilter = new SubstringBytesRefFilter(contains, ignoreCase);
    return getCounts(searcher, docs, fieldName, offset, limit, mincount, missing, sort, prefix, termFilter, fdebug);
  }
  public static NamedList<Integer> getCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate<BytesRef> termFilter, FacetDebugInfo fdebug) throws IOException {
    SchemaField schemaField = searcher.getSchema().getField(fieldName);
    FieldType ft = schemaField.getType();
    NamedList<Integer> res = new NamedList<>();
@ -178,9 +184,9 @@ public class DocValuesFacets {
            // index order, so we already know that the keys are ordered.  This can be very
            // important if a lot of the counts are repeated (like zero counts would be).
-            if (contains != null) {
+            if (termFilter != null) {
              final BytesRef term = si.lookupOrd(startTermIndex+i);
-              if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) {
+              if (!termFilter.test(term)) {
                continue;
              }
            }
@ -213,8 +219,8 @@ public class DocValuesFacets {
      } else {
        // add results in index order
        int i=(startTermIndex==-1)?1:0;
-        if (mincount<=0 && contains == null) {
+        if (mincount<=0 && termFilter == null) {
-          // if mincount<=0 and we're not examining the values for contains, then
+          // if mincount<=0 and we're not examining the values for the term filter, then
          // we won't discard any terms and we know exactly where to start.
          i+=off;
          off=0;
@ -224,9 +230,9 @@ public class DocValuesFacets {
          int c = counts[i];
          if (c<mincount) continue;
          BytesRef term = null;
-          if (contains != null) {
+          if (termFilter != null) {
            term = si.lookupOrd(startTermIndex+i);
-            if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) {
+            if (!termFilter.test(term)) {
              continue;
            }
          }
--- a/solr/core/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java
+++ b/solr/core/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java
@ -25,6 +25,7 @@ import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Executor;
 import java.util.concurrent.ExecutorCompletionService;
 import java.util.concurrent.Future;
 import java.util.function.Predicate;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.LeafReaderContext;
@ -62,14 +63,17 @@ class PerSegmentSingleValuedFaceting {
  String sort;
  String prefix;
-  private String contains;
+  private final Predicate<BytesRef> termFilter;
  private boolean ignoreCase;
  Filter baseSet;
  int nThreads;
  public PerSegmentSingleValuedFaceting(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase) {
    this(searcher, docs, fieldName, offset, limit, mincount, missing, sort, prefix, new SubstringBytesRefFilter(contains, ignoreCase));
  }
  public PerSegmentSingleValuedFaceting(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate<BytesRef> filter) {
    this.searcher = searcher;
    this.docs = docs;
    this.fieldName = fieldName;
@ -79,8 +83,7 @@ class PerSegmentSingleValuedFaceting {
    this.missing = missing;
    this.sort = sort;
    this.prefix = prefix;
-    this.contains = contains;
+    this.termFilter = filter;
    this.ignoreCase = ignoreCase;
  }
  public void setNumThreads(int threads) {
@ -183,8 +186,7 @@ class PerSegmentSingleValuedFaceting {
    while (queue.size() > 0) {
      SegFacet seg = queue.top();
-      // if facet.contains specified, only actually collect the count if substring contained
+      boolean collect = termFilter == null || termFilter.test(seg.tempBR);
      boolean collect = contains == null || SimpleFacets.contains(seg.tempBR.utf8ToString(), contains, ignoreCase);
      // we will normally end up advancing the term enum for this segment
      // while still using "val", so we need to make a copy since the BytesRef
--- a/solr/core/src/java/org/apache/solr/request/SimpleFacets.java
+++ b/solr/core/src/java/org/apache/solr/request/SimpleFacets.java
@ -33,6 +33,7 @@ import java.util.concurrent.RunnableFuture;
 import java.util.concurrent.Semaphore;
 import java.util.concurrent.SynchronousQueue;
 import java.util.concurrent.TimeUnit;
 import java.util.function.Predicate;
 import org.apache.commons.lang.StringUtils;
 import org.apache.lucene.index.Fields;
@ -169,26 +170,6 @@ public class SimpleFacets {
    this.fdebugParent = fdebugParent;
  }
  /**
   * Returns <code>true</code> if a String contains the given substring. Otherwise
   * <code>false</code>.
   *
   * @param ref
   *          the {@link String} to test
   * @param substring
   *          the substring to look for
   * @param ignoreCase
   *          whether the comparison should be case-insensitive
   * @return Returns <code>true</code> iff the String contains the given substring.
   *         Otherwise <code>false</code>.
   */
  public static boolean contains(String ref, String substring, boolean ignoreCase) {
    if (ignoreCase)
      return StringUtils.containsIgnoreCase(ref, substring);
    return StringUtils.contains(ref, substring);
  }
  protected ParsedParams parseParams(String type, String param) throws SyntaxError, IOException {
    SolrParams localParams = QueryParsing.getLocalParams(param, req.getParams());
    DocSet docs = docsOrig;
@ -362,6 +343,17 @@ public class SimpleFacets {
    ENUM, FC, FCS, UIF;
  }
  protected Predicate<BytesRef> newBytesRefFilter(String field, SolrParams params) {
    final String contains = params.getFieldParam(field, FacetParams.FACET_CONTAINS);
    final boolean ignoreCase = params.getFieldBool(field, FacetParams.FACET_CONTAINS_IGNORE_CASE, false);
    if (contains == null) {
      return null;
    }
    return new SubstringBytesRefFilter(contains, ignoreCase);
  }
  /**
   * Term counts for use in pivot faceting that resepcts the appropriate mincount
   * @see FacetParams#FACET_PIVOT_MINCOUNT
@ -405,8 +397,9 @@ public class SimpleFacets {
    // default to sorting if there is a limit.
    String sort = params.getFieldParam(field, FacetParams.FACET_SORT, limit>0 ? FacetParams.FACET_SORT_COUNT : FacetParams.FACET_SORT_INDEX);
    String prefix = params.getFieldParam(field, FacetParams.FACET_PREFIX);
-    String contains = params.getFieldParam(field, FacetParams.FACET_CONTAINS);
+
-    boolean ignoreCase = params.getFieldBool(field, FacetParams.FACET_CONTAINS_IGNORE_CASE, false);
+    final Predicate<BytesRef> termFilter = newBytesRefFilter(field, params);
    boolean exists = params.getFieldBool(field, FacetParams.FACET_EXISTS, false);
    NamedList<Integer> counts;
@ -448,14 +441,13 @@ public class SimpleFacets {
    }
    if (params.getFieldBool(field, GroupParams.GROUP_FACET, false)) {
-      counts = getGroupedCounts(searcher, docs, field, multiToken, offset,limit, mincount, missing, sort, prefix, contains, ignoreCase);
+      counts = getGroupedCounts(searcher, docs, field, multiToken, offset,limit, mincount, missing, sort, prefix, termFilter);
    } else {
      assert appliedFacetMethod != null;
      switch (appliedFacetMethod) {
        case ENUM:
          assert TrieField.getMainValuePrefix(ft) == null;
-          counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix, contains, ignoreCase, 
+          counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix, termFilter, exists);
                                          exists);
          break;
        case FCS:
          assert !multiToken;
@ -464,12 +456,15 @@ public class SimpleFacets {
            if (prefix != null && !prefix.isEmpty()) {
              throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_PREFIX + " is not supported on numeric types");
            }
-            if (contains != null && !contains.isEmpty()) {
+            if (termFilter != null) {
              final boolean supportedOperation = (termFilter instanceof SubstringBytesRefFilter) && ((SubstringBytesRefFilter) termFilter).substring().isEmpty();
              if (!supportedOperation) {
                throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_CONTAINS + " is not supported on numeric types");
              }
            }
            counts = NumericFacets.getCounts(searcher, docs, field, offset, limit, mincount, missing, sort);
          } else {
-            PerSegmentSingleValuedFaceting ps = new PerSegmentSingleValuedFaceting(searcher, docs, field, offset, limit, mincount, missing, sort, prefix, contains, ignoreCase);
+            PerSegmentSingleValuedFaceting ps = new PerSegmentSingleValuedFaceting(searcher, docs, field, offset, limit, mincount, missing, sort, prefix, termFilter);
            Executor executor = threads == 0 ? directExecutor : facetExecutor;
            ps.setNumThreads(threads);
            counts = ps.getFacetCounts(executor);
@ -532,7 +527,7 @@ public class SimpleFacets {
            }
          break;
        case FC:
-          counts = DocValuesFacets.getCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix, contains, ignoreCase, fdebug);
+          counts = DocValuesFacets.getCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix, termFilter, fdebug);
          break;
        default:
          throw new AssertionError();
@ -644,8 +639,7 @@ public class SimpleFacets {
                                             boolean missing,
                                             String sort,
                                             String prefix,
-                                             String contains,
+                                             Predicate<BytesRef> termFilter) throws IOException {
                                             boolean ignoreCase) throws IOException {
    GroupingSpecification groupingSpecification = rb.getGroupingSpec();
    final String groupField  = groupingSpecification != null ? groupingSpecification.getFields()[0] : null;
    if (groupField == null) {
@ -675,8 +669,8 @@ public class SimpleFacets {
    List<TermGroupFacetCollector.FacetEntry> scopedEntries 
      = result.getFacetEntries(offset, limit < 0 ? Integer.MAX_VALUE : limit);
    for (TermGroupFacetCollector.FacetEntry facetEntry : scopedEntries) {
-      //:TODO:can we do contains earlier than this to make it more efficient?
+      //:TODO:can we filter earlier than this to make it more efficient?
-      if (contains != null && !contains(facetEntry.getValue().utf8ToString(), contains, ignoreCase)) {
+      if (termFilter != null && !termFilter.test(facetEntry.getValue())) {
        continue;
      }
      facetFieldType.indexedToReadable(facetEntry.getValue(), charsRef);
@ -851,6 +845,18 @@ public class SimpleFacets {
    return docs.andNotSize(hasVal);
  }
  /**
   *  Works like {@link #getFacetTermEnumCounts(SolrIndexSearcher, DocSet, String, int, int, int, boolean, String, String, Predicate, boolean)}
   *  but takes a substring directly for the contains check rather than a {@link Predicate} instance.
   */
  public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing,
                                                   String sort, String prefix, String contains, boolean ignoreCase, boolean intersectsCheck)
    throws IOException {
    final Predicate<BytesRef> termFilter = new SubstringBytesRefFilter(contains, ignoreCase);
    return getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount, missing, sort, prefix, termFilter, intersectsCheck);
  }
  /**
   * Returns a list of terms in the specified field along with the 
   * corresponding count of documents in the set that match that constraint.
@ -862,7 +868,7 @@ public class SimpleFacets {
   * @see FacetParams#FACET_MISSING
   */
  public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing,
-                                      String sort, String prefix, String contains, boolean ignoreCase, boolean intersectsCheck)
+                                                   String sort, String prefix, Predicate<BytesRef> termFilter, boolean intersectsCheck)
    throws IOException {
    /* :TODO: potential optimization...
@ -934,7 +940,7 @@ public class SimpleFacets {
        if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes))
          break;
-        if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) {
+        if (termFilter == null || termFilter.test(term)) {
          int df = termsEnum.docFreq();
          // If we are sorting, we can use df>min (rather than >=) since we
--- a/solr/core/src/java/org/apache/solr/request/SubstringBytesRefFilter.java
+++ b/solr/core/src/java/org/apache/solr/request/SubstringBytesRefFilter.java
@ -0,0 +1,52 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.solr.request;
 import java.util.function.Predicate;
 import org.apache.lucene.util.BytesRef;
 import org.apache.commons.lang.StringUtils;
 /**
 * An implementation of {@link Predicate} which returns true if the BytesRef contains a given substring.
 */
 public class SubstringBytesRefFilter implements Predicate<BytesRef> {
  final private String contains;
  final private boolean ignoreCase;
  public SubstringBytesRefFilter(String contains, boolean ignoreCase) {
    this.contains = contains;
    this.ignoreCase = ignoreCase;
  }
  public String substring() {
    return contains;
  }
  protected boolean includeString(String term) {
    if (ignoreCase) {
      return StringUtils.containsIgnoreCase(term, contains);
    }
    return StringUtils.contains(term, contains);
  }
  @Override
  public boolean test(BytesRef term) {
    return includeString(term.utf8ToString());
  }
 }
--- a/solr/core/src/test/org/apache/solr/request/SimpleFacetsTest.java
+++ b/solr/core/src/test/org/apache/solr/request/SimpleFacetsTest.java
@ -2685,26 +2685,6 @@ public class SimpleFacetsTest extends SolrTestCaseJ4 {
              400);
  }
  public void testContainsAtStart() {
    assertTrue(SimpleFacets.contains("foobar", "foo", false));
  }
  public void testContains() {
    assertTrue(SimpleFacets.contains("foobar", "ooba", false));
  }
  public void testContainsAtEnd() {
    assertTrue(SimpleFacets.contains("foobar", "bar", false));
  }
  public void testContainsWhole() {
    assertTrue(SimpleFacets.contains("foobar", "foobar", false));
  }
  public void testContainsIgnoreCase() {
    assertTrue(SimpleFacets.contains("FooBar", "bar", true));
  }
  public void testRangeQueryHardEndParamFilter() {
    doTestRangeQueryHardEndParam("range_facet_l", FacetRangeMethod.FILTER);
  }
--- a/solr/core/src/test/org/apache/solr/request/SubstringBytesRefFilterTest.java
+++ b/solr/core/src/test/org/apache/solr/request/SubstringBytesRefFilterTest.java
@ -0,0 +1,51 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.solr.request;
 import java.util.ArrayList;
 import java.util.List;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.BytesRef;
 import org.junit.Test;
 public class SubstringBytesRefFilterTest extends LuceneTestCase {
  @Test
  public void testSubstringBytesRefFilter() {
    final List<String> substrings = new ArrayList<>(4);
    substrings.add("foo");
    substrings.add("ooba");
    substrings.add("bar");
    substrings.add("foobar");
    final String contains = substrings.get(random().nextInt(substrings.size()));
    final boolean ignoreCase = random().nextBoolean();
    final SubstringBytesRefFilter filter = new SubstringBytesRefFilter(contains, ignoreCase);
    assertTrue(filter.test(new BytesRef("foobar")));
    if (ignoreCase) {
      assertTrue(filter.test(new BytesRef("FooBar")));
    } else {
      assertFalse(filter.test(new BytesRef("FooBar")));
    }
    assertFalse(filter.test(new BytesRef("qux")));
  }
 }