From 55afc2031c7aeb75f23e66f9817131cd3798a5b2 Mon Sep 17 00:00:00 2001
From: Christine Poerschke <cpoerschke@apache.org>
Date: Thu, 9 Feb 2017 11:19:21 +0000
Subject: [PATCH] SOLR-9914: SimpleFacets: refactor "contains" check into
 "SubstringBytesRefFilter" class. (Jonny Marks, David Smiley, Christine
 Poerschke)

---
 solr/CHANGES.txt                              |  3 +
 .../apache/solr/request/DocValuesFacets.java  | 20 +++--
 .../PerSegmentSingleValuedFaceting.java       | 14 ++--
 .../org/apache/solr/request/SimpleFacets.java | 80 ++++++++++---------
 .../solr/request/SubstringBytesRefFilter.java | 52 ++++++++++++
 .../apache/solr/request/SimpleFacetsTest.java | 20 -----
 .../request/SubstringBytesRefFilterTest.java  | 51 ++++++++++++
 7 files changed, 170 insertions(+), 70 deletions(-)
 create mode 100644 solr/core/src/java/org/apache/solr/request/SubstringBytesRefFilter.java
 create mode 100644 solr/core/src/test/org/apache/solr/request/SubstringBytesRefFilterTest.java
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index d6d97243bcc..4d8aa0163e4 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -177,6 +177,9 @@ Other Changes
 
 * SOLR-9800: Factor out FacetComponent.newSimpleFacets method. (Jonny Marks via Christine Poerschke)
 
+* SOLR-9914: SimpleFacets: refactor "contains" check into "SubstringBytesRefFilter" class.
+  (Jonny Marks, David Smiley, Christine Poerschke)
+
 ==================  6.4.1 ==================
 
 Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.
diff --git a/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java b/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java
index 3714bf1b9f6..e9498f8a7d9 100644
--- a/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java
+++ b/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java
@@ -18,6 +18,7 @@ package org.apache.solr.request;
 
 import java.io.IOException;
 import java.util.List;
+import java.util.function.Predicate;
 
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.LeafReaderContext;
@@ -57,8 +58,13 @@ import org.apache.solr.util.LongPriorityQueue;
  */
 public class DocValuesFacets {
   private DocValuesFacets() {}
-  
+
   public static NamedList<Integer> getCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase, FacetDebugInfo fdebug) throws IOException {
+    final Predicate<BytesRef> termFilter = new SubstringBytesRefFilter(contains, ignoreCase);
+    return getCounts(searcher, docs, fieldName, offset, limit, mincount, missing, sort, prefix, termFilter, fdebug);
+  }
+  
+  public static NamedList<Integer> getCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate<BytesRef> termFilter, FacetDebugInfo fdebug) throws IOException {
     SchemaField schemaField = searcher.getSchema().getField(fieldName);
     FieldType ft = schemaField.getType();
     NamedList<Integer> res = new NamedList<>();
@@ -178,9 +184,9 @@ public class DocValuesFacets {
             // index order, so we already know that the keys are ordered.  This can be very
             // important if a lot of the counts are repeated (like zero counts would be).
 
-            if (contains != null) {
+            if (termFilter != null) {
               final BytesRef term = si.lookupOrd(startTermIndex+i);
-              if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) {
+              if (!termFilter.test(term)) {
                 continue;
               }
             }
@@ -213,8 +219,8 @@ public class DocValuesFacets {
       } else {
         // add results in index order
         int i=(startTermIndex==-1)?1:0;
-        if (mincount<=0 && contains == null) {
-          // if mincount<=0 and we're not examining the values for contains, then
+        if (mincount<=0 && termFilter == null) {
+          // if mincount<=0 and we're not examining the values for the term filter, then
           // we won't discard any terms and we know exactly where to start.
           i+=off;
           off=0;
@@ -224,9 +230,9 @@ public class DocValuesFacets {
           int c = counts[i];
           if (c<mincount) continue;
           BytesRef term = null;
-          if (contains != null) {
+          if (termFilter != null) {
             term = si.lookupOrd(startTermIndex+i);
-            if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) {
+            if (!termFilter.test(term)) {
               continue;
             }
           }
diff --git a/solr/core/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java b/solr/core/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java
index c729b3bd47c..48837e0c6ab 100644
--- a/solr/core/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java
+++ b/solr/core/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java
@@ -25,6 +25,7 @@ import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Executor;
 import java.util.concurrent.ExecutorCompletionService;
 import java.util.concurrent.Future;
+import java.util.function.Predicate;
 
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.LeafReaderContext;
@@ -62,14 +63,17 @@ class PerSegmentSingleValuedFaceting {
   String sort;
   String prefix;
 
-  private String contains;
-  private boolean ignoreCase;
+  private final Predicate<BytesRef> termFilter;
 
   Filter baseSet;
 
   int nThreads;
 
   public PerSegmentSingleValuedFaceting(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase) {
+    this(searcher, docs, fieldName, offset, limit, mincount, missing, sort, prefix, new SubstringBytesRefFilter(contains, ignoreCase));
+  }
+
+  public PerSegmentSingleValuedFaceting(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate<BytesRef> filter) {
     this.searcher = searcher;
     this.docs = docs;
     this.fieldName = fieldName;
@@ -79,8 +83,7 @@ class PerSegmentSingleValuedFaceting {
     this.missing = missing;
     this.sort = sort;
     this.prefix = prefix;
-    this.contains = contains;
-    this.ignoreCase = ignoreCase;
+    this.termFilter = filter;
   }
 
   public void setNumThreads(int threads) {
@@ -183,8 +186,7 @@ class PerSegmentSingleValuedFaceting {
     while (queue.size() > 0) {
       SegFacet seg = queue.top();
       
-      // if facet.contains specified, only actually collect the count if substring contained
-      boolean collect = contains == null || SimpleFacets.contains(seg.tempBR.utf8ToString(), contains, ignoreCase);
+      boolean collect = termFilter == null || termFilter.test(seg.tempBR);
       
       // we will normally end up advancing the term enum for this segment
       // while still using "val", so we need to make a copy since the BytesRef
diff --git a/solr/core/src/java/org/apache/solr/request/SimpleFacets.java b/solr/core/src/java/org/apache/solr/request/SimpleFacets.java
index bc2877c6f05..ac8e9109516 100644
--- a/solr/core/src/java/org/apache/solr/request/SimpleFacets.java
+++ b/solr/core/src/java/org/apache/solr/request/SimpleFacets.java
@@ -33,6 +33,7 @@ import java.util.concurrent.RunnableFuture;
 import java.util.concurrent.Semaphore;
 import java.util.concurrent.SynchronousQueue;
 import java.util.concurrent.TimeUnit;
+import java.util.function.Predicate;
 
 import org.apache.commons.lang.StringUtils;
 import org.apache.lucene.index.Fields;
@@ -169,26 +170,6 @@ public class SimpleFacets {
     this.fdebugParent = fdebugParent;
   }
 
-  /**
-   * Returns <code>true</code> if a String contains the given substring. Otherwise
-   * <code>false</code>.
-   *
-   * @param ref
-   *          the {@link String} to test
-   * @param substring
-   *          the substring to look for
-   * @param ignoreCase
-   *          whether the comparison should be case-insensitive
-   * @return Returns <code>true</code> iff the String contains the given substring.
-   *         Otherwise <code>false</code>.
-   */
-  public static boolean contains(String ref, String substring, boolean ignoreCase) {
-    if (ignoreCase)
-      return StringUtils.containsIgnoreCase(ref, substring);
-    return StringUtils.contains(ref, substring);
-  }
-
-
   protected ParsedParams parseParams(String type, String param) throws SyntaxError, IOException {
     SolrParams localParams = QueryParsing.getLocalParams(param, req.getParams());
     DocSet docs = docsOrig;
@@ -362,6 +343,17 @@ public class SimpleFacets {
     ENUM, FC, FCS, UIF;
   }
 
+  protected Predicate<BytesRef> newBytesRefFilter(String field, SolrParams params) {
+    final String contains = params.getFieldParam(field, FacetParams.FACET_CONTAINS);
+    final boolean ignoreCase = params.getFieldBool(field, FacetParams.FACET_CONTAINS_IGNORE_CASE, false);
+
+    if (contains == null) {
+      return null;
+    }
+
+    return new SubstringBytesRefFilter(contains, ignoreCase);
+  }
+
   /**
    * Term counts for use in pivot faceting that resepcts the appropriate mincount
    * @see FacetParams#FACET_PIVOT_MINCOUNT
@@ -405,8 +397,9 @@ public class SimpleFacets {
     // default to sorting if there is a limit.
     String sort = params.getFieldParam(field, FacetParams.FACET_SORT, limit>0 ? FacetParams.FACET_SORT_COUNT : FacetParams.FACET_SORT_INDEX);
     String prefix = params.getFieldParam(field, FacetParams.FACET_PREFIX);
-    String contains = params.getFieldParam(field, FacetParams.FACET_CONTAINS);
-    boolean ignoreCase = params.getFieldBool(field, FacetParams.FACET_CONTAINS_IGNORE_CASE, false);
+
+    final Predicate<BytesRef> termFilter = newBytesRefFilter(field, params);
+
     boolean exists = params.getFieldBool(field, FacetParams.FACET_EXISTS, false);
     
     NamedList<Integer> counts;
@@ -448,14 +441,13 @@ public class SimpleFacets {
     }
 
     if (params.getFieldBool(field, GroupParams.GROUP_FACET, false)) {
-      counts = getGroupedCounts(searcher, docs, field, multiToken, offset,limit, mincount, missing, sort, prefix, contains, ignoreCase);
+      counts = getGroupedCounts(searcher, docs, field, multiToken, offset,limit, mincount, missing, sort, prefix, termFilter);
     } else {
       assert appliedFacetMethod != null;
       switch (appliedFacetMethod) {
         case ENUM:
           assert TrieField.getMainValuePrefix(ft) == null;
-          counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix, contains, ignoreCase, 
-                                          exists);
+          counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix, termFilter, exists);
           break;
         case FCS:
           assert !multiToken;
@@ -464,12 +456,15 @@ public class SimpleFacets {
             if (prefix != null && !prefix.isEmpty()) {
               throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_PREFIX + " is not supported on numeric types");
             }
-            if (contains != null && !contains.isEmpty()) {
-              throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_CONTAINS + " is not supported on numeric types");
+            if (termFilter != null) {
+              final boolean supportedOperation = (termFilter instanceof SubstringBytesRefFilter) && ((SubstringBytesRefFilter) termFilter).substring().isEmpty();
+              if (!supportedOperation) {
+                throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_CONTAINS + " is not supported on numeric types");
+              }
             }
             counts = NumericFacets.getCounts(searcher, docs, field, offset, limit, mincount, missing, sort);
           } else {
-            PerSegmentSingleValuedFaceting ps = new PerSegmentSingleValuedFaceting(searcher, docs, field, offset, limit, mincount, missing, sort, prefix, contains, ignoreCase);
+            PerSegmentSingleValuedFaceting ps = new PerSegmentSingleValuedFaceting(searcher, docs, field, offset, limit, mincount, missing, sort, prefix, termFilter);
             Executor executor = threads == 0 ? directExecutor : facetExecutor;
             ps.setNumThreads(threads);
             counts = ps.getFacetCounts(executor);
@@ -532,7 +527,7 @@ public class SimpleFacets {
             }
           break;
         case FC:
-          counts = DocValuesFacets.getCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix, contains, ignoreCase, fdebug);
+          counts = DocValuesFacets.getCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix, termFilter, fdebug);
           break;
         default:
           throw new AssertionError();
@@ -644,8 +639,7 @@ public class SimpleFacets {
                                              boolean missing,
                                              String sort,
                                              String prefix,
-                                             String contains,
-                                             boolean ignoreCase) throws IOException {
+                                             Predicate<BytesRef> termFilter) throws IOException {
     GroupingSpecification groupingSpecification = rb.getGroupingSpec();
     final String groupField  = groupingSpecification != null ? groupingSpecification.getFields()[0] : null;
     if (groupField == null) {
@@ -675,8 +669,8 @@ public class SimpleFacets {
     List<TermGroupFacetCollector.FacetEntry> scopedEntries 
       = result.getFacetEntries(offset, limit < 0 ? Integer.MAX_VALUE : limit);
     for (TermGroupFacetCollector.FacetEntry facetEntry : scopedEntries) {
-      //:TODO:can we do contains earlier than this to make it more efficient?
-      if (contains != null && !contains(facetEntry.getValue().utf8ToString(), contains, ignoreCase)) {
+      //:TODO:can we filter earlier than this to make it more efficient?
+      if (termFilter != null && !termFilter.test(facetEntry.getValue())) {
         continue;
       }
       facetFieldType.indexedToReadable(facetEntry.getValue(), charsRef);
@@ -851,6 +845,18 @@ public class SimpleFacets {
     return docs.andNotSize(hasVal);
   }
 
+  /**
+   *  Works like {@link #getFacetTermEnumCounts(SolrIndexSearcher, DocSet, String, int, int, int, boolean, String, String, Predicate, boolean)}
+   *  but takes a substring directly for the contains check rather than a {@link Predicate} instance.
+   */
+  public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing,
+                                                   String sort, String prefix, String contains, boolean ignoreCase, boolean intersectsCheck)
+    throws IOException {
+
+    final Predicate<BytesRef> termFilter = new SubstringBytesRefFilter(contains, ignoreCase);
+    return getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount, missing, sort, prefix, termFilter, intersectsCheck);
+  }
+
   /**
    * Returns a list of terms in the specified field along with the 
    * corresponding count of documents in the set that match that constraint.
@@ -861,8 +867,8 @@ public class SimpleFacets {
    * @see FacetParams#FACET_ZEROS
    * @see FacetParams#FACET_MISSING
    */
-  public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, 
-                                      String sort, String prefix, String contains, boolean ignoreCase, boolean intersectsCheck)
+  public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing,
+                                                   String sort, String prefix, Predicate<BytesRef> termFilter, boolean intersectsCheck)
     throws IOException {
     
     /* :TODO: potential optimization...
@@ -934,7 +940,7 @@ public class SimpleFacets {
         if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes))
           break;
 
-        if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) {
+        if (termFilter == null || termFilter.test(term)) {
           int df = termsEnum.docFreq();
 
           // If we are sorting, we can use df>min (rather than >=) since we
@@ -1148,4 +1154,4 @@ public class SimpleFacets {
   public ResponseBuilder getResponseBuilder() {
     return rb;
   }
-}
\ No newline at end of file
+}
diff --git a/solr/core/src/java/org/apache/solr/request/SubstringBytesRefFilter.java b/solr/core/src/java/org/apache/solr/request/SubstringBytesRefFilter.java
new file mode 100644
index 00000000000..623cb550859
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/request/SubstringBytesRefFilter.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.request;
+
+import java.util.function.Predicate;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.commons.lang.StringUtils;
+
+/**
+ * An implementation of {@link Predicate} which returns true if the BytesRef contains a given substring.
+ */
+public class SubstringBytesRefFilter implements Predicate<BytesRef> {
+  final private String contains;
+  final private boolean ignoreCase;
+  
+  public SubstringBytesRefFilter(String contains, boolean ignoreCase) {
+    this.contains = contains;
+    this.ignoreCase = ignoreCase;
+  }
+
+  public String substring() {
+    return contains;
+  }
+
+  protected boolean includeString(String term) {
+    if (ignoreCase) {
+      return StringUtils.containsIgnoreCase(term, contains);
+    }
+
+    return StringUtils.contains(term, contains);
+  }
+
+  @Override
+  public boolean test(BytesRef term) {
+    return includeString(term.utf8ToString());
+  }
+}
diff --git a/solr/core/src/test/org/apache/solr/request/SimpleFacetsTest.java b/solr/core/src/test/org/apache/solr/request/SimpleFacetsTest.java
index a21135e113c..bec3aa0dfdc 100644
--- a/solr/core/src/test/org/apache/solr/request/SimpleFacetsTest.java
+++ b/solr/core/src/test/org/apache/solr/request/SimpleFacetsTest.java
@@ -2684,26 +2684,6 @@ public class SimpleFacetsTest extends SolrTestCaseJ4 {
                   "facet.range.gap", "0.0003"),
               400);
   }
-
-  public void testContainsAtStart() {
-    assertTrue(SimpleFacets.contains("foobar", "foo", false));
-  }
-
-  public void testContains() {
-    assertTrue(SimpleFacets.contains("foobar", "ooba", false));
-  }
-
-  public void testContainsAtEnd() {
-    assertTrue(SimpleFacets.contains("foobar", "bar", false));
-  }
-
-  public void testContainsWhole() {
-    assertTrue(SimpleFacets.contains("foobar", "foobar", false));
-  }
-
-  public void testContainsIgnoreCase() {
-    assertTrue(SimpleFacets.contains("FooBar", "bar", true));
-  }
   
   public void testRangeQueryHardEndParamFilter() {
     doTestRangeQueryHardEndParam("range_facet_l", FacetRangeMethod.FILTER);
diff --git a/solr/core/src/test/org/apache/solr/request/SubstringBytesRefFilterTest.java b/solr/core/src/test/org/apache/solr/request/SubstringBytesRefFilterTest.java
new file mode 100644
index 00000000000..a5b304882f8
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/request/SubstringBytesRefFilterTest.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.request;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.BytesRef;
+import org.junit.Test;
+
+public class SubstringBytesRefFilterTest extends LuceneTestCase {
+    
+  @Test
+  public void testSubstringBytesRefFilter() {
+    final List<String> substrings = new ArrayList<>(4);
+    substrings.add("foo");
+    substrings.add("ooba");
+    substrings.add("bar");
+    substrings.add("foobar");
+
+    final String contains = substrings.get(random().nextInt(substrings.size()));
+    final boolean ignoreCase = random().nextBoolean();
+    final SubstringBytesRefFilter filter = new SubstringBytesRefFilter(contains, ignoreCase);
+
+    assertTrue(filter.test(new BytesRef("foobar")));
+
+    if (ignoreCase) {
+      assertTrue(filter.test(new BytesRef("FooBar")));
+    } else {
+      assertFalse(filter.test(new BytesRef("FooBar")));
+    }
+
+    assertFalse(filter.test(new BytesRef("qux")));
+  }
+
+}