SOLR-1387: Move contains() method to SimpleFacets

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1672106 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Alan Woodward 2015-04-08 15:28:13 +00:00
parent 3b8ea7b2e7
commit 3c0ff1de91
7 changed files with 69 additions and 92 deletions

View File

@ -22,7 +22,6 @@ import java.math.BigInteger;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.util.Arrays; import java.util.Arrays;
import java.util.Locale;
import java.util.Properties; import java.util.Properties;
/** /**
@ -135,34 +134,6 @@ public abstract class StringHelper {
return sliceEquals(ref, suffix, ref.length - suffix.length); return sliceEquals(ref, suffix, ref.length - suffix.length);
} }
/**
* Returns <code>true</code> iff the ref contains the given slice. Otherwise
* <code>false</code>.
*
* @param ref
* the {@link BytesRef} to test
* @param slice
* the slice to look for
* @param ignoreCase
* whether the comparison should be case-insensitive
* @return Returns <code>true</code> iff the ref contains the given slice.
* Otherwise <code>false</code>.
*/
public static boolean contains(BytesRef ref, BytesRef slice, boolean ignoreCase) {
if (ignoreCase) {
String s1 = ref.utf8ToString();
String s2 = slice.utf8ToString();
return s1.toLowerCase(Locale.ENGLISH).contains(s2.toLowerCase(Locale.ENGLISH));
} else {
for (int pos = 0; pos <= ref.length - slice.length; ++pos) {
if (sliceEquals(ref, slice, pos)) {
return true;
}
}
}
return false;
}
private static boolean sliceEquals(BytesRef sliceToTest, BytesRef other, int pos) { private static boolean sliceEquals(BytesRef sliceToTest, BytesRef other, int pos) {
if (pos < 0 || sliceToTest.length - pos < other.length) { if (pos < 0 || sliceToTest.length - pos < other.length) {
return false; return false;

View File

@ -56,36 +56,6 @@ public class TestStringHelper extends LuceneTestCase {
assertTrue(StringHelper.endsWith(ref, slice)); assertTrue(StringHelper.endsWith(ref, slice));
} }
public void testContainsAtStart() {
BytesRef ref = new BytesRef("foobar");
BytesRef slice = new BytesRef("foo");
assertTrue(StringHelper.contains(ref, slice, false));
}
public void testContains() {
BytesRef ref = new BytesRef("foobar");
BytesRef slice = new BytesRef("ooba");
assertTrue(StringHelper.contains(ref, slice, false));
}
public void testContainsAtEnd() {
BytesRef ref = new BytesRef("foobar");
BytesRef slice = new BytesRef("bar");
assertTrue(StringHelper.contains(ref, slice, false));
}
public void testContainsWhole() {
BytesRef ref = new BytesRef("foobar");
BytesRef slice = new BytesRef("foobar");
assertTrue(StringHelper.contains(ref, slice, false));
}
public void testContainsIgnoreCase() {
BytesRef ref = new BytesRef("FooBar");
BytesRef slice = new BytesRef("bar");
assertTrue(StringHelper.contains(ref, slice, true));
}
public void testMurmurHash3() throws Exception { public void testMurmurHash3() throws Exception {
// Hashes computed using murmur3_32 from https://code.google.com/p/pyfasthash // Hashes computed using murmur3_32 from https://code.google.com/p/pyfasthash
assertEquals(0xf6a5c420, StringHelper.murmurhash3_x86_32(new BytesRef("foo"), 0)); assertEquals(0xf6a5c420, StringHelper.murmurhash3_x86_32(new BytesRef("foo"), 0));

View File

@ -17,12 +17,12 @@ package org.apache.lucene.demo.facet;
* limitations under the License. * limitations under the License.
*/ */
import java.util.List;
import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.FacetResult;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test; import org.junit.Test;
import java.util.List;
public class TestSimpleFacetsExample extends LuceneTestCase { public class TestSimpleFacetsExample extends LuceneTestCase {
@Test @Test
@ -54,4 +54,5 @@ public class TestSimpleFacetsExample extends LuceneTestCase {
assertEquals("dim=Publish Date path=[] value=5 childCount=3\n 2010 (2)\n 2012 (2)\n 1999 (1)\n", result.get(0).toString()); assertEquals("dim=Publish Date path=[] value=5 childCount=3\n 2010 (2)\n 2012 (2)\n 1999 (1)\n", result.get(0).toString());
assertEquals("dim=Author path=[] value=2 childCount=2\n Bob (1)\n Lisa (1)\n", result.get(1).toString()); assertEquals("dim=Author path=[] value=2 childCount=2\n Bob (1)\n Lisa (1)\n", result.get(1).toString());
} }
} }

View File

@ -17,11 +17,8 @@ package org.apache.solr.request;
* limitations under the License. * limitations under the License.
*/ */
import java.io.IOException;
import java.util.List;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues.MultiSortedDocValues; import org.apache.lucene.index.MultiDocValues.MultiSortedDocValues;
import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues; import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues;
import org.apache.lucene.index.MultiDocValues.OrdinalMap; import org.apache.lucene.index.MultiDocValues.OrdinalMap;
@ -34,7 +31,6 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.LongValues; import org.apache.lucene.util.LongValues;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.UnicodeUtil;
import org.apache.solr.common.params.FacetParams; import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
@ -44,6 +40,9 @@ import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.LongPriorityQueue; import org.apache.solr.util.LongPriorityQueue;
import java.io.IOException;
import java.util.List;
/** /**
* Computes term facets for docvalues field (single or multivalued). * Computes term facets for docvalues field (single or multivalued).
* <p> * <p>
@ -99,8 +98,6 @@ public class DocValuesFacets {
prefixRef.copyChars(prefix); prefixRef.copyChars(prefix);
} }
final BytesRef containsBR = contains != null ? new BytesRef(contains) : null;
int startTermIndex, endTermIndex; int startTermIndex, endTermIndex;
if (prefix!=null) { if (prefix!=null) {
startTermIndex = (int) si.lookupTerm(prefixRef.get()); startTermIndex = (int) si.lookupTerm(prefixRef.get());
@ -173,9 +170,9 @@ public class DocValuesFacets {
int min=mincount-1; // the smallest value in the top 'N' values int min=mincount-1; // the smallest value in the top 'N' values
for (int i=(startTermIndex==-1)?1:0; i<nTerms; i++) { for (int i=(startTermIndex==-1)?1:0; i<nTerms; i++) {
int c = counts[i]; int c = counts[i];
if (containsBR != null) { if (contains != null) {
final BytesRef term = si.lookupOrd(startTermIndex+i); final BytesRef term = si.lookupOrd(startTermIndex+i);
if (!StringHelper.contains(term, containsBR, ignoreCase)) { if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) {
continue; continue;
} }
} }
@ -212,7 +209,7 @@ public class DocValuesFacets {
} else { } else {
// add results in index order // add results in index order
int i=(startTermIndex==-1)?1:0; int i=(startTermIndex==-1)?1:0;
if (mincount<=0 && containsBR == null) { if (mincount<=0 && contains == null) {
// if mincount<=0 and we're not examining the values for contains, then // if mincount<=0 and we're not examining the values for contains, then
// we won't discard any terms and we know exactly where to start. // we won't discard any terms and we know exactly where to start.
i+=off; i+=off;
@ -223,9 +220,9 @@ public class DocValuesFacets {
int c = counts[i]; int c = counts[i];
if (c<mincount) continue; if (c<mincount) continue;
BytesRef term = null; BytesRef term = null;
if (containsBR != null) { if (contains != null) {
term = si.lookupOrd(startTermIndex+i); term = si.lookupOrd(startTermIndex+i);
if (!StringHelper.contains(term, containsBR, ignoreCase)) { if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) {
continue; continue;
} }
} }

View File

@ -17,12 +17,8 @@
package org.apache.solr.request; package org.apache.solr.request;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.*;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSet;
@ -32,7 +28,6 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.UnicodeUtil;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.FacetParams; import org.apache.solr.common.params.FacetParams;
@ -42,6 +37,16 @@ import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.BoundedTreeSet; import org.apache.solr.util.BoundedTreeSet;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.Future;
class PerSegmentSingleValuedFaceting { class PerSegmentSingleValuedFaceting {
@ -55,7 +60,7 @@ class PerSegmentSingleValuedFaceting {
boolean missing; boolean missing;
String sort; String sort;
String prefix; String prefix;
BytesRef containsBR; String contains;
boolean ignoreCase; boolean ignoreCase;
Filter baseSet; Filter baseSet;
@ -72,7 +77,7 @@ class PerSegmentSingleValuedFaceting {
this.missing = missing; this.missing = missing;
this.sort = sort; this.sort = sort;
this.prefix = prefix; this.prefix = prefix;
this.containsBR = contains != null ? new BytesRef(contains) : null; this.contains = contains;
this.ignoreCase = ignoreCase; this.ignoreCase = ignoreCase;
} }
@ -180,7 +185,7 @@ class PerSegmentSingleValuedFaceting {
SegFacet seg = queue.top(); SegFacet seg = queue.top();
// if facet.contains specified, only actually collect the count if substring contained // if facet.contains specified, only actually collect the count if substring contained
boolean collect = containsBR == null || StringHelper.contains(seg.tempBR, containsBR, ignoreCase); boolean collect = contains == null || SimpleFacets.contains(seg.tempBR.utf8ToString(), contains, ignoreCase);
// we will normally end up advancing the term enum for this segment // we will normally end up advancing the term enum for this segment
// while still using "val", so we need to make a copy since the BytesRef // while still using "val", so we need to make a copy since the BytesRef

View File

@ -17,6 +17,7 @@
package org.apache.solr.request; package org.apache.solr.request;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.index.Fields; import org.apache.lucene.index.Fields;
import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.LeafReaderContext;
@ -143,6 +144,25 @@ public class SimpleFacets {
this.rb = rb; this.rb = rb;
} }
/**
* Returns <code>true</code> if a String contains the given substring. Otherwise
* <code>false</code>.
*
* @param ref
* the {@link String} to test
* @param substring
* the substring to look for
* @param ignoreCase
* whether the comparison should be case-insensitive
* @return Returns <code>true</code> iff the String contains the given substring.
* Otherwise <code>false</code>.
*/
public static boolean contains(String ref, String substring, boolean ignoreCase) {
if (ignoreCase)
return StringUtils.containsIgnoreCase(ref, substring);
return StringUtils.contains(ref, substring);
}
protected void parseParams(String type, String param) throws SyntaxError, IOException { protected void parseParams(String type, String param) throws SyntaxError, IOException {
localParams = QueryParsing.getLocalParams(param, req.getParams()); localParams = QueryParsing.getLocalParams(param, req.getParams());
@ -494,7 +514,6 @@ public class SimpleFacets {
} }
BytesRef prefixBytesRef = prefix != null ? new BytesRef(prefix) : null; BytesRef prefixBytesRef = prefix != null ? new BytesRef(prefix) : null;
BytesRef containsRef = contains != null ? new BytesRef(contains) : null;
final TermGroupFacetCollector collector = TermGroupFacetCollector.createTermGroupFacetCollector(groupField, field, multiToken, prefixBytesRef, 128); final TermGroupFacetCollector collector = TermGroupFacetCollector.createTermGroupFacetCollector(groupField, field, multiToken, prefixBytesRef, 128);
SchemaField sf = searcher.getSchema().getFieldOrNull(groupField); SchemaField sf = searcher.getSchema().getFieldOrNull(groupField);
@ -526,7 +545,7 @@ public class SimpleFacets {
= result.getFacetEntries(offset, limit < 0 ? Integer.MAX_VALUE : limit); = result.getFacetEntries(offset, limit < 0 ? Integer.MAX_VALUE : limit);
for (TermGroupFacetCollector.FacetEntry facetEntry : scopedEntries) { for (TermGroupFacetCollector.FacetEntry facetEntry : scopedEntries) {
//:TODO:can we do contains earlier than this to make it more efficient? //:TODO:can we do contains earlier than this to make it more efficient?
if (containsRef != null && !StringHelper.contains(facetEntry.getValue(), containsRef, ignoreCase)) { if (contains != null && !contains(facetEntry.getValue().utf8ToString(), contains, ignoreCase)) {
continue; continue;
} }
facetFieldType.indexedToReadable(facetEntry.getValue(), charsRef); facetFieldType.indexedToReadable(facetEntry.getValue(), charsRef);
@ -731,12 +750,6 @@ public class SimpleFacets {
prefixTermBytes = new BytesRef(indexedPrefix); prefixTermBytes = new BytesRef(indexedPrefix);
} }
BytesRef containsTermBytes = null;
if (contains != null) {
String indexedContains = ft.toInternal(contains);
containsTermBytes = new BytesRef(indexedContains);
}
Fields fields = r.fields(); Fields fields = r.fields();
Terms terms = fields==null ? null : fields.terms(field); Terms terms = fields==null ? null : fields.terms(field);
TermsEnum termsEnum = null; TermsEnum termsEnum = null;
@ -769,7 +782,7 @@ public class SimpleFacets {
if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes)) if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes))
break; break;
if (containsTermBytes == null || StringHelper.contains(term, containsTermBytes, ignoreCase)) { if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) {
int df = termsEnum.docFreq(); int df = termsEnum.docFreq();
// If we are sorting, we can use df>min (rather than >=) since we // If we are sorting, we can use df>min (rather than >=) since we

View File

@ -2258,4 +2258,24 @@ public class SimpleFacetsTest extends SolrTestCaseJ4 {
400); 400);
} }
public void testContainsAtStart() {
assertTrue(SimpleFacets.contains("foobar", "foo", false));
}
public void testContains() {
assertTrue(SimpleFacets.contains("foobar", "ooba", false));
}
public void testContainsAtEnd() {
assertTrue(SimpleFacets.contains("foobar", "bar", false));
}
public void testContainsWhole() {
assertTrue(SimpleFacets.contains("foobar", "foobar", false));
}
public void testContainsIgnoreCase() {
assertTrue(SimpleFacets.contains("FooBar", "bar", true));
}
} }