From 93e3d755d4414fc229a5c20aa3e260cc863eae10 Mon Sep 17 00:00:00 2001 From: Noble Paul Date: Fri, 11 Dec 2009 09:03:28 +0000 Subject: [PATCH] SOLR-1625 Add regexp support for TermsComponent git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@889537 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 5 +- .../solr/common/params/TermsParams.java | 28 +++++++- .../handler/component/TermsComponent.java | 27 +++++++- .../handler/component/TermsComponentTest.java | 68 +++++++++++++++++++ 4 files changed, 121 insertions(+), 7 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 189fb81bcc2..32afbfba682 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -53,8 +53,7 @@ New Features * SOLR-1571: Added unicode collation support though Lucene's CollationKeyFilter (Robert Muir via shalin) -* SOLR-785: Distributed Search support for SpellCheckComponent - (Matthew Woytowitz, shalin) +* SOLR-1625: Add regexp support for TermsComponent (Uri Boness via noble) Optimizations ---------------------- @@ -142,8 +141,6 @@ Other Changes * SOLR-1608: Extract base class from TestDistributedSearch to make it easy to write test cases for other distributed components. (shalin) -* Upgraded to Lucene 2.9-dev r888785 (shalin) - Build ---------------------- diff --git a/src/common/org/apache/solr/common/params/TermsParams.java b/src/common/org/apache/solr/common/params/TermsParams.java index d08dd183a28..153e987cf80 100644 --- a/src/common/org/apache/solr/common/params/TermsParams.java +++ b/src/common/org/apache/solr/common/params/TermsParams.java @@ -17,6 +17,7 @@ package org.apache.solr.common.params; +import java.util.regex.Pattern; /** * @@ -68,7 +69,32 @@ public interface TermsParams { public static final String TERMS_PREFIX_STR = TERMS_PREFIX + "prefix"; - /** + public static final String TERMS_REGEXP_STR = TERMS_PREFIX + "regex"; + + public static final String TERMS_REGEXP_FLAG = TERMS_REGEXP_STR + ".flag"; + + public static enum TermsRegexpFlag { + UNIX_LINES(Pattern.UNIX_LINES), + CASE_INSENSITIVE(Pattern.CASE_INSENSITIVE), + COMMENTS(Pattern.COMMENTS), + MULTILINE(Pattern.MULTILINE), + LITERAL(Pattern.LITERAL), + DOTALL(Pattern.DOTALL), + UNICODE_CASE(Pattern.UNICODE_CASE), + CANON_EQ(Pattern.CANON_EQ); + + int value; + + TermsRegexpFlag(int value) { + this.value = value; + } + + public int getValue() { + return value; + } + } + + /** * Optional. The minimum value of docFreq to be returned. 1 by default */ public static final String TERMS_MINCOUNT = TERMS_PREFIX + "mincount"; diff --git a/src/java/org/apache/solr/handler/component/TermsComponent.java b/src/java/org/apache/solr/handler/component/TermsComponent.java index 93c82cf3c49..c9f018d0b0a 100644 --- a/src/java/org/apache/solr/handler/component/TermsComponent.java +++ b/src/java/org/apache/solr/handler/component/TermsComponent.java @@ -20,7 +20,6 @@ import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.util.StringHelper; import org.apache.solr.common.SolrException; -import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.TermsParams; import org.apache.solr.common.util.NamedList; @@ -30,7 +29,7 @@ import org.apache.solr.request.SimpleFacets.CountPair; import org.apache.solr.util.BoundedTreeSet; import java.io.IOException; - +import java.util.regex.Pattern; /** * Return TermEnum information, useful for things like auto suggest. @@ -65,6 +64,9 @@ public class TermsComponent extends SearchComponent { freqmax = Integer.MAX_VALUE; } String prefix = params.get(TermsParams.TERMS_PREFIX_STR); + String regexp = params.get(TermsParams.TERMS_REGEXP_STR); + Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null; + boolean raw = params.getBool(TermsParams.TERMS_RAW, false); for (int j = 0; j < fields.length; j++) { String field = StringHelper.intern(fields[j]); @@ -105,6 +107,11 @@ public class TermsComponent extends SearchComponent { // stop if the prefix doesn't match if (prefix != null && !indexedText.startsWith(prefix)) break; + if (pattern != null && !pattern.matcher(indexedText).matches()) { + termEnum.next(); + continue; + } + if (upperTerm != null) { int upperCmp = theTerm.compareTo(upperTerm); // if we are past the upper term, or equal to it (when don't include upper) then stop. @@ -146,6 +153,22 @@ public class TermsComponent extends SearchComponent { } } + int resolveRegexpFlags(SolrParams params) { + String[] flagParams = params.getParams(TermsParams.TERMS_REGEXP_FLAG); + if (flagParams == null) { + return 0; + } + int flags = 0; + for (String flagParam : flagParams) { + try { + flags |= TermsParams.TermsRegexpFlag.valueOf(flagParam.toUpperCase()).getValue(); + } catch (IllegalArgumentException iae) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown terms regex flag '" + flagParam + "'"); + } + } + return flags; + } + public void prepare(ResponseBuilder rb) throws IOException { //nothing to do } diff --git a/src/test/org/apache/solr/handler/component/TermsComponentTest.java b/src/test/org/apache/solr/handler/component/TermsComponentTest.java index 00475f752f7..bf54f3ff585 100644 --- a/src/test/org/apache/solr/handler/component/TermsComponentTest.java +++ b/src/test/org/apache/solr/handler/component/TermsComponentTest.java @@ -26,6 +26,7 @@ import org.apache.solr.request.SolrRequestHandler; import org.apache.solr.request.LocalSolrQueryRequest; import org.apache.solr.request.SolrQueryResponse; +import java.util.regex.Pattern; /** * @@ -210,6 +211,73 @@ public class TermsComponentTest extends AbstractSolrTestCase { assertTrue("value is null and it shouldn't be", value != null); } + public void testRegexp() throws Exception { + SolrCore core = h.getCore(); + TermsComponent tc = (TermsComponent) core.getSearchComponent("termsComp"); + assertTrue("tc is null and it shouldn't be", tc != null); + + ModifiableSolrParams params = new ModifiableSolrParams(); + params.add(TermsParams.TERMS, "true"); + params.add(TermsParams.TERMS_FIELD, "standardfilt"); + params.add(TermsParams.TERMS_LOWER, "bb"); + params.add(TermsParams.TERMS_LOWER_INCLUSIVE, "false"); + params.add(TermsParams.TERMS_REGEXP_STR, "b.*"); + params.add(TermsParams.TERMS_UPPER, "bbbb"); + params.add(TermsParams.TERMS_UPPER_INCLUSIVE, "true"); + params.add(TermsParams.TERMS_LIMIT, String.valueOf(50)); + SolrRequestHandler handler; + SolrQueryResponse rsp; + NamedList values; + NamedList terms; + handler = core.getRequestHandler("/terms"); + assertTrue("handler is null and it shouldn't be", handler != null); + rsp = new SolrQueryResponse(); + rsp.add("responseHeader", new SimpleOrderedMap()); + handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp); + values = rsp.getValues(); + terms = (NamedList) ((NamedList) values.get("terms")).get("standardfilt"); + assertEquals("terms Size: " + terms.size() + " is not: 1", 1, terms.size()); + } + + public void testRegexpFlagParsing() { + ModifiableSolrParams params = new ModifiableSolrParams(); + params.add(TermsParams.TERMS_REGEXP_FLAG, "case_insensitive", "literal", "comments", "multiline", "unix_lines", + "unicode_case", "dotall", "canon_eq"); + int flags = new TermsComponent().resolveRegexpFlags(params); + int expected = Pattern.CASE_INSENSITIVE | Pattern.LITERAL | Pattern.COMMENTS | Pattern.MULTILINE | Pattern.UNIX_LINES + | Pattern.UNICODE_CASE | Pattern.DOTALL | Pattern.CANON_EQ; + assertEquals(expected, flags); + } + + public void testRegexpWithFlags() throws Exception { + SolrCore core = h.getCore(); + TermsComponent tc = (TermsComponent) core.getSearchComponent("termsComp"); + assertTrue("tc is null and it shouldn't be", tc != null); + + ModifiableSolrParams params = new ModifiableSolrParams(); + params.add(TermsParams.TERMS, "true"); + params.add(TermsParams.TERMS_FIELD, "standardfilt"); + params.add(TermsParams.TERMS_LOWER, "bb"); + params.add(TermsParams.TERMS_LOWER_INCLUSIVE, "false"); + params.add(TermsParams.TERMS_REGEXP_STR, "B.*"); + params.add(TermsParams.TERMS_REGEXP_FLAG, "case_insensitive"); + params.add(TermsParams.TERMS_UPPER, "bbbb"); + params.add(TermsParams.TERMS_UPPER_INCLUSIVE, "true"); + params.add(TermsParams.TERMS_LIMIT, String.valueOf(50)); + SolrRequestHandler handler; + SolrQueryResponse rsp; + NamedList values; + NamedList terms; + handler = core.getRequestHandler("/terms"); + assertTrue("handler is null and it shouldn't be", handler != null); + rsp = new SolrQueryResponse(); + rsp.add("responseHeader", new SimpleOrderedMap()); + handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp); + values = rsp.getValues(); + terms = (NamedList) ((NamedList) values.get("terms")).get("standardfilt"); + assertEquals("terms Size: " + terms.size() + " is not: 1", 1, terms.size()); + } + public void testSortCount() throws Exception { SolrCore core = h.getCore(); TermsComponent tc = (TermsComponent) core.getSearchComponent("termsComp");