From 0b0d13dffec1f80d8a0eb41c9d2791a57c176d45 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Fri, 31 Jul 2009 17:41:04 +0000 Subject: [PATCH] LUCENE-1745: allow passing matching flags to the underlying regexp engine git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@799667 13f79535-47bb-0310-9956-ffa450edef68 --- contrib/CHANGES.txt | 8 ++++ .../regex/JakartaRegexpCapabilities.java | 33 ++++++++++++++- .../regex/JavaUtilRegexCapabilities.java | 41 ++++++++++++++++++- .../lucene/search/regex/TestRegexQuery.java | 30 ++++++++++++-- 4 files changed, 105 insertions(+), 7 deletions(-) diff --git a/contrib/CHANGES.txt b/contrib/CHANGES.txt index 09d756e365b..7d88fd6aec0 100644 --- a/contrib/CHANGES.txt +++ b/contrib/CHANGES.txt @@ -87,6 +87,14 @@ New features 10. LUCENE-1272: Add get/setBoost to MoreLikeThis. (Jonathan Leibiusky via Mike McCandless) +11. LUCENE-1745: Added constructors to JakartaRegexpCapabilities and + JavaUtilRegexCapabilities as well as static flags to support + configuring a RegexCapabilities implementation with the + implementation-specific modifier flags. Allows for callers to + customize the RegexQuery using the implementation-specific options + and fine tune how regular expressions are compiled and + matched. (Marc Zampetti zampettim@aim.com via Mike McCandless) + Optimizations 1. LUCENE-1643: Re-use the collation key (RawCollationKey) for diff --git a/contrib/regex/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java b/contrib/regex/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java index 1a7de6509c2..71692f58f55 100644 --- a/contrib/regex/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java +++ b/contrib/regex/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java @@ -27,9 +27,40 @@ import org.apache.regexp.RegexpTunnel; */ public class JakartaRegexpCapabilities implements RegexCapabilities { private RE regexp; + + // Define the flags that are possible. Redefine them here + // to avoid exposign the RE class to the caller. + + private int flags = RE.MATCH_NORMAL; + /** + * Flag to specify normal, case-sensitive matching behaviour. This is the default. + */ + public static final int FLAG_MATCH_NORMAL = RE.MATCH_NORMAL; + + /** + * Flag to specify that matching should be case-independent (folded) + */ + public static final int FLAG_MATCH_CASEINDEPENDENT = RE.MATCH_CASEINDEPENDENT; + + /** + * Contructs a RegexCapabilities with the default MATCH_NORMAL match style. + */ + public JakartaRegexpCapabilities() {} + + /** + * Constructs a RegexCapabilities with the provided match flags. + * Multiple flags should be ORed together. + * + * @param flags The matching style + */ + public JakartaRegexpCapabilities(int flags) + { + this.flags = flags; + } + public void compile(String pattern) { - regexp = new RE(pattern); + regexp = new RE(pattern, this.flags); } public boolean match(String string) { diff --git a/contrib/regex/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java b/contrib/regex/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java index 4748e0a0bab..e950ab70b24 100644 --- a/contrib/regex/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java +++ b/contrib/regex/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java @@ -28,9 +28,46 @@ import java.util.regex.Pattern; */ public class JavaUtilRegexCapabilities implements RegexCapabilities { private Pattern pattern; - + private int flags = 0; + + // Define the optional flags from Pattern that can be used. + // Do this here to keep Pattern contained within this class. + + public static final int FLAG_CANON_EQ = Pattern.CANON_EQ; + public static final int FLAG_CASE_INSENSITIVE = Pattern.CASE_INSENSITIVE; + public static final int FLAG_COMMENTS = Pattern.COMMENTS; + public static final int FLAG_DOTALL = Pattern.DOTALL; + public static final int FLAG_LITERAL = Pattern.LITERAL; + public static final int FLAG_MULTILINE = Pattern.MULTILINE; + public static final int FLAG_UNICODE_CASE = Pattern.UNICODE_CASE; + public static final int FLAG_UNIX_LINES = Pattern.UNIX_LINES; + + /** + * Default constructor that uses java.util.regex.Pattern + * with its default flags. + */ + public JavaUtilRegexCapabilities() { + this.flags = 0; + } + + /** + * Constructor that allows for the modification of the flags that + * the java.util.regex.Pattern will use to compile the regular expression. + * This gives the user the ability to fine-tune how the regular expression + * to match the functionlity that they need. + * The {@link java.util.regex.Pattern Pattern} class supports specifying + * these fields via the regular expression text itself, but this gives the caller + * another option to modify the behavior. Useful in cases where the regular expression text + * cannot be modified, or if doing so is undesired. + * + * @flags The flags that are ORed together. + */ + public JavaUtilRegexCapabilities(int flags) { + this.flags = flags; + } + public void compile(String pattern) { - this.pattern = Pattern.compile(pattern); + this.pattern = Pattern.compile(pattern, this.flags); } public boolean match(String string) { diff --git a/contrib/regex/src/test/org/apache/lucene/search/regex/TestRegexQuery.java b/contrib/regex/src/test/org/apache/lucene/search/regex/TestRegexQuery.java index 1527d57f74f..1947263f05e 100644 --- a/contrib/regex/src/test/org/apache/lucene/search/regex/TestRegexQuery.java +++ b/contrib/regex/src/test/org/apache/lucene/search/regex/TestRegexQuery.java @@ -33,6 +33,7 @@ public class TestRegexQuery extends TestCase { private IndexSearcher searcher; private final String FN = "field"; + public void setUp() { RAMDirectory directory = new RAMDirectory(); try { @@ -59,8 +60,12 @@ public class TestRegexQuery extends TestCase { private Term newTerm(String value) { return new Term(FN, value); } - private int regexQueryNrHits(String regex) throws Exception { + private int regexQueryNrHits(String regex, RegexCapabilities capability) throws Exception { RegexQuery query = new RegexQuery( newTerm(regex)); + + if ( capability != null ) + query.setRegexImplementation(capability); + return searcher.search(query).length(); } @@ -68,19 +73,20 @@ public class TestRegexQuery extends TestCase { SpanRegexQuery srq1 = new SpanRegexQuery( newTerm(regex1)); SpanRegexQuery srq2 = new SpanRegexQuery( newTerm(regex2)); SpanNearQuery query = new SpanNearQuery( new SpanQuery[]{srq1, srq2}, slop, ordered); + return searcher.search(query).length(); } public void testRegex1() throws Exception { - assertEquals(1, regexQueryNrHits("^q.[aeiou]c.*$")); + assertEquals(1, regexQueryNrHits("^q.[aeiou]c.*$", null)); } public void testRegex2() throws Exception { - assertEquals(0, regexQueryNrHits("^.[aeiou]c.*$")); + assertEquals(0, regexQueryNrHits("^.[aeiou]c.*$", null)); } public void testRegex3() throws Exception { - assertEquals(0, regexQueryNrHits("^q.[aeiou]c$")); + assertEquals(0, regexQueryNrHits("^q.[aeiou]c$", null)); } public void testSpanRegex1() throws Exception { @@ -98,6 +104,22 @@ public class TestRegexQuery extends TestCase { RegexQuery query2 = new RegexQuery( newTerm("foo.*")); assertFalse(query1.equals(query2)); } + + public void testJakartaCaseSensativeFail() throws Exception { + assertEquals(0, regexQueryNrHits("^.*DOG.*$", null)); + } + + public void testJavaUtilCaseSensativeFail() throws Exception { + assertEquals(0, regexQueryNrHits("^.*DOG.*$", null)); + } + + public void testJakartaCaseInsensative() throws Exception { + assertEquals(1, regexQueryNrHits("^.*DOG.*$", new JakartaRegexpCapabilities(JakartaRegexpCapabilities.FLAG_MATCH_CASEINDEPENDENT))); + } + + public void testJavaUtilCaseInsensative() throws Exception { + assertEquals(1, regexQueryNrHits("^.*DOG.*$", new JavaUtilRegexCapabilities(JavaUtilRegexCapabilities.FLAG_CASE_INSENSITIVE))); + } }