From 396229f18d851203b44ca7c6c9de7b0935a37399 Mon Sep 17 00:00:00 2001 From: Erik Hatcher Date: Wed, 28 Dec 2005 13:57:40 +0000 Subject: [PATCH] Remove (Span)RegexQuery from core. Add completely refactored version to contrib/regex allowing pluggable regex implementations. contrib/regex is still a work in progress, and documentation is forthcoming git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@359526 13f79535-47bb-0310-9956-ffa450edef68 --- contrib/regex/build.xml | 19 ++++++++ contrib/regex/lib/jakarta-regexp-1.4.jar | 2 + .../regex/JakartaRegexpCapabilities.java | 21 +++++++++ .../regex/JavaUtilRegexCapabilities.java | 19 ++++++++ .../search/regex/RegexCapabilities.java | 7 +++ .../lucene/search/regex/RegexQuery.java | 47 +++++++++++++++++++ .../search/regex/RegexQueryCapable.java | 6 +++ .../lucene/search/regex/RegexTermEnum.java | 26 ++++------ .../lucene/search/regex/SpanRegexQuery.java | 32 ++++++++++--- .../java/org/apache/regexp/RegexpTunnel.java | 11 +++++ .../lucene/search/regex/TestRegexQuery.java | 27 ++++++----- .../search/regex/TestSpanRegexQuery.java | 4 -- .../lucene/search/regex/RegexQuery.java | 26 ---------- 13 files changed, 180 insertions(+), 67 deletions(-) create mode 100644 contrib/regex/build.xml create mode 100644 contrib/regex/lib/jakarta-regexp-1.4.jar create mode 100644 contrib/regex/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java create mode 100644 contrib/regex/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java create mode 100644 contrib/regex/src/java/org/apache/lucene/search/regex/RegexCapabilities.java create mode 100644 contrib/regex/src/java/org/apache/lucene/search/regex/RegexQuery.java create mode 100644 contrib/regex/src/java/org/apache/lucene/search/regex/RegexQueryCapable.java rename {src => contrib/regex/src}/java/org/apache/lucene/search/regex/RegexTermEnum.java (63%) rename {src => contrib/regex/src}/java/org/apache/lucene/search/regex/SpanRegexQuery.java (71%) create mode 100644 contrib/regex/src/java/org/apache/regexp/RegexpTunnel.java rename {src => contrib/regex/src}/test/org/apache/lucene/search/regex/TestRegexQuery.java (79%) rename {src => contrib/regex/src}/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java (90%) delete mode 100644 src/java/org/apache/lucene/search/regex/RegexQuery.java diff --git a/contrib/regex/build.xml b/contrib/regex/build.xml new file mode 100644 index 00000000000..cbdbf8f9a23 --- /dev/null +++ b/contrib/regex/build.xml @@ -0,0 +1,19 @@ + + + + + + Regular expression query + + + + + + + + + + diff --git a/contrib/regex/lib/jakarta-regexp-1.4.jar b/contrib/regex/lib/jakarta-regexp-1.4.jar new file mode 100644 index 00000000000..0366f231ea9 --- /dev/null +++ b/contrib/regex/lib/jakarta-regexp-1.4.jar @@ -0,0 +1,2 @@ +AnyObjectId[5d70c357a1e6c4c702af313c94aaf3168d300dcf] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/regex/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java b/contrib/regex/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java new file mode 100644 index 00000000000..d32554825d3 --- /dev/null +++ b/contrib/regex/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java @@ -0,0 +1,21 @@ +package org.apache.lucene.search.regex; + +import org.apache.regexp.RE; +import org.apache.regexp.RegexpTunnel; + +public class JakartaRegexpCapabilities implements RegexCapabilities { + private RE regexp; + + public void compile(String pattern) { + regexp = new RE(pattern); + } + + public boolean match(String string) { + return regexp.match(string); + } + + public String prefix() { + char[] prefix = RegexpTunnel.getPrefix(regexp); + return prefix == null ? null : new String(prefix); + } +} diff --git a/contrib/regex/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java b/contrib/regex/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java new file mode 100644 index 00000000000..90720b55ab9 --- /dev/null +++ b/contrib/regex/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java @@ -0,0 +1,19 @@ +package org.apache.lucene.search.regex; + +import java.util.regex.Pattern; + +public class JavaUtilRegexCapabilities implements RegexCapabilities { + private Pattern pattern; + + public void compile(String pattern) { + this.pattern = Pattern.compile(pattern); + } + + public boolean match(String string) { + return pattern.matcher(string).lookingAt(); + } + + public String prefix() { + return null; + } +} diff --git a/contrib/regex/src/java/org/apache/lucene/search/regex/RegexCapabilities.java b/contrib/regex/src/java/org/apache/lucene/search/regex/RegexCapabilities.java new file mode 100644 index 00000000000..fb1224e5240 --- /dev/null +++ b/contrib/regex/src/java/org/apache/lucene/search/regex/RegexCapabilities.java @@ -0,0 +1,7 @@ +package org.apache.lucene.search.regex; + +public interface RegexCapabilities { + void compile(String pattern); + boolean match(String string); + String prefix(); +} diff --git a/contrib/regex/src/java/org/apache/lucene/search/regex/RegexQuery.java b/contrib/regex/src/java/org/apache/lucene/search/regex/RegexQuery.java new file mode 100644 index 00000000000..a0843bab97a --- /dev/null +++ b/contrib/regex/src/java/org/apache/lucene/search/regex/RegexQuery.java @@ -0,0 +1,47 @@ +package org.apache.lucene.search.regex; + +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.FilteredTermEnum; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexReader; + +import java.io.IOException; + +public class RegexQuery extends MultiTermQuery implements RegexQueryCapable { + private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities(); + + public RegexQuery(Term term) { + super(term); + } + + public void setRegexImplementation(RegexCapabilities impl) { + this.regexImpl = impl; + } + + public RegexCapabilities getRegexImplementation() { + return regexImpl; + } + + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + Term term = new Term(getTerm().field(), getTerm().text()); + return new RegexTermEnum(reader, term, regexImpl); + } + + /* generated by IntelliJ IDEA */ + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + + final RegexQuery that = (RegexQuery) o; + + return regexImpl.equals(that.regexImpl); + } + + /* generated by IntelliJ IDEA */ + public int hashCode() { + int result = super.hashCode(); + result = 29 * result + regexImpl.hashCode(); + return result; + } +} diff --git a/contrib/regex/src/java/org/apache/lucene/search/regex/RegexQueryCapable.java b/contrib/regex/src/java/org/apache/lucene/search/regex/RegexQueryCapable.java new file mode 100644 index 00000000000..79ca08a4227 --- /dev/null +++ b/contrib/regex/src/java/org/apache/lucene/search/regex/RegexQueryCapable.java @@ -0,0 +1,6 @@ +package org.apache.lucene.search.regex; + +public interface RegexQueryCapable { + void setRegexImplementation(RegexCapabilities impl); + RegexCapabilities getRegexImplementation(); +} diff --git a/src/java/org/apache/lucene/search/regex/RegexTermEnum.java b/contrib/regex/src/java/org/apache/lucene/search/regex/RegexTermEnum.java similarity index 63% rename from src/java/org/apache/lucene/search/regex/RegexTermEnum.java rename to contrib/regex/src/java/org/apache/lucene/search/regex/RegexTermEnum.java index 528e6212f92..19b2739f160 100644 --- a/src/java/org/apache/lucene/search/regex/RegexTermEnum.java +++ b/contrib/regex/src/java/org/apache/lucene/search/regex/RegexTermEnum.java @@ -4,34 +4,24 @@ import org.apache.lucene.search.FilteredTermEnum; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import java.util.regex.Pattern; import java.io.IOException; public class RegexTermEnum extends FilteredTermEnum { private String field = ""; private String pre = ""; - boolean endEnum = false; - private Pattern pattern; + private boolean endEnum = false; + private RegexCapabilities regexImpl; - public RegexTermEnum(IndexReader reader, Term term) throws IOException { + public RegexTermEnum(IndexReader reader, Term term, RegexCapabilities regexImpl) throws IOException { super(); field = term.field(); String text = term.text(); + this.regexImpl = regexImpl; - pattern = Pattern.compile(text); + regexImpl.compile(text); - // Find the first regex character position, to find the - // maximum prefix to use for term enumeration - int index = 0; - while (index < text.length()) { - char c = text.charAt(index); - - if (!Character.isLetterOrDigit(c)) break; - - index++; - } - - pre = text.substring(0, index); + pre = regexImpl.prefix(); + if (pre == null) pre = ""; setEnum(reader.terms(new Term(term.field(), pre))); } @@ -40,7 +30,7 @@ public class RegexTermEnum extends FilteredTermEnum { if (field == term.field()) { String searchText = term.text(); if (searchText.startsWith(pre)) { - return pattern.matcher(searchText).matches(); + return regexImpl.match(searchText); } } endEnum = true; diff --git a/src/java/org/apache/lucene/search/regex/SpanRegexQuery.java b/contrib/regex/src/java/org/apache/lucene/search/regex/SpanRegexQuery.java similarity index 71% rename from src/java/org/apache/lucene/search/regex/SpanRegexQuery.java rename to contrib/regex/src/java/org/apache/lucene/search/regex/SpanRegexQuery.java index a679a5225bc..37c649e018c 100644 --- a/src/java/org/apache/lucene/search/regex/SpanRegexQuery.java +++ b/contrib/regex/src/java/org/apache/lucene/search/regex/SpanRegexQuery.java @@ -16,7 +16,8 @@ import java.io.IOException; import java.util.Collection; import java.util.ArrayList; -public class SpanRegexQuery extends SpanQuery { +public class SpanRegexQuery extends SpanQuery implements RegexQueryCapable { + private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities(); private Term term; public SpanRegexQuery(Term term) { @@ -26,10 +27,11 @@ public class SpanRegexQuery extends SpanQuery { public Term getTerm() { return term; } public Query rewrite(IndexReader reader) throws IOException { - Query orig = new RegexQuery(term).rewrite(reader); + RegexQuery orig = new RegexQuery(term); + orig.setRegexImplementation(regexImpl); // RegexQuery (via MultiTermQuery).rewrite always returns a BooleanQuery - BooleanQuery bq = (BooleanQuery) orig; + BooleanQuery bq = (BooleanQuery) orig.rewrite(reader); BooleanClause[] clauses = bq.getClauses(); SpanQuery[] sqs = new SpanQuery[clauses.length]; @@ -63,15 +65,25 @@ public class SpanRegexQuery extends SpanQuery { return terms; } + /* generated by IntelliJ IDEA */ public boolean equals(Object o) { if (this == o) return true; - if (!(o instanceof SpanRegexQuery)) return false; + if (o == null || getClass() != o.getClass()) return false; + final SpanRegexQuery that = (SpanRegexQuery) o; - return term.equals(that.term) && getBoost() == that.getBoost(); + + if (!regexImpl.equals(that.regexImpl)) return false; + if (!term.equals(that.term)) return false; + + return true; } + /* generated by IntelliJ IDEA */ public int hashCode() { - return term.hashCode() ^ Float.floatToRawIntBits(getBoost()) ^ 0x4BCEF3A9; + int result; + result = regexImpl.hashCode(); + result = 29 * result + term.hashCode(); + return result; } public String toString(String field) { @@ -82,4 +94,12 @@ public class SpanRegexQuery extends SpanQuery { buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } + + public void setRegexImplementation(RegexCapabilities impl) { + this.regexImpl = impl; + } + + public RegexCapabilities getRegexImplementation() { + return regexImpl; + } } diff --git a/contrib/regex/src/java/org/apache/regexp/RegexpTunnel.java b/contrib/regex/src/java/org/apache/regexp/RegexpTunnel.java new file mode 100644 index 00000000000..5861956beeb --- /dev/null +++ b/contrib/regex/src/java/org/apache/regexp/RegexpTunnel.java @@ -0,0 +1,11 @@ +package org.apache.regexp; + +/** + * This class exists as a gateway to access useful Jakarta Regexp package protected data. + */ +public class RegexpTunnel { + public static char[] getPrefix(RE regexp) { + REProgram program = regexp.getProgram(); + return program.prefix; + } +} diff --git a/src/test/org/apache/lucene/search/regex/TestRegexQuery.java b/contrib/regex/src/test/org/apache/lucene/search/regex/TestRegexQuery.java similarity index 79% rename from src/test/org/apache/lucene/search/regex/TestRegexQuery.java rename to contrib/regex/src/test/org/apache/lucene/search/regex/TestRegexQuery.java index caa9e326e00..74b105d231a 100644 --- a/src/test/org/apache/lucene/search/regex/TestRegexQuery.java +++ b/contrib/regex/src/test/org/apache/lucene/search/regex/TestRegexQuery.java @@ -24,7 +24,6 @@ import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanQuery; @@ -59,7 +58,7 @@ public class TestRegexQuery extends TestCase { private Term newTerm(String value) { return new Term(FN, value); } private int regexQueryNrHits(String regex) throws Exception { - Query query = new RegexQuery( newTerm(regex)); + RegexQuery query = new RegexQuery( newTerm(regex)); return searcher.search(query).length(); } @@ -71,30 +70,32 @@ public class TestRegexQuery extends TestCase { } public void testRegex1() throws Exception { - assertEquals(1, regexQueryNrHits("q.[aeiou]c.*")); + assertEquals(1, regexQueryNrHits("^q.[aeiou]c.*$")); } public void testRegex2() throws Exception { - assertEquals(0, regexQueryNrHits(".[aeiou]c.*")); + assertEquals(0, regexQueryNrHits("^.[aeiou]c.*$")); } public void testRegex3() throws Exception { - assertEquals(0, regexQueryNrHits("q.[aeiou]c")); + assertEquals(0, regexQueryNrHits("^q.[aeiou]c$")); } public void testSpanRegex1() throws Exception { - assertEquals(1, spanRegexQueryNrHits("q.[aeiou]c.*", "dog", 6, true)); + assertEquals(1, spanRegexQueryNrHits("^q.[aeiou]c.*$", "dog", 6, true)); } public void testSpanRegex2() throws Exception { - assertEquals(0, spanRegexQueryNrHits("q.[aeiou]c.*", "dog", 5, true)); + assertEquals(0, spanRegexQueryNrHits("^q.[aeiou]c.*$", "dog", 5, true)); + } + + public void testEquals() throws Exception { + RegexQuery query1 = new RegexQuery( newTerm("foo.*")); + query1.setRegexImplementation(new JakartaRegexpCapabilities()); + + RegexQuery query2 = new RegexQuery( newTerm("foo.*")); + assertFalse(query1.equals(query2)); } -// public void testPrefix() throws Exception { - // This test currently fails because RegexTermEnum picks "r" as the prefix - // but the following "?" makes the "r" optional and should be a hit for the - // document matching "over". -// assertEquals(1, regexQueryNrHits("r?over")); -// } } diff --git a/src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java b/contrib/regex/src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java similarity index 90% rename from src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java rename to contrib/regex/src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java index 8438532d240..6b662bcdcd4 100644 --- a/src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java +++ b/contrib/regex/src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java @@ -9,7 +9,6 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Hits; -import org.apache.lucene.search.QueryUtils; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanQuery; @@ -30,8 +29,5 @@ public class TestSpanRegexQuery extends TestCase { SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {srq, stq}, 6, true); Hits hits = searcher.search(query); assertEquals(1, hits.length()); - QueryUtils.check(srq); - QueryUtils.checkUnequal(srq,stq); - QueryUtils.checkUnequal(srq,query); } } diff --git a/src/java/org/apache/lucene/search/regex/RegexQuery.java b/src/java/org/apache/lucene/search/regex/RegexQuery.java deleted file mode 100644 index 38ce8a5d2e8..00000000000 --- a/src/java/org/apache/lucene/search/regex/RegexQuery.java +++ /dev/null @@ -1,26 +0,0 @@ -package org.apache.lucene.search.regex; - -import org.apache.lucene.search.MultiTermQuery; -import org.apache.lucene.search.FilteredTermEnum; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.IndexReader; - -import java.io.IOException; - -public class RegexQuery extends MultiTermQuery { - public RegexQuery(Term term) { - super(term); - } - - protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { - Term term = new Term(getTerm().field(), getTerm().text()); - return new RegexTermEnum(reader, term); - } - - public boolean equals(Object o) { - if (o instanceof RegexQuery) - return super.equals(o); - - return false; - } -}