Remove (Span)RegexQuery from core. Add completely refactored version to contrib/regex allowing pluggable regex implementations. contrib/regex is still a work in progress, and documentation is forthcoming

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@359526 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erik Hatcher 2005-12-28 13:57:40 +00:00
parent aa07efc076
commit 396229f18d
13 changed files with 180 additions and 67 deletions

19
contrib/regex/build.xml Normal file
View File

@ -0,0 +1,19 @@
<?xml version="1.0"?>
<project name="regex" default="default">
<description>
Regular expression query
</description>
<path id="additional.dependencies">
<fileset dir="lib" includes="*-oro-*.jar,*-regexp-*.jar"/>
</path>
<pathconvert property="project.classpath"
targetos="unix"
refid="additional.dependencies"
/>
<import file="../contrib-build.xml"/>
</project>

View File

@ -0,0 +1,2 @@
AnyObjectId[5d70c357a1e6c4c702af313c94aaf3168d300dcf] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,21 @@
package org.apache.lucene.search.regex;
import org.apache.regexp.RE;
import org.apache.regexp.RegexpTunnel;
public class JakartaRegexpCapabilities implements RegexCapabilities {
private RE regexp;
public void compile(String pattern) {
regexp = new RE(pattern);
}
public boolean match(String string) {
return regexp.match(string);
}
public String prefix() {
char[] prefix = RegexpTunnel.getPrefix(regexp);
return prefix == null ? null : new String(prefix);
}
}

View File

@ -0,0 +1,19 @@
package org.apache.lucene.search.regex;
import java.util.regex.Pattern;
public class JavaUtilRegexCapabilities implements RegexCapabilities {
private Pattern pattern;
public void compile(String pattern) {
this.pattern = Pattern.compile(pattern);
}
public boolean match(String string) {
return pattern.matcher(string).lookingAt();
}
public String prefix() {
return null;
}
}

View File

@ -0,0 +1,7 @@
package org.apache.lucene.search.regex;
public interface RegexCapabilities {
void compile(String pattern);
boolean match(String string);
String prefix();
}

View File

@ -0,0 +1,47 @@
package org.apache.lucene.search.regex;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.FilteredTermEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader;
import java.io.IOException;
public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities();
public RegexQuery(Term term) {
super(term);
}
public void setRegexImplementation(RegexCapabilities impl) {
this.regexImpl = impl;
}
public RegexCapabilities getRegexImplementation() {
return regexImpl;
}
protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
Term term = new Term(getTerm().field(), getTerm().text());
return new RegexTermEnum(reader, term, regexImpl);
}
/* generated by IntelliJ IDEA */
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
if (!super.equals(o)) return false;
final RegexQuery that = (RegexQuery) o;
return regexImpl.equals(that.regexImpl);
}
/* generated by IntelliJ IDEA */
public int hashCode() {
int result = super.hashCode();
result = 29 * result + regexImpl.hashCode();
return result;
}
}

View File

@ -0,0 +1,6 @@
package org.apache.lucene.search.regex;
public interface RegexQueryCapable {
void setRegexImplementation(RegexCapabilities impl);
RegexCapabilities getRegexImplementation();
}

View File

@ -4,34 +4,24 @@ import org.apache.lucene.search.FilteredTermEnum;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import java.util.regex.Pattern;
import java.io.IOException; import java.io.IOException;
public class RegexTermEnum extends FilteredTermEnum { public class RegexTermEnum extends FilteredTermEnum {
private String field = ""; private String field = "";
private String pre = ""; private String pre = "";
boolean endEnum = false; private boolean endEnum = false;
private Pattern pattern; private RegexCapabilities regexImpl;
public RegexTermEnum(IndexReader reader, Term term) throws IOException { public RegexTermEnum(IndexReader reader, Term term, RegexCapabilities regexImpl) throws IOException {
super(); super();
field = term.field(); field = term.field();
String text = term.text(); String text = term.text();
this.regexImpl = regexImpl;
pattern = Pattern.compile(text); regexImpl.compile(text);
// Find the first regex character position, to find the pre = regexImpl.prefix();
// maximum prefix to use for term enumeration if (pre == null) pre = "";
int index = 0;
while (index < text.length()) {
char c = text.charAt(index);
if (!Character.isLetterOrDigit(c)) break;
index++;
}
pre = text.substring(0, index);
setEnum(reader.terms(new Term(term.field(), pre))); setEnum(reader.terms(new Term(term.field(), pre)));
} }
@ -40,7 +30,7 @@ public class RegexTermEnum extends FilteredTermEnum {
if (field == term.field()) { if (field == term.field()) {
String searchText = term.text(); String searchText = term.text();
if (searchText.startsWith(pre)) { if (searchText.startsWith(pre)) {
return pattern.matcher(searchText).matches(); return regexImpl.match(searchText);
} }
} }
endEnum = true; endEnum = true;

View File

@ -16,7 +16,8 @@ import java.io.IOException;
import java.util.Collection; import java.util.Collection;
import java.util.ArrayList; import java.util.ArrayList;
public class SpanRegexQuery extends SpanQuery { public class SpanRegexQuery extends SpanQuery implements RegexQueryCapable {
private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities();
private Term term; private Term term;
public SpanRegexQuery(Term term) { public SpanRegexQuery(Term term) {
@ -26,10 +27,11 @@ public class SpanRegexQuery extends SpanQuery {
public Term getTerm() { return term; } public Term getTerm() { return term; }
public Query rewrite(IndexReader reader) throws IOException { public Query rewrite(IndexReader reader) throws IOException {
Query orig = new RegexQuery(term).rewrite(reader); RegexQuery orig = new RegexQuery(term);
orig.setRegexImplementation(regexImpl);
// RegexQuery (via MultiTermQuery).rewrite always returns a BooleanQuery // RegexQuery (via MultiTermQuery).rewrite always returns a BooleanQuery
BooleanQuery bq = (BooleanQuery) orig; BooleanQuery bq = (BooleanQuery) orig.rewrite(reader);
BooleanClause[] clauses = bq.getClauses(); BooleanClause[] clauses = bq.getClauses();
SpanQuery[] sqs = new SpanQuery[clauses.length]; SpanQuery[] sqs = new SpanQuery[clauses.length];
@ -63,15 +65,25 @@ public class SpanRegexQuery extends SpanQuery {
return terms; return terms;
} }
/* generated by IntelliJ IDEA */
public boolean equals(Object o) { public boolean equals(Object o) {
if (this == o) return true; if (this == o) return true;
if (!(o instanceof SpanRegexQuery)) return false; if (o == null || getClass() != o.getClass()) return false;
final SpanRegexQuery that = (SpanRegexQuery) o; final SpanRegexQuery that = (SpanRegexQuery) o;
return term.equals(that.term) && getBoost() == that.getBoost();
if (!regexImpl.equals(that.regexImpl)) return false;
if (!term.equals(that.term)) return false;
return true;
} }
/* generated by IntelliJ IDEA */
public int hashCode() { public int hashCode() {
return term.hashCode() ^ Float.floatToRawIntBits(getBoost()) ^ 0x4BCEF3A9; int result;
result = regexImpl.hashCode();
result = 29 * result + term.hashCode();
return result;
} }
public String toString(String field) { public String toString(String field) {
@ -82,4 +94,12 @@ public class SpanRegexQuery extends SpanQuery {
buffer.append(ToStringUtils.boost(getBoost())); buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString(); return buffer.toString();
} }
public void setRegexImplementation(RegexCapabilities impl) {
this.regexImpl = impl;
}
public RegexCapabilities getRegexImplementation() {
return regexImpl;
}
} }

View File

@ -0,0 +1,11 @@
package org.apache.regexp;
/**
* This class exists as a gateway to access useful Jakarta Regexp package protected data.
*/
public class RegexpTunnel {
public static char[] getPrefix(RE regexp) {
REProgram program = regexp.getProgram();
return program.prefix;
}
}

View File

@ -24,7 +24,6 @@ import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanQuery;
@ -59,7 +58,7 @@ public class TestRegexQuery extends TestCase {
private Term newTerm(String value) { return new Term(FN, value); } private Term newTerm(String value) { return new Term(FN, value); }
private int regexQueryNrHits(String regex) throws Exception { private int regexQueryNrHits(String regex) throws Exception {
Query query = new RegexQuery( newTerm(regex)); RegexQuery query = new RegexQuery( newTerm(regex));
return searcher.search(query).length(); return searcher.search(query).length();
} }
@ -71,30 +70,32 @@ public class TestRegexQuery extends TestCase {
} }
public void testRegex1() throws Exception { public void testRegex1() throws Exception {
assertEquals(1, regexQueryNrHits("q.[aeiou]c.*")); assertEquals(1, regexQueryNrHits("^q.[aeiou]c.*$"));
} }
public void testRegex2() throws Exception { public void testRegex2() throws Exception {
assertEquals(0, regexQueryNrHits(".[aeiou]c.*")); assertEquals(0, regexQueryNrHits("^.[aeiou]c.*$"));
} }
public void testRegex3() throws Exception { public void testRegex3() throws Exception {
assertEquals(0, regexQueryNrHits("q.[aeiou]c")); assertEquals(0, regexQueryNrHits("^q.[aeiou]c$"));
} }
public void testSpanRegex1() throws Exception { public void testSpanRegex1() throws Exception {
assertEquals(1, spanRegexQueryNrHits("q.[aeiou]c.*", "dog", 6, true)); assertEquals(1, spanRegexQueryNrHits("^q.[aeiou]c.*$", "dog", 6, true));
} }
public void testSpanRegex2() throws Exception { public void testSpanRegex2() throws Exception {
assertEquals(0, spanRegexQueryNrHits("q.[aeiou]c.*", "dog", 5, true)); assertEquals(0, spanRegexQueryNrHits("^q.[aeiou]c.*$", "dog", 5, true));
}
public void testEquals() throws Exception {
RegexQuery query1 = new RegexQuery( newTerm("foo.*"));
query1.setRegexImplementation(new JakartaRegexpCapabilities());
RegexQuery query2 = new RegexQuery( newTerm("foo.*"));
assertFalse(query1.equals(query2));
} }
// public void testPrefix() throws Exception {
// This test currently fails because RegexTermEnum picks "r" as the prefix
// but the following "?" makes the "r" optional and should be a hit for the
// document matching "over".
// assertEquals(1, regexQueryNrHits("r?over"));
// }
} }

View File

@ -9,7 +9,6 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Hits; import org.apache.lucene.search.Hits;
import org.apache.lucene.search.QueryUtils;
import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanQuery;
@ -30,8 +29,5 @@ public class TestSpanRegexQuery extends TestCase {
SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {srq, stq}, 6, true); SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {srq, stq}, 6, true);
Hits hits = searcher.search(query); Hits hits = searcher.search(query);
assertEquals(1, hits.length()); assertEquals(1, hits.length());
QueryUtils.check(srq);
QueryUtils.checkUnequal(srq,stq);
QueryUtils.checkUnequal(srq,query);
} }
} }

View File

@ -1,26 +0,0 @@
package org.apache.lucene.search.regex;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.FilteredTermEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader;
import java.io.IOException;
public class RegexQuery extends MultiTermQuery {
public RegexQuery(Term term) {
super(term);
}
protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
Term term = new Term(getTerm().field(), getTerm().text());
return new RegexTermEnum(reader, term);
}
public boolean equals(Object o) {
if (o instanceof RegexQuery)
return super.equals(o);
return false;
}
}