mirror of https://github.com/apache/lucene.git
Remove (Span)RegexQuery from core. Add completely refactored version to contrib/regex allowing pluggable regex implementations. contrib/regex is still a work in progress, and documentation is forthcoming
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@359526 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
aa07efc076
commit
396229f18d
|
@ -0,0 +1,19 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<project name="regex" default="default">
|
||||
|
||||
<description>
|
||||
Regular expression query
|
||||
</description>
|
||||
|
||||
<path id="additional.dependencies">
|
||||
<fileset dir="lib" includes="*-oro-*.jar,*-regexp-*.jar"/>
|
||||
</path>
|
||||
|
||||
<pathconvert property="project.classpath"
|
||||
targetos="unix"
|
||||
refid="additional.dependencies"
|
||||
/>
|
||||
|
||||
<import file="../contrib-build.xml"/>
|
||||
</project>
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[5d70c357a1e6c4c702af313c94aaf3168d300dcf] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,21 @@
|
|||
package org.apache.lucene.search.regex;
|
||||
|
||||
import org.apache.regexp.RE;
|
||||
import org.apache.regexp.RegexpTunnel;
|
||||
|
||||
public class JakartaRegexpCapabilities implements RegexCapabilities {
|
||||
private RE regexp;
|
||||
|
||||
public void compile(String pattern) {
|
||||
regexp = new RE(pattern);
|
||||
}
|
||||
|
||||
public boolean match(String string) {
|
||||
return regexp.match(string);
|
||||
}
|
||||
|
||||
public String prefix() {
|
||||
char[] prefix = RegexpTunnel.getPrefix(regexp);
|
||||
return prefix == null ? null : new String(prefix);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
package org.apache.lucene.search.regex;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class JavaUtilRegexCapabilities implements RegexCapabilities {
|
||||
private Pattern pattern;
|
||||
|
||||
public void compile(String pattern) {
|
||||
this.pattern = Pattern.compile(pattern);
|
||||
}
|
||||
|
||||
public boolean match(String string) {
|
||||
return pattern.matcher(string).lookingAt();
|
||||
}
|
||||
|
||||
public String prefix() {
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
package org.apache.lucene.search.regex;
|
||||
|
||||
public interface RegexCapabilities {
|
||||
void compile(String pattern);
|
||||
boolean match(String string);
|
||||
String prefix();
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
package org.apache.lucene.search.regex;
|
||||
|
||||
import org.apache.lucene.search.MultiTermQuery;
|
||||
import org.apache.lucene.search.FilteredTermEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
|
||||
private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities();
|
||||
|
||||
public RegexQuery(Term term) {
|
||||
super(term);
|
||||
}
|
||||
|
||||
public void setRegexImplementation(RegexCapabilities impl) {
|
||||
this.regexImpl = impl;
|
||||
}
|
||||
|
||||
public RegexCapabilities getRegexImplementation() {
|
||||
return regexImpl;
|
||||
}
|
||||
|
||||
protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
|
||||
Term term = new Term(getTerm().field(), getTerm().text());
|
||||
return new RegexTermEnum(reader, term, regexImpl);
|
||||
}
|
||||
|
||||
/* generated by IntelliJ IDEA */
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
if (!super.equals(o)) return false;
|
||||
|
||||
final RegexQuery that = (RegexQuery) o;
|
||||
|
||||
return regexImpl.equals(that.regexImpl);
|
||||
}
|
||||
|
||||
/* generated by IntelliJ IDEA */
|
||||
public int hashCode() {
|
||||
int result = super.hashCode();
|
||||
result = 29 * result + regexImpl.hashCode();
|
||||
return result;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,6 @@
|
|||
package org.apache.lucene.search.regex;
|
||||
|
||||
public interface RegexQueryCapable {
|
||||
void setRegexImplementation(RegexCapabilities impl);
|
||||
RegexCapabilities getRegexImplementation();
|
||||
}
|
|
@ -4,34 +4,24 @@ import org.apache.lucene.search.FilteredTermEnum;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
import java.io.IOException;
|
||||
|
||||
public class RegexTermEnum extends FilteredTermEnum {
|
||||
private String field = "";
|
||||
private String pre = "";
|
||||
boolean endEnum = false;
|
||||
private Pattern pattern;
|
||||
private boolean endEnum = false;
|
||||
private RegexCapabilities regexImpl;
|
||||
|
||||
public RegexTermEnum(IndexReader reader, Term term) throws IOException {
|
||||
public RegexTermEnum(IndexReader reader, Term term, RegexCapabilities regexImpl) throws IOException {
|
||||
super();
|
||||
field = term.field();
|
||||
String text = term.text();
|
||||
this.regexImpl = regexImpl;
|
||||
|
||||
pattern = Pattern.compile(text);
|
||||
regexImpl.compile(text);
|
||||
|
||||
// Find the first regex character position, to find the
|
||||
// maximum prefix to use for term enumeration
|
||||
int index = 0;
|
||||
while (index < text.length()) {
|
||||
char c = text.charAt(index);
|
||||
|
||||
if (!Character.isLetterOrDigit(c)) break;
|
||||
|
||||
index++;
|
||||
}
|
||||
|
||||
pre = text.substring(0, index);
|
||||
pre = regexImpl.prefix();
|
||||
if (pre == null) pre = "";
|
||||
|
||||
setEnum(reader.terms(new Term(term.field(), pre)));
|
||||
}
|
||||
|
@ -40,7 +30,7 @@ public class RegexTermEnum extends FilteredTermEnum {
|
|||
if (field == term.field()) {
|
||||
String searchText = term.text();
|
||||
if (searchText.startsWith(pre)) {
|
||||
return pattern.matcher(searchText).matches();
|
||||
return regexImpl.match(searchText);
|
||||
}
|
||||
}
|
||||
endEnum = true;
|
|
@ -16,7 +16,8 @@ import java.io.IOException;
|
|||
import java.util.Collection;
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class SpanRegexQuery extends SpanQuery {
|
||||
public class SpanRegexQuery extends SpanQuery implements RegexQueryCapable {
|
||||
private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities();
|
||||
private Term term;
|
||||
|
||||
public SpanRegexQuery(Term term) {
|
||||
|
@ -26,10 +27,11 @@ public class SpanRegexQuery extends SpanQuery {
|
|||
public Term getTerm() { return term; }
|
||||
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
Query orig = new RegexQuery(term).rewrite(reader);
|
||||
RegexQuery orig = new RegexQuery(term);
|
||||
orig.setRegexImplementation(regexImpl);
|
||||
|
||||
// RegexQuery (via MultiTermQuery).rewrite always returns a BooleanQuery
|
||||
BooleanQuery bq = (BooleanQuery) orig;
|
||||
BooleanQuery bq = (BooleanQuery) orig.rewrite(reader);
|
||||
|
||||
BooleanClause[] clauses = bq.getClauses();
|
||||
SpanQuery[] sqs = new SpanQuery[clauses.length];
|
||||
|
@ -63,15 +65,25 @@ public class SpanRegexQuery extends SpanQuery {
|
|||
return terms;
|
||||
}
|
||||
|
||||
/* generated by IntelliJ IDEA */
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof SpanRegexQuery)) return false;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
final SpanRegexQuery that = (SpanRegexQuery) o;
|
||||
return term.equals(that.term) && getBoost() == that.getBoost();
|
||||
|
||||
if (!regexImpl.equals(that.regexImpl)) return false;
|
||||
if (!term.equals(that.term)) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* generated by IntelliJ IDEA */
|
||||
public int hashCode() {
|
||||
return term.hashCode() ^ Float.floatToRawIntBits(getBoost()) ^ 0x4BCEF3A9;
|
||||
int result;
|
||||
result = regexImpl.hashCode();
|
||||
result = 29 * result + term.hashCode();
|
||||
return result;
|
||||
}
|
||||
|
||||
public String toString(String field) {
|
||||
|
@ -82,4 +94,12 @@ public class SpanRegexQuery extends SpanQuery {
|
|||
buffer.append(ToStringUtils.boost(getBoost()));
|
||||
return buffer.toString();
|
||||
}
|
||||
|
||||
public void setRegexImplementation(RegexCapabilities impl) {
|
||||
this.regexImpl = impl;
|
||||
}
|
||||
|
||||
public RegexCapabilities getRegexImplementation() {
|
||||
return regexImpl;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
package org.apache.regexp;
|
||||
|
||||
/**
|
||||
* This class exists as a gateway to access useful Jakarta Regexp package protected data.
|
||||
*/
|
||||
public class RegexpTunnel {
|
||||
public static char[] getPrefix(RE regexp) {
|
||||
REProgram program = regexp.getProgram();
|
||||
return program.prefix;
|
||||
}
|
||||
}
|
|
@ -24,7 +24,6 @@ import org.apache.lucene.analysis.SimpleAnalyzer;
|
|||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
|
@ -59,7 +58,7 @@ public class TestRegexQuery extends TestCase {
|
|||
private Term newTerm(String value) { return new Term(FN, value); }
|
||||
|
||||
private int regexQueryNrHits(String regex) throws Exception {
|
||||
Query query = new RegexQuery( newTerm(regex));
|
||||
RegexQuery query = new RegexQuery( newTerm(regex));
|
||||
return searcher.search(query).length();
|
||||
}
|
||||
|
||||
|
@ -71,30 +70,32 @@ public class TestRegexQuery extends TestCase {
|
|||
}
|
||||
|
||||
public void testRegex1() throws Exception {
|
||||
assertEquals(1, regexQueryNrHits("q.[aeiou]c.*"));
|
||||
assertEquals(1, regexQueryNrHits("^q.[aeiou]c.*$"));
|
||||
}
|
||||
|
||||
public void testRegex2() throws Exception {
|
||||
assertEquals(0, regexQueryNrHits(".[aeiou]c.*"));
|
||||
assertEquals(0, regexQueryNrHits("^.[aeiou]c.*$"));
|
||||
}
|
||||
|
||||
public void testRegex3() throws Exception {
|
||||
assertEquals(0, regexQueryNrHits("q.[aeiou]c"));
|
||||
assertEquals(0, regexQueryNrHits("^q.[aeiou]c$"));
|
||||
}
|
||||
|
||||
public void testSpanRegex1() throws Exception {
|
||||
assertEquals(1, spanRegexQueryNrHits("q.[aeiou]c.*", "dog", 6, true));
|
||||
assertEquals(1, spanRegexQueryNrHits("^q.[aeiou]c.*$", "dog", 6, true));
|
||||
}
|
||||
|
||||
public void testSpanRegex2() throws Exception {
|
||||
assertEquals(0, spanRegexQueryNrHits("q.[aeiou]c.*", "dog", 5, true));
|
||||
assertEquals(0, spanRegexQueryNrHits("^q.[aeiou]c.*$", "dog", 5, true));
|
||||
}
|
||||
|
||||
public void testEquals() throws Exception {
|
||||
RegexQuery query1 = new RegexQuery( newTerm("foo.*"));
|
||||
query1.setRegexImplementation(new JakartaRegexpCapabilities());
|
||||
|
||||
RegexQuery query2 = new RegexQuery( newTerm("foo.*"));
|
||||
assertFalse(query1.equals(query2));
|
||||
}
|
||||
|
||||
// public void testPrefix() throws Exception {
|
||||
// This test currently fails because RegexTermEnum picks "r" as the prefix
|
||||
// but the following "?" makes the "r" optional and should be a hit for the
|
||||
// document matching "over".
|
||||
// assertEquals(1, regexQueryNrHits("r?over"));
|
||||
// }
|
||||
}
|
||||
|
|
@ -9,7 +9,6 @@ import org.apache.lucene.document.Document;
|
|||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Hits;
|
||||
import org.apache.lucene.search.QueryUtils;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
|
@ -30,8 +29,5 @@ public class TestSpanRegexQuery extends TestCase {
|
|||
SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {srq, stq}, 6, true);
|
||||
Hits hits = searcher.search(query);
|
||||
assertEquals(1, hits.length());
|
||||
QueryUtils.check(srq);
|
||||
QueryUtils.checkUnequal(srq,stq);
|
||||
QueryUtils.checkUnequal(srq,query);
|
||||
}
|
||||
}
|
|
@ -1,26 +0,0 @@
|
|||
package org.apache.lucene.search.regex;
|
||||
|
||||
import org.apache.lucene.search.MultiTermQuery;
|
||||
import org.apache.lucene.search.FilteredTermEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class RegexQuery extends MultiTermQuery {
|
||||
public RegexQuery(Term term) {
|
||||
super(term);
|
||||
}
|
||||
|
||||
protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
|
||||
Term term = new Term(getTerm().field(), getTerm().text());
|
||||
return new RegexTermEnum(reader, term);
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
if (o instanceof RegexQuery)
|
||||
return super.equals(o);
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue