mirror of https://github.com/apache/lucene.git
Remove (Span)RegexQuery from core. Add completely refactored version to contrib/regex allowing pluggable regex implementations. contrib/regex is still a work in progress, and documentation is forthcoming
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@359526 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
aa07efc076
commit
396229f18d
|
@ -0,0 +1,19 @@
|
||||||
|
<?xml version="1.0"?>
|
||||||
|
|
||||||
|
<project name="regex" default="default">
|
||||||
|
|
||||||
|
<description>
|
||||||
|
Regular expression query
|
||||||
|
</description>
|
||||||
|
|
||||||
|
<path id="additional.dependencies">
|
||||||
|
<fileset dir="lib" includes="*-oro-*.jar,*-regexp-*.jar"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<pathconvert property="project.classpath"
|
||||||
|
targetos="unix"
|
||||||
|
refid="additional.dependencies"
|
||||||
|
/>
|
||||||
|
|
||||||
|
<import file="../contrib-build.xml"/>
|
||||||
|
</project>
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[5d70c357a1e6c4c702af313c94aaf3168d300dcf] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -0,0 +1,21 @@
|
||||||
|
package org.apache.lucene.search.regex;
|
||||||
|
|
||||||
|
import org.apache.regexp.RE;
|
||||||
|
import org.apache.regexp.RegexpTunnel;
|
||||||
|
|
||||||
|
public class JakartaRegexpCapabilities implements RegexCapabilities {
|
||||||
|
private RE regexp;
|
||||||
|
|
||||||
|
public void compile(String pattern) {
|
||||||
|
regexp = new RE(pattern);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean match(String string) {
|
||||||
|
return regexp.match(string);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String prefix() {
|
||||||
|
char[] prefix = RegexpTunnel.getPrefix(regexp);
|
||||||
|
return prefix == null ? null : new String(prefix);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,19 @@
|
||||||
|
package org.apache.lucene.search.regex;
|
||||||
|
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
public class JavaUtilRegexCapabilities implements RegexCapabilities {
|
||||||
|
private Pattern pattern;
|
||||||
|
|
||||||
|
public void compile(String pattern) {
|
||||||
|
this.pattern = Pattern.compile(pattern);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean match(String string) {
|
||||||
|
return pattern.matcher(string).lookingAt();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String prefix() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,7 @@
|
||||||
|
package org.apache.lucene.search.regex;
|
||||||
|
|
||||||
|
public interface RegexCapabilities {
|
||||||
|
void compile(String pattern);
|
||||||
|
boolean match(String string);
|
||||||
|
String prefix();
|
||||||
|
}
|
|
@ -0,0 +1,47 @@
|
||||||
|
package org.apache.lucene.search.regex;
|
||||||
|
|
||||||
|
import org.apache.lucene.search.MultiTermQuery;
|
||||||
|
import org.apache.lucene.search.FilteredTermEnum;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
|
||||||
|
private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities();
|
||||||
|
|
||||||
|
public RegexQuery(Term term) {
|
||||||
|
super(term);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRegexImplementation(RegexCapabilities impl) {
|
||||||
|
this.regexImpl = impl;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RegexCapabilities getRegexImplementation() {
|
||||||
|
return regexImpl;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
|
||||||
|
Term term = new Term(getTerm().field(), getTerm().text());
|
||||||
|
return new RegexTermEnum(reader, term, regexImpl);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* generated by IntelliJ IDEA */
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if (this == o) return true;
|
||||||
|
if (o == null || getClass() != o.getClass()) return false;
|
||||||
|
if (!super.equals(o)) return false;
|
||||||
|
|
||||||
|
final RegexQuery that = (RegexQuery) o;
|
||||||
|
|
||||||
|
return regexImpl.equals(that.regexImpl);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* generated by IntelliJ IDEA */
|
||||||
|
public int hashCode() {
|
||||||
|
int result = super.hashCode();
|
||||||
|
result = 29 * result + regexImpl.hashCode();
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,6 @@
|
||||||
|
package org.apache.lucene.search.regex;
|
||||||
|
|
||||||
|
public interface RegexQueryCapable {
|
||||||
|
void setRegexImplementation(RegexCapabilities impl);
|
||||||
|
RegexCapabilities getRegexImplementation();
|
||||||
|
}
|
|
@ -4,34 +4,24 @@ import org.apache.lucene.search.FilteredTermEnum;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
public class RegexTermEnum extends FilteredTermEnum {
|
public class RegexTermEnum extends FilteredTermEnum {
|
||||||
private String field = "";
|
private String field = "";
|
||||||
private String pre = "";
|
private String pre = "";
|
||||||
boolean endEnum = false;
|
private boolean endEnum = false;
|
||||||
private Pattern pattern;
|
private RegexCapabilities regexImpl;
|
||||||
|
|
||||||
public RegexTermEnum(IndexReader reader, Term term) throws IOException {
|
public RegexTermEnum(IndexReader reader, Term term, RegexCapabilities regexImpl) throws IOException {
|
||||||
super();
|
super();
|
||||||
field = term.field();
|
field = term.field();
|
||||||
String text = term.text();
|
String text = term.text();
|
||||||
|
this.regexImpl = regexImpl;
|
||||||
|
|
||||||
pattern = Pattern.compile(text);
|
regexImpl.compile(text);
|
||||||
|
|
||||||
// Find the first regex character position, to find the
|
pre = regexImpl.prefix();
|
||||||
// maximum prefix to use for term enumeration
|
if (pre == null) pre = "";
|
||||||
int index = 0;
|
|
||||||
while (index < text.length()) {
|
|
||||||
char c = text.charAt(index);
|
|
||||||
|
|
||||||
if (!Character.isLetterOrDigit(c)) break;
|
|
||||||
|
|
||||||
index++;
|
|
||||||
}
|
|
||||||
|
|
||||||
pre = text.substring(0, index);
|
|
||||||
|
|
||||||
setEnum(reader.terms(new Term(term.field(), pre)));
|
setEnum(reader.terms(new Term(term.field(), pre)));
|
||||||
}
|
}
|
||||||
|
@ -40,7 +30,7 @@ public class RegexTermEnum extends FilteredTermEnum {
|
||||||
if (field == term.field()) {
|
if (field == term.field()) {
|
||||||
String searchText = term.text();
|
String searchText = term.text();
|
||||||
if (searchText.startsWith(pre)) {
|
if (searchText.startsWith(pre)) {
|
||||||
return pattern.matcher(searchText).matches();
|
return regexImpl.match(searchText);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
endEnum = true;
|
endEnum = true;
|
|
@ -16,7 +16,8 @@ import java.io.IOException;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
|
||||||
public class SpanRegexQuery extends SpanQuery {
|
public class SpanRegexQuery extends SpanQuery implements RegexQueryCapable {
|
||||||
|
private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities();
|
||||||
private Term term;
|
private Term term;
|
||||||
|
|
||||||
public SpanRegexQuery(Term term) {
|
public SpanRegexQuery(Term term) {
|
||||||
|
@ -26,10 +27,11 @@ public class SpanRegexQuery extends SpanQuery {
|
||||||
public Term getTerm() { return term; }
|
public Term getTerm() { return term; }
|
||||||
|
|
||||||
public Query rewrite(IndexReader reader) throws IOException {
|
public Query rewrite(IndexReader reader) throws IOException {
|
||||||
Query orig = new RegexQuery(term).rewrite(reader);
|
RegexQuery orig = new RegexQuery(term);
|
||||||
|
orig.setRegexImplementation(regexImpl);
|
||||||
|
|
||||||
// RegexQuery (via MultiTermQuery).rewrite always returns a BooleanQuery
|
// RegexQuery (via MultiTermQuery).rewrite always returns a BooleanQuery
|
||||||
BooleanQuery bq = (BooleanQuery) orig;
|
BooleanQuery bq = (BooleanQuery) orig.rewrite(reader);
|
||||||
|
|
||||||
BooleanClause[] clauses = bq.getClauses();
|
BooleanClause[] clauses = bq.getClauses();
|
||||||
SpanQuery[] sqs = new SpanQuery[clauses.length];
|
SpanQuery[] sqs = new SpanQuery[clauses.length];
|
||||||
|
@ -63,15 +65,25 @@ public class SpanRegexQuery extends SpanQuery {
|
||||||
return terms;
|
return terms;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* generated by IntelliJ IDEA */
|
||||||
public boolean equals(Object o) {
|
public boolean equals(Object o) {
|
||||||
if (this == o) return true;
|
if (this == o) return true;
|
||||||
if (!(o instanceof SpanRegexQuery)) return false;
|
if (o == null || getClass() != o.getClass()) return false;
|
||||||
|
|
||||||
final SpanRegexQuery that = (SpanRegexQuery) o;
|
final SpanRegexQuery that = (SpanRegexQuery) o;
|
||||||
return term.equals(that.term) && getBoost() == that.getBoost();
|
|
||||||
|
if (!regexImpl.equals(that.regexImpl)) return false;
|
||||||
|
if (!term.equals(that.term)) return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* generated by IntelliJ IDEA */
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
return term.hashCode() ^ Float.floatToRawIntBits(getBoost()) ^ 0x4BCEF3A9;
|
int result;
|
||||||
|
result = regexImpl.hashCode();
|
||||||
|
result = 29 * result + term.hashCode();
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString(String field) {
|
public String toString(String field) {
|
||||||
|
@ -82,4 +94,12 @@ public class SpanRegexQuery extends SpanQuery {
|
||||||
buffer.append(ToStringUtils.boost(getBoost()));
|
buffer.append(ToStringUtils.boost(getBoost()));
|
||||||
return buffer.toString();
|
return buffer.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setRegexImplementation(RegexCapabilities impl) {
|
||||||
|
this.regexImpl = impl;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RegexCapabilities getRegexImplementation() {
|
||||||
|
return regexImpl;
|
||||||
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,11 @@
|
||||||
|
package org.apache.regexp;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class exists as a gateway to access useful Jakarta Regexp package protected data.
|
||||||
|
*/
|
||||||
|
public class RegexpTunnel {
|
||||||
|
public static char[] getPrefix(RE regexp) {
|
||||||
|
REProgram program = regexp.getProgram();
|
||||||
|
return program.prefix;
|
||||||
|
}
|
||||||
|
}
|
|
@ -24,7 +24,6 @@ import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.Query;
|
|
||||||
|
|
||||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||||
import org.apache.lucene.search.spans.SpanQuery;
|
import org.apache.lucene.search.spans.SpanQuery;
|
||||||
|
@ -59,7 +58,7 @@ public class TestRegexQuery extends TestCase {
|
||||||
private Term newTerm(String value) { return new Term(FN, value); }
|
private Term newTerm(String value) { return new Term(FN, value); }
|
||||||
|
|
||||||
private int regexQueryNrHits(String regex) throws Exception {
|
private int regexQueryNrHits(String regex) throws Exception {
|
||||||
Query query = new RegexQuery( newTerm(regex));
|
RegexQuery query = new RegexQuery( newTerm(regex));
|
||||||
return searcher.search(query).length();
|
return searcher.search(query).length();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -71,30 +70,32 @@ public class TestRegexQuery extends TestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testRegex1() throws Exception {
|
public void testRegex1() throws Exception {
|
||||||
assertEquals(1, regexQueryNrHits("q.[aeiou]c.*"));
|
assertEquals(1, regexQueryNrHits("^q.[aeiou]c.*$"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testRegex2() throws Exception {
|
public void testRegex2() throws Exception {
|
||||||
assertEquals(0, regexQueryNrHits(".[aeiou]c.*"));
|
assertEquals(0, regexQueryNrHits("^.[aeiou]c.*$"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testRegex3() throws Exception {
|
public void testRegex3() throws Exception {
|
||||||
assertEquals(0, regexQueryNrHits("q.[aeiou]c"));
|
assertEquals(0, regexQueryNrHits("^q.[aeiou]c$"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSpanRegex1() throws Exception {
|
public void testSpanRegex1() throws Exception {
|
||||||
assertEquals(1, spanRegexQueryNrHits("q.[aeiou]c.*", "dog", 6, true));
|
assertEquals(1, spanRegexQueryNrHits("^q.[aeiou]c.*$", "dog", 6, true));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSpanRegex2() throws Exception {
|
public void testSpanRegex2() throws Exception {
|
||||||
assertEquals(0, spanRegexQueryNrHits("q.[aeiou]c.*", "dog", 5, true));
|
assertEquals(0, spanRegexQueryNrHits("^q.[aeiou]c.*$", "dog", 5, true));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEquals() throws Exception {
|
||||||
|
RegexQuery query1 = new RegexQuery( newTerm("foo.*"));
|
||||||
|
query1.setRegexImplementation(new JakartaRegexpCapabilities());
|
||||||
|
|
||||||
|
RegexQuery query2 = new RegexQuery( newTerm("foo.*"));
|
||||||
|
assertFalse(query1.equals(query2));
|
||||||
}
|
}
|
||||||
|
|
||||||
// public void testPrefix() throws Exception {
|
|
||||||
// This test currently fails because RegexTermEnum picks "r" as the prefix
|
|
||||||
// but the following "?" makes the "r" optional and should be a hit for the
|
|
||||||
// document matching "over".
|
|
||||||
// assertEquals(1, regexQueryNrHits("r?over"));
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,7 +9,6 @@ import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.Hits;
|
import org.apache.lucene.search.Hits;
|
||||||
import org.apache.lucene.search.QueryUtils;
|
|
||||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||||
import org.apache.lucene.search.spans.SpanQuery;
|
import org.apache.lucene.search.spans.SpanQuery;
|
||||||
|
@ -30,8 +29,5 @@ public class TestSpanRegexQuery extends TestCase {
|
||||||
SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {srq, stq}, 6, true);
|
SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {srq, stq}, 6, true);
|
||||||
Hits hits = searcher.search(query);
|
Hits hits = searcher.search(query);
|
||||||
assertEquals(1, hits.length());
|
assertEquals(1, hits.length());
|
||||||
QueryUtils.check(srq);
|
|
||||||
QueryUtils.checkUnequal(srq,stq);
|
|
||||||
QueryUtils.checkUnequal(srq,query);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -1,26 +0,0 @@
|
||||||
package org.apache.lucene.search.regex;
|
|
||||||
|
|
||||||
import org.apache.lucene.search.MultiTermQuery;
|
|
||||||
import org.apache.lucene.search.FilteredTermEnum;
|
|
||||||
import org.apache.lucene.index.Term;
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
public class RegexQuery extends MultiTermQuery {
|
|
||||||
public RegexQuery(Term term) {
|
|
||||||
super(term);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
|
|
||||||
Term term = new Term(getTerm().field(), getTerm().text());
|
|
||||||
return new RegexTermEnum(reader, term);
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean equals(Object o) {
|
|
||||||
if (o instanceof RegexQuery)
|
|
||||||
return super.equals(o);
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in New Issue