mirror of https://github.com/apache/lucene.git
Add RegexQuery and SpanRegexQuery
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@332747 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a176028a7e
commit
6befc45334
|
@ -163,6 +163,11 @@ New features
|
||||||
highlighting entire documents or fields.
|
highlighting entire documents or fields.
|
||||||
(Erik Hatcher)
|
(Erik Hatcher)
|
||||||
|
|
||||||
|
23. Added regular expression queries, RegexQuery and SpanRegexQuery.
|
||||||
|
Note the same term enumeration caveats apply with these queries as
|
||||||
|
apply to WildcardQuery and other term expanding queries.
|
||||||
|
(Erik Hatcher)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
1. Several methods and fields have been deprecated. The API documentation
|
1. Several methods and fields have been deprecated. The API documentation
|
||||||
|
|
|
@ -0,0 +1,26 @@
|
||||||
|
package org.apache.lucene.search.regex;
|
||||||
|
|
||||||
|
import org.apache.lucene.search.MultiTermQuery;
|
||||||
|
import org.apache.lucene.search.FilteredTermEnum;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public class RegexQuery extends MultiTermQuery {
|
||||||
|
public RegexQuery(Term term) {
|
||||||
|
super(term);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
|
||||||
|
Term term = new Term(getTerm().field(), getTerm().text());
|
||||||
|
return new RegexTermEnum(reader, term);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if (o instanceof RegexQuery)
|
||||||
|
return super.equals(o);
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,65 @@
|
||||||
|
package org.apache.lucene.search.regex;
|
||||||
|
|
||||||
|
import org.apache.lucene.search.FilteredTermEnum;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public class RegexTermEnum extends FilteredTermEnum {
|
||||||
|
private String field = "";
|
||||||
|
private String pre = "";
|
||||||
|
boolean endEnum = false;
|
||||||
|
private Pattern pattern;
|
||||||
|
|
||||||
|
public RegexTermEnum(IndexReader reader, Term term) throws IOException {
|
||||||
|
super();
|
||||||
|
field = term.field();
|
||||||
|
String text = term.text();
|
||||||
|
|
||||||
|
pattern = Pattern.compile(text);
|
||||||
|
|
||||||
|
// Find the first regex character position, to find the
|
||||||
|
// maximum prefix to use for term enumeration
|
||||||
|
int index = 0;
|
||||||
|
while (index < text.length()) {
|
||||||
|
char c = text.charAt(index);
|
||||||
|
|
||||||
|
// TODO: improve the logic here. There are other types of patterns
|
||||||
|
// that could break this, such as "\d*" and "\*abc"
|
||||||
|
if (c == '*' || c == '[' || c == '?' || c == '.') break;
|
||||||
|
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
|
||||||
|
pre = text.substring(0, index);
|
||||||
|
|
||||||
|
setEnum(reader.terms(new Term(term.field(), pre)));
|
||||||
|
}
|
||||||
|
|
||||||
|
protected final boolean termCompare(Term term) {
|
||||||
|
if (field == term.field()) {
|
||||||
|
String searchText = term.text();
|
||||||
|
if (searchText.startsWith(pre)) {
|
||||||
|
return pattern.matcher(searchText).matches();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
endEnum = true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final float difference() {
|
||||||
|
// TODO: adjust difference based on distance of searchTerm.text() and term().text()
|
||||||
|
return 1.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final boolean endEnum() {
|
||||||
|
return endEnum;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() throws IOException {
|
||||||
|
super.close();
|
||||||
|
field = null;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,85 @@
|
||||||
|
package org.apache.lucene.search.regex;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
|
import org.apache.lucene.search.BooleanClause;
|
||||||
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||||
|
import org.apache.lucene.search.spans.SpanQuery;
|
||||||
|
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||||
|
import org.apache.lucene.search.spans.Spans;
|
||||||
|
import org.apache.lucene.util.ToStringUtils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
public class SpanRegexQuery extends SpanQuery {
|
||||||
|
private Term term;
|
||||||
|
|
||||||
|
public SpanRegexQuery(Term term) {
|
||||||
|
this.term = term;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Query rewrite(IndexReader reader) throws IOException {
|
||||||
|
Query orig = new RegexQuery(term).rewrite(reader);
|
||||||
|
|
||||||
|
// RegexQuery (via MultiTermQuery).rewrite always returns a BooleanQuery
|
||||||
|
BooleanQuery bq = (BooleanQuery) orig;
|
||||||
|
|
||||||
|
BooleanClause[] clauses = bq.getClauses();
|
||||||
|
SpanQuery[] sqs = new SpanQuery[clauses.length];
|
||||||
|
for (int i = 0; i < clauses.length; i++) {
|
||||||
|
BooleanClause clause = clauses[i];
|
||||||
|
|
||||||
|
// Clauses from RegexQuery.rewrite are always TermQuery's
|
||||||
|
TermQuery tq = (TermQuery) clause.getQuery();
|
||||||
|
|
||||||
|
sqs[i] = new SpanTermQuery(tq.getTerm());
|
||||||
|
sqs[i].setBoost(tq.getBoost());
|
||||||
|
}
|
||||||
|
|
||||||
|
SpanOrQuery query = new SpanOrQuery(sqs);
|
||||||
|
query.setBoost(orig.getBoost());
|
||||||
|
|
||||||
|
return query;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Spans getSpans(IndexReader reader) throws IOException {
|
||||||
|
throw new UnsupportedOperationException("Query should have been rewritten");
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getField() {
|
||||||
|
return term.field();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Collection getTerms() {
|
||||||
|
Collection terms = new ArrayList();
|
||||||
|
terms.add(term);
|
||||||
|
return terms;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if (this == o) return true;
|
||||||
|
if (o == null || getClass() != o.getClass()) return false;
|
||||||
|
|
||||||
|
final SpanRegexQuery that = (SpanRegexQuery) o;
|
||||||
|
|
||||||
|
return term.equals(that.term) && getBoost() == that.getBoost();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int hashCode() {
|
||||||
|
return term.hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString(String field) {
|
||||||
|
StringBuffer buffer = new StringBuffer();
|
||||||
|
buffer.append("spanRegexQuery(");
|
||||||
|
buffer.append(term);
|
||||||
|
buffer.append(")");
|
||||||
|
buffer.append(ToStringUtils.boost(getBoost()));
|
||||||
|
return buffer.toString();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
package org.apache.lucene.search.regex;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.Hits;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
|
||||||
|
public class TestRegexQuery extends TestCase {
|
||||||
|
public void testRegex() throws Exception {
|
||||||
|
RAMDirectory directory = new RAMDirectory();
|
||||||
|
IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true);
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new Field("field", "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
writer.optimize();
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
IndexSearcher searcher = new IndexSearcher(directory);
|
||||||
|
Query query = new SpanRegexQuery(new Term("field", "q.[aeiou]c.*"));
|
||||||
|
Hits hits = searcher.search(query);
|
||||||
|
assertEquals(1, hits.length());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,33 @@
|
||||||
|
package org.apache.lucene.search.regex;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.Hits;
|
||||||
|
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||||
|
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||||
|
import org.apache.lucene.search.spans.SpanQuery;
|
||||||
|
|
||||||
|
public class TestSpanRegexQuery extends TestCase {
|
||||||
|
public void testSpanRegex() throws Exception {
|
||||||
|
RAMDirectory directory = new RAMDirectory();
|
||||||
|
IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true);
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new Field("field", "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
writer.optimize();
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
IndexSearcher searcher = new IndexSearcher(directory);
|
||||||
|
SpanRegexQuery srq = new SpanRegexQuery(new Term("field", "q.[aeiou]c.*"));
|
||||||
|
SpanTermQuery stq = new SpanTermQuery(new Term("field","dog"));
|
||||||
|
SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {srq, stq}, 6, true);
|
||||||
|
Hits hits = searcher.search(query);
|
||||||
|
assertEquals(1, hits.length());
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue