mirror of https://github.com/apache/lucene.git
Add RegexQuery and SpanRegexQuery
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@332747 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a176028a7e
commit
6befc45334
|
@ -163,6 +163,11 @@ New features
|
|||
highlighting entire documents or fields.
|
||||
(Erik Hatcher)
|
||||
|
||||
23. Added regular expression queries, RegexQuery and SpanRegexQuery.
|
||||
Note the same term enumeration caveats apply with these queries as
|
||||
apply to WildcardQuery and other term expanding queries.
|
||||
(Erik Hatcher)
|
||||
|
||||
API Changes
|
||||
|
||||
1. Several methods and fields have been deprecated. The API documentation
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
package org.apache.lucene.search.regex;
|
||||
|
||||
import org.apache.lucene.search.MultiTermQuery;
|
||||
import org.apache.lucene.search.FilteredTermEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class RegexQuery extends MultiTermQuery {
|
||||
public RegexQuery(Term term) {
|
||||
super(term);
|
||||
}
|
||||
|
||||
protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
|
||||
Term term = new Term(getTerm().field(), getTerm().text());
|
||||
return new RegexTermEnum(reader, term);
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
if (o instanceof RegexQuery)
|
||||
return super.equals(o);
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,65 @@
|
|||
package org.apache.lucene.search.regex;
|
||||
|
||||
import org.apache.lucene.search.FilteredTermEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
import java.io.IOException;
|
||||
|
||||
public class RegexTermEnum extends FilteredTermEnum {
|
||||
private String field = "";
|
||||
private String pre = "";
|
||||
boolean endEnum = false;
|
||||
private Pattern pattern;
|
||||
|
||||
public RegexTermEnum(IndexReader reader, Term term) throws IOException {
|
||||
super();
|
||||
field = term.field();
|
||||
String text = term.text();
|
||||
|
||||
pattern = Pattern.compile(text);
|
||||
|
||||
// Find the first regex character position, to find the
|
||||
// maximum prefix to use for term enumeration
|
||||
int index = 0;
|
||||
while (index < text.length()) {
|
||||
char c = text.charAt(index);
|
||||
|
||||
// TODO: improve the logic here. There are other types of patterns
|
||||
// that could break this, such as "\d*" and "\*abc"
|
||||
if (c == '*' || c == '[' || c == '?' || c == '.') break;
|
||||
|
||||
index++;
|
||||
}
|
||||
|
||||
pre = text.substring(0, index);
|
||||
|
||||
setEnum(reader.terms(new Term(term.field(), pre)));
|
||||
}
|
||||
|
||||
protected final boolean termCompare(Term term) {
|
||||
if (field == term.field()) {
|
||||
String searchText = term.text();
|
||||
if (searchText.startsWith(pre)) {
|
||||
return pattern.matcher(searchText).matches();
|
||||
}
|
||||
}
|
||||
endEnum = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public final float difference() {
|
||||
// TODO: adjust difference based on distance of searchTerm.text() and term().text()
|
||||
return 1.0f;
|
||||
}
|
||||
|
||||
public final boolean endEnum() {
|
||||
return endEnum;
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
field = null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,85 @@
|
|||
package org.apache.lucene.search.regex;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.search.spans.Spans;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class SpanRegexQuery extends SpanQuery {
|
||||
private Term term;
|
||||
|
||||
public SpanRegexQuery(Term term) {
|
||||
this.term = term;
|
||||
}
|
||||
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
Query orig = new RegexQuery(term).rewrite(reader);
|
||||
|
||||
// RegexQuery (via MultiTermQuery).rewrite always returns a BooleanQuery
|
||||
BooleanQuery bq = (BooleanQuery) orig;
|
||||
|
||||
BooleanClause[] clauses = bq.getClauses();
|
||||
SpanQuery[] sqs = new SpanQuery[clauses.length];
|
||||
for (int i = 0; i < clauses.length; i++) {
|
||||
BooleanClause clause = clauses[i];
|
||||
|
||||
// Clauses from RegexQuery.rewrite are always TermQuery's
|
||||
TermQuery tq = (TermQuery) clause.getQuery();
|
||||
|
||||
sqs[i] = new SpanTermQuery(tq.getTerm());
|
||||
sqs[i].setBoost(tq.getBoost());
|
||||
}
|
||||
|
||||
SpanOrQuery query = new SpanOrQuery(sqs);
|
||||
query.setBoost(orig.getBoost());
|
||||
|
||||
return query;
|
||||
}
|
||||
|
||||
public Spans getSpans(IndexReader reader) throws IOException {
|
||||
throw new UnsupportedOperationException("Query should have been rewritten");
|
||||
}
|
||||
|
||||
public String getField() {
|
||||
return term.field();
|
||||
}
|
||||
|
||||
public Collection getTerms() {
|
||||
Collection terms = new ArrayList();
|
||||
terms.add(term);
|
||||
return terms;
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
final SpanRegexQuery that = (SpanRegexQuery) o;
|
||||
|
||||
return term.equals(that.term) && getBoost() == that.getBoost();
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return term.hashCode();
|
||||
}
|
||||
|
||||
public String toString(String field) {
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
buffer.append("spanRegexQuery(");
|
||||
buffer.append(term);
|
||||
buffer.append(")");
|
||||
buffer.append(ToStringUtils.boost(getBoost()));
|
||||
return buffer.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
package org.apache.lucene.search.regex;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Hits;
|
||||
import org.apache.lucene.search.Query;
|
||||
|
||||
public class TestRegexQuery extends TestCase {
|
||||
public void testRegex() throws Exception {
|
||||
RAMDirectory directory = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true);
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("field", "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED));
|
||||
writer.addDocument(doc);
|
||||
writer.optimize();
|
||||
writer.close();
|
||||
|
||||
IndexSearcher searcher = new IndexSearcher(directory);
|
||||
Query query = new SpanRegexQuery(new Term("field", "q.[aeiou]c.*"));
|
||||
Hits hits = searcher.search(query);
|
||||
assertEquals(1, hits.length());
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
package org.apache.lucene.search.regex;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Hits;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
|
||||
public class TestSpanRegexQuery extends TestCase {
|
||||
public void testSpanRegex() throws Exception {
|
||||
RAMDirectory directory = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true);
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("field", "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED));
|
||||
writer.addDocument(doc);
|
||||
writer.optimize();
|
||||
writer.close();
|
||||
|
||||
IndexSearcher searcher = new IndexSearcher(directory);
|
||||
SpanRegexQuery srq = new SpanRegexQuery(new Term("field", "q.[aeiou]c.*"));
|
||||
SpanTermQuery stq = new SpanTermQuery(new Term("field","dog"));
|
||||
SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {srq, stq}, 6, true);
|
||||
Hits hits = searcher.search(query);
|
||||
assertEquals(1, hits.length());
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue