Add RegexQuery and SpanRegexQuery

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@332747 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erik Hatcher 2005-11-12 09:03:26 +00:00
parent a176028a7e
commit 6befc45334
6 changed files with 244 additions and 0 deletions

View File

@ -163,6 +163,11 @@ New features
highlighting entire documents or fields. highlighting entire documents or fields.
(Erik Hatcher) (Erik Hatcher)
23. Added regular expression queries, RegexQuery and SpanRegexQuery.
Note the same term enumeration caveats apply with these queries as
apply to WildcardQuery and other term expanding queries.
(Erik Hatcher)
API Changes API Changes
1. Several methods and fields have been deprecated. The API documentation 1. Several methods and fields have been deprecated. The API documentation

View File

@ -0,0 +1,26 @@
package org.apache.lucene.search.regex;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.FilteredTermEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader;
import java.io.IOException;
public class RegexQuery extends MultiTermQuery {
public RegexQuery(Term term) {
super(term);
}
protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
Term term = new Term(getTerm().field(), getTerm().text());
return new RegexTermEnum(reader, term);
}
public boolean equals(Object o) {
if (o instanceof RegexQuery)
return super.equals(o);
return false;
}
}

View File

@ -0,0 +1,65 @@
package org.apache.lucene.search.regex;
import org.apache.lucene.search.FilteredTermEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import java.util.regex.Pattern;
import java.io.IOException;
public class RegexTermEnum extends FilteredTermEnum {
private String field = "";
private String pre = "";
boolean endEnum = false;
private Pattern pattern;
public RegexTermEnum(IndexReader reader, Term term) throws IOException {
super();
field = term.field();
String text = term.text();
pattern = Pattern.compile(text);
// Find the first regex character position, to find the
// maximum prefix to use for term enumeration
int index = 0;
while (index < text.length()) {
char c = text.charAt(index);
// TODO: improve the logic here. There are other types of patterns
// that could break this, such as "\d*" and "\*abc"
if (c == '*' || c == '[' || c == '?' || c == '.') break;
index++;
}
pre = text.substring(0, index);
setEnum(reader.terms(new Term(term.field(), pre)));
}
protected final boolean termCompare(Term term) {
if (field == term.field()) {
String searchText = term.text();
if (searchText.startsWith(pre)) {
return pattern.matcher(searchText).matches();
}
}
endEnum = true;
return false;
}
public final float difference() {
// TODO: adjust difference based on distance of searchTerm.text() and term().text()
return 1.0f;
}
public final boolean endEnum() {
return endEnum;
}
public void close() throws IOException {
super.close();
field = null;
}
}

View File

@ -0,0 +1,85 @@
package org.apache.lucene.search.regex;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.ToStringUtils;
import java.io.IOException;
import java.util.Collection;
import java.util.ArrayList;
public class SpanRegexQuery extends SpanQuery {
private Term term;
public SpanRegexQuery(Term term) {
this.term = term;
}
public Query rewrite(IndexReader reader) throws IOException {
Query orig = new RegexQuery(term).rewrite(reader);
// RegexQuery (via MultiTermQuery).rewrite always returns a BooleanQuery
BooleanQuery bq = (BooleanQuery) orig;
BooleanClause[] clauses = bq.getClauses();
SpanQuery[] sqs = new SpanQuery[clauses.length];
for (int i = 0; i < clauses.length; i++) {
BooleanClause clause = clauses[i];
// Clauses from RegexQuery.rewrite are always TermQuery's
TermQuery tq = (TermQuery) clause.getQuery();
sqs[i] = new SpanTermQuery(tq.getTerm());
sqs[i].setBoost(tq.getBoost());
}
SpanOrQuery query = new SpanOrQuery(sqs);
query.setBoost(orig.getBoost());
return query;
}
public Spans getSpans(IndexReader reader) throws IOException {
throw new UnsupportedOperationException("Query should have been rewritten");
}
public String getField() {
return term.field();
}
public Collection getTerms() {
Collection terms = new ArrayList();
terms.add(term);
return terms;
}
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
final SpanRegexQuery that = (SpanRegexQuery) o;
return term.equals(that.term) && getBoost() == that.getBoost();
}
public int hashCode() {
return term.hashCode();
}
public String toString(String field) {
StringBuffer buffer = new StringBuffer();
buffer.append("spanRegexQuery(");
buffer.append(term);
buffer.append(")");
buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString();
}
}

View File

@ -0,0 +1,30 @@
package org.apache.lucene.search.regex;
import junit.framework.TestCase;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.Query;
public class TestRegexQuery extends TestCase {
public void testRegex() throws Exception {
RAMDirectory directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true);
Document doc = new Document();
doc.add(new Field("field", "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED));
writer.addDocument(doc);
writer.optimize();
writer.close();
IndexSearcher searcher = new IndexSearcher(directory);
Query query = new SpanRegexQuery(new Term("field", "q.[aeiou]c.*"));
Hits hits = searcher.search(query);
assertEquals(1, hits.length());
}
}

View File

@ -0,0 +1,33 @@
package org.apache.lucene.search.regex;
import junit.framework.TestCase;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
public class TestSpanRegexQuery extends TestCase {
public void testSpanRegex() throws Exception {
RAMDirectory directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true);
Document doc = new Document();
doc.add(new Field("field", "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED));
writer.addDocument(doc);
writer.optimize();
writer.close();
IndexSearcher searcher = new IndexSearcher(directory);
SpanRegexQuery srq = new SpanRegexQuery(new Term("field", "q.[aeiou]c.*"));
SpanTermQuery stq = new SpanTermQuery(new Term("field","dog"));
SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {srq, stq}, 6, true);
Hits hits = searcher.search(query);
assertEquals(1, hits.length());
}
}