Add RegexQuery and SpanRegexQuery

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@332747 13f79535-47bb-0310-9956-ffa450edef68
2005-11-12 09:03:26 +00:00 · 2005-11-12 09:03:26 +00:00 · 6befc45334
parent a176028a7e
commit 6befc45334
6 changed files with 244 additions and 0 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -163,6 +163,11 @@ New features
    highlighting entire documents or fields.
    (Erik Hatcher)

+23. Added regular expression queries, RegexQuery and SpanRegexQuery.
+    Note the same term enumeration caveats apply with these queries as
+    apply to WildcardQuery and other term expanding queries.
+    (Erik Hatcher)
+
 API Changes

 1. Several methods and fields have been deprecated. The API documentation
--- a/src/java/org/apache/lucene/search/regex/RegexQuery.java
+++ b/src/java/org/apache/lucene/search/regex/RegexQuery.java
@ -0,0 +1,26 @@
+package org.apache.lucene.search.regex;
+
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.FilteredTermEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexReader;
+
+import java.io.IOException;
+
+public class RegexQuery extends MultiTermQuery {
+  public RegexQuery(Term term) {
+    super(term);
+  }
+
+  protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
+    Term term = new Term(getTerm().field(), getTerm().text());
+    return new RegexTermEnum(reader, term);
+  }
+
+  public boolean equals(Object o) {
+    if (o instanceof RegexQuery)
+      return super.equals(o);
+
+    return false;
+  }
+}
--- a/src/java/org/apache/lucene/search/regex/RegexTermEnum.java
+++ b/src/java/org/apache/lucene/search/regex/RegexTermEnum.java
@ -0,0 +1,65 @@
+package org.apache.lucene.search.regex;
+
+import org.apache.lucene.search.FilteredTermEnum;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+
+import java.util.regex.Pattern;
+import java.io.IOException;
+
+public class RegexTermEnum extends FilteredTermEnum {
+  private String field = "";
+  private String pre = "";
+  boolean endEnum = false;
+  private Pattern pattern;
+
+  public RegexTermEnum(IndexReader reader, Term term) throws IOException {
+    super();
+    field = term.field();
+    String text = term.text();
+
+    pattern = Pattern.compile(text);
+
+    // Find the first regex character position, to find the
+    // maximum prefix to use for term enumeration
+    int index = 0;
+    while (index < text.length()) {
+      char c = text.charAt(index);
+
+      // TODO: improve the logic here.  There are other types of patterns
+      // that could break this, such as "\d*" and "\*abc"
+      if (c == '*' || c == '[' || c == '?' || c == '.') break;
+
+      index++;
+    }
+
+    pre = text.substring(0, index);
+
+    setEnum(reader.terms(new Term(term.field(), pre)));
+  }
+
+  protected final boolean termCompare(Term term) {
+    if (field == term.field()) {
+      String searchText = term.text();
+      if (searchText.startsWith(pre)) {
+        return pattern.matcher(searchText).matches();
+      }
+    }
+    endEnum = true;
+    return false;
+  }
+
+  public final float difference() {
+// TODO: adjust difference based on distance of searchTerm.text() and term().text()
+    return 1.0f;
+  }
+
+  public final boolean endEnum() {
+    return endEnum;
+  }
+
+  public void close() throws IOException {
+    super.close();
+    field = null;
+  }
+}
--- a/src/java/org/apache/lucene/search/regex/SpanRegexQuery.java
+++ b/src/java/org/apache/lucene/search/regex/SpanRegexQuery.java
@ -0,0 +1,85 @@
+package org.apache.lucene.search.regex;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.search.spans.Spans;
+import org.apache.lucene.util.ToStringUtils;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.ArrayList;
+
+public class SpanRegexQuery extends SpanQuery {
+  private Term term;
+
+  public SpanRegexQuery(Term term) {
+    this.term = term;
+  }
+
+  public Query rewrite(IndexReader reader) throws IOException {
+    Query orig = new RegexQuery(term).rewrite(reader);
+
+    // RegexQuery (via MultiTermQuery).rewrite always returns a BooleanQuery
+    BooleanQuery bq = (BooleanQuery) orig;
+
+    BooleanClause[] clauses = bq.getClauses();
+    SpanQuery[] sqs = new SpanQuery[clauses.length];
+    for (int i = 0; i < clauses.length; i++) {
+      BooleanClause clause = clauses[i];
+
+      // Clauses from RegexQuery.rewrite are always TermQuery's
+      TermQuery tq = (TermQuery) clause.getQuery();
+
+      sqs[i] = new SpanTermQuery(tq.getTerm());
+      sqs[i].setBoost(tq.getBoost());
+    }
+
+    SpanOrQuery query = new SpanOrQuery(sqs);
+    query.setBoost(orig.getBoost());
+
+    return query;
+  }
+
+  public Spans getSpans(IndexReader reader) throws IOException {
+    throw new UnsupportedOperationException("Query should have been rewritten");
+  }
+
+  public String getField() {
+    return term.field();
+  }
+
+  public Collection getTerms() {
+    Collection terms = new ArrayList();
+    terms.add(term);
+    return terms;
+  }
+
+  public boolean equals(Object o) {
+    if (this == o) return true;
+    if (o == null || getClass() != o.getClass()) return false;
+
+    final SpanRegexQuery that = (SpanRegexQuery) o;
+
+    return term.equals(that.term) && getBoost() == that.getBoost();
+  }
+
+  public int hashCode() {
+    return term.hashCode();
+  }
+
+  public String toString(String field) {
+    StringBuffer buffer = new StringBuffer();
+    buffer.append("spanRegexQuery(");
+    buffer.append(term);
+    buffer.append(")");
+    buffer.append(ToStringUtils.boost(getBoost()));
+    return buffer.toString();
+  }
+}
--- a/src/test/org/apache/lucene/search/regex/TestRegexQuery.java
+++ b/src/test/org/apache/lucene/search/regex/TestRegexQuery.java
@ -0,0 +1,30 @@
+package org.apache.lucene.search.regex;
+
+import junit.framework.TestCase;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.Query;
+
+public class TestRegexQuery extends TestCase {
+  public void testRegex() throws Exception {
+    RAMDirectory directory = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true);
+    Document doc = new Document();
+    doc.add(new Field("field", "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED));
+    writer.addDocument(doc);
+    writer.optimize();
+    writer.close();
+
+    IndexSearcher searcher = new IndexSearcher(directory);
+    Query query = new SpanRegexQuery(new Term("field", "q.[aeiou]c.*"));
+    Hits hits = searcher.search(query);
+    assertEquals(1, hits.length());
+  }
+}
+
--- a/src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java
+++ b/src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java
@ -0,0 +1,33 @@
+package org.apache.lucene.search.regex;
+
+import junit.framework.TestCase;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+
+public class TestSpanRegexQuery extends TestCase {
+  public void testSpanRegex() throws Exception {
+    RAMDirectory directory = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true);
+    Document doc = new Document();
+    doc.add(new Field("field", "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED));
+    writer.addDocument(doc);
+    writer.optimize();
+    writer.close();
+
+    IndexSearcher searcher = new IndexSearcher(directory);
+    SpanRegexQuery srq = new SpanRegexQuery(new Term("field", "q.[aeiou]c.*"));
+    SpanTermQuery stq = new SpanTermQuery(new Term("field","dog"));
+    SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {srq, stq}, 6, true);
+    Hits hits = searcher.search(query);
+    assertEquals(1, hits.length());
+  }
+}