From 6befc453349621cc322efd6a5981baa9e7dfd004 Mon Sep 17 00:00:00 2001 From: Erik Hatcher Date: Sat, 12 Nov 2005 09:03:26 +0000 Subject: [PATCH] Add RegexQuery and SpanRegexQuery git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@332747 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 5 ++ .../lucene/search/regex/RegexQuery.java | 26 ++++++ .../lucene/search/regex/RegexTermEnum.java | 65 ++++++++++++++ .../lucene/search/regex/SpanRegexQuery.java | 85 +++++++++++++++++++ .../lucene/search/regex/TestRegexQuery.java | 30 +++++++ .../search/regex/TestSpanRegexQuery.java | 33 +++++++ 6 files changed, 244 insertions(+) create mode 100644 src/java/org/apache/lucene/search/regex/RegexQuery.java create mode 100644 src/java/org/apache/lucene/search/regex/RegexTermEnum.java create mode 100644 src/java/org/apache/lucene/search/regex/SpanRegexQuery.java create mode 100644 src/test/org/apache/lucene/search/regex/TestRegexQuery.java create mode 100644 src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java diff --git a/CHANGES.txt b/CHANGES.txt index 84d43968a67..a718a7f4697 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -163,6 +163,11 @@ New features highlighting entire documents or fields. (Erik Hatcher) +23. Added regular expression queries, RegexQuery and SpanRegexQuery. + Note the same term enumeration caveats apply with these queries as + apply to WildcardQuery and other term expanding queries. + (Erik Hatcher) + API Changes 1. Several methods and fields have been deprecated. The API documentation diff --git a/src/java/org/apache/lucene/search/regex/RegexQuery.java b/src/java/org/apache/lucene/search/regex/RegexQuery.java new file mode 100644 index 00000000000..38ce8a5d2e8 --- /dev/null +++ b/src/java/org/apache/lucene/search/regex/RegexQuery.java @@ -0,0 +1,26 @@ +package org.apache.lucene.search.regex; + +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.FilteredTermEnum; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexReader; + +import java.io.IOException; + +public class RegexQuery extends MultiTermQuery { + public RegexQuery(Term term) { + super(term); + } + + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + Term term = new Term(getTerm().field(), getTerm().text()); + return new RegexTermEnum(reader, term); + } + + public boolean equals(Object o) { + if (o instanceof RegexQuery) + return super.equals(o); + + return false; + } +} diff --git a/src/java/org/apache/lucene/search/regex/RegexTermEnum.java b/src/java/org/apache/lucene/search/regex/RegexTermEnum.java new file mode 100644 index 00000000000..bfef444d75f --- /dev/null +++ b/src/java/org/apache/lucene/search/regex/RegexTermEnum.java @@ -0,0 +1,65 @@ +package org.apache.lucene.search.regex; + +import org.apache.lucene.search.FilteredTermEnum; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; + +import java.util.regex.Pattern; +import java.io.IOException; + +public class RegexTermEnum extends FilteredTermEnum { + private String field = ""; + private String pre = ""; + boolean endEnum = false; + private Pattern pattern; + + public RegexTermEnum(IndexReader reader, Term term) throws IOException { + super(); + field = term.field(); + String text = term.text(); + + pattern = Pattern.compile(text); + + // Find the first regex character position, to find the + // maximum prefix to use for term enumeration + int index = 0; + while (index < text.length()) { + char c = text.charAt(index); + + // TODO: improve the logic here. There are other types of patterns + // that could break this, such as "\d*" and "\*abc" + if (c == '*' || c == '[' || c == '?' || c == '.') break; + + index++; + } + + pre = text.substring(0, index); + + setEnum(reader.terms(new Term(term.field(), pre))); + } + + protected final boolean termCompare(Term term) { + if (field == term.field()) { + String searchText = term.text(); + if (searchText.startsWith(pre)) { + return pattern.matcher(searchText).matches(); + } + } + endEnum = true; + return false; + } + + public final float difference() { +// TODO: adjust difference based on distance of searchTerm.text() and term().text() + return 1.0f; + } + + public final boolean endEnum() { + return endEnum; + } + + public void close() throws IOException { + super.close(); + field = null; + } +} diff --git a/src/java/org/apache/lucene/search/regex/SpanRegexQuery.java b/src/java/org/apache/lucene/search/regex/SpanRegexQuery.java new file mode 100644 index 00000000000..7ba61b92976 --- /dev/null +++ b/src/java/org/apache/lucene/search/regex/SpanRegexQuery.java @@ -0,0 +1,85 @@ +package org.apache.lucene.search.regex; + +import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.spans.SpanOrQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.spans.Spans; +import org.apache.lucene.util.ToStringUtils; + +import java.io.IOException; +import java.util.Collection; +import java.util.ArrayList; + +public class SpanRegexQuery extends SpanQuery { + private Term term; + + public SpanRegexQuery(Term term) { + this.term = term; + } + + public Query rewrite(IndexReader reader) throws IOException { + Query orig = new RegexQuery(term).rewrite(reader); + + // RegexQuery (via MultiTermQuery).rewrite always returns a BooleanQuery + BooleanQuery bq = (BooleanQuery) orig; + + BooleanClause[] clauses = bq.getClauses(); + SpanQuery[] sqs = new SpanQuery[clauses.length]; + for (int i = 0; i < clauses.length; i++) { + BooleanClause clause = clauses[i]; + + // Clauses from RegexQuery.rewrite are always TermQuery's + TermQuery tq = (TermQuery) clause.getQuery(); + + sqs[i] = new SpanTermQuery(tq.getTerm()); + sqs[i].setBoost(tq.getBoost()); + } + + SpanOrQuery query = new SpanOrQuery(sqs); + query.setBoost(orig.getBoost()); + + return query; + } + + public Spans getSpans(IndexReader reader) throws IOException { + throw new UnsupportedOperationException("Query should have been rewritten"); + } + + public String getField() { + return term.field(); + } + + public Collection getTerms() { + Collection terms = new ArrayList(); + terms.add(term); + return terms; + } + + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final SpanRegexQuery that = (SpanRegexQuery) o; + + return term.equals(that.term) && getBoost() == that.getBoost(); + } + + public int hashCode() { + return term.hashCode(); + } + + public String toString(String field) { + StringBuffer buffer = new StringBuffer(); + buffer.append("spanRegexQuery("); + buffer.append(term); + buffer.append(")"); + buffer.append(ToStringUtils.boost(getBoost())); + return buffer.toString(); + } +} diff --git a/src/test/org/apache/lucene/search/regex/TestRegexQuery.java b/src/test/org/apache/lucene/search/regex/TestRegexQuery.java new file mode 100644 index 00000000000..26b41c66170 --- /dev/null +++ b/src/test/org/apache/lucene/search/regex/TestRegexQuery.java @@ -0,0 +1,30 @@ +package org.apache.lucene.search.regex; + +import junit.framework.TestCase; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.Query; + +public class TestRegexQuery extends TestCase { + public void testRegex() throws Exception { + RAMDirectory directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true); + Document doc = new Document(); + doc.add(new Field("field", "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED)); + writer.addDocument(doc); + writer.optimize(); + writer.close(); + + IndexSearcher searcher = new IndexSearcher(directory); + Query query = new SpanRegexQuery(new Term("field", "q.[aeiou]c.*")); + Hits hits = searcher.search(query); + assertEquals(1, hits.length()); + } +} + diff --git a/src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java b/src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java new file mode 100644 index 00000000000..6b662bcdcd4 --- /dev/null +++ b/src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java @@ -0,0 +1,33 @@ +package org.apache.lucene.search.regex; + +import junit.framework.TestCase; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; + +public class TestSpanRegexQuery extends TestCase { + public void testSpanRegex() throws Exception { + RAMDirectory directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true); + Document doc = new Document(); + doc.add(new Field("field", "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED)); + writer.addDocument(doc); + writer.optimize(); + writer.close(); + + IndexSearcher searcher = new IndexSearcher(directory); + SpanRegexQuery srq = new SpanRegexQuery(new Term("field", "q.[aeiou]c.*")); + SpanTermQuery stq = new SpanTermQuery(new Term("field","dog")); + SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {srq, stq}, 6, true); + Hits hits = searcher.search(query); + assertEquals(1, hits.length()); + } +}