LUCENE-2606: optimize contrib/regex for flex

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@987129 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-08-19 11:20:42 +00:00
parent f50e111501
commit 4ec28930c9
7 changed files with 181 additions and 106 deletions

View File

@ -23,6 +23,12 @@ New Features
* LUCENE-2479: Added ability to provide a sort comparator for spelling suggestions along
with two implementations. The existing comparator (score, then frequency) is the default (Grant Ingersoll)
API Changes
* LUCENE-2606: Changed RegexCapabilities interface to fix thread
safety, serialization, and performance problems. If you have
written a custom RegexCapabilities it will need to be updated
to the new API. (Robert Muir, Uwe Schindler)
======================= Lucene 3.x (not yet released) =======================
Changes in backwards compatibility policy

View File

@ -17,6 +17,9 @@ package org.apache.lucene.search.regex;
* limitations under the License.
*/
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.regexp.CharacterIterator;
import org.apache.regexp.RE;
import org.apache.regexp.REProgram;
import java.lang.reflect.Field;
@ -30,8 +33,6 @@ import java.lang.reflect.Method;
* it doesn't always provide a prefix even if one would exist.
*/
public class JakartaRegexpCapabilities implements RegexCapabilities {
private RE regexp;
private static Field prefixField;
private static Method getPrefixMethod;
static {
@ -79,45 +80,75 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
this.flags = flags;
}
public void compile(String pattern) {
regexp = new RE(pattern, this.flags);
}
public boolean match(String string) {
return regexp.match(string);
}
public String prefix() {
try {
final char[] prefix;
if (getPrefixMethod != null) {
prefix = (char[]) getPrefixMethod.invoke(regexp.getProgram());
} else if (prefixField != null) {
prefix = (char[]) prefixField.get(regexp.getProgram());
} else {
return null;
}
return prefix == null ? null : new String(prefix);
} catch (Exception e) {
// if we cannot get the prefix, return none
return null;
}
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
final JakartaRegexpCapabilities that = (JakartaRegexpCapabilities) o;
if (regexp != null ? !regexp.equals(that.regexp) : that.regexp != null) return false;
return true;
public RegexCapabilities.RegexMatcher compile(String regex) {
return new JakartaRegexMatcher(regex, flags);
}
@Override
public int hashCode() {
return (regexp != null ? regexp.hashCode() : 0);
final int prime = 31;
int result = 1;
result = prime * result + flags;
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
JakartaRegexpCapabilities other = (JakartaRegexpCapabilities) obj;
if (flags != other.flags) return false;
return true;
}
class JakartaRegexMatcher implements RegexCapabilities.RegexMatcher {
private RE regexp;
private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
private final CharacterIterator utf16wrapper = new CharacterIterator() {
public char charAt(int pos) {
return utf16.result[pos];
}
public boolean isEnd(int pos) {
return pos >= utf16.length;
}
public String substring(int beginIndex) {
return substring(beginIndex, utf16.length);
}
public String substring(int beginIndex, int endIndex) {
return new String(utf16.result, beginIndex, endIndex - beginIndex);
}
};
public JakartaRegexMatcher(String regex, int flags) {
regexp = new RE(regex, flags);
}
public boolean match(BytesRef term) {
UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
return regexp.match(utf16wrapper, 0);
}
public String prefix() {
try {
final char[] prefix;
if (getPrefixMethod != null) {
prefix = (char[]) getPrefixMethod.invoke(regexp.getProgram());
} else if (prefixField != null) {
prefix = (char[]) prefixField.get(regexp.getProgram());
} else {
return null;
}
return prefix == null ? null : new String(prefix);
} catch (Exception e) {
// if we cannot get the prefix, return none
return null;
}
}
}
}

View File

@ -17,8 +17,12 @@ package org.apache.lucene.search.regex;
* limitations under the License.
*/
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
/**
* An implementation tying Java's built-in java.util.regex to RegexQuery.
*
@ -27,7 +31,6 @@ import java.util.regex.Pattern;
* attempt to {@link #match} each term for the specified field in the index.
*/
public class JavaUtilRegexCapabilities implements RegexCapabilities {
private Pattern pattern;
private int flags = 0;
// Define the optional flags from Pattern that can be used.
@ -66,32 +69,59 @@ public class JavaUtilRegexCapabilities implements RegexCapabilities {
this.flags = flags;
}
public void compile(String pattern) {
this.pattern = Pattern.compile(pattern, this.flags);
}
public boolean match(String string) {
return pattern.matcher(string).matches();
}
public String prefix() {
return null;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
final JavaUtilRegexCapabilities that = (JavaUtilRegexCapabilities) o;
if (pattern != null ? !pattern.equals(that.pattern) : that.pattern != null) return false;
return true;
public RegexCapabilities.RegexMatcher compile(String regex) {
return new JavaUtilRegexMatcher(regex, flags);
}
@Override
public int hashCode() {
return (pattern != null ? pattern.hashCode() : 0);
final int prime = 31;
int result = 1;
result = prime * result + flags;
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
JavaUtilRegexCapabilities other = (JavaUtilRegexCapabilities) obj;
if (flags != other.flags) return false;
return true;
}
class JavaUtilRegexMatcher implements RegexCapabilities.RegexMatcher {
private final Pattern pattern;
private final Matcher matcher;
private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
private final CharSequence utf16wrapper = new CharSequence() {
public int length() {
return utf16.length;
}
public char charAt(int index) {
return utf16.result[index];
}
public CharSequence subSequence(int start, int end) {
return new String(utf16.result, start, end - start);
}
};
public JavaUtilRegexMatcher(String regex, int flags) {
this.pattern = Pattern.compile(regex, flags);
this.matcher = this.pattern.matcher(utf16wrapper);
}
public boolean match(BytesRef term) {
UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
return matcher.reset().matches();
}
public String prefix() {
return null;
}
}
}

View File

@ -1,5 +1,9 @@
package org.apache.lucene.search.regex;
import java.io.Serializable;
import org.apache.lucene.util.BytesRef;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,7 +25,7 @@ package org.apache.lucene.search.regex;
* Defines basic operations needed by {@link RegexQuery} for a regular
* expression implementation.
*/
public interface RegexCapabilities {
public interface RegexCapabilities extends Serializable {
/**
* Called by the constructor of {@link RegexTermEnum} allowing
* implementations to cache a compiled version of the regular
@ -29,20 +33,22 @@ public interface RegexCapabilities {
*
* @param pattern regular expression pattern
*/
void compile(String pattern);
public RegexMatcher compile(String pattern);
/**
*
* @param string
* @return true if string matches the pattern last passed to {@link #compile}.
*/
boolean match(String string);
public interface RegexMatcher {
/**
*
* @param string
* @return true if string matches the pattern last passed to {@link #compile}.
*/
public boolean match(BytesRef term);
/**
* A wise prefix implementation can reduce the term enumeration (and thus increase performance)
* of RegexQuery dramatically!
*
* @return static non-regex prefix of the pattern last passed to {@link #compile}. May return null.
*/
String prefix();
/**
* A wise prefix implementation can reduce the term enumeration (and thus increase performance)
* of RegexQuery dramatically!
*
* @return static non-regex prefix of the pattern last passed to {@link #compile}. May return null.
*/
public String prefix();
}
}

View File

@ -76,23 +76,27 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
return buffer.toString();
}
/* generated by IntelliJ IDEA */
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
if (!super.equals(o)) return false;
final RegexQuery that = (RegexQuery) o;
return regexImpl.equals(that.regexImpl);
}
/* generated by IntelliJ IDEA */
@Override
public int hashCode() {
final int prime = 31;
int result = super.hashCode();
result = 29 * result + regexImpl.hashCode();
result = prime * result + ((regexImpl == null) ? 0 : regexImpl.hashCode());
result = prime * result + ((term == null) ? 0 : term.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (!super.equals(obj)) return false;
if (getClass() != obj.getClass()) return false;
RegexQuery other = (RegexQuery) obj;
if (regexImpl == null) {
if (other.regexImpl != null) return false;
} else if (!regexImpl.equals(other.regexImpl)) return false;
if (term == null) {
if (other.term != null) return false;
} else if (!term.equals(other.term)) return false;
return true;
}
}

View File

@ -34,15 +34,13 @@ import java.io.IOException;
*/
public class RegexTermsEnum extends FilteredTermsEnum {
private RegexCapabilities regexImpl;
private RegexCapabilities.RegexMatcher regexImpl;
private final BytesRef prefixRef;
public RegexTermsEnum(IndexReader reader, Term term, RegexCapabilities regexImpl) throws IOException {
public RegexTermsEnum(IndexReader reader, Term term, RegexCapabilities regexCap) throws IOException {
super(reader, term.field());
String text = term.text();
this.regexImpl = regexImpl;
regexImpl.compile(text);
this.regexImpl = regexCap.compile(text);
String pre = regexImpl.prefix();
if (pre == null) pre = "";
@ -55,8 +53,7 @@ public class RegexTermsEnum extends FilteredTermsEnum {
if (term.startsWith(prefixRef)) {
// TODO: set BoostAttr based on distance of
// searchTerm.text() and term().text()
String text = term.utf8ToString();
return regexImpl.match(text) ? AcceptStatus.YES : AcceptStatus.NO;
return regexImpl.match(term) ? AcceptStatus.YES : AcceptStatus.NO;
} else {
return AcceptStatus.NO;
}

View File

@ -17,6 +17,7 @@ package org.apache.lucene.search.regex;
* limitations under the License.
*/
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
/**
@ -26,21 +27,21 @@ public class TestJakartaRegexpCapabilities extends LuceneTestCase {
public void testGetPrefix(){
JakartaRegexpCapabilities cap = new JakartaRegexpCapabilities();
cap.compile("luc[e]?");
assertTrue(cap.match("luce"));
assertEquals("luc", cap.prefix());
RegexCapabilities.RegexMatcher matcher = cap.compile("luc[e]?");
assertTrue(matcher.match(new BytesRef("luce")));
assertEquals("luc", matcher.prefix());
cap.compile("lucene");
assertTrue(cap.match("lucene"));
assertEquals("lucene", cap.prefix());
matcher = cap.compile("lucene");
assertTrue(matcher.match(new BytesRef("lucene")));
assertEquals("lucene", matcher.prefix());
}
public void testShakyPrefix(){
JakartaRegexpCapabilities cap = new JakartaRegexpCapabilities();
cap.compile("(ab|ac)");
assertTrue(cap.match("ab"));
assertTrue(cap.match("ac"));
RegexCapabilities.RegexMatcher matcher = cap.compile("(ab|ac)");
assertTrue(matcher.match(new BytesRef("ab")));
assertTrue(matcher.match(new BytesRef("ac")));
// why is it not a???
assertNull(cap.prefix());
assertNull(matcher.prefix());
}
}