LUCENE-2606: optimize contrib/regex for flex

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@987129 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-08-19 11:20:42 +00:00
parent f50e111501
commit 4ec28930c9
7 changed files with 181 additions and 106 deletions

View File

@ -23,6 +23,12 @@ New Features
* LUCENE-2479: Added ability to provide a sort comparator for spelling suggestions along * LUCENE-2479: Added ability to provide a sort comparator for spelling suggestions along
with two implementations. The existing comparator (score, then frequency) is the default (Grant Ingersoll) with two implementations. The existing comparator (score, then frequency) is the default (Grant Ingersoll)
API Changes
* LUCENE-2606: Changed RegexCapabilities interface to fix thread
safety, serialization, and performance problems. If you have
written a custom RegexCapabilities it will need to be updated
to the new API. (Robert Muir, Uwe Schindler)
======================= Lucene 3.x (not yet released) ======================= ======================= Lucene 3.x (not yet released) =======================
Changes in backwards compatibility policy Changes in backwards compatibility policy

View File

@ -17,6 +17,9 @@ package org.apache.lucene.search.regex;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.regexp.CharacterIterator;
import org.apache.regexp.RE; import org.apache.regexp.RE;
import org.apache.regexp.REProgram; import org.apache.regexp.REProgram;
import java.lang.reflect.Field; import java.lang.reflect.Field;
@ -30,8 +33,6 @@ import java.lang.reflect.Method;
* it doesn't always provide a prefix even if one would exist. * it doesn't always provide a prefix even if one would exist.
*/ */
public class JakartaRegexpCapabilities implements RegexCapabilities { public class JakartaRegexpCapabilities implements RegexCapabilities {
private RE regexp;
private static Field prefixField; private static Field prefixField;
private static Method getPrefixMethod; private static Method getPrefixMethod;
static { static {
@ -79,45 +80,75 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
this.flags = flags; this.flags = flags;
} }
public void compile(String pattern) { public RegexCapabilities.RegexMatcher compile(String regex) {
regexp = new RE(pattern, this.flags); return new JakartaRegexMatcher(regex, flags);
}
public boolean match(String string) {
return regexp.match(string);
}
public String prefix() {
try {
final char[] prefix;
if (getPrefixMethod != null) {
prefix = (char[]) getPrefixMethod.invoke(regexp.getProgram());
} else if (prefixField != null) {
prefix = (char[]) prefixField.get(regexp.getProgram());
} else {
return null;
}
return prefix == null ? null : new String(prefix);
} catch (Exception e) {
// if we cannot get the prefix, return none
return null;
}
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
final JakartaRegexpCapabilities that = (JakartaRegexpCapabilities) o;
if (regexp != null ? !regexp.equals(that.regexp) : that.regexp != null) return false;
return true;
} }
@Override @Override
public int hashCode() { public int hashCode() {
return (regexp != null ? regexp.hashCode() : 0); final int prime = 31;
int result = 1;
result = prime * result + flags;
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
JakartaRegexpCapabilities other = (JakartaRegexpCapabilities) obj;
if (flags != other.flags) return false;
return true;
}
class JakartaRegexMatcher implements RegexCapabilities.RegexMatcher {
private RE regexp;
private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
private final CharacterIterator utf16wrapper = new CharacterIterator() {
public char charAt(int pos) {
return utf16.result[pos];
}
public boolean isEnd(int pos) {
return pos >= utf16.length;
}
public String substring(int beginIndex) {
return substring(beginIndex, utf16.length);
}
public String substring(int beginIndex, int endIndex) {
return new String(utf16.result, beginIndex, endIndex - beginIndex);
}
};
public JakartaRegexMatcher(String regex, int flags) {
regexp = new RE(regex, flags);
}
public boolean match(BytesRef term) {
UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
return regexp.match(utf16wrapper, 0);
}
public String prefix() {
try {
final char[] prefix;
if (getPrefixMethod != null) {
prefix = (char[]) getPrefixMethod.invoke(regexp.getProgram());
} else if (prefixField != null) {
prefix = (char[]) prefixField.get(regexp.getProgram());
} else {
return null;
}
return prefix == null ? null : new String(prefix);
} catch (Exception e) {
// if we cannot get the prefix, return none
return null;
}
}
} }
} }

View File

@ -17,8 +17,12 @@ package org.apache.lucene.search.regex;
* limitations under the License. * limitations under the License.
*/ */
import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
/** /**
* An implementation tying Java's built-in java.util.regex to RegexQuery. * An implementation tying Java's built-in java.util.regex to RegexQuery.
* *
@ -27,9 +31,8 @@ import java.util.regex.Pattern;
* attempt to {@link #match} each term for the specified field in the index. * attempt to {@link #match} each term for the specified field in the index.
*/ */
public class JavaUtilRegexCapabilities implements RegexCapabilities { public class JavaUtilRegexCapabilities implements RegexCapabilities {
private Pattern pattern;
private int flags = 0; private int flags = 0;
// Define the optional flags from Pattern that can be used. // Define the optional flags from Pattern that can be used.
// Do this here to keep Pattern contained within this class. // Do this here to keep Pattern contained within this class.
@ -66,32 +69,59 @@ public class JavaUtilRegexCapabilities implements RegexCapabilities {
this.flags = flags; this.flags = flags;
} }
public void compile(String pattern) { public RegexCapabilities.RegexMatcher compile(String regex) {
this.pattern = Pattern.compile(pattern, this.flags); return new JavaUtilRegexMatcher(regex, flags);
} }
public boolean match(String string) { @Override
return pattern.matcher(string).matches(); public int hashCode() {
} final int prime = 31;
int result = 1;
public String prefix() { result = prime * result + flags;
return null; return result;
} }
@Override @Override
public boolean equals(Object o) { public boolean equals(Object obj) {
if (this == o) return true; if (this == obj) return true;
if (o == null || getClass() != o.getClass()) return false; if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
final JavaUtilRegexCapabilities that = (JavaUtilRegexCapabilities) o; JavaUtilRegexCapabilities other = (JavaUtilRegexCapabilities) obj;
if (flags != other.flags) return false;
if (pattern != null ? !pattern.equals(that.pattern) : that.pattern != null) return false;
return true; return true;
} }
@Override class JavaUtilRegexMatcher implements RegexCapabilities.RegexMatcher {
public int hashCode() { private final Pattern pattern;
return (pattern != null ? pattern.hashCode() : 0); private final Matcher matcher;
private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
private final CharSequence utf16wrapper = new CharSequence() {
public int length() {
return utf16.length;
}
public char charAt(int index) {
return utf16.result[index];
}
public CharSequence subSequence(int start, int end) {
return new String(utf16.result, start, end - start);
}
};
public JavaUtilRegexMatcher(String regex, int flags) {
this.pattern = Pattern.compile(regex, flags);
this.matcher = this.pattern.matcher(utf16wrapper);
}
public boolean match(BytesRef term) {
UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
return matcher.reset().matches();
}
public String prefix() {
return null;
}
} }
} }

View File

@ -1,5 +1,9 @@
package org.apache.lucene.search.regex; package org.apache.lucene.search.regex;
import java.io.Serializable;
import org.apache.lucene.util.BytesRef;
/** /**
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,7 +25,7 @@ package org.apache.lucene.search.regex;
* Defines basic operations needed by {@link RegexQuery} for a regular * Defines basic operations needed by {@link RegexQuery} for a regular
* expression implementation. * expression implementation.
*/ */
public interface RegexCapabilities { public interface RegexCapabilities extends Serializable {
/** /**
* Called by the constructor of {@link RegexTermEnum} allowing * Called by the constructor of {@link RegexTermEnum} allowing
* implementations to cache a compiled version of the regular * implementations to cache a compiled version of the regular
@ -29,20 +33,22 @@ public interface RegexCapabilities {
* *
* @param pattern regular expression pattern * @param pattern regular expression pattern
*/ */
void compile(String pattern); public RegexMatcher compile(String pattern);
/** public interface RegexMatcher {
* /**
* @param string *
* @return true if string matches the pattern last passed to {@link #compile}. * @param string
*/ * @return true if string matches the pattern last passed to {@link #compile}.
boolean match(String string); */
public boolean match(BytesRef term);
/** /**
* A wise prefix implementation can reduce the term enumeration (and thus increase performance) * A wise prefix implementation can reduce the term enumeration (and thus increase performance)
* of RegexQuery dramatically! * of RegexQuery dramatically!
* *
* @return static non-regex prefix of the pattern last passed to {@link #compile}. May return null. * @return static non-regex prefix of the pattern last passed to {@link #compile}. May return null.
*/ */
String prefix(); public String prefix();
}
} }

View File

@ -76,23 +76,27 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
return buffer.toString(); return buffer.toString();
} }
/* generated by IntelliJ IDEA */
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
if (!super.equals(o)) return false;
final RegexQuery that = (RegexQuery) o;
return regexImpl.equals(that.regexImpl);
}
/* generated by IntelliJ IDEA */
@Override @Override
public int hashCode() { public int hashCode() {
final int prime = 31;
int result = super.hashCode(); int result = super.hashCode();
result = 29 * result + regexImpl.hashCode(); result = prime * result + ((regexImpl == null) ? 0 : regexImpl.hashCode());
result = prime * result + ((term == null) ? 0 : term.hashCode());
return result; return result;
} }
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (!super.equals(obj)) return false;
if (getClass() != obj.getClass()) return false;
RegexQuery other = (RegexQuery) obj;
if (regexImpl == null) {
if (other.regexImpl != null) return false;
} else if (!regexImpl.equals(other.regexImpl)) return false;
if (term == null) {
if (other.term != null) return false;
} else if (!term.equals(other.term)) return false;
return true;
}
} }

View File

@ -34,15 +34,13 @@ import java.io.IOException;
*/ */
public class RegexTermsEnum extends FilteredTermsEnum { public class RegexTermsEnum extends FilteredTermsEnum {
private RegexCapabilities regexImpl; private RegexCapabilities.RegexMatcher regexImpl;
private final BytesRef prefixRef; private final BytesRef prefixRef;
public RegexTermsEnum(IndexReader reader, Term term, RegexCapabilities regexImpl) throws IOException { public RegexTermsEnum(IndexReader reader, Term term, RegexCapabilities regexCap) throws IOException {
super(reader, term.field()); super(reader, term.field());
String text = term.text(); String text = term.text();
this.regexImpl = regexImpl; this.regexImpl = regexCap.compile(text);
regexImpl.compile(text);
String pre = regexImpl.prefix(); String pre = regexImpl.prefix();
if (pre == null) pre = ""; if (pre == null) pre = "";
@ -55,8 +53,7 @@ public class RegexTermsEnum extends FilteredTermsEnum {
if (term.startsWith(prefixRef)) { if (term.startsWith(prefixRef)) {
// TODO: set BoostAttr based on distance of // TODO: set BoostAttr based on distance of
// searchTerm.text() and term().text() // searchTerm.text() and term().text()
String text = term.utf8ToString(); return regexImpl.match(term) ? AcceptStatus.YES : AcceptStatus.NO;
return regexImpl.match(text) ? AcceptStatus.YES : AcceptStatus.NO;
} else { } else {
return AcceptStatus.NO; return AcceptStatus.NO;
} }

View File

@ -17,6 +17,7 @@ package org.apache.lucene.search.regex;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
/** /**
@ -26,21 +27,21 @@ public class TestJakartaRegexpCapabilities extends LuceneTestCase {
public void testGetPrefix(){ public void testGetPrefix(){
JakartaRegexpCapabilities cap = new JakartaRegexpCapabilities(); JakartaRegexpCapabilities cap = new JakartaRegexpCapabilities();
cap.compile("luc[e]?"); RegexCapabilities.RegexMatcher matcher = cap.compile("luc[e]?");
assertTrue(cap.match("luce")); assertTrue(matcher.match(new BytesRef("luce")));
assertEquals("luc", cap.prefix()); assertEquals("luc", matcher.prefix());
cap.compile("lucene"); matcher = cap.compile("lucene");
assertTrue(cap.match("lucene")); assertTrue(matcher.match(new BytesRef("lucene")));
assertEquals("lucene", cap.prefix()); assertEquals("lucene", matcher.prefix());
} }
public void testShakyPrefix(){ public void testShakyPrefix(){
JakartaRegexpCapabilities cap = new JakartaRegexpCapabilities(); JakartaRegexpCapabilities cap = new JakartaRegexpCapabilities();
cap.compile("(ab|ac)"); RegexCapabilities.RegexMatcher matcher = cap.compile("(ab|ac)");
assertTrue(cap.match("ab")); assertTrue(matcher.match(new BytesRef("ab")));
assertTrue(cap.match("ac")); assertTrue(matcher.match(new BytesRef("ac")));
// why is it not a??? // why is it not a???
assertNull(cap.prefix()); assertNull(matcher.prefix());
} }
} }