mirror of https://github.com/apache/lucene.git
LUCENE-2606: optimize contrib/regex for flex
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@987129 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f50e111501
commit
4ec28930c9
|
@ -23,6 +23,12 @@ New Features
|
|||
* LUCENE-2479: Added ability to provide a sort comparator for spelling suggestions along
|
||||
with two implementations. The existing comparator (score, then frequency) is the default (Grant Ingersoll)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-2606: Changed RegexCapabilities interface to fix thread
|
||||
safety, serialization, and performance problems. If you have
|
||||
written a custom RegexCapabilities it will need to be updated
|
||||
to the new API. (Robert Muir, Uwe Schindler)
|
||||
======================= Lucene 3.x (not yet released) =======================
|
||||
|
||||
Changes in backwards compatibility policy
|
||||
|
|
|
@ -17,6 +17,9 @@ package org.apache.lucene.search.regex;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.regexp.CharacterIterator;
|
||||
import org.apache.regexp.RE;
|
||||
import org.apache.regexp.REProgram;
|
||||
import java.lang.reflect.Field;
|
||||
|
@ -30,8 +33,6 @@ import java.lang.reflect.Method;
|
|||
* it doesn't always provide a prefix even if one would exist.
|
||||
*/
|
||||
public class JakartaRegexpCapabilities implements RegexCapabilities {
|
||||
private RE regexp;
|
||||
|
||||
private static Field prefixField;
|
||||
private static Method getPrefixMethod;
|
||||
static {
|
||||
|
@ -79,45 +80,75 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
|
|||
this.flags = flags;
|
||||
}
|
||||
|
||||
public void compile(String pattern) {
|
||||
regexp = new RE(pattern, this.flags);
|
||||
}
|
||||
|
||||
public boolean match(String string) {
|
||||
return regexp.match(string);
|
||||
}
|
||||
|
||||
public String prefix() {
|
||||
try {
|
||||
final char[] prefix;
|
||||
if (getPrefixMethod != null) {
|
||||
prefix = (char[]) getPrefixMethod.invoke(regexp.getProgram());
|
||||
} else if (prefixField != null) {
|
||||
prefix = (char[]) prefixField.get(regexp.getProgram());
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
return prefix == null ? null : new String(prefix);
|
||||
} catch (Exception e) {
|
||||
// if we cannot get the prefix, return none
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
final JakartaRegexpCapabilities that = (JakartaRegexpCapabilities) o;
|
||||
|
||||
if (regexp != null ? !regexp.equals(that.regexp) : that.regexp != null) return false;
|
||||
|
||||
return true;
|
||||
public RegexCapabilities.RegexMatcher compile(String regex) {
|
||||
return new JakartaRegexMatcher(regex, flags);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return (regexp != null ? regexp.hashCode() : 0);
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
result = prime * result + flags;
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) return true;
|
||||
if (obj == null) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
JakartaRegexpCapabilities other = (JakartaRegexpCapabilities) obj;
|
||||
if (flags != other.flags) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
class JakartaRegexMatcher implements RegexCapabilities.RegexMatcher {
|
||||
private RE regexp;
|
||||
private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
|
||||
private final CharacterIterator utf16wrapper = new CharacterIterator() {
|
||||
|
||||
public char charAt(int pos) {
|
||||
return utf16.result[pos];
|
||||
}
|
||||
|
||||
public boolean isEnd(int pos) {
|
||||
return pos >= utf16.length;
|
||||
}
|
||||
|
||||
public String substring(int beginIndex) {
|
||||
return substring(beginIndex, utf16.length);
|
||||
}
|
||||
|
||||
public String substring(int beginIndex, int endIndex) {
|
||||
return new String(utf16.result, beginIndex, endIndex - beginIndex);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
public JakartaRegexMatcher(String regex, int flags) {
|
||||
regexp = new RE(regex, flags);
|
||||
}
|
||||
|
||||
public boolean match(BytesRef term) {
|
||||
UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
|
||||
return regexp.match(utf16wrapper, 0);
|
||||
}
|
||||
|
||||
public String prefix() {
|
||||
try {
|
||||
final char[] prefix;
|
||||
if (getPrefixMethod != null) {
|
||||
prefix = (char[]) getPrefixMethod.invoke(regexp.getProgram());
|
||||
} else if (prefixField != null) {
|
||||
prefix = (char[]) prefixField.get(regexp.getProgram());
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
return prefix == null ? null : new String(prefix);
|
||||
} catch (Exception e) {
|
||||
// if we cannot get the prefix, return none
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,8 +17,12 @@ package org.apache.lucene.search.regex;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
/**
|
||||
* An implementation tying Java's built-in java.util.regex to RegexQuery.
|
||||
*
|
||||
|
@ -27,9 +31,8 @@ import java.util.regex.Pattern;
|
|||
* attempt to {@link #match} each term for the specified field in the index.
|
||||
*/
|
||||
public class JavaUtilRegexCapabilities implements RegexCapabilities {
|
||||
private Pattern pattern;
|
||||
private int flags = 0;
|
||||
|
||||
|
||||
// Define the optional flags from Pattern that can be used.
|
||||
// Do this here to keep Pattern contained within this class.
|
||||
|
||||
|
@ -66,32 +69,59 @@ public class JavaUtilRegexCapabilities implements RegexCapabilities {
|
|||
this.flags = flags;
|
||||
}
|
||||
|
||||
public void compile(String pattern) {
|
||||
this.pattern = Pattern.compile(pattern, this.flags);
|
||||
public RegexCapabilities.RegexMatcher compile(String regex) {
|
||||
return new JavaUtilRegexMatcher(regex, flags);
|
||||
}
|
||||
|
||||
public boolean match(String string) {
|
||||
return pattern.matcher(string).matches();
|
||||
}
|
||||
|
||||
public String prefix() {
|
||||
return null;
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
result = prime * result + flags;
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
final JavaUtilRegexCapabilities that = (JavaUtilRegexCapabilities) o;
|
||||
|
||||
if (pattern != null ? !pattern.equals(that.pattern) : that.pattern != null) return false;
|
||||
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) return true;
|
||||
if (obj == null) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
JavaUtilRegexCapabilities other = (JavaUtilRegexCapabilities) obj;
|
||||
if (flags != other.flags) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return (pattern != null ? pattern.hashCode() : 0);
|
||||
class JavaUtilRegexMatcher implements RegexCapabilities.RegexMatcher {
|
||||
private final Pattern pattern;
|
||||
private final Matcher matcher;
|
||||
private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
|
||||
private final CharSequence utf16wrapper = new CharSequence() {
|
||||
|
||||
public int length() {
|
||||
return utf16.length;
|
||||
}
|
||||
|
||||
public char charAt(int index) {
|
||||
return utf16.result[index];
|
||||
}
|
||||
|
||||
public CharSequence subSequence(int start, int end) {
|
||||
return new String(utf16.result, start, end - start);
|
||||
}
|
||||
};
|
||||
|
||||
public JavaUtilRegexMatcher(String regex, int flags) {
|
||||
this.pattern = Pattern.compile(regex, flags);
|
||||
this.matcher = this.pattern.matcher(utf16wrapper);
|
||||
}
|
||||
|
||||
public boolean match(BytesRef term) {
|
||||
UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
|
||||
return matcher.reset().matches();
|
||||
}
|
||||
|
||||
public String prefix() {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
package org.apache.lucene.search.regex;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -21,7 +25,7 @@ package org.apache.lucene.search.regex;
|
|||
* Defines basic operations needed by {@link RegexQuery} for a regular
|
||||
* expression implementation.
|
||||
*/
|
||||
public interface RegexCapabilities {
|
||||
public interface RegexCapabilities extends Serializable {
|
||||
/**
|
||||
* Called by the constructor of {@link RegexTermEnum} allowing
|
||||
* implementations to cache a compiled version of the regular
|
||||
|
@ -29,20 +33,22 @@ public interface RegexCapabilities {
|
|||
*
|
||||
* @param pattern regular expression pattern
|
||||
*/
|
||||
void compile(String pattern);
|
||||
public RegexMatcher compile(String pattern);
|
||||
|
||||
/**
|
||||
*
|
||||
* @param string
|
||||
* @return true if string matches the pattern last passed to {@link #compile}.
|
||||
*/
|
||||
boolean match(String string);
|
||||
public interface RegexMatcher {
|
||||
/**
|
||||
*
|
||||
* @param string
|
||||
* @return true if string matches the pattern last passed to {@link #compile}.
|
||||
*/
|
||||
public boolean match(BytesRef term);
|
||||
|
||||
/**
|
||||
* A wise prefix implementation can reduce the term enumeration (and thus increase performance)
|
||||
* of RegexQuery dramatically!
|
||||
*
|
||||
* @return static non-regex prefix of the pattern last passed to {@link #compile}. May return null.
|
||||
*/
|
||||
String prefix();
|
||||
/**
|
||||
* A wise prefix implementation can reduce the term enumeration (and thus increase performance)
|
||||
* of RegexQuery dramatically!
|
||||
*
|
||||
* @return static non-regex prefix of the pattern last passed to {@link #compile}. May return null.
|
||||
*/
|
||||
public String prefix();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -76,23 +76,27 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
|
|||
return buffer.toString();
|
||||
}
|
||||
|
||||
/* generated by IntelliJ IDEA */
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
if (!super.equals(o)) return false;
|
||||
|
||||
final RegexQuery that = (RegexQuery) o;
|
||||
|
||||
return regexImpl.equals(that.regexImpl);
|
||||
}
|
||||
|
||||
/* generated by IntelliJ IDEA */
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = super.hashCode();
|
||||
result = 29 * result + regexImpl.hashCode();
|
||||
result = prime * result + ((regexImpl == null) ? 0 : regexImpl.hashCode());
|
||||
result = prime * result + ((term == null) ? 0 : term.hashCode());
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) return true;
|
||||
if (!super.equals(obj)) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
RegexQuery other = (RegexQuery) obj;
|
||||
if (regexImpl == null) {
|
||||
if (other.regexImpl != null) return false;
|
||||
} else if (!regexImpl.equals(other.regexImpl)) return false;
|
||||
if (term == null) {
|
||||
if (other.term != null) return false;
|
||||
} else if (!term.equals(other.term)) return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,15 +34,13 @@ import java.io.IOException;
|
|||
*/
|
||||
|
||||
public class RegexTermsEnum extends FilteredTermsEnum {
|
||||
private RegexCapabilities regexImpl;
|
||||
private RegexCapabilities.RegexMatcher regexImpl;
|
||||
private final BytesRef prefixRef;
|
||||
|
||||
public RegexTermsEnum(IndexReader reader, Term term, RegexCapabilities regexImpl) throws IOException {
|
||||
public RegexTermsEnum(IndexReader reader, Term term, RegexCapabilities regexCap) throws IOException {
|
||||
super(reader, term.field());
|
||||
String text = term.text();
|
||||
this.regexImpl = regexImpl;
|
||||
|
||||
regexImpl.compile(text);
|
||||
this.regexImpl = regexCap.compile(text);
|
||||
|
||||
String pre = regexImpl.prefix();
|
||||
if (pre == null) pre = "";
|
||||
|
@ -55,8 +53,7 @@ public class RegexTermsEnum extends FilteredTermsEnum {
|
|||
if (term.startsWith(prefixRef)) {
|
||||
// TODO: set BoostAttr based on distance of
|
||||
// searchTerm.text() and term().text()
|
||||
String text = term.utf8ToString();
|
||||
return regexImpl.match(text) ? AcceptStatus.YES : AcceptStatus.NO;
|
||||
return regexImpl.match(term) ? AcceptStatus.YES : AcceptStatus.NO;
|
||||
} else {
|
||||
return AcceptStatus.NO;
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.search.regex;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/**
|
||||
|
@ -26,21 +27,21 @@ public class TestJakartaRegexpCapabilities extends LuceneTestCase {
|
|||
|
||||
public void testGetPrefix(){
|
||||
JakartaRegexpCapabilities cap = new JakartaRegexpCapabilities();
|
||||
cap.compile("luc[e]?");
|
||||
assertTrue(cap.match("luce"));
|
||||
assertEquals("luc", cap.prefix());
|
||||
RegexCapabilities.RegexMatcher matcher = cap.compile("luc[e]?");
|
||||
assertTrue(matcher.match(new BytesRef("luce")));
|
||||
assertEquals("luc", matcher.prefix());
|
||||
|
||||
cap.compile("lucene");
|
||||
assertTrue(cap.match("lucene"));
|
||||
assertEquals("lucene", cap.prefix());
|
||||
matcher = cap.compile("lucene");
|
||||
assertTrue(matcher.match(new BytesRef("lucene")));
|
||||
assertEquals("lucene", matcher.prefix());
|
||||
}
|
||||
|
||||
public void testShakyPrefix(){
|
||||
JakartaRegexpCapabilities cap = new JakartaRegexpCapabilities();
|
||||
cap.compile("(ab|ac)");
|
||||
assertTrue(cap.match("ab"));
|
||||
assertTrue(cap.match("ac"));
|
||||
RegexCapabilities.RegexMatcher matcher = cap.compile("(ab|ac)");
|
||||
assertTrue(matcher.match(new BytesRef("ab")));
|
||||
assertTrue(matcher.match(new BytesRef("ac")));
|
||||
// why is it not a???
|
||||
assertNull(cap.prefix());
|
||||
assertNull(matcher.prefix());
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue