mirror of https://github.com/apache/lucene.git
LUCENE-2606: optimize contrib/regex for flex
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@987129 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f50e111501
commit
4ec28930c9
|
@ -23,6 +23,12 @@ New Features
|
||||||
* LUCENE-2479: Added ability to provide a sort comparator for spelling suggestions along
|
* LUCENE-2479: Added ability to provide a sort comparator for spelling suggestions along
|
||||||
with two implementations. The existing comparator (score, then frequency) is the default (Grant Ingersoll)
|
with two implementations. The existing comparator (score, then frequency) is the default (Grant Ingersoll)
|
||||||
|
|
||||||
|
API Changes
|
||||||
|
|
||||||
|
* LUCENE-2606: Changed RegexCapabilities interface to fix thread
|
||||||
|
safety, serialization, and performance problems. If you have
|
||||||
|
written a custom RegexCapabilities it will need to be updated
|
||||||
|
to the new API. (Robert Muir, Uwe Schindler)
|
||||||
======================= Lucene 3.x (not yet released) =======================
|
======================= Lucene 3.x (not yet released) =======================
|
||||||
|
|
||||||
Changes in backwards compatibility policy
|
Changes in backwards compatibility policy
|
||||||
|
|
|
@ -17,6 +17,9 @@ package org.apache.lucene.search.regex;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
|
import org.apache.regexp.CharacterIterator;
|
||||||
import org.apache.regexp.RE;
|
import org.apache.regexp.RE;
|
||||||
import org.apache.regexp.REProgram;
|
import org.apache.regexp.REProgram;
|
||||||
import java.lang.reflect.Field;
|
import java.lang.reflect.Field;
|
||||||
|
@ -30,8 +33,6 @@ import java.lang.reflect.Method;
|
||||||
* it doesn't always provide a prefix even if one would exist.
|
* it doesn't always provide a prefix even if one would exist.
|
||||||
*/
|
*/
|
||||||
public class JakartaRegexpCapabilities implements RegexCapabilities {
|
public class JakartaRegexpCapabilities implements RegexCapabilities {
|
||||||
private RE regexp;
|
|
||||||
|
|
||||||
private static Field prefixField;
|
private static Field prefixField;
|
||||||
private static Method getPrefixMethod;
|
private static Method getPrefixMethod;
|
||||||
static {
|
static {
|
||||||
|
@ -79,12 +80,58 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
|
||||||
this.flags = flags;
|
this.flags = flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void compile(String pattern) {
|
public RegexCapabilities.RegexMatcher compile(String regex) {
|
||||||
regexp = new RE(pattern, this.flags);
|
return new JakartaRegexMatcher(regex, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean match(String string) {
|
@Override
|
||||||
return regexp.match(string);
|
public int hashCode() {
|
||||||
|
final int prime = 31;
|
||||||
|
int result = 1;
|
||||||
|
result = prime * result + flags;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj) return true;
|
||||||
|
if (obj == null) return false;
|
||||||
|
if (getClass() != obj.getClass()) return false;
|
||||||
|
JakartaRegexpCapabilities other = (JakartaRegexpCapabilities) obj;
|
||||||
|
if (flags != other.flags) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
class JakartaRegexMatcher implements RegexCapabilities.RegexMatcher {
|
||||||
|
private RE regexp;
|
||||||
|
private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
|
||||||
|
private final CharacterIterator utf16wrapper = new CharacterIterator() {
|
||||||
|
|
||||||
|
public char charAt(int pos) {
|
||||||
|
return utf16.result[pos];
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isEnd(int pos) {
|
||||||
|
return pos >= utf16.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String substring(int beginIndex) {
|
||||||
|
return substring(beginIndex, utf16.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String substring(int beginIndex, int endIndex) {
|
||||||
|
return new String(utf16.result, beginIndex, endIndex - beginIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
public JakartaRegexMatcher(String regex, int flags) {
|
||||||
|
regexp = new RE(regex, flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean match(BytesRef term) {
|
||||||
|
UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
|
||||||
|
return regexp.match(utf16wrapper, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String prefix() {
|
public String prefix() {
|
||||||
|
@ -103,21 +150,5 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object o) {
|
|
||||||
if (this == o) return true;
|
|
||||||
if (o == null || getClass() != o.getClass()) return false;
|
|
||||||
|
|
||||||
final JakartaRegexpCapabilities that = (JakartaRegexpCapabilities) o;
|
|
||||||
|
|
||||||
if (regexp != null ? !regexp.equals(that.regexp) : that.regexp != null) return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode() {
|
|
||||||
return (regexp != null ? regexp.hashCode() : 0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,8 +17,12 @@ package org.apache.lucene.search.regex;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An implementation tying Java's built-in java.util.regex to RegexQuery.
|
* An implementation tying Java's built-in java.util.regex to RegexQuery.
|
||||||
*
|
*
|
||||||
|
@ -27,7 +31,6 @@ import java.util.regex.Pattern;
|
||||||
* attempt to {@link #match} each term for the specified field in the index.
|
* attempt to {@link #match} each term for the specified field in the index.
|
||||||
*/
|
*/
|
||||||
public class JavaUtilRegexCapabilities implements RegexCapabilities {
|
public class JavaUtilRegexCapabilities implements RegexCapabilities {
|
||||||
private Pattern pattern;
|
|
||||||
private int flags = 0;
|
private int flags = 0;
|
||||||
|
|
||||||
// Define the optional flags from Pattern that can be used.
|
// Define the optional flags from Pattern that can be used.
|
||||||
|
@ -66,32 +69,59 @@ public class JavaUtilRegexCapabilities implements RegexCapabilities {
|
||||||
this.flags = flags;
|
this.flags = flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void compile(String pattern) {
|
public RegexCapabilities.RegexMatcher compile(String regex) {
|
||||||
this.pattern = Pattern.compile(pattern, this.flags);
|
return new JavaUtilRegexMatcher(regex, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean match(String string) {
|
@Override
|
||||||
return pattern.matcher(string).matches();
|
public int hashCode() {
|
||||||
|
final int prime = 31;
|
||||||
|
int result = 1;
|
||||||
|
result = prime * result + flags;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj) return true;
|
||||||
|
if (obj == null) return false;
|
||||||
|
if (getClass() != obj.getClass()) return false;
|
||||||
|
JavaUtilRegexCapabilities other = (JavaUtilRegexCapabilities) obj;
|
||||||
|
if (flags != other.flags) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
class JavaUtilRegexMatcher implements RegexCapabilities.RegexMatcher {
|
||||||
|
private final Pattern pattern;
|
||||||
|
private final Matcher matcher;
|
||||||
|
private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
|
||||||
|
private final CharSequence utf16wrapper = new CharSequence() {
|
||||||
|
|
||||||
|
public int length() {
|
||||||
|
return utf16.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
public char charAt(int index) {
|
||||||
|
return utf16.result[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
public CharSequence subSequence(int start, int end) {
|
||||||
|
return new String(utf16.result, start, end - start);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
public JavaUtilRegexMatcher(String regex, int flags) {
|
||||||
|
this.pattern = Pattern.compile(regex, flags);
|
||||||
|
this.matcher = this.pattern.matcher(utf16wrapper);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean match(BytesRef term) {
|
||||||
|
UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
|
||||||
|
return matcher.reset().matches();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String prefix() {
|
public String prefix() {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object o) {
|
|
||||||
if (this == o) return true;
|
|
||||||
if (o == null || getClass() != o.getClass()) return false;
|
|
||||||
|
|
||||||
final JavaUtilRegexCapabilities that = (JavaUtilRegexCapabilities) o;
|
|
||||||
|
|
||||||
if (pattern != null ? !pattern.equals(that.pattern) : that.pattern != null) return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode() {
|
|
||||||
return (pattern != null ? pattern.hashCode() : 0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,9 @@
|
||||||
package org.apache.lucene.search.regex;
|
package org.apache.lucene.search.regex;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -21,7 +25,7 @@ package org.apache.lucene.search.regex;
|
||||||
* Defines basic operations needed by {@link RegexQuery} for a regular
|
* Defines basic operations needed by {@link RegexQuery} for a regular
|
||||||
* expression implementation.
|
* expression implementation.
|
||||||
*/
|
*/
|
||||||
public interface RegexCapabilities {
|
public interface RegexCapabilities extends Serializable {
|
||||||
/**
|
/**
|
||||||
* Called by the constructor of {@link RegexTermEnum} allowing
|
* Called by the constructor of {@link RegexTermEnum} allowing
|
||||||
* implementations to cache a compiled version of the regular
|
* implementations to cache a compiled version of the regular
|
||||||
|
@ -29,14 +33,15 @@ public interface RegexCapabilities {
|
||||||
*
|
*
|
||||||
* @param pattern regular expression pattern
|
* @param pattern regular expression pattern
|
||||||
*/
|
*/
|
||||||
void compile(String pattern);
|
public RegexMatcher compile(String pattern);
|
||||||
|
|
||||||
|
public interface RegexMatcher {
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param string
|
* @param string
|
||||||
* @return true if string matches the pattern last passed to {@link #compile}.
|
* @return true if string matches the pattern last passed to {@link #compile}.
|
||||||
*/
|
*/
|
||||||
boolean match(String string);
|
public boolean match(BytesRef term);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A wise prefix implementation can reduce the term enumeration (and thus increase performance)
|
* A wise prefix implementation can reduce the term enumeration (and thus increase performance)
|
||||||
|
@ -44,5 +49,6 @@ public interface RegexCapabilities {
|
||||||
*
|
*
|
||||||
* @return static non-regex prefix of the pattern last passed to {@link #compile}. May return null.
|
* @return static non-regex prefix of the pattern last passed to {@link #compile}. May return null.
|
||||||
*/
|
*/
|
||||||
String prefix();
|
public String prefix();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -76,23 +76,27 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
|
||||||
return buffer.toString();
|
return buffer.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
/* generated by IntelliJ IDEA */
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object o) {
|
|
||||||
if (this == o) return true;
|
|
||||||
if (o == null || getClass() != o.getClass()) return false;
|
|
||||||
if (!super.equals(o)) return false;
|
|
||||||
|
|
||||||
final RegexQuery that = (RegexQuery) o;
|
|
||||||
|
|
||||||
return regexImpl.equals(that.regexImpl);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* generated by IntelliJ IDEA */
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
|
final int prime = 31;
|
||||||
int result = super.hashCode();
|
int result = super.hashCode();
|
||||||
result = 29 * result + regexImpl.hashCode();
|
result = prime * result + ((regexImpl == null) ? 0 : regexImpl.hashCode());
|
||||||
|
result = prime * result + ((term == null) ? 0 : term.hashCode());
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj) return true;
|
||||||
|
if (!super.equals(obj)) return false;
|
||||||
|
if (getClass() != obj.getClass()) return false;
|
||||||
|
RegexQuery other = (RegexQuery) obj;
|
||||||
|
if (regexImpl == null) {
|
||||||
|
if (other.regexImpl != null) return false;
|
||||||
|
} else if (!regexImpl.equals(other.regexImpl)) return false;
|
||||||
|
if (term == null) {
|
||||||
|
if (other.term != null) return false;
|
||||||
|
} else if (!term.equals(other.term)) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,15 +34,13 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class RegexTermsEnum extends FilteredTermsEnum {
|
public class RegexTermsEnum extends FilteredTermsEnum {
|
||||||
private RegexCapabilities regexImpl;
|
private RegexCapabilities.RegexMatcher regexImpl;
|
||||||
private final BytesRef prefixRef;
|
private final BytesRef prefixRef;
|
||||||
|
|
||||||
public RegexTermsEnum(IndexReader reader, Term term, RegexCapabilities regexImpl) throws IOException {
|
public RegexTermsEnum(IndexReader reader, Term term, RegexCapabilities regexCap) throws IOException {
|
||||||
super(reader, term.field());
|
super(reader, term.field());
|
||||||
String text = term.text();
|
String text = term.text();
|
||||||
this.regexImpl = regexImpl;
|
this.regexImpl = regexCap.compile(text);
|
||||||
|
|
||||||
regexImpl.compile(text);
|
|
||||||
|
|
||||||
String pre = regexImpl.prefix();
|
String pre = regexImpl.prefix();
|
||||||
if (pre == null) pre = "";
|
if (pre == null) pre = "";
|
||||||
|
@ -55,8 +53,7 @@ public class RegexTermsEnum extends FilteredTermsEnum {
|
||||||
if (term.startsWith(prefixRef)) {
|
if (term.startsWith(prefixRef)) {
|
||||||
// TODO: set BoostAttr based on distance of
|
// TODO: set BoostAttr based on distance of
|
||||||
// searchTerm.text() and term().text()
|
// searchTerm.text() and term().text()
|
||||||
String text = term.utf8ToString();
|
return regexImpl.match(term) ? AcceptStatus.YES : AcceptStatus.NO;
|
||||||
return regexImpl.match(text) ? AcceptStatus.YES : AcceptStatus.NO;
|
|
||||||
} else {
|
} else {
|
||||||
return AcceptStatus.NO;
|
return AcceptStatus.NO;
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.search.regex;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -26,21 +27,21 @@ public class TestJakartaRegexpCapabilities extends LuceneTestCase {
|
||||||
|
|
||||||
public void testGetPrefix(){
|
public void testGetPrefix(){
|
||||||
JakartaRegexpCapabilities cap = new JakartaRegexpCapabilities();
|
JakartaRegexpCapabilities cap = new JakartaRegexpCapabilities();
|
||||||
cap.compile("luc[e]?");
|
RegexCapabilities.RegexMatcher matcher = cap.compile("luc[e]?");
|
||||||
assertTrue(cap.match("luce"));
|
assertTrue(matcher.match(new BytesRef("luce")));
|
||||||
assertEquals("luc", cap.prefix());
|
assertEquals("luc", matcher.prefix());
|
||||||
|
|
||||||
cap.compile("lucene");
|
matcher = cap.compile("lucene");
|
||||||
assertTrue(cap.match("lucene"));
|
assertTrue(matcher.match(new BytesRef("lucene")));
|
||||||
assertEquals("lucene", cap.prefix());
|
assertEquals("lucene", matcher.prefix());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testShakyPrefix(){
|
public void testShakyPrefix(){
|
||||||
JakartaRegexpCapabilities cap = new JakartaRegexpCapabilities();
|
JakartaRegexpCapabilities cap = new JakartaRegexpCapabilities();
|
||||||
cap.compile("(ab|ac)");
|
RegexCapabilities.RegexMatcher matcher = cap.compile("(ab|ac)");
|
||||||
assertTrue(cap.match("ab"));
|
assertTrue(matcher.match(new BytesRef("ab")));
|
||||||
assertTrue(cap.match("ac"));
|
assertTrue(matcher.match(new BytesRef("ac")));
|
||||||
// why is it not a???
|
// why is it not a???
|
||||||
assertNull(cap.prefix());
|
assertNull(matcher.prefix());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue