mirror of https://github.com/apache/lucene.git
LUCENE-2907: fix automaton thread hazard: change query to compute immutable compiled form in ctor
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1067720 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
70cbc8acab
commit
32b914572d
|
@ -18,15 +18,15 @@ package org.apache.lucene.search;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.AutomatonTermsEnum.CompiledAutomaton;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.BasicAutomata;
|
||||
import org.apache.lucene.util.automaton.BasicOperations;
|
||||
import org.apache.lucene.util.automaton.MinimizationOperations;
|
||||
|
@ -56,9 +56,16 @@ public class AutomatonQuery extends MultiTermQuery {
|
|||
/** term containing the field, and possibly some pattern structure */
|
||||
protected final Term term;
|
||||
|
||||
transient ByteRunAutomaton runAutomaton;
|
||||
transient boolean isFinite;
|
||||
transient BytesRef commonSuffixRef;
|
||||
/**
|
||||
* abstraction for returning a termsenum:
|
||||
* in the ctor the query computes one of these, the actual
|
||||
* implementation depends upon the automaton's structure.
|
||||
*/
|
||||
private abstract class TermsEnumFactory implements Serializable {
|
||||
protected abstract TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException;
|
||||
}
|
||||
|
||||
private final TermsEnumFactory factory;
|
||||
|
||||
/**
|
||||
* Create a new AutomatonQuery from an {@link Automaton}.
|
||||
|
@ -68,60 +75,77 @@ public class AutomatonQuery extends MultiTermQuery {
|
|||
* @param automaton Automaton to run, terms that are accepted are considered a
|
||||
* match.
|
||||
*/
|
||||
public AutomatonQuery(Term term, Automaton automaton) {
|
||||
public AutomatonQuery(final Term term, Automaton automaton) {
|
||||
super(term.field());
|
||||
this.term = term;
|
||||
this.automaton = automaton;
|
||||
MinimizationOperations.minimize(automaton);
|
||||
|
||||
if (BasicOperations.isEmpty(automaton)) {
|
||||
// matches nothing
|
||||
factory = new TermsEnumFactory() {
|
||||
@Override
|
||||
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
|
||||
return TermsEnum.EMPTY;
|
||||
}
|
||||
};
|
||||
} else if (BasicOperations.isTotal(automaton)) {
|
||||
// matches all possible strings
|
||||
factory = new TermsEnumFactory() {
|
||||
@Override
|
||||
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
|
||||
return terms.iterator();
|
||||
}
|
||||
};
|
||||
} else {
|
||||
final String singleton;
|
||||
final String commonPrefix;
|
||||
|
||||
if (automaton.getSingleton() == null) {
|
||||
commonPrefix = SpecialOperations.getCommonPrefix(automaton);
|
||||
if (commonPrefix.length() > 0 && BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) {
|
||||
singleton = commonPrefix;
|
||||
} else {
|
||||
singleton = null;
|
||||
}
|
||||
} else {
|
||||
commonPrefix = null;
|
||||
singleton = automaton.getSingleton();
|
||||
}
|
||||
|
||||
private synchronized void compileAutomaton() {
|
||||
// this method must be synchronized, as setting the three transient fields is not atomic:
|
||||
if (runAutomaton == null) {
|
||||
runAutomaton = new ByteRunAutomaton(automaton);
|
||||
isFinite = SpecialOperations.isFinite(automaton);
|
||||
commonSuffixRef = isFinite ? null : SpecialOperations.getCommonSuffixBytesRef(runAutomaton.getAutomaton());
|
||||
if (singleton != null) {
|
||||
// matches a fixed string in singleton or expanded representation
|
||||
factory = new TermsEnumFactory() {
|
||||
@Override
|
||||
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
|
||||
return new SingleTermsEnum(terms.iterator(), term.createTerm(singleton));
|
||||
}
|
||||
};
|
||||
} else if (BasicOperations.sameLanguage(automaton, BasicOperations.concatenate(
|
||||
BasicAutomata.makeString(commonPrefix), BasicAutomata.makeAnyString()))) {
|
||||
// matches a constant prefix
|
||||
factory = new TermsEnumFactory() {
|
||||
@Override
|
||||
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
|
||||
return new PrefixTermsEnum(terms.iterator(), term.createTerm(commonPrefix));
|
||||
}
|
||||
};
|
||||
} else {
|
||||
final AutomatonTermsEnum.CompiledAutomaton compiled =
|
||||
new CompiledAutomaton(automaton, SpecialOperations.isFinite(automaton));
|
||||
factory = new TermsEnumFactory() {
|
||||
@Override
|
||||
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
|
||||
return new AutomatonTermsEnum(terms.iterator(), compiled);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
|
||||
// matches nothing
|
||||
if (BasicOperations.isEmpty(automaton)) {
|
||||
return TermsEnum.EMPTY;
|
||||
}
|
||||
|
||||
TermsEnum tenum = terms.iterator();
|
||||
|
||||
// matches all possible strings
|
||||
if (BasicOperations.isTotal(automaton)) {
|
||||
return tenum;
|
||||
}
|
||||
|
||||
// matches a fixed string in singleton representation
|
||||
String singleton = automaton.getSingleton();
|
||||
if (singleton != null)
|
||||
return new SingleTermsEnum(tenum, term.createTerm(singleton));
|
||||
|
||||
// matches a fixed string in expanded representation
|
||||
final String commonPrefix = SpecialOperations.getCommonPrefix(automaton);
|
||||
|
||||
if (commonPrefix.length() > 0) {
|
||||
if (BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) {
|
||||
return new SingleTermsEnum(tenum, term.createTerm(commonPrefix));
|
||||
}
|
||||
|
||||
// matches a constant prefix
|
||||
Automaton prefixAutomaton = BasicOperations.concatenate(BasicAutomata
|
||||
.makeString(commonPrefix), BasicAutomata.makeAnyString());
|
||||
if (BasicOperations.sameLanguage(automaton, prefixAutomaton)) {
|
||||
return new PrefixTermsEnum(tenum, term.createTerm(commonPrefix));
|
||||
}
|
||||
}
|
||||
|
||||
compileAutomaton();
|
||||
|
||||
return new AutomatonTermsEnum(runAutomaton, tenum, isFinite, commonSuffixRef);
|
||||
return factory.getTermsEnum(terms, atts);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.util.automaton.Automaton;
|
|||
import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.SpecialOperations;
|
||||
import org.apache.lucene.util.automaton.Transition;
|
||||
import org.apache.lucene.util.automaton.UTF32ToUTF8;
|
||||
|
||||
/**
|
||||
* A FilteredTermsEnum that enumerates terms based upon what is accepted by a
|
||||
|
@ -46,8 +47,6 @@ import org.apache.lucene.util.automaton.Transition;
|
|||
* @lucene.experimental
|
||||
*/
|
||||
public class AutomatonTermsEnum extends FilteredTermsEnum {
|
||||
// the object-oriented form of the DFA
|
||||
private final Automaton automaton;
|
||||
// a tableized array-based form of the DFA
|
||||
private final ByteRunAutomaton runAutomaton;
|
||||
// common suffix of the automaton
|
||||
|
@ -71,54 +70,26 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
|
|||
private final Comparator<BytesRef> termComp;
|
||||
|
||||
/**
|
||||
* Expert ctor:
|
||||
* Construct an enumerator based upon an automaton, enumerating the specified
|
||||
* field, working on a supplied TermsEnum
|
||||
* <p>
|
||||
* @lucene.experimental
|
||||
* <p>
|
||||
* @param runAutomaton pre-compiled ByteRunAutomaton
|
||||
* @param finite true if the automaton accepts a finite language
|
||||
* @param compiled CompiledAutomaton
|
||||
*/
|
||||
public AutomatonTermsEnum(ByteRunAutomaton runAutomaton,
|
||||
TermsEnum tenum,
|
||||
boolean finite, BytesRef commonSuffixRef)
|
||||
throws IOException {
|
||||
public AutomatonTermsEnum(TermsEnum tenum, CompiledAutomaton compiled) throws IOException {
|
||||
super(tenum);
|
||||
this.automaton = runAutomaton.getAutomaton();
|
||||
this.finite = finite;
|
||||
this.finite = compiled.finite;
|
||||
this.runAutomaton = compiled.runAutomaton;
|
||||
this.commonSuffixRef = compiled.commonSuffixRef;
|
||||
this.allTransitions = compiled.sortedTransitions;
|
||||
|
||||
this.runAutomaton = runAutomaton;
|
||||
if (finite) {
|
||||
// don't use suffix w/ finite DFAs
|
||||
this.commonSuffixRef = null;
|
||||
} else if (commonSuffixRef == null) {
|
||||
// compute now
|
||||
this.commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(automaton);
|
||||
} else {
|
||||
// precomputed
|
||||
this.commonSuffixRef = commonSuffixRef;
|
||||
}
|
||||
|
||||
// build a cache of sorted transitions for every state
|
||||
allTransitions = this.automaton.getSortedTransitions();
|
||||
// used for path tracking, where each bit is a numbered state.
|
||||
visited = new long[runAutomaton.getSize()];
|
||||
|
||||
termComp = getComparator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct an enumerator based upon an automaton, enumerating the specified
|
||||
* field, working on a supplied TermsEnum
|
||||
* <p>
|
||||
* It will automatically calculate whether or not the automaton is finite
|
||||
*/
|
||||
public AutomatonTermsEnum(Automaton automaton, TermsEnum tenum)
|
||||
throws IOException {
|
||||
this(new ByteRunAutomaton(automaton), tenum, SpecialOperations.isFinite(automaton), null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the term matches the automaton. Also stashes away the term
|
||||
* to assist with smart enumeration.
|
||||
|
@ -350,4 +321,26 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
|
|||
}
|
||||
return -1; /* all solutions exhausted */
|
||||
}
|
||||
|
||||
/**
|
||||
* immutable class with everything this enum needs.
|
||||
*/
|
||||
public static class CompiledAutomaton {
|
||||
public final ByteRunAutomaton runAutomaton;
|
||||
public final Transition[][] sortedTransitions;
|
||||
public final BytesRef commonSuffixRef;
|
||||
public final boolean finite;
|
||||
|
||||
public CompiledAutomaton(Automaton automaton, boolean finite) {
|
||||
Automaton utf8 = new UTF32ToUTF8().convert(automaton);
|
||||
runAutomaton = new ByteRunAutomaton(utf8, true);
|
||||
sortedTransitions = utf8.getSortedTransitions();
|
||||
this.finite = finite;
|
||||
if (finite) {
|
||||
commonSuffixRef = null;
|
||||
} else {
|
||||
commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(utf8);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ import org.apache.lucene.index.DocsEnum;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.AutomatonTermsEnum.CompiledAutomaton;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
@ -140,18 +141,18 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
|||
*/
|
||||
private TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm)
|
||||
throws IOException {
|
||||
final List<ByteRunAutomaton> runAutomata = initAutomata(editDistance);
|
||||
final List<CompiledAutomaton> runAutomata = initAutomata(editDistance);
|
||||
if (editDistance < runAutomata.size()) {
|
||||
return new AutomatonFuzzyTermsEnum(runAutomata.subList(0, editDistance + 1)
|
||||
.toArray(new ByteRunAutomaton[editDistance + 1]), lastTerm);
|
||||
.toArray(new CompiledAutomaton[editDistance + 1]), lastTerm);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/** initialize levenshtein DFAs up to maxDistance, if possible */
|
||||
private List<ByteRunAutomaton> initAutomata(int maxDistance) {
|
||||
final List<ByteRunAutomaton> runAutomata = dfaAtt.automata();
|
||||
private List<CompiledAutomaton> initAutomata(int maxDistance) {
|
||||
final List<CompiledAutomaton> runAutomata = dfaAtt.automata();
|
||||
if (runAutomata.size() <= maxDistance &&
|
||||
maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
|
||||
LevenshteinAutomata builder =
|
||||
|
@ -165,7 +166,7 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
|||
UnicodeUtil.newString(termText, 0, realPrefixLength));
|
||||
a = BasicOperations.concatenate(prefix, a);
|
||||
}
|
||||
runAutomata.add(new ByteRunAutomaton(a));
|
||||
runAutomata.add(new CompiledAutomaton(a, true));
|
||||
}
|
||||
}
|
||||
return runAutomata;
|
||||
|
@ -312,10 +313,12 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
|||
private final BoostAttribute boostAtt =
|
||||
attributes().addAttribute(BoostAttribute.class);
|
||||
|
||||
public AutomatonFuzzyTermsEnum(ByteRunAutomaton matchers[],
|
||||
public AutomatonFuzzyTermsEnum(CompiledAutomaton compiled[],
|
||||
BytesRef lastTerm) throws IOException {
|
||||
super(matchers[matchers.length - 1], tenum, true, null);
|
||||
this.matchers = matchers;
|
||||
super(tenum, compiled[compiled.length - 1]);
|
||||
this.matchers = new ByteRunAutomaton[compiled.length];
|
||||
for (int i = 0; i < compiled.length; i++)
|
||||
this.matchers[i] = compiled[i].runAutomaton;
|
||||
this.lastTerm = lastTerm;
|
||||
termRef = new BytesRef(term.text());
|
||||
}
|
||||
|
@ -563,14 +566,14 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
|||
|
||||
/** @lucene.internal */
|
||||
public static interface LevenshteinAutomataAttribute extends Attribute {
|
||||
public List<ByteRunAutomaton> automata();
|
||||
public List<CompiledAutomaton> automata();
|
||||
}
|
||||
|
||||
/** @lucene.internal */
|
||||
public static final class LevenshteinAutomataAttributeImpl extends AttributeImpl implements LevenshteinAutomataAttribute {
|
||||
private final List<ByteRunAutomaton> automata = new ArrayList<ByteRunAutomaton>();
|
||||
private final List<CompiledAutomaton> automata = new ArrayList<CompiledAutomaton>();
|
||||
|
||||
public List<ByteRunAutomaton> automata() {
|
||||
public List<CompiledAutomaton> automata() {
|
||||
return automata;
|
||||
}
|
||||
|
||||
|
@ -595,7 +598,7 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
|||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
final List<ByteRunAutomaton> targetAutomata =
|
||||
final List<CompiledAutomaton> targetAutomata =
|
||||
((LevenshteinAutomataAttribute) target).automata();
|
||||
targetAutomata.clear();
|
||||
targetAutomata.addAll(automata);
|
||||
|
|
|
@ -66,6 +66,13 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
* assumed by the built-in automata operations.
|
||||
*
|
||||
* <p>
|
||||
* <p>
|
||||
* Note: This class has internal mutable state and is not thread safe. It is
|
||||
* the caller's responsibility to ensure any necessary synchronization if you
|
||||
* wish to use the same Automaton from multiple threads. In general it is instead
|
||||
* recommended to use a {@link RunAutomaton} for multithreaded matching: it is immutable,
|
||||
* thread safe, and much faster.
|
||||
* </p>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Automaton implements Serializable, Cloneable {
|
||||
|
|
|
@ -20,7 +20,12 @@ package org.apache.lucene.util.automaton;
|
|||
public class ByteRunAutomaton extends RunAutomaton {
|
||||
|
||||
public ByteRunAutomaton(Automaton a) {
|
||||
super(new UTF32ToUTF8().convert(a), 256, true);
|
||||
this(a, false);
|
||||
}
|
||||
|
||||
/** expert: if utf8 is true, the input is already byte-based */
|
||||
public ByteRunAutomaton(Automaton a, boolean utf8) {
|
||||
super(utf8 ? a : new UTF32ToUTF8().convert(a), 256, true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -45,7 +45,6 @@ public abstract class RunAutomaton implements Serializable {
|
|||
// getCharClass(c)]
|
||||
final int[] points; // char interval start points
|
||||
final int[] classmap; // map from char number to class class
|
||||
final Automaton automaton;
|
||||
|
||||
/**
|
||||
* Returns a string representation of this automaton.
|
||||
|
@ -114,13 +113,6 @@ public abstract class RunAutomaton implements Serializable {
|
|||
return SpecialOperations.findIndex(c, points);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the automaton
|
||||
*/
|
||||
public Automaton getAutomaton() {
|
||||
return automaton;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a new <code>RunAutomaton</code> from a deterministic
|
||||
* <code>Automaton</code>.
|
||||
|
@ -160,7 +152,6 @@ public abstract class RunAutomaton implements Serializable {
|
|||
} else {
|
||||
classmap = null;
|
||||
}
|
||||
this.automaton = a;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -26,7 +26,8 @@ import java.util.ArrayList;
|
|||
// TODO
|
||||
// - do we really need the .bits...? if not we can make util in UnicodeUtil to convert 1 char into a BytesRef
|
||||
|
||||
final class UTF32ToUTF8 {
|
||||
/** @lucene.internal */
|
||||
public final class UTF32ToUTF8 {
|
||||
|
||||
// Unicode boundaries for UTF8 bytes 1,2,3,4
|
||||
private static final int[] startCodes = new int[] {0, 128, 2048, 65536};
|
||||
|
|
Loading…
Reference in New Issue