LUCENE-2907: fix automaton thread hazard: change query to compute immutable compiled form in ctor

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1067720 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-02-06 18:17:00 +00:00
parent 70cbc8acab
commit 32b914572d
7 changed files with 133 additions and 109 deletions

View File

@ -18,15 +18,15 @@ package org.apache.lucene.search;
*/ */
import java.io.IOException; import java.io.IOException;
import java.io.Serializable;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms; import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.AutomatonTermsEnum.CompiledAutomaton;
import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.BasicAutomata; import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.BasicOperations; import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.MinimizationOperations; import org.apache.lucene.util.automaton.MinimizationOperations;
@ -56,9 +56,16 @@ public class AutomatonQuery extends MultiTermQuery {
/** term containing the field, and possibly some pattern structure */ /** term containing the field, and possibly some pattern structure */
protected final Term term; protected final Term term;
transient ByteRunAutomaton runAutomaton; /**
transient boolean isFinite; * abstraction for returning a termsenum:
transient BytesRef commonSuffixRef; * in the ctor the query computes one of these, the actual
* implementation depends upon the automaton's structure.
*/
private abstract class TermsEnumFactory implements Serializable {
protected abstract TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException;
}
private final TermsEnumFactory factory;
/** /**
* Create a new AutomatonQuery from an {@link Automaton}. * Create a new AutomatonQuery from an {@link Automaton}.
@ -68,60 +75,77 @@ public class AutomatonQuery extends MultiTermQuery {
* @param automaton Automaton to run, terms that are accepted are considered a * @param automaton Automaton to run, terms that are accepted are considered a
* match. * match.
*/ */
public AutomatonQuery(Term term, Automaton automaton) { public AutomatonQuery(final Term term, Automaton automaton) {
super(term.field()); super(term.field());
this.term = term; this.term = term;
this.automaton = automaton; this.automaton = automaton;
MinimizationOperations.minimize(automaton); MinimizationOperations.minimize(automaton);
}
private synchronized void compileAutomaton() { if (BasicOperations.isEmpty(automaton)) {
// this method must be synchronized, as setting the three transient fields is not atomic: // matches nothing
if (runAutomaton == null) { factory = new TermsEnumFactory() {
runAutomaton = new ByteRunAutomaton(automaton); @Override
isFinite = SpecialOperations.isFinite(automaton); protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
commonSuffixRef = isFinite ? null : SpecialOperations.getCommonSuffixBytesRef(runAutomaton.getAutomaton()); return TermsEnum.EMPTY;
}
};
} else if (BasicOperations.isTotal(automaton)) {
// matches all possible strings
factory = new TermsEnumFactory() {
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
return terms.iterator();
}
};
} else {
final String singleton;
final String commonPrefix;
if (automaton.getSingleton() == null) {
commonPrefix = SpecialOperations.getCommonPrefix(automaton);
if (commonPrefix.length() > 0 && BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) {
singleton = commonPrefix;
} else {
singleton = null;
}
} else {
commonPrefix = null;
singleton = automaton.getSingleton();
}
if (singleton != null) {
// matches a fixed string in singleton or expanded representation
factory = new TermsEnumFactory() {
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
return new SingleTermsEnum(terms.iterator(), term.createTerm(singleton));
}
};
} else if (BasicOperations.sameLanguage(automaton, BasicOperations.concatenate(
BasicAutomata.makeString(commonPrefix), BasicAutomata.makeAnyString()))) {
// matches a constant prefix
factory = new TermsEnumFactory() {
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
return new PrefixTermsEnum(terms.iterator(), term.createTerm(commonPrefix));
}
};
} else {
final AutomatonTermsEnum.CompiledAutomaton compiled =
new CompiledAutomaton(automaton, SpecialOperations.isFinite(automaton));
factory = new TermsEnumFactory() {
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
return new AutomatonTermsEnum(terms.iterator(), compiled);
}
};
}
} }
} }
@Override @Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
// matches nothing return factory.getTermsEnum(terms, atts);
if (BasicOperations.isEmpty(automaton)) {
return TermsEnum.EMPTY;
}
TermsEnum tenum = terms.iterator();
// matches all possible strings
if (BasicOperations.isTotal(automaton)) {
return tenum;
}
// matches a fixed string in singleton representation
String singleton = automaton.getSingleton();
if (singleton != null)
return new SingleTermsEnum(tenum, term.createTerm(singleton));
// matches a fixed string in expanded representation
final String commonPrefix = SpecialOperations.getCommonPrefix(automaton);
if (commonPrefix.length() > 0) {
if (BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) {
return new SingleTermsEnum(tenum, term.createTerm(commonPrefix));
}
// matches a constant prefix
Automaton prefixAutomaton = BasicOperations.concatenate(BasicAutomata
.makeString(commonPrefix), BasicAutomata.makeAnyString());
if (BasicOperations.sameLanguage(automaton, prefixAutomaton)) {
return new PrefixTermsEnum(tenum, term.createTerm(commonPrefix));
}
}
compileAutomaton();
return new AutomatonTermsEnum(runAutomaton, tenum, isFinite, commonSuffixRef);
} }
@Override @Override

View File

@ -27,6 +27,7 @@ import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton; import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.SpecialOperations; import org.apache.lucene.util.automaton.SpecialOperations;
import org.apache.lucene.util.automaton.Transition; import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.automaton.UTF32ToUTF8;
/** /**
* A FilteredTermsEnum that enumerates terms based upon what is accepted by a * A FilteredTermsEnum that enumerates terms based upon what is accepted by a
@ -46,8 +47,6 @@ import org.apache.lucene.util.automaton.Transition;
* @lucene.experimental * @lucene.experimental
*/ */
public class AutomatonTermsEnum extends FilteredTermsEnum { public class AutomatonTermsEnum extends FilteredTermsEnum {
// the object-oriented form of the DFA
private final Automaton automaton;
// a tableized array-based form of the DFA // a tableized array-based form of the DFA
private final ByteRunAutomaton runAutomaton; private final ByteRunAutomaton runAutomaton;
// common suffix of the automaton // common suffix of the automaton
@ -71,54 +70,26 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
private final Comparator<BytesRef> termComp; private final Comparator<BytesRef> termComp;
/** /**
* Expert ctor:
* Construct an enumerator based upon an automaton, enumerating the specified * Construct an enumerator based upon an automaton, enumerating the specified
* field, working on a supplied TermsEnum * field, working on a supplied TermsEnum
* <p> * <p>
* @lucene.experimental * @lucene.experimental
* <p> * <p>
* @param runAutomaton pre-compiled ByteRunAutomaton * @param compiled CompiledAutomaton
* @param finite true if the automaton accepts a finite language
*/ */
public AutomatonTermsEnum(ByteRunAutomaton runAutomaton, public AutomatonTermsEnum(TermsEnum tenum, CompiledAutomaton compiled) throws IOException {
TermsEnum tenum,
boolean finite, BytesRef commonSuffixRef)
throws IOException {
super(tenum); super(tenum);
this.automaton = runAutomaton.getAutomaton(); this.finite = compiled.finite;
this.finite = finite; this.runAutomaton = compiled.runAutomaton;
this.commonSuffixRef = compiled.commonSuffixRef;
this.allTransitions = compiled.sortedTransitions;
this.runAutomaton = runAutomaton;
if (finite) {
// don't use suffix w/ finite DFAs
this.commonSuffixRef = null;
} else if (commonSuffixRef == null) {
// compute now
this.commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(automaton);
} else {
// precomputed
this.commonSuffixRef = commonSuffixRef;
}
// build a cache of sorted transitions for every state
allTransitions = this.automaton.getSortedTransitions();
// used for path tracking, where each bit is a numbered state. // used for path tracking, where each bit is a numbered state.
visited = new long[runAutomaton.getSize()]; visited = new long[runAutomaton.getSize()];
termComp = getComparator(); termComp = getComparator();
} }
/**
* Construct an enumerator based upon an automaton, enumerating the specified
* field, working on a supplied TermsEnum
* <p>
* It will automatically calculate whether or not the automaton is finite
*/
public AutomatonTermsEnum(Automaton automaton, TermsEnum tenum)
throws IOException {
this(new ByteRunAutomaton(automaton), tenum, SpecialOperations.isFinite(automaton), null);
}
/** /**
* Returns true if the term matches the automaton. Also stashes away the term * Returns true if the term matches the automaton. Also stashes away the term
* to assist with smart enumeration. * to assist with smart enumeration.
@ -350,4 +321,26 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
} }
return -1; /* all solutions exhausted */ return -1; /* all solutions exhausted */
} }
/**
* immutable class with everything this enum needs.
*/
public static class CompiledAutomaton {
public final ByteRunAutomaton runAutomaton;
public final Transition[][] sortedTransitions;
public final BytesRef commonSuffixRef;
public final boolean finite;
public CompiledAutomaton(Automaton automaton, boolean finite) {
Automaton utf8 = new UTF32ToUTF8().convert(automaton);
runAutomaton = new ByteRunAutomaton(utf8, true);
sortedTransitions = utf8.getSortedTransitions();
this.finite = finite;
if (finite) {
commonSuffixRef = null;
} else {
commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(utf8);
}
}
}
} }

View File

@ -22,6 +22,7 @@ import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState; import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.AutomatonTermsEnum.CompiledAutomaton;
import org.apache.lucene.util.Attribute; import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource;
@ -140,18 +141,18 @@ public final class FuzzyTermsEnum extends TermsEnum {
*/ */
private TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm) private TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm)
throws IOException { throws IOException {
final List<ByteRunAutomaton> runAutomata = initAutomata(editDistance); final List<CompiledAutomaton> runAutomata = initAutomata(editDistance);
if (editDistance < runAutomata.size()) { if (editDistance < runAutomata.size()) {
return new AutomatonFuzzyTermsEnum(runAutomata.subList(0, editDistance + 1) return new AutomatonFuzzyTermsEnum(runAutomata.subList(0, editDistance + 1)
.toArray(new ByteRunAutomaton[editDistance + 1]), lastTerm); .toArray(new CompiledAutomaton[editDistance + 1]), lastTerm);
} else { } else {
return null; return null;
} }
} }
/** initialize levenshtein DFAs up to maxDistance, if possible */ /** initialize levenshtein DFAs up to maxDistance, if possible */
private List<ByteRunAutomaton> initAutomata(int maxDistance) { private List<CompiledAutomaton> initAutomata(int maxDistance) {
final List<ByteRunAutomaton> runAutomata = dfaAtt.automata(); final List<CompiledAutomaton> runAutomata = dfaAtt.automata();
if (runAutomata.size() <= maxDistance && if (runAutomata.size() <= maxDistance &&
maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
LevenshteinAutomata builder = LevenshteinAutomata builder =
@ -165,7 +166,7 @@ public final class FuzzyTermsEnum extends TermsEnum {
UnicodeUtil.newString(termText, 0, realPrefixLength)); UnicodeUtil.newString(termText, 0, realPrefixLength));
a = BasicOperations.concatenate(prefix, a); a = BasicOperations.concatenate(prefix, a);
} }
runAutomata.add(new ByteRunAutomaton(a)); runAutomata.add(new CompiledAutomaton(a, true));
} }
} }
return runAutomata; return runAutomata;
@ -312,10 +313,12 @@ public final class FuzzyTermsEnum extends TermsEnum {
private final BoostAttribute boostAtt = private final BoostAttribute boostAtt =
attributes().addAttribute(BoostAttribute.class); attributes().addAttribute(BoostAttribute.class);
public AutomatonFuzzyTermsEnum(ByteRunAutomaton matchers[], public AutomatonFuzzyTermsEnum(CompiledAutomaton compiled[],
BytesRef lastTerm) throws IOException { BytesRef lastTerm) throws IOException {
super(matchers[matchers.length - 1], tenum, true, null); super(tenum, compiled[compiled.length - 1]);
this.matchers = matchers; this.matchers = new ByteRunAutomaton[compiled.length];
for (int i = 0; i < compiled.length; i++)
this.matchers[i] = compiled[i].runAutomaton;
this.lastTerm = lastTerm; this.lastTerm = lastTerm;
termRef = new BytesRef(term.text()); termRef = new BytesRef(term.text());
} }
@ -563,14 +566,14 @@ public final class FuzzyTermsEnum extends TermsEnum {
/** @lucene.internal */ /** @lucene.internal */
public static interface LevenshteinAutomataAttribute extends Attribute { public static interface LevenshteinAutomataAttribute extends Attribute {
public List<ByteRunAutomaton> automata(); public List<CompiledAutomaton> automata();
} }
/** @lucene.internal */ /** @lucene.internal */
public static final class LevenshteinAutomataAttributeImpl extends AttributeImpl implements LevenshteinAutomataAttribute { public static final class LevenshteinAutomataAttributeImpl extends AttributeImpl implements LevenshteinAutomataAttribute {
private final List<ByteRunAutomaton> automata = new ArrayList<ByteRunAutomaton>(); private final List<CompiledAutomaton> automata = new ArrayList<CompiledAutomaton>();
public List<ByteRunAutomaton> automata() { public List<CompiledAutomaton> automata() {
return automata; return automata;
} }
@ -595,7 +598,7 @@ public final class FuzzyTermsEnum extends TermsEnum {
@Override @Override
public void copyTo(AttributeImpl target) { public void copyTo(AttributeImpl target) {
final List<ByteRunAutomaton> targetAutomata = final List<CompiledAutomaton> targetAutomata =
((LevenshteinAutomataAttribute) target).automata(); ((LevenshteinAutomataAttribute) target).automata();
targetAutomata.clear(); targetAutomata.clear();
targetAutomata.addAll(automata); targetAutomata.addAll(automata);

View File

@ -66,6 +66,13 @@ import org.apache.lucene.util.RamUsageEstimator;
* assumed by the built-in automata operations. * assumed by the built-in automata operations.
* *
* <p> * <p>
* <p>
* Note: This class has internal mutable state and is not thread safe. It is
* the caller's responsibility to ensure any necessary synchronization if you
* wish to use the same Automaton from multiple threads. In general it is instead
* recommended to use a {@link RunAutomaton} for multithreaded matching: it is immutable,
* thread safe, and much faster.
* </p>
* @lucene.experimental * @lucene.experimental
*/ */
public class Automaton implements Serializable, Cloneable { public class Automaton implements Serializable, Cloneable {

View File

@ -20,7 +20,12 @@ package org.apache.lucene.util.automaton;
public class ByteRunAutomaton extends RunAutomaton { public class ByteRunAutomaton extends RunAutomaton {
public ByteRunAutomaton(Automaton a) { public ByteRunAutomaton(Automaton a) {
super(new UTF32ToUTF8().convert(a), 256, true); this(a, false);
}
/** expert: if utf8 is true, the input is already byte-based */
public ByteRunAutomaton(Automaton a, boolean utf8) {
super(utf8 ? a : new UTF32ToUTF8().convert(a), 256, true);
} }
/** /**

View File

@ -45,7 +45,6 @@ public abstract class RunAutomaton implements Serializable {
// getCharClass(c)] // getCharClass(c)]
final int[] points; // char interval start points final int[] points; // char interval start points
final int[] classmap; // map from char number to class class final int[] classmap; // map from char number to class class
final Automaton automaton;
/** /**
* Returns a string representation of this automaton. * Returns a string representation of this automaton.
@ -114,13 +113,6 @@ public abstract class RunAutomaton implements Serializable {
return SpecialOperations.findIndex(c, points); return SpecialOperations.findIndex(c, points);
} }
/**
* @return the automaton
*/
public Automaton getAutomaton() {
return automaton;
}
/** /**
* Constructs a new <code>RunAutomaton</code> from a deterministic * Constructs a new <code>RunAutomaton</code> from a deterministic
* <code>Automaton</code>. * <code>Automaton</code>.
@ -160,7 +152,6 @@ public abstract class RunAutomaton implements Serializable {
} else { } else {
classmap = null; classmap = null;
} }
this.automaton = a;
} }
/** /**

View File

@ -26,7 +26,8 @@ import java.util.ArrayList;
// TODO // TODO
// - do we really need the .bits...? if not we can make util in UnicodeUtil to convert 1 char into a BytesRef // - do we really need the .bits...? if not we can make util in UnicodeUtil to convert 1 char into a BytesRef
final class UTF32ToUTF8 { /** @lucene.internal */
public final class UTF32ToUTF8 {
// Unicode boundaries for UTF8 bytes 1,2,3,4 // Unicode boundaries for UTF8 bytes 1,2,3,4
private static final int[] startCodes = new int[] {0, 128, 2048, 65536}; private static final int[] startCodes = new int[] {0, 128, 2048, 65536};