mirror of https://github.com/apache/lucene.git
LUCENE-6046: add maxDeterminizedStates to determinize to prevent exhausting CPU/RAM when the automaton is too difficult to determinize
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1636716 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2380e375a0
commit
ba56e75704
|
@ -207,6 +207,11 @@ Bug Fixes
|
|||
* LUCENE-6042: CustomScoreQuery explain was incorrect in some cases,
|
||||
such as when nested inside a boolean query. (Denis Lantsman via Robert Muir)
|
||||
|
||||
* LUCENE-6046: Add maxDeterminizedStates safety to determinize (which has
|
||||
an exponential worst case) so that if it would create too many states, it
|
||||
now throws an exception instead of exhausting CPU/RAM. (Nik
|
||||
Everett via Mike McCandless)
|
||||
|
||||
Documentation
|
||||
|
||||
* LUCENE-5392: Add/improve analysis package documentation to reflect
|
||||
|
|
|
@ -1,11 +1,5 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.AutomatonProvider;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -23,6 +17,13 @@ import org.apache.lucene.util.automaton.RegExp;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.AutomatonProvider;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
||||
/**
|
||||
* A fast regular expression query based on the
|
||||
* {@link org.apache.lucene.util.automaton} package.
|
||||
|
@ -75,7 +76,21 @@ public class RegexpQuery extends AutomatonQuery {
|
|||
* @param flags optional RegExp features from {@link RegExp}
|
||||
*/
|
||||
public RegexpQuery(Term term, int flags) {
|
||||
this(term, flags, defaultProvider);
|
||||
this(term, flags, defaultProvider,
|
||||
Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a query for terms matching <code>term</code>.
|
||||
*
|
||||
* @param term regular expression.
|
||||
* @param flags optional RegExp features from {@link RegExp}
|
||||
* @param maxDeterminizedStates maximum number of states that compiling the
|
||||
* automaton for the regexp can result in. Set higher to allow more complex
|
||||
* queries and lower to prevent memory exhaustion.
|
||||
*/
|
||||
public RegexpQuery(Term term, int flags, int maxDeterminizedStates) {
|
||||
this(term, flags, defaultProvider, maxDeterminizedStates);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -84,9 +99,14 @@ public class RegexpQuery extends AutomatonQuery {
|
|||
* @param term regular expression.
|
||||
* @param flags optional RegExp features from {@link RegExp}
|
||||
* @param provider custom AutomatonProvider for named automata
|
||||
* @param maxDeterminizedStates maximum number of states that compiling the
|
||||
* automaton for the regexp can result in. Set higher to allow more complex
|
||||
* queries and lower to prevent memory exhaustion.
|
||||
*/
|
||||
public RegexpQuery(Term term, int flags, AutomatonProvider provider) {
|
||||
super(term, new RegExp(term.text(), flags).toAutomaton(provider));
|
||||
public RegexpQuery(Term term, int flags, AutomatonProvider provider,
|
||||
int maxDeterminizedStates) {
|
||||
super(term, new RegExp(term.text(), flags).toAutomaton(
|
||||
provider, maxDeterminizedStates));
|
||||
}
|
||||
|
||||
/** Prints a user-readable version of this query. */
|
||||
|
|
|
@ -33,7 +33,7 @@ import java.io.IOException;
|
|||
|
||||
/**
|
||||
* Automaton provider for <code>RegExp.</code>
|
||||
* {@link RegExp#toAutomaton(AutomatonProvider)}
|
||||
* {@link RegExp#toAutomaton(AutomatonProvider,int)}
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
|
|
@ -24,12 +24,12 @@ public class ByteRunAutomaton extends RunAutomaton {
|
|||
|
||||
/** Converts incoming automaton to byte-based (UTF32ToUTF8) first */
|
||||
public ByteRunAutomaton(Automaton a) {
|
||||
this(a, false);
|
||||
this(a, false, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
|
||||
/** expert: if utf8 is true, the input is already byte-based */
|
||||
public ByteRunAutomaton(Automaton a, boolean utf8) {
|
||||
super(utf8 ? a : new UTF32ToUTF8().convert(a), 256, true);
|
||||
public ByteRunAutomaton(Automaton a, boolean utf8, int maxDeterminizedStates) {
|
||||
super(utf8 ? a : new UTF32ToUTF8().convert(a), 256, true, maxDeterminizedStates);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -21,10 +21,22 @@ package org.apache.lucene.util.automaton;
|
|||
* Automaton representation for matching char[].
|
||||
*/
|
||||
public class CharacterRunAutomaton extends RunAutomaton {
|
||||
|
||||
/** Sole constructor. */
|
||||
/**
|
||||
* Construct with a default number of maxDeterminizedStates.
|
||||
*/
|
||||
public CharacterRunAutomaton(Automaton a) {
|
||||
super(a, Character.MAX_CODE_POINT, false);
|
||||
this(a, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct specifying maxDeterminizedStates.
|
||||
* @param a Automaton to match
|
||||
* @param maxDeterminizedStates maximum number of states that the automataon
|
||||
* can have once determinized. If more states are required to determinize
|
||||
* it then a TooComplexToDeterminizeException is thrown.
|
||||
*/
|
||||
public CharacterRunAutomaton(Automaton a, int maxDeterminizedStates) {
|
||||
super(a, Character.MAX_CODE_POINT, false, maxDeterminizedStates);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -21,10 +21,10 @@ import java.io.IOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.SingleTermsEnum;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.PrefixTermsEnum;
|
||||
import org.apache.lucene.index.SingleTermsEnum;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
|
||||
|
@ -101,7 +101,20 @@ public class CompiledAutomaton {
|
|||
* possibly expensive operations to determine if the automaton is one
|
||||
* the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}. */
|
||||
public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) {
|
||||
this(automaton, finite, simplify, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
|
||||
|
||||
/** Create this. If finite is null, we use {@link Operations#isFinite}
|
||||
* to determine whether it is finite. If simplify is true, we run
|
||||
* possibly expensive operations to determine if the automaton is one
|
||||
* the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}. If simplify
|
||||
* requires determinizing the autaomaton then only maxDeterminizedStates
|
||||
* will be created. Any more than that will cause a
|
||||
* TooComplexToDeterminizeException.
|
||||
*/
|
||||
public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify,
|
||||
int maxDeterminizedStates) {
|
||||
if (automaton.getNumStates() == 0) {
|
||||
automaton = new Automaton();
|
||||
automaton.createState();
|
||||
|
@ -134,7 +147,7 @@ public class CompiledAutomaton {
|
|||
return;
|
||||
} else {
|
||||
|
||||
automaton = Operations.determinize(automaton);
|
||||
automaton = Operations.determinize(automaton, maxDeterminizedStates);
|
||||
|
||||
final String commonPrefix = Operations.getCommonPrefix(automaton);
|
||||
final String singleton;
|
||||
|
@ -156,7 +169,7 @@ public class CompiledAutomaton {
|
|||
return;
|
||||
} else if (commonPrefix.length() > 0) {
|
||||
Automaton other = Operations.concatenate(Automata.makeString(commonPrefix), Automata.makeAnyString());
|
||||
other = Operations.determinize(other);
|
||||
other = Operations.determinize(other, maxDeterminizedStates);
|
||||
assert Operations.hasDeadStates(other) == false;
|
||||
if (Operations.sameLanguage(automaton, other)) {
|
||||
// matches a constant prefix
|
||||
|
@ -185,9 +198,9 @@ public class CompiledAutomaton {
|
|||
if (this.finite) {
|
||||
commonSuffixRef = null;
|
||||
} else {
|
||||
commonSuffixRef = Operations.getCommonSuffixBytesRef(utf8);
|
||||
commonSuffixRef = Operations.getCommonSuffixBytesRef(utf8, maxDeterminizedStates);
|
||||
}
|
||||
runAutomaton = new ByteRunAutomaton(utf8, true);
|
||||
runAutomaton = new ByteRunAutomaton(utf8, true, maxDeterminizedStates);
|
||||
|
||||
this.automaton = runAutomaton.automaton;
|
||||
}
|
||||
|
|
|
@ -29,8 +29,8 @@
|
|||
|
||||
package org.apache.lucene.util.automaton;
|
||||
|
||||
import java.util.BitSet;
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
|
||||
|
@ -45,21 +45,17 @@ final public class MinimizationOperations {
|
|||
|
||||
/**
|
||||
* Minimizes (and determinizes if not already deterministic) the given
|
||||
* automaton.
|
||||
* automaton using Hopcroft's algorighm.
|
||||
* @param maxDeterminizedStates maximum number of states determinizing the
|
||||
* automaton can result in. Set higher to allow more complex queries and
|
||||
* lower to prevent memory exhaustion.
|
||||
*/
|
||||
public static Automaton minimize(Automaton a) {
|
||||
return minimizeHopcroft(a);
|
||||
}
|
||||
|
||||
/**
|
||||
* Minimizes the given automaton using Hopcroft's algorithm.
|
||||
*/
|
||||
public static Automaton minimizeHopcroft(Automaton a) {
|
||||
public static Automaton minimize(Automaton a, int maxDeterminizedStates) {
|
||||
if (a.getNumStates() == 0 || (a.isAccept(0) == false && a.getNumTransitions(0) == 0)) {
|
||||
// Fastmatch for common case
|
||||
return new Automaton();
|
||||
}
|
||||
a = Operations.determinize(a);
|
||||
a = Operations.determinize(a, maxDeterminizedStates);
|
||||
//a.writeDot("adet");
|
||||
if (a.getNumTransitions(0) == 1) {
|
||||
Transition t = new Transition();
|
||||
|
|
|
@ -53,6 +53,10 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
* @lucene.experimental
|
||||
*/
|
||||
final public class Operations {
|
||||
/**
|
||||
* Default maximum number of states that {@link Operations#determinize} should create.
|
||||
*/
|
||||
public static final int DEFAULT_MAX_DETERMINIZED_STATES = 10000;
|
||||
|
||||
private Operations() {}
|
||||
|
||||
|
@ -202,12 +206,12 @@ final public class Operations {
|
|||
* <p>
|
||||
* Complexity: linear in number of states and in <code>min</code>.
|
||||
*/
|
||||
static public Automaton repeat(Automaton a, int min) {
|
||||
if (min == 0) {
|
||||
static public Automaton repeat(Automaton a, int count) {
|
||||
if (count == 0) {
|
||||
return repeat(a);
|
||||
}
|
||||
List<Automaton> as = new ArrayList<>();
|
||||
while (min-- > 0) {
|
||||
while (count-- > 0) {
|
||||
as.add(a);
|
||||
}
|
||||
as.add(repeat(a));
|
||||
|
@ -242,19 +246,18 @@ final public class Operations {
|
|||
}
|
||||
|
||||
Set<Integer> prevAcceptStates = toSet(b, 0);
|
||||
|
||||
Automaton.Builder builder = new Automaton.Builder();
|
||||
builder.copy(b);
|
||||
for(int i=min;i<max;i++) {
|
||||
int numStates = b.getNumStates();
|
||||
b.copy(a);
|
||||
int numStates = builder.getNumStates();
|
||||
builder.copy(a);
|
||||
for(int s : prevAcceptStates) {
|
||||
b.addEpsilon(s, numStates);
|
||||
builder.addEpsilon(s, numStates);
|
||||
}
|
||||
prevAcceptStates = toSet(a, numStates);
|
||||
}
|
||||
|
||||
b.finishState();
|
||||
|
||||
return b;
|
||||
return builder.finish();
|
||||
}
|
||||
|
||||
private static Set<Integer> toSet(Automaton a, int offset) {
|
||||
|
@ -274,10 +277,14 @@ final public class Operations {
|
|||
* Returns a (deterministic) automaton that accepts the complement of the
|
||||
* language of the given automaton.
|
||||
* <p>
|
||||
* Complexity: linear in number of states (if already deterministic).
|
||||
* Complexity: linear in number of states if already deterministic and
|
||||
* exponential otherwise.
|
||||
* @param maxDeterminizedStates maximum number of states determinizing the
|
||||
* automaton can result in. Set higher to allow more complex queries and
|
||||
* lower to prevent memory exhaustion.
|
||||
*/
|
||||
static public Automaton complement(Automaton a) {
|
||||
a = totalize(determinize(a));
|
||||
static public Automaton complement(Automaton a, int maxDeterminizedStates) {
|
||||
a = totalize(determinize(a, maxDeterminizedStates));
|
||||
int numStates = a.getNumStates();
|
||||
for (int p=0;p<numStates;p++) {
|
||||
a.setAccept(p, !a.isAccept(p));
|
||||
|
@ -291,16 +298,17 @@ final public class Operations {
|
|||
* <code>a2</code>. As a side-effect, the automata may be determinized, if not
|
||||
* already deterministic.
|
||||
* <p>
|
||||
* Complexity: quadratic in number of states (if already deterministic).
|
||||
* Complexity: quadratic in number of states if a2 already deterministic and
|
||||
* exponential in number of a2's states otherwise.
|
||||
*/
|
||||
static public Automaton minus(Automaton a1, Automaton a2) {
|
||||
static public Automaton minus(Automaton a1, Automaton a2, int maxDeterminizedStates) {
|
||||
if (Operations.isEmpty(a1) || a1 == a2) {
|
||||
return Automata.makeEmpty();
|
||||
}
|
||||
if (Operations.isEmpty(a2)) {
|
||||
return a1;
|
||||
}
|
||||
return intersection(a1, complement(a2));
|
||||
return intersection(a1, complement(a2, maxDeterminizedStates));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -490,7 +498,6 @@ final public class Operations {
|
|||
result.createState();
|
||||
|
||||
// Copy over all automata
|
||||
Transition t = new Transition();
|
||||
for(Automaton a : l) {
|
||||
result.copy(a);
|
||||
}
|
||||
|
@ -644,8 +651,15 @@ final public class Operations {
|
|||
* Determinizes the given automaton.
|
||||
* <p>
|
||||
* Worst case complexity: exponential in number of states.
|
||||
* @param maxDeterminizedStates Maximum number of states created when
|
||||
* determinizing. Higher numbers allow this operation to consume more
|
||||
* memory but allow more complex automatons. Use
|
||||
* DEFAULT_MAX_DETERMINIZED_STATES as a decent default if you don't know
|
||||
* how many to allow.
|
||||
* @throws TooComplexToDeterminizeException if determinizing a creates an
|
||||
* automaton with more than maxDeterminizedStates
|
||||
*/
|
||||
public static Automaton determinize(Automaton a) {
|
||||
public static Automaton determinize(Automaton a, int maxDeterminizedStates) {
|
||||
if (a.isDeterministic()) {
|
||||
// Already determinized
|
||||
return a;
|
||||
|
@ -674,11 +688,6 @@ final public class Operations {
|
|||
b.setAccept(0, a.isAccept(0));
|
||||
newstate.put(initialset, 0);
|
||||
|
||||
int newStateUpto = 0;
|
||||
int[] newStatesArray = new int[5];
|
||||
newStatesArray[newStateUpto] = 0;
|
||||
newStateUpto++;
|
||||
|
||||
// like Set<Integer,PointTransitions>
|
||||
final PointTransitionSet points = new PointTransitionSet();
|
||||
|
||||
|
@ -726,6 +735,9 @@ final public class Operations {
|
|||
Integer q = newstate.get(statesSet);
|
||||
if (q == null) {
|
||||
q = b.createState();
|
||||
if (q >= maxDeterminizedStates) {
|
||||
throw new TooComplexToDeterminizeException(a, maxDeterminizedStates);
|
||||
}
|
||||
final SortedIntSet.FrozenIntSet p = statesSet.freeze(q);
|
||||
//System.out.println(" make new state=" + q + " -> " + p + " accCount=" + accCount);
|
||||
worklist.add(p);
|
||||
|
@ -1100,12 +1112,14 @@ final public class Operations {
|
|||
* Returns the longest BytesRef that is a suffix of all accepted strings.
|
||||
* Worst case complexity: exponential in number of states (this calls
|
||||
* determinize).
|
||||
*
|
||||
* @param maxDeterminizedStates maximum number of states determinizing the
|
||||
* automaton can result in. Set higher to allow more complex queries and
|
||||
* lower to prevent memory exhaustion.
|
||||
* @return common suffix
|
||||
*/
|
||||
public static BytesRef getCommonSuffixBytesRef(Automaton a) {
|
||||
public static BytesRef getCommonSuffixBytesRef(Automaton a, int maxDeterminizedStates) {
|
||||
// reverse the language of the automaton, then reverse its common prefix.
|
||||
Automaton r = Operations.determinize(reverse(a));
|
||||
Automaton r = Operations.determinize(reverse(a), maxDeterminizedStates);
|
||||
BytesRef ref = getCommonPrefixBytesRef(r);
|
||||
reverseBytes(ref);
|
||||
return ref;
|
||||
|
|
|
@ -361,6 +361,7 @@ public class RegExp {
|
|||
*/
|
||||
public static final int NONE = 0x0000;
|
||||
|
||||
private final String originalString;
|
||||
Kind kind;
|
||||
RegExp exp1, exp2;
|
||||
String s;
|
||||
|
@ -368,11 +369,12 @@ public class RegExp {
|
|||
int min, max, digits;
|
||||
int from, to;
|
||||
|
||||
String b;
|
||||
int flags;
|
||||
int pos;
|
||||
|
||||
RegExp() {}
|
||||
RegExp() {
|
||||
this.originalString = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs new <code>RegExp</code> from a string. Same as
|
||||
|
@ -396,13 +398,13 @@ public class RegExp {
|
|||
* regular expression
|
||||
*/
|
||||
public RegExp(String s, int syntax_flags) throws IllegalArgumentException {
|
||||
b = s;
|
||||
originalString = s;
|
||||
flags = syntax_flags;
|
||||
RegExp e;
|
||||
if (s.length() == 0) e = makeString("");
|
||||
else {
|
||||
e = parseUnionExp();
|
||||
if (pos < b.length()) throw new IllegalArgumentException(
|
||||
if (pos < originalString.length()) throw new IllegalArgumentException(
|
||||
"end-of-string expected at position " + pos);
|
||||
}
|
||||
kind = e.kind;
|
||||
|
@ -415,7 +417,6 @@ public class RegExp {
|
|||
digits = e.digits;
|
||||
from = e.from;
|
||||
to = e.to;
|
||||
b = null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -423,7 +424,26 @@ public class RegExp {
|
|||
* as <code>toAutomaton(null)</code> (empty automaton map).
|
||||
*/
|
||||
public Automaton toAutomaton() {
|
||||
return toAutomaton(null, null);
|
||||
return toAutomaton(null, null, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs new <code>Automaton</code> from this <code>RegExp</code>. The
|
||||
* constructed automaton is minimal and deterministic and has no transitions
|
||||
* to dead states.
|
||||
*
|
||||
* @param maxDeterminizedStates maximum number of states in the resulting
|
||||
* automata. If the automata would need more than this many states
|
||||
* TooComplextToDeterminizeException is thrown. Higher number require more
|
||||
* space but can process more complex regexes.
|
||||
* @exception IllegalArgumentException if this regular expression uses a named
|
||||
* identifier that is not available from the automaton provider
|
||||
* @exception TooComplexToDeterminizeException if determinizing this regexp
|
||||
* requires more than maxDeterminizedStates states
|
||||
*/
|
||||
public Automaton toAutomaton(int maxDeterminizedStates)
|
||||
throws IllegalArgumentException, TooComplexToDeterminizeException {
|
||||
return toAutomaton(null, null, maxDeterminizedStates);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -432,12 +452,19 @@ public class RegExp {
|
|||
* to dead states.
|
||||
*
|
||||
* @param automaton_provider provider of automata for named identifiers
|
||||
* @param maxDeterminizedStates maximum number of states in the resulting
|
||||
* automata. If the automata would need more than this many states
|
||||
* TooComplextToDeterminizeException is thrown. Higher number require more
|
||||
* space but can process more complex regexes.
|
||||
* @exception IllegalArgumentException if this regular expression uses a named
|
||||
* identifier that is not available from the automaton provider
|
||||
* identifier that is not available from the automaton provider
|
||||
* @exception TooComplexToDeterminizeException if determinizing this regexp
|
||||
* requires more than maxDeterminizedStates states
|
||||
*/
|
||||
public Automaton toAutomaton(AutomatonProvider automaton_provider)
|
||||
throws IllegalArgumentException {
|
||||
return toAutomaton(null, automaton_provider);
|
||||
public Automaton toAutomaton(AutomatonProvider automaton_provider,
|
||||
int maxDeterminizedStates) throws IllegalArgumentException,
|
||||
TooComplexToDeterminizeException {
|
||||
return toAutomaton(null, automaton_provider, maxDeterminizedStates);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -447,60 +474,95 @@ public class RegExp {
|
|||
*
|
||||
* @param automata a map from automaton identifiers to automata (of type
|
||||
* <code>Automaton</code>).
|
||||
* @param maxDeterminizedStates maximum number of states in the resulting
|
||||
* automata. If the automata would need more than this many states
|
||||
* TooComplexToDeterminizeException is thrown. Higher number require more
|
||||
* space but can process more complex regexes.
|
||||
* @exception IllegalArgumentException if this regular expression uses a named
|
||||
* identifier that does not occur in the automaton map
|
||||
* identifier that does not occur in the automaton map
|
||||
* @exception TooComplexToDeterminizeException if determinizing this regexp
|
||||
* requires more than maxDeterminizedStates states
|
||||
*/
|
||||
public Automaton toAutomaton(Map<String,Automaton> automata)
|
||||
throws IllegalArgumentException {
|
||||
return toAutomaton(automata, null);
|
||||
public Automaton toAutomaton(Map<String,Automaton> automata,
|
||||
int maxDeterminizedStates) throws IllegalArgumentException,
|
||||
TooComplexToDeterminizeException {
|
||||
return toAutomaton(automata, null, maxDeterminizedStates);
|
||||
}
|
||||
|
||||
private Automaton toAutomaton(Map<String,Automaton> automata,
|
||||
AutomatonProvider automaton_provider) throws IllegalArgumentException {
|
||||
AutomatonProvider automaton_provider, int maxDeterminizedStates)
|
||||
throws IllegalArgumentException, TooComplexToDeterminizeException {
|
||||
try {
|
||||
return toAutomatonInternal(automata, automaton_provider,
|
||||
maxDeterminizedStates);
|
||||
} catch (TooComplexToDeterminizeException e) {
|
||||
throw new TooComplexToDeterminizeException(this, e);
|
||||
}
|
||||
}
|
||||
|
||||
private Automaton toAutomatonInternal(Map<String,Automaton> automata,
|
||||
AutomatonProvider automaton_provider, int maxDeterminizedStates)
|
||||
throws IllegalArgumentException {
|
||||
List<Automaton> list;
|
||||
Automaton a = null;
|
||||
switch (kind) {
|
||||
case REGEXP_UNION:
|
||||
list = new ArrayList<>();
|
||||
findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider);
|
||||
findLeaves(exp2, Kind.REGEXP_UNION, list, automata, automaton_provider);
|
||||
findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider,
|
||||
maxDeterminizedStates);
|
||||
findLeaves(exp2, Kind.REGEXP_UNION, list, automata, automaton_provider,
|
||||
maxDeterminizedStates);
|
||||
a = Operations.union(list);
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
|
||||
break;
|
||||
case REGEXP_CONCATENATION:
|
||||
list = new ArrayList<>();
|
||||
findLeaves(exp1, Kind.REGEXP_CONCATENATION, list, automata,
|
||||
automaton_provider);
|
||||
automaton_provider, maxDeterminizedStates);
|
||||
findLeaves(exp2, Kind.REGEXP_CONCATENATION, list, automata,
|
||||
automaton_provider);
|
||||
automaton_provider, maxDeterminizedStates);
|
||||
a = Operations.concatenate(list);
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
|
||||
break;
|
||||
case REGEXP_INTERSECTION:
|
||||
a = Operations.intersection(
|
||||
exp1.toAutomaton(automata, automaton_provider),
|
||||
exp2.toAutomaton(automata, automaton_provider));
|
||||
a = MinimizationOperations.minimize(a);
|
||||
exp1.toAutomatonInternal(
|
||||
automata, automaton_provider, maxDeterminizedStates),
|
||||
exp2.toAutomatonInternal(
|
||||
automata, automaton_provider, maxDeterminizedStates));
|
||||
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
|
||||
break;
|
||||
case REGEXP_OPTIONAL:
|
||||
a = Operations.optional(exp1.toAutomaton(automata, automaton_provider));
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = Operations.optional(exp1.toAutomatonInternal(automata,
|
||||
automaton_provider, maxDeterminizedStates));
|
||||
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
|
||||
break;
|
||||
case REGEXP_REPEAT:
|
||||
a = Operations.repeat(exp1.toAutomaton(automata, automaton_provider));
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = Operations.repeat(exp1.toAutomatonInternal(
|
||||
automata, automaton_provider, maxDeterminizedStates));
|
||||
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
|
||||
break;
|
||||
case REGEXP_REPEAT_MIN:
|
||||
a = Operations.repeat(exp1.toAutomaton(automata, automaton_provider), min);
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = Operations.repeat(
|
||||
exp1.toAutomatonInternal(automata, automaton_provider,
|
||||
maxDeterminizedStates),
|
||||
min);
|
||||
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
|
||||
break;
|
||||
case REGEXP_REPEAT_MINMAX:
|
||||
a = Operations.repeat(exp1.toAutomaton(automata, automaton_provider), min, max);
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = Operations.repeat(
|
||||
exp1.toAutomatonInternal(automata, automaton_provider,
|
||||
maxDeterminizedStates),
|
||||
min,
|
||||
max);
|
||||
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
|
||||
break;
|
||||
case REGEXP_COMPLEMENT:
|
||||
a = Operations.complement(exp1.toAutomaton(automata, automaton_provider));
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = Operations.complement(
|
||||
exp1.toAutomatonInternal(automata, automaton_provider,
|
||||
maxDeterminizedStates),
|
||||
maxDeterminizedStates);
|
||||
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
|
||||
break;
|
||||
case REGEXP_CHAR:
|
||||
a = Automata.makeChar(c);
|
||||
|
@ -545,24 +607,37 @@ public class RegExp {
|
|||
}
|
||||
|
||||
private void findLeaves(RegExp exp, Kind kind, List<Automaton> list,
|
||||
Map<String,Automaton> automata, AutomatonProvider automaton_provider) {
|
||||
Map<String,Automaton> automata, AutomatonProvider automaton_provider,
|
||||
int maxDeterminizedStates) {
|
||||
if (exp.kind == kind) {
|
||||
findLeaves(exp.exp1, kind, list, automata, automaton_provider);
|
||||
findLeaves(exp.exp2, kind, list, automata, automaton_provider);
|
||||
findLeaves(exp.exp1, kind, list, automata, automaton_provider,
|
||||
maxDeterminizedStates);
|
||||
findLeaves(exp.exp2, kind, list, automata, automaton_provider,
|
||||
maxDeterminizedStates);
|
||||
} else {
|
||||
list.add(exp.toAutomaton(automata, automaton_provider));
|
||||
list.add(exp.toAutomatonInternal(automata, automaton_provider,
|
||||
maxDeterminizedStates));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The string that was used to construct the regex. Compare to toString.
|
||||
*/
|
||||
public String getOriginalString() {
|
||||
return originalString;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs string from parsed regular expression.
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return toStringBuilder(new StringBuilder()).toString();
|
||||
StringBuilder b = new StringBuilder();
|
||||
toStringBuilder(b);
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
StringBuilder toStringBuilder(StringBuilder b) {
|
||||
void toStringBuilder(StringBuilder b) {
|
||||
switch (kind) {
|
||||
case REGEXP_UNION:
|
||||
b.append("(");
|
||||
|
@ -640,7 +715,110 @@ public class RegExp {
|
|||
b.append(s2).append(">");
|
||||
break;
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
/**
|
||||
* Like to string, but more verbose (shows the higherchy more clearly).
|
||||
*/
|
||||
public String toStringTree() {
|
||||
StringBuilder b = new StringBuilder();
|
||||
toStringTree(b, "");
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
void toStringTree(StringBuilder b, String indent) {
|
||||
switch (kind) {
|
||||
// binary
|
||||
case REGEXP_UNION:
|
||||
case REGEXP_CONCATENATION:
|
||||
case REGEXP_INTERSECTION:
|
||||
b.append(indent);
|
||||
b.append(kind);
|
||||
b.append('\n');
|
||||
exp1.toStringTree(b, indent + " ");
|
||||
exp2.toStringTree(b, indent + " ");
|
||||
break;
|
||||
// unary
|
||||
case REGEXP_OPTIONAL:
|
||||
case REGEXP_REPEAT:
|
||||
case REGEXP_COMPLEMENT:
|
||||
b.append(indent);
|
||||
b.append(kind);
|
||||
b.append('\n');
|
||||
exp1.toStringTree(b, indent + " ");
|
||||
break;
|
||||
case REGEXP_REPEAT_MIN:
|
||||
b.append(indent);
|
||||
b.append(kind);
|
||||
b.append(" min=");
|
||||
b.append(min);
|
||||
b.append('\n');
|
||||
exp1.toStringTree(b, indent + " ");
|
||||
break;
|
||||
case REGEXP_REPEAT_MINMAX:
|
||||
b.append(indent);
|
||||
b.append(kind);
|
||||
b.append(" min=");
|
||||
b.append(min);
|
||||
b.append(" max=");
|
||||
b.append(max);
|
||||
b.append('\n');
|
||||
exp1.toStringTree(b, indent + " ");
|
||||
break;
|
||||
case REGEXP_CHAR:
|
||||
b.append(indent);
|
||||
b.append(kind);
|
||||
b.append(" char=");
|
||||
b.appendCodePoint(c);
|
||||
b.append('\n');
|
||||
break;
|
||||
case REGEXP_CHAR_RANGE:
|
||||
b.append(indent);
|
||||
b.append(kind);
|
||||
b.append(" from=");
|
||||
b.appendCodePoint(from);
|
||||
b.append(" to=");
|
||||
b.appendCodePoint(to);
|
||||
b.append('\n');
|
||||
break;
|
||||
case REGEXP_ANYCHAR:
|
||||
case REGEXP_EMPTY:
|
||||
b.append(indent);
|
||||
b.append(kind);
|
||||
b.append('\n');
|
||||
break;
|
||||
case REGEXP_STRING:
|
||||
b.append(indent);
|
||||
b.append(kind);
|
||||
b.append(" string=");
|
||||
b.append(s);
|
||||
b.append('\n');
|
||||
break;
|
||||
case REGEXP_ANYSTRING:
|
||||
b.append(indent);
|
||||
b.append(kind);
|
||||
b.append('\n');
|
||||
break;
|
||||
case REGEXP_AUTOMATON:
|
||||
b.append(indent);
|
||||
b.append(kind);
|
||||
b.append('\n');
|
||||
break;
|
||||
case REGEXP_INTERVAL:
|
||||
b.append(indent);
|
||||
b.append(kind);
|
||||
String s1 = Integer.toString(min);
|
||||
String s2 = Integer.toString(max);
|
||||
b.append("<");
|
||||
if (digits > 0) for (int i = s1.length(); i < digits; i++)
|
||||
b.append('0');
|
||||
b.append(s1).append("-");
|
||||
if (digits > 0) for (int i = s2.length(); i < digits; i++)
|
||||
b.append('0');
|
||||
b.append(s2).append(">");
|
||||
b.append('\n');
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -819,12 +997,12 @@ public class RegExp {
|
|||
}
|
||||
|
||||
private boolean peek(String s) {
|
||||
return more() && s.indexOf(b.codePointAt(pos)) != -1;
|
||||
return more() && s.indexOf(originalString.codePointAt(pos)) != -1;
|
||||
}
|
||||
|
||||
private boolean match(int c) {
|
||||
if (pos >= b.length()) return false;
|
||||
if (b.codePointAt(pos) == c) {
|
||||
if (pos >= originalString.length()) return false;
|
||||
if (originalString.codePointAt(pos) == c) {
|
||||
pos += Character.charCount(c);
|
||||
return true;
|
||||
}
|
||||
|
@ -832,12 +1010,12 @@ public class RegExp {
|
|||
}
|
||||
|
||||
private boolean more() {
|
||||
return pos < b.length();
|
||||
return pos < originalString.length();
|
||||
}
|
||||
|
||||
private int next() throws IllegalArgumentException {
|
||||
if (!more()) throw new IllegalArgumentException("unexpected end-of-string");
|
||||
int ch = b.codePointAt(pos);
|
||||
int ch = originalString.codePointAt(pos);
|
||||
pos += Character.charCount(ch);
|
||||
return ch;
|
||||
}
|
||||
|
@ -878,13 +1056,14 @@ public class RegExp {
|
|||
next();
|
||||
if (start == pos) throw new IllegalArgumentException(
|
||||
"integer expected at position " + pos);
|
||||
int n = Integer.parseInt(b.substring(start, pos));
|
||||
int n = Integer.parseInt(originalString.substring(start, pos));
|
||||
int m = -1;
|
||||
if (match(',')) {
|
||||
start = pos;
|
||||
while (peek("0123456789"))
|
||||
next();
|
||||
if (start != pos) m = Integer.parseInt(b.substring(start, pos));
|
||||
if (start != pos) m = Integer.parseInt(
|
||||
originalString.substring(start, pos));
|
||||
} else m = n;
|
||||
if (!match('}')) throw new IllegalArgumentException(
|
||||
"expected '}' at position " + pos);
|
||||
|
@ -935,7 +1114,7 @@ public class RegExp {
|
|||
next();
|
||||
if (!match('"')) throw new IllegalArgumentException(
|
||||
"expected '\"' at position " + pos);
|
||||
return makeString(b.substring(start, pos - 1));
|
||||
return makeString(originalString.substring(start, pos - 1));
|
||||
} else if (match('(')) {
|
||||
if (match(')')) return makeString("");
|
||||
RegExp e = parseUnionExp();
|
||||
|
@ -948,7 +1127,7 @@ public class RegExp {
|
|||
next();
|
||||
if (!match('>')) throw new IllegalArgumentException(
|
||||
"expected '>' at position " + pos);
|
||||
String s = b.substring(start, pos - 1);
|
||||
String s = originalString.substring(start, pos - 1);
|
||||
int i = s.indexOf('-');
|
||||
if (i == -1) {
|
||||
if (!check(AUTOMATON)) throw new IllegalArgumentException(
|
||||
|
|
|
@ -121,8 +121,21 @@ public abstract class RunAutomaton {
|
|||
* @param a an automaton
|
||||
*/
|
||||
public RunAutomaton(Automaton a, int maxInterval, boolean tableize) {
|
||||
this(a, maxInterval, tableize, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a new <code>RunAutomaton</code> from a deterministic
|
||||
* <code>Automaton</code>.
|
||||
*
|
||||
* @param a an automaton
|
||||
* @param maxDeterminizedStates maximum number of states that can be created
|
||||
* while determinizing a
|
||||
*/
|
||||
public RunAutomaton(Automaton a, int maxInterval, boolean tableize,
|
||||
int maxDeterminizedStates) {
|
||||
this.maxInterval = maxInterval;
|
||||
a = Operations.determinize(a);
|
||||
a = Operations.determinize(a, maxDeterminizedStates);
|
||||
this.automaton = a;
|
||||
points = a.getStartPoints();
|
||||
initial = 0;
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
package org.apache.lucene.util.automaton;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* This exception is thrown when determinizing an automaton would result in one
|
||||
* has too many states.
|
||||
*/
|
||||
public class TooComplexToDeterminizeException extends RuntimeException {
|
||||
private final Automaton automaton;
|
||||
private final RegExp regExp;
|
||||
private final int maxDeterminizedStates;
|
||||
|
||||
/** Use this constructor when the RegExp failed to convert to an automaton. */
|
||||
public TooComplexToDeterminizeException(RegExp regExp, TooComplexToDeterminizeException cause) {
|
||||
super("Determinizing " + regExp.getOriginalString() + " would result in more than " +
|
||||
cause.maxDeterminizedStates + " states.", cause);
|
||||
this.regExp = regExp;
|
||||
this.automaton = cause.automaton;
|
||||
this.maxDeterminizedStates = cause.maxDeterminizedStates;
|
||||
}
|
||||
|
||||
/** Use this constructor when the automaton failed to determinize. */
|
||||
public TooComplexToDeterminizeException(Automaton automaton, int maxDeterminizedStates) {
|
||||
super("Determinizing automaton would result in more than " + maxDeterminizedStates + " states.");
|
||||
this.automaton = automaton;
|
||||
this.regExp = null;
|
||||
this.maxDeterminizedStates = maxDeterminizedStates;
|
||||
}
|
||||
|
||||
/** Returns the automaton that caused this exception, if any. */
|
||||
public Automaton getAutomaton() {
|
||||
return automaton;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the RegExp that caused this exception if any.
|
||||
*/
|
||||
public RegExp getRegExp() {
|
||||
return regExp;
|
||||
}
|
||||
|
||||
/** Get the maximum number of allowed determinized states. */
|
||||
public int getMaxDeterminizedStates() {
|
||||
return maxDeterminizedStates;
|
||||
}
|
||||
}
|
|
@ -30,8 +30,10 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
|
||||
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
|
||||
|
||||
public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
||||
|
||||
|
@ -404,15 +406,11 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testSingleToken() throws Exception {
|
||||
|
||||
final TokenStream ts = new CannedTokenStream(
|
||||
new Token[] {
|
||||
token("abc", 1, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton expected = s2a("abc");
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
|
||||
Operations.determinize(Operations.removeDeadStates(actual))));
|
||||
assertSameLanguage(s2a("abc"), ts);
|
||||
}
|
||||
|
||||
public void testMultipleHoles() throws Exception {
|
||||
|
@ -421,10 +419,7 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
token("a", 1, 1),
|
||||
token("b", 3, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton expected = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
|
||||
Operations.determinize(Operations.removeDeadStates(actual))));
|
||||
assertSameLanguage(join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")), ts);
|
||||
}
|
||||
|
||||
public void testSynOverMultipleHoles() throws Exception {
|
||||
|
@ -434,12 +429,9 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
token("x", 0, 3),
|
||||
token("b", 3, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
|
||||
final Automaton a2 = join(s2a("x"), SEP_A, s2a("b"));
|
||||
final Automaton expected = Operations.union(a1, a2);
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
|
||||
Operations.determinize(Operations.removeDeadStates(actual))));
|
||||
assertSameLanguage(Operations.union(a1, a2), ts);
|
||||
}
|
||||
|
||||
// for debugging!
|
||||
|
@ -475,18 +467,12 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testTwoTokens() throws Exception {
|
||||
|
||||
final TokenStream ts = new CannedTokenStream(
|
||||
new Token[] {
|
||||
token("abc", 1, 1),
|
||||
token("def", 1, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton expected = join("abc", "def");
|
||||
|
||||
//toDot(actual);
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
|
||||
Operations.determinize(Operations.removeDeadStates(actual))));
|
||||
assertSameLanguage(join("abc", "def"), ts);
|
||||
}
|
||||
|
||||
public void testHole() throws Exception {
|
||||
|
@ -496,13 +482,7 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
token("abc", 1, 1),
|
||||
token("def", 2, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
|
||||
final Automaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"));
|
||||
|
||||
//toDot(actual);
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
|
||||
Operations.determinize(Operations.removeDeadStates(actual))));
|
||||
assertSameLanguage(join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def")), ts);
|
||||
}
|
||||
|
||||
public void testOverlappedTokensSausage() throws Exception {
|
||||
|
@ -513,12 +493,9 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
token("abc", 1, 1),
|
||||
token("xyz", 0, 1)
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton a1 = s2a("abc");
|
||||
final Automaton a2 = s2a("xyz");
|
||||
final Automaton expected = Operations.union(a1, a2);
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
|
||||
Operations.determinize(Operations.removeDeadStates(actual))));
|
||||
assertSameLanguage(Operations.union(a1, a2), ts);
|
||||
}
|
||||
|
||||
public void testOverlappedTokensLattice() throws Exception {
|
||||
|
@ -529,14 +506,9 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
token("xyz", 0, 2),
|
||||
token("def", 1, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton a1 = s2a("xyz");
|
||||
final Automaton a2 = join("abc", "def");
|
||||
|
||||
final Automaton expected = Operations.union(a1, a2);
|
||||
//toDot(actual);
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
|
||||
Operations.determinize(Operations.removeDeadStates(actual))));
|
||||
assertSameLanguage(Operations.union(a1, a2), ts);
|
||||
}
|
||||
|
||||
public void testSynOverHole() throws Exception {
|
||||
|
@ -547,15 +519,9 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
token("X", 0, 2),
|
||||
token("b", 2, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton a1 = Operations.union(
|
||||
join(s2a("a"), SEP_A, HOLE_A),
|
||||
s2a("X"));
|
||||
final Automaton expected = Operations.concatenate(a1,
|
||||
join(SEP_A, s2a("b")));
|
||||
//toDot(actual);
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
|
||||
Operations.determinize(Operations.removeDeadStates(actual))));
|
||||
final Automaton a1 = Operations.union(join(s2a("a"), SEP_A, HOLE_A), s2a("X"));
|
||||
final Automaton expected = Operations.concatenate(a1, join(SEP_A, s2a("b")));
|
||||
assertSameLanguage(expected, ts);
|
||||
}
|
||||
|
||||
public void testSynOverHole2() throws Exception {
|
||||
|
@ -566,12 +532,9 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
token("abc", 0, 3),
|
||||
token("def", 2, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton expected = Operations.union(
|
||||
join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")),
|
||||
s2a("abc"));
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
|
||||
Operations.determinize(Operations.removeDeadStates(actual))));
|
||||
join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), s2a("abc"));
|
||||
assertSameLanguage(expected, ts);
|
||||
}
|
||||
|
||||
public void testOverlappedTokensLattice2() throws Exception {
|
||||
|
@ -583,13 +546,9 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
token("def", 1, 1),
|
||||
token("ghi", 1, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton a1 = s2a("xyz");
|
||||
final Automaton a2 = join("abc", "def", "ghi");
|
||||
final Automaton expected = Operations.union(a1, a2);
|
||||
//toDot(actual);
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
|
||||
Operations.determinize(Operations.removeDeadStates(actual))));
|
||||
assertSameLanguage(Operations.union(a1, a2), ts);
|
||||
}
|
||||
|
||||
public void testToDot() throws Exception {
|
||||
|
@ -604,11 +563,7 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
new Token[] {
|
||||
token("abc", 2, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton expected = join(HOLE_A, SEP_A, s2a("abc"));
|
||||
//toDot(actual);
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
|
||||
Operations.determinize(Operations.removeDeadStates(actual))));
|
||||
assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc")), ts);
|
||||
}
|
||||
|
||||
// TODO: testEndsWithHole... but we need posInc to set in TS.end()
|
||||
|
@ -619,10 +574,16 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
token("a", 1, 1),
|
||||
token("X", 0, 10),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton expected = Operations.union(s2a("a"),
|
||||
s2a("X"));
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
|
||||
Operations.determinize(Operations.removeDeadStates(actual))));
|
||||
assertSameLanguage(Operations.union(s2a("a"), s2a("X")), ts);
|
||||
}
|
||||
|
||||
private void assertSameLanguage(Automaton expected, TokenStream ts) throws IOException {
|
||||
assertSameLanguage(expected, new TokenStreamToAutomaton().toAutomaton(ts));
|
||||
}
|
||||
|
||||
private void assertSameLanguage(Automaton expected, Automaton actual) {
|
||||
assertTrue(Operations.sameLanguage(
|
||||
Operations.determinize(Operations.removeDeadStates(expected), DEFAULT_MAX_DETERMINIZED_STATES),
|
||||
Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_MAX_DETERMINIZED_STATES)));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -40,6 +40,8 @@ import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
|||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
||||
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
|
||||
|
||||
public class TestMockAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
||||
/** Test a configuration that behaves a lot like WhitespaceAnalyzer */
|
||||
|
@ -166,7 +168,8 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
|
|||
new CharacterRunAutomaton(
|
||||
Operations.complement(
|
||||
Operations.union(
|
||||
Arrays.asList(Automata.makeString("foo"), Automata.makeString("bar")))));
|
||||
Arrays.asList(Automata.makeString("foo"), Automata.makeString("bar"))),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES));
|
||||
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords);
|
||||
assertAnalyzesTo(a, "quick foo brown bar bar fox foo",
|
||||
new String[] { "foo", "bar", "bar", "foo" },
|
||||
|
|
|
@ -29,12 +29,12 @@ import org.apache.lucene.search.DocIdSetIterator;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LineFileDocs;
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
||||
@SuppressCodecs({ "SimpleText", "Memory", "Direct" })
|
||||
|
@ -182,7 +182,6 @@ public class TestTermsEnum extends LuceneTestCase {
|
|||
|
||||
// Tests Terms.intersect
|
||||
public void testIntersectRandom() throws IOException {
|
||||
|
||||
final Directory dir = newDirectory();
|
||||
final RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
|
||||
|
@ -261,7 +260,7 @@ public class TestTermsEnum extends LuceneTestCase {
|
|||
a = Automata.makeStringUnion(sortedAcceptTerms);
|
||||
}
|
||||
|
||||
final CompiledAutomaton c = new CompiledAutomaton(a, true, false);
|
||||
final CompiledAutomaton c = new CompiledAutomaton(a, true, false, 1000000);
|
||||
|
||||
final BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.size()];
|
||||
final Set<BytesRef> acceptTermsSet = new HashSet<>();
|
||||
|
|
|
@ -38,6 +38,8 @@ import org.apache.lucene.util.LuceneTestCase;
|
|||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.automaton.*;
|
||||
|
||||
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
|
||||
|
||||
public class TestTermsEnum2 extends LuceneTestCase {
|
||||
private Directory dir;
|
||||
private IndexReader reader;
|
||||
|
@ -86,7 +88,8 @@ public class TestTermsEnum2 extends LuceneTestCase {
|
|||
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
String reg = AutomatonTestUtil.randomRegexp(random());
|
||||
Automaton automaton = Operations.determinize(new RegExp(reg, RegExp.NONE).toAutomaton());
|
||||
Automaton automaton = Operations.determinize(new RegExp(reg, RegExp.NONE).toAutomaton(),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
final List<BytesRef> matchedTerms = new ArrayList<>();
|
||||
for(BytesRef t : terms) {
|
||||
if (Operations.run(automaton, t.utf8ToString())) {
|
||||
|
@ -111,7 +114,8 @@ public class TestTermsEnum2 extends LuceneTestCase {
|
|||
public void testSeeking() throws Exception {
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
String reg = AutomatonTestUtil.randomRegexp(random());
|
||||
Automaton automaton = Operations.determinize(new RegExp(reg, RegExp.NONE).toAutomaton());
|
||||
Automaton automaton = Operations.determinize(new RegExp(reg, RegExp.NONE).toAutomaton(),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
TermsEnum te = MultiFields.getTerms(reader, "field").iterator(null);
|
||||
ArrayList<BytesRef> unsortedTerms = new ArrayList<>(terms);
|
||||
Collections.shuffle(unsortedTerms, random());
|
||||
|
@ -158,13 +162,15 @@ public class TestTermsEnum2 extends LuceneTestCase {
|
|||
Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton();
|
||||
CompiledAutomaton ca = new CompiledAutomaton(automaton, Operations.isFinite(automaton), false);
|
||||
TermsEnum te = MultiFields.getTerms(reader, "field").intersect(ca, null);
|
||||
Automaton expected = Operations.determinize(Operations.intersection(termsAutomaton, automaton));
|
||||
Automaton expected = Operations.determinize(Operations.intersection(termsAutomaton, automaton),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
TreeSet<BytesRef> found = new TreeSet<>();
|
||||
while (te.next() != null) {
|
||||
found.add(BytesRef.deepCopyOf(te.term()));
|
||||
}
|
||||
|
||||
Automaton actual = Operations.determinize(Automata.makeStringUnion(found));
|
||||
Automaton actual = Operations.determinize(Automata.makeStringUnion(found),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertTrue(Operations.sameLanguage(expected, actual));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,10 +33,12 @@ import org.apache.lucene.store.Directory;
|
|||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.Rethrow;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.automaton.AutomatonTestUtil;
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.AutomatonTestUtil;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
|
||||
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
|
||||
|
||||
public class TestAutomatonQuery extends LuceneTestCase {
|
||||
private Directory directory;
|
||||
|
@ -118,7 +120,7 @@ public class TestAutomatonQuery extends LuceneTestCase {
|
|||
assertAutomatonHits(0, Operations.intersection(Automata
|
||||
.makeChar('a'), Automata.makeChar('b')));
|
||||
assertAutomatonHits(1, Operations.minus(Automata.makeCharRange('a', 'b'),
|
||||
Automata.makeChar('a')));
|
||||
Automata.makeChar('a'), DEFAULT_MAX_DETERMINIZED_STATES));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -28,11 +28,13 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.AutomatonProvider;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
||||
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
|
||||
|
||||
/**
|
||||
* Some simple regex tests, mostly converted from contrib's TestRegexQuery.
|
||||
*/
|
||||
|
@ -108,7 +110,8 @@ public class TestRegexpQuery extends LuceneTestCase {
|
|||
else return null;
|
||||
}
|
||||
};
|
||||
RegexpQuery query = new RegexpQuery(newTerm("<quickBrown>"), RegExp.ALL, myProvider);
|
||||
RegexpQuery query = new RegexpQuery(newTerm("<quickBrown>"), RegExp.ALL,
|
||||
myProvider, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertEquals(1, searcher.search(query, 5).totalHits);
|
||||
}
|
||||
|
||||
|
|
|
@ -36,6 +36,8 @@ import org.apache.lucene.util.UnicodeUtil;
|
|||
import org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStrings;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
|
||||
|
||||
public class TestAutomaton extends LuceneTestCase {
|
||||
|
||||
public void testBasic() throws Exception {
|
||||
|
@ -111,7 +113,7 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
Automata.makeAnyString(),
|
||||
Automata.makeString("n"),
|
||||
Automata.makeAnyString()));
|
||||
a = Operations.determinize(a);
|
||||
a = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertTrue(Operations.run(a, "mn"));
|
||||
assertTrue(Operations.run(a, "mone"));
|
||||
assertFalse(Operations.run(a, "m"));
|
||||
|
@ -122,7 +124,7 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
Automaton a = Operations.union(Arrays.asList(
|
||||
Automata.makeString("foobar"),
|
||||
Automata.makeString("barbaz")));
|
||||
a = Operations.determinize(a);
|
||||
a = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertTrue(Operations.run(a, "foobar"));
|
||||
assertTrue(Operations.run(a, "barbaz"));
|
||||
|
||||
|
@ -134,7 +136,7 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
Automata.makeString("foobar"),
|
||||
Automata.makeString(""),
|
||||
Automata.makeString("barbaz")));
|
||||
a = Operations.determinize(a);
|
||||
a = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertTrue(Operations.run(a, "foobar"));
|
||||
assertTrue(Operations.run(a, "barbaz"));
|
||||
assertTrue(Operations.run(a, ""));
|
||||
|
@ -144,7 +146,7 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
|
||||
public void testMinimizeSimple() throws Exception {
|
||||
Automaton a = Automata.makeString("foobar");
|
||||
Automaton aMin = MinimizationOperations.minimize(a);
|
||||
Automaton aMin = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
|
||||
assertTrue(Operations.sameLanguage(a, aMin));
|
||||
}
|
||||
|
@ -152,14 +154,16 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
public void testMinimize2() throws Exception {
|
||||
Automaton a = Operations.union(Arrays.asList(Automata.makeString("foobar"),
|
||||
Automata.makeString("boobar")));
|
||||
Automaton aMin = MinimizationOperations.minimize(a);
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(a)), aMin));
|
||||
Automaton aMin = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(
|
||||
Operations.removeDeadStates(a), DEFAULT_MAX_DETERMINIZED_STATES), aMin));
|
||||
}
|
||||
|
||||
public void testReverse() throws Exception {
|
||||
Automaton a = Automata.makeString("foobar");
|
||||
Automaton ra = Operations.reverse(a);
|
||||
Automaton a2 = Operations.determinize(Operations.reverse(ra));
|
||||
Automaton a2 = Operations.determinize(Operations.reverse(ra),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
|
||||
assertTrue(Operations.sameLanguage(a, a2));
|
||||
}
|
||||
|
@ -167,7 +171,7 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
public void testOptional() throws Exception {
|
||||
Automaton a = Automata.makeString("foobar");
|
||||
Automaton a2 = Operations.optional(a);
|
||||
a2 = Operations.determinize(a2);
|
||||
a2 = Operations.determinize(a2, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
|
||||
assertTrue(Operations.run(a, "foobar"));
|
||||
assertFalse(Operations.run(a, ""));
|
||||
|
@ -177,7 +181,8 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
|
||||
public void testRepeatAny() throws Exception {
|
||||
Automaton a = Automata.makeString("zee");
|
||||
Automaton a2 = Operations.determinize(Operations.repeat(a));
|
||||
Automaton a2 = Operations.determinize(Operations.repeat(a),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertTrue(Operations.run(a2, ""));
|
||||
assertTrue(Operations.run(a2, "zee"));
|
||||
assertTrue(Operations.run(a2, "zeezee"));
|
||||
|
@ -186,7 +191,8 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
|
||||
public void testRepeatMin() throws Exception {
|
||||
Automaton a = Automata.makeString("zee");
|
||||
Automaton a2 = Operations.determinize(Operations.repeat(a, 2));
|
||||
Automaton a2 = Operations.determinize(Operations.repeat(a, 2),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertFalse(Operations.run(a2, ""));
|
||||
assertFalse(Operations.run(a2, "zee"));
|
||||
assertTrue(Operations.run(a2, "zeezee"));
|
||||
|
@ -195,7 +201,8 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
|
||||
public void testRepeatMinMax1() throws Exception {
|
||||
Automaton a = Automata.makeString("zee");
|
||||
Automaton a2 = Operations.determinize(Operations.repeat(a, 0, 2));
|
||||
Automaton a2 = Operations.determinize(Operations.repeat(a, 0, 2),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertTrue(Operations.run(a2, ""));
|
||||
assertTrue(Operations.run(a2, "zee"));
|
||||
assertTrue(Operations.run(a2, "zeezee"));
|
||||
|
@ -204,7 +211,8 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
|
||||
public void testRepeatMinMax2() throws Exception {
|
||||
Automaton a = Automata.makeString("zee");
|
||||
Automaton a2 = Operations.determinize(Operations.repeat(a, 2, 4));
|
||||
Automaton a2 = Operations.determinize(Operations.repeat(a, 2, 4),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertFalse(Operations.run(a2, ""));
|
||||
assertFalse(Operations.run(a2, "zee"));
|
||||
assertTrue(Operations.run(a2, "zeezee"));
|
||||
|
@ -215,7 +223,8 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
|
||||
public void testComplement() throws Exception {
|
||||
Automaton a = Automata.makeString("zee");
|
||||
Automaton a2 = Operations.determinize(Operations.complement(a));
|
||||
Automaton a2 = Operations.determinize(Operations.complement(a,
|
||||
DEFAULT_MAX_DETERMINIZED_STATES), DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertTrue(Operations.run(a2, ""));
|
||||
assertFalse(Operations.run(a2, "zee"));
|
||||
assertTrue(Operations.run(a2, "zeezee"));
|
||||
|
@ -223,7 +232,8 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testInterval() throws Exception {
|
||||
Automaton a = Operations.determinize(Automata.makeInterval(17, 100, 3));
|
||||
Automaton a = Operations.determinize(Automata.makeInterval(17, 100, 3),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertFalse(Operations.run(a, ""));
|
||||
assertTrue(Operations.run(a, "017"));
|
||||
assertTrue(Operations.run(a, "100"));
|
||||
|
@ -239,7 +249,8 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
a.addTransition(init, fini, 'm');
|
||||
a.addTransition(fini, fini, 'm');
|
||||
a.finishState();
|
||||
assertEquals(0, Operations.getCommonSuffixBytesRef(a).length);
|
||||
assertEquals(0, Operations.getCommonSuffixBytesRef(a,
|
||||
DEFAULT_MAX_DETERMINIZED_STATES).length);
|
||||
}
|
||||
|
||||
public void testReverseRandom1() throws Exception {
|
||||
|
@ -248,8 +259,9 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
Automaton a = AutomatonTestUtil.randomAutomaton(random());
|
||||
Automaton ra = Operations.reverse(a);
|
||||
Automaton rra = Operations.reverse(ra);
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(a)),
|
||||
Operations.determinize(Operations.removeDeadStates(rra))));
|
||||
assertTrue(Operations.sameLanguage(
|
||||
Operations.determinize(Operations.removeDeadStates(a), DEFAULT_MAX_DETERMINIZED_STATES),
|
||||
Operations.determinize(Operations.removeDeadStates(rra), DEFAULT_MAX_DETERMINIZED_STATES)));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -262,7 +274,7 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
a = Operations.removeDeadStates(a);
|
||||
}
|
||||
Automaton ra = Operations.reverse(a);
|
||||
Automaton rda = Operations.determinize(ra);
|
||||
Automaton rda = Operations.determinize(ra, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
|
||||
if (Operations.isEmpty(a)) {
|
||||
assertTrue(Operations.isEmpty(rda));
|
||||
|
@ -290,7 +302,8 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testAnyStringEmptyString() throws Exception {
|
||||
Automaton a = Operations.determinize(Automata.makeAnyString());
|
||||
Automaton a = Operations.determinize(Automata.makeAnyString(),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertTrue(Operations.run(a, ""));
|
||||
}
|
||||
|
||||
|
@ -349,9 +362,9 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
}
|
||||
|
||||
assertTrue(Operations.sameLanguage(
|
||||
Operations.determinize(Operations.removeDeadStates(a)),
|
||||
Operations.determinize(Operations.removeDeadStates(builder.finish()))));
|
||||
|
||||
Operations.determinize(Operations.removeDeadStates(a), DEFAULT_MAX_DETERMINIZED_STATES),
|
||||
Operations.determinize(Operations.removeDeadStates(builder.finish()),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES)));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -368,7 +381,8 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
a.finishState();
|
||||
assertFalse(Operations.isTotal(a));
|
||||
a.setAccept(init, true);
|
||||
assertTrue(Operations.isTotal(MinimizationOperations.minimize(a)));
|
||||
assertTrue(Operations.isTotal(MinimizationOperations.minimize(a,
|
||||
DEFAULT_MAX_DETERMINIZED_STATES)));
|
||||
}
|
||||
|
||||
public void testMinimizeEmpty() throws Exception {
|
||||
|
@ -377,7 +391,7 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
int fini = a.createState();
|
||||
a.addTransition(init, fini, 'a');
|
||||
a.finishState();
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertEquals(0, a.getNumStates());
|
||||
}
|
||||
|
||||
|
@ -387,26 +401,29 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
Automaton a3 = Automata.makeString("beebar");
|
||||
Automaton a = Operations.union(Arrays.asList(a1, a2, a3));
|
||||
if (random().nextBoolean()) {
|
||||
a = Operations.determinize(a);
|
||||
a = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
} else if (random().nextBoolean()) {
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
assertMatches(a, "foobar", "beebar", "boobar");
|
||||
|
||||
Automaton a4 = Operations.determinize(Operations.minus(a, a2));
|
||||
Automaton a4 = Operations.determinize(Operations.minus(a, a2,
|
||||
DEFAULT_MAX_DETERMINIZED_STATES), DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
|
||||
assertTrue(Operations.run(a4, "foobar"));
|
||||
assertFalse(Operations.run(a4, "boobar"));
|
||||
assertTrue(Operations.run(a4, "beebar"));
|
||||
assertMatches(a4, "foobar", "beebar");
|
||||
|
||||
a4 = Operations.determinize(Operations.minus(a4, a1));
|
||||
a4 = Operations.determinize(Operations.minus(a4, a1,
|
||||
DEFAULT_MAX_DETERMINIZED_STATES), DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertFalse(Operations.run(a4, "foobar"));
|
||||
assertFalse(Operations.run(a4, "boobar"));
|
||||
assertTrue(Operations.run(a4, "beebar"));
|
||||
assertMatches(a4, "beebar");
|
||||
|
||||
a4 = Operations.determinize(Operations.minus(a4, a3));
|
||||
a4 = Operations.determinize(Operations.minus(a4, a3,
|
||||
DEFAULT_MAX_DETERMINIZED_STATES), DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertFalse(Operations.run(a4, "foobar"));
|
||||
assertFalse(Operations.run(a4, "boobar"));
|
||||
assertFalse(Operations.run(a4, "beebar"));
|
||||
|
@ -415,7 +432,7 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
|
||||
public void testOneInterval() throws Exception {
|
||||
Automaton a = Automata.makeInterval(999, 1032, 0);
|
||||
a = Operations.determinize(a);
|
||||
a = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertTrue(Operations.run(a, "0999"));
|
||||
assertTrue(Operations.run(a, "00999"));
|
||||
assertTrue(Operations.run(a, "000999"));
|
||||
|
@ -423,7 +440,7 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
|
||||
public void testAnotherInterval() throws Exception {
|
||||
Automaton a = Automata.makeInterval(1, 2, 0);
|
||||
a = Operations.determinize(a);
|
||||
a = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertTrue(Operations.run(a, "01"));
|
||||
}
|
||||
|
||||
|
@ -445,9 +462,10 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
}
|
||||
String prefix = b.toString();
|
||||
|
||||
Automaton a = Operations.determinize(Automata.makeInterval(min, max, digits));
|
||||
Automaton a = Operations.determinize(Automata.makeInterval(min, max, digits),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
if (random().nextBoolean()) {
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
String mins = Integer.toString(min);
|
||||
String maxs = Integer.toString(max);
|
||||
|
@ -487,7 +505,8 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
expected.add(Util.toUTF32(s, ints));
|
||||
}
|
||||
|
||||
assertEquals(expected, Operations.getFiniteStrings(Operations.determinize(a), -1));
|
||||
assertEquals(expected, Operations.getFiniteStrings(Operations.determinize(a,
|
||||
DEFAULT_MAX_DETERMINIZED_STATES), -1));
|
||||
}
|
||||
|
||||
public void testConcatenatePreservesDet() throws Exception {
|
||||
|
@ -578,13 +597,13 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
if (VERBOSE) {
|
||||
System.out.println(" randomNoOp: determinize");
|
||||
}
|
||||
return Operations.determinize(a);
|
||||
return Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
case 1:
|
||||
if (a.getNumStates() < 100) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" randomNoOp: minimize");
|
||||
}
|
||||
return MinimizationOperations.minimize(a);
|
||||
return MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
} else {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" randomNoOp: skip op=minimize: too many states (" + a.getNumStates() + ")");
|
||||
|
@ -725,7 +744,7 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
if (VERBOSE) {
|
||||
System.out.println(" op=determinize");
|
||||
}
|
||||
a = Operations.determinize(a);
|
||||
a = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertTrue(a.isDeterministic());
|
||||
break;
|
||||
|
||||
|
@ -735,7 +754,7 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
System.out.println(" op=minimize");
|
||||
}
|
||||
// minimize
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
} else if (VERBOSE) {
|
||||
System.out.println(" skip op=minimize: too many states (" + a.getNumStates() + ")");
|
||||
}
|
||||
|
@ -791,7 +810,7 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
assertTrue(removed);
|
||||
}
|
||||
Automaton a2 = unionTerms(toRemove);
|
||||
a = Operations.minus(a, a2);
|
||||
a = Operations.minus(a, a2, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -831,7 +850,7 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
Automaton a2 = randomNoOp(Operations.union(as));
|
||||
a = Operations.minus(a, a2);
|
||||
a = Operations.minus(a, a2, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -868,9 +887,9 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
|
||||
Automaton a2 = Operations.union(as);
|
||||
if (random().nextBoolean()) {
|
||||
a2 = Operations.determinize(a2);
|
||||
a2 = Operations.determinize(a2, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
} else if (random().nextBoolean()) {
|
||||
a2 = MinimizationOperations.minimize(a2);
|
||||
a2 = MinimizationOperations.minimize(a2, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
a = Operations.intersection(a, a2);
|
||||
|
||||
|
@ -944,7 +963,7 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
if (VERBOSE) {
|
||||
System.out.println(" op=remove the empty string");
|
||||
}
|
||||
a = Operations.minus(a, Automata.makeEmptyString());
|
||||
a = Operations.minus(a, Automata.makeEmptyString(), DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
terms.remove(new BytesRef());
|
||||
break;
|
||||
|
||||
|
@ -1024,7 +1043,7 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
assertTrue(Operations.isFinite(a));
|
||||
assertFalse(Operations.isTotal(a));
|
||||
|
||||
Automaton detA = Operations.determinize(a);
|
||||
Automaton detA = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
|
||||
// Make sure all terms are accepted:
|
||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
|
@ -1058,8 +1077,10 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
}
|
||||
|
||||
// Use sameLanguage:
|
||||
Automaton a2 = Operations.removeDeadStates(Operations.determinize(unionTerms(terms)));
|
||||
assertTrue(Operations.sameLanguage(a2, Operations.removeDeadStates(Operations.determinize(a))));
|
||||
Automaton a2 = Operations.removeDeadStates(Operations.determinize(unionTerms(terms),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES));
|
||||
assertTrue(Operations.sameLanguage(a2, Operations.removeDeadStates(Operations.determinize(a,
|
||||
DEFAULT_MAX_DETERMINIZED_STATES))));
|
||||
|
||||
// Do same check, in UTF8 space
|
||||
Automaton utf8 = randomNoOp(new UTF32ToUTF8().convert(a));
|
||||
|
|
|
@ -31,14 +31,14 @@ import org.apache.lucene.util.TestUtil;
|
|||
|
||||
public class TestCompiledAutomaton extends LuceneTestCase {
|
||||
|
||||
private CompiledAutomaton build(String... strings) {
|
||||
private CompiledAutomaton build(int maxDeterminizedStates, String... strings) {
|
||||
final List<BytesRef> terms = new ArrayList<>();
|
||||
for(String s : strings) {
|
||||
terms.add(new BytesRef(s));
|
||||
}
|
||||
Collections.sort(terms);
|
||||
final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
|
||||
return new CompiledAutomaton(a, true, false);
|
||||
return new CompiledAutomaton(a, true, false, maxDeterminizedStates);
|
||||
}
|
||||
|
||||
private void testFloor(CompiledAutomaton c, String input, String expected) {
|
||||
|
@ -53,8 +53,8 @@ public class TestCompiledAutomaton extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
private void testTerms(String[] terms) throws Exception {
|
||||
final CompiledAutomaton c = build(terms);
|
||||
private void testTerms(int maxDeterminizedStates, String[] terms) throws Exception {
|
||||
final CompiledAutomaton c = build(maxDeterminizedStates, terms);
|
||||
final BytesRef[] termBytes = new BytesRef[terms.length];
|
||||
for(int idx=0;idx<terms.length;idx++) {
|
||||
termBytes[idx] = new BytesRef(terms[idx]);
|
||||
|
@ -100,7 +100,7 @@ public class TestCompiledAutomaton extends LuceneTestCase {
|
|||
while(terms.size() != numTerms) {
|
||||
terms.add(randomString());
|
||||
}
|
||||
testTerms(terms.toArray(new String[terms.size()]));
|
||||
testTerms(numTerms * 100, terms.toArray(new String[terms.size()]));
|
||||
}
|
||||
|
||||
private String randomString() {
|
||||
|
@ -109,7 +109,8 @@ public class TestCompiledAutomaton extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testBasic() throws Exception {
|
||||
CompiledAutomaton c = build("fob", "foo", "goo");
|
||||
CompiledAutomaton c = build(Operations.DEFAULT_MAX_DETERMINIZED_STATES,
|
||||
"fob", "foo", "goo");
|
||||
testFloor(c, "goo", "goo");
|
||||
testFloor(c, "ga", "foo");
|
||||
testFloor(c, "g", "foo");
|
||||
|
|
|
@ -19,6 +19,8 @@ package org.apache.lucene.util.automaton;
|
|||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
|
||||
|
||||
/**
|
||||
* Not completely thorough, but tries to test determinism correctness
|
||||
* somewhat randomly.
|
||||
|
@ -39,29 +41,32 @@ public class TestDeterminism extends LuceneTestCase {
|
|||
for (int i = 0; i < num; i++) {
|
||||
Automaton a = AutomatonTestUtil.randomAutomaton(random());
|
||||
a = AutomatonTestUtil.determinizeSimple(a);
|
||||
Automaton b = Operations.determinize(a);
|
||||
Automaton b = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
// TODO: more verifications possible?
|
||||
assertTrue(Operations.sameLanguage(a, b));
|
||||
}
|
||||
}
|
||||
|
||||
private static void assertAutomaton(Automaton a) {
|
||||
a = Operations.determinize(Operations.removeDeadStates(a));
|
||||
a = Operations.determinize(Operations.removeDeadStates(a), DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
|
||||
// complement(complement(a)) = a
|
||||
Automaton equivalent = Operations.complement(Operations.complement(a));
|
||||
Automaton equivalent = Operations.complement(Operations.complement(a,
|
||||
DEFAULT_MAX_DETERMINIZED_STATES), DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertTrue(Operations.sameLanguage(a, equivalent));
|
||||
|
||||
// a union a = a
|
||||
equivalent = Operations.determinize(Operations.removeDeadStates(Operations.union(a, a)));
|
||||
equivalent = Operations.determinize(Operations.removeDeadStates(Operations.union(a, a)),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertTrue(Operations.sameLanguage(a, equivalent));
|
||||
|
||||
// a intersect a = a
|
||||
equivalent = Operations.determinize(Operations.removeDeadStates(Operations.intersection(a, a)));
|
||||
equivalent = Operations.determinize(Operations.removeDeadStates(Operations.intersection(a, a)),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertTrue(Operations.sameLanguage(a, equivalent));
|
||||
|
||||
// a minus a = empty
|
||||
Automaton empty = Operations.minus(a, a);
|
||||
Automaton empty = Operations.minus(a, a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertTrue(Operations.isEmpty(empty));
|
||||
|
||||
// as long as don't accept the empty string
|
||||
|
@ -70,7 +75,8 @@ public class TestDeterminism extends LuceneTestCase {
|
|||
//System.out.println("test " + a);
|
||||
Automaton optional = Operations.optional(a);
|
||||
//System.out.println("optional " + optional);
|
||||
equivalent = Operations.minus(optional, Automata.makeEmptyString());
|
||||
equivalent = Operations.minus(optional, Automata.makeEmptyString(),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
//System.out.println("equiv " + equivalent);
|
||||
assertTrue(Operations.sameLanguage(a, equivalent));
|
||||
}
|
||||
|
|
|
@ -50,12 +50,12 @@ public class TestDeterminizeLexicon extends LuceneTestCase {
|
|||
public void assertLexicon() throws Exception {
|
||||
Collections.shuffle(automata, random());
|
||||
Automaton lex = Operations.union(automata);
|
||||
lex = Operations.determinize(lex);
|
||||
lex = Operations.determinize(lex, 1000000);
|
||||
assertTrue(Operations.isFinite(lex));
|
||||
for (String s : terms) {
|
||||
assertTrue(Operations.run(lex, s));
|
||||
}
|
||||
final ByteRunAutomaton lexByte = new ByteRunAutomaton(lex);
|
||||
final ByteRunAutomaton lexByte = new ByteRunAutomaton(lex, false, 1000000);
|
||||
for (String s : terms) {
|
||||
byte bytes[] = s.getBytes(StandardCharsets.UTF_8);
|
||||
assertTrue(lexByte.run(bytes, 0, bytes.length));
|
||||
|
|
|
@ -22,6 +22,8 @@ import java.util.List;
|
|||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
|
||||
|
||||
public class TestLevenshteinAutomata extends LuceneTestCase {
|
||||
|
||||
public void testLev0() throws Exception {
|
||||
|
@ -121,11 +123,11 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
|
|||
private Automaton naiveLev1(String s) {
|
||||
Automaton a = Automata.makeString(s);
|
||||
a = Operations.union(a, insertionsOf(s));
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
a = Operations.union(a, deletionsOf(s));
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
a = Operations.union(a, substitutionsOf(s));
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
|
||||
return a;
|
||||
}
|
||||
|
@ -137,7 +139,7 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
|
|||
private Automaton naiveLev1T(String s) {
|
||||
Automaton a = naiveLev1(s);
|
||||
a = Operations.union(a, transpositionsOf(s));
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
return a;
|
||||
}
|
||||
|
||||
|
@ -156,7 +158,7 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
|
|||
}
|
||||
|
||||
Automaton a = Operations.union(list);
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
return a;
|
||||
}
|
||||
|
||||
|
@ -174,7 +176,7 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
|
|||
}
|
||||
|
||||
Automaton a = Operations.union(list);
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
return a;
|
||||
}
|
||||
|
||||
|
@ -193,7 +195,7 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
|
|||
}
|
||||
|
||||
Automaton a = Operations.union(list);
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
return a;
|
||||
}
|
||||
|
||||
|
@ -218,7 +220,7 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
Automaton a = Operations.union(list);
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
return a;
|
||||
}
|
||||
|
||||
|
|
|
@ -19,6 +19,8 @@ package org.apache.lucene.util.automaton;
|
|||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
|
||||
|
||||
/**
|
||||
* This test builds some randomish NFA/DFA and minimizes them.
|
||||
*/
|
||||
|
@ -28,8 +30,10 @@ public class TestMinimize extends LuceneTestCase {
|
|||
int num = atLeast(200);
|
||||
for (int i = 0; i < num; i++) {
|
||||
Automaton a = AutomatonTestUtil.randomAutomaton(random());
|
||||
Automaton la = Operations.determinize(Operations.removeDeadStates(a));
|
||||
Automaton lb = MinimizationOperations.minimize(a);
|
||||
Automaton la = Operations.determinize(Operations.removeDeadStates(a),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
Automaton lb = MinimizationOperations.minimize(a,
|
||||
DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertTrue(Operations.sameLanguage(la, lb));
|
||||
}
|
||||
}
|
||||
|
@ -42,7 +46,8 @@ public class TestMinimize extends LuceneTestCase {
|
|||
for (int i = 0; i < num; i++) {
|
||||
Automaton a = AutomatonTestUtil.randomAutomaton(random());
|
||||
a = AutomatonTestUtil.minimizeSimple(a);
|
||||
Automaton b = MinimizationOperations.minimize(a);
|
||||
Automaton b = MinimizationOperations.minimize(a,
|
||||
DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertTrue(Operations.sameLanguage(a, b));
|
||||
assertEquals(a.getNumStates(), b.getNumStates());
|
||||
int numStates = a.getNumStates();
|
||||
|
@ -62,6 +67,6 @@ public class TestMinimize extends LuceneTestCase {
|
|||
|
||||
/** n^2 space usage in Hopcroft minimization? */
|
||||
public void testMinimizeHuge() {
|
||||
new RegExp("+-*(A|.....|BC)*]", RegExp.NONE).toAutomaton();
|
||||
new RegExp("+-*(A|.....|BC)*]", RegExp.NONE).toAutomaton(1000000);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,6 +24,8 @@ import org.apache.lucene.util.fst.Util;
|
|||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomInts;
|
||||
|
||||
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
|
||||
|
||||
public class TestOperations extends LuceneTestCase {
|
||||
/** Test string union. */
|
||||
public void testStringUnion() {
|
||||
|
@ -51,7 +53,8 @@ public class TestOperations extends LuceneTestCase {
|
|||
for (BytesRef bref : strings) {
|
||||
eachIndividual[i++] = Automata.makeString(bref.utf8ToString());
|
||||
}
|
||||
return Operations.determinize(Operations.union(Arrays.asList(eachIndividual)));
|
||||
return Operations.determinize(Operations.union(Arrays.asList(eachIndividual)),
|
||||
DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
|
||||
/** Test concatenation with empty language returns empty */
|
||||
|
@ -71,12 +74,12 @@ public class TestOperations extends LuceneTestCase {
|
|||
Automaton concat1 = Operations.concatenate(expandedSingleton, nfa);
|
||||
Automaton concat2 = Operations.concatenate(singleton, nfa);
|
||||
assertFalse(concat2.isDeterministic());
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(concat1),
|
||||
Operations.determinize(concat2)));
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(nfa),
|
||||
Operations.determinize(concat1)));
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(nfa),
|
||||
Operations.determinize(concat2)));
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(concat1, 100),
|
||||
Operations.determinize(concat2, 100)));
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(nfa, 100),
|
||||
Operations.determinize(concat1, 100)));
|
||||
assertTrue(Operations.sameLanguage(Operations.determinize(nfa, 100),
|
||||
Operations.determinize(concat2, 100)));
|
||||
}
|
||||
|
||||
public void testGetRandomAcceptedString() throws Throwable {
|
||||
|
@ -86,7 +89,7 @@ public class TestOperations extends LuceneTestCase {
|
|||
|
||||
final RegExp re = new RegExp(AutomatonTestUtil.randomRegexp(random()), RegExp.NONE);
|
||||
//System.out.println("TEST i=" + i + " re=" + re);
|
||||
final Automaton a = Operations.determinize(re.toAutomaton());
|
||||
final Automaton a = Operations.determinize(re.toAutomaton(), DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
assertFalse(Operations.isEmpty(a));
|
||||
|
||||
final AutomatonTestUtil.RandomAcceptedStrings rx = new AutomatonTestUtil.RandomAcceptedStrings(a);
|
||||
|
@ -137,7 +140,7 @@ public class TestOperations extends LuceneTestCase {
|
|||
*/
|
||||
public void testFiniteStringsBasic() {
|
||||
Automaton a = Operations.union(Automata.makeString("dog"), Automata.makeString("duck"));
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
Set<IntsRef> strings = getFiniteStrings(a, -1, true);
|
||||
assertEquals(2, strings.size());
|
||||
IntsRefBuilder dog = new IntsRefBuilder();
|
||||
|
@ -190,7 +193,7 @@ public class TestOperations extends LuceneTestCase {
|
|||
// TODO: what other random things can we do here...
|
||||
Automaton a = Operations.union(automata);
|
||||
if (random().nextBoolean()) {
|
||||
a = MinimizationOperations.minimize(a);
|
||||
a = MinimizationOperations.minimize(a, 1000000);
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: a.minimize numStates=" + a.getNumStates());
|
||||
}
|
||||
|
@ -198,7 +201,7 @@ public class TestOperations extends LuceneTestCase {
|
|||
if (VERBOSE) {
|
||||
System.out.println("TEST: a.determinize");
|
||||
}
|
||||
a = Operations.determinize(a);
|
||||
a = Operations.determinize(a, 1000000);
|
||||
} else if (random().nextBoolean()) {
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: a.removeDeadStates");
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
package org.apache.lucene.util.automaton;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestRegExp extends LuceneTestCase {
|
||||
|
||||
/**
|
||||
* Simple smoke test for regular expression.
|
||||
*/
|
||||
public void testSmoke() {
|
||||
RegExp r = new RegExp("a(b+|c+)d");
|
||||
Automaton a = r.toAutomaton();
|
||||
assertTrue(a.isDeterministic());
|
||||
CharacterRunAutomaton run = new CharacterRunAutomaton(a);
|
||||
assertTrue(run.run("abbbbbd"));
|
||||
assertTrue(run.run("acd"));
|
||||
assertFalse(run.run("ad"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Compiles a regular expression that is prohibitively expensive to
|
||||
* determinize and expexts to catch an exception for it.
|
||||
*/
|
||||
public void testDeterminizeTooManyStates() {
|
||||
// LUCENE-6046
|
||||
String source = "[ac]*a[ac]{50,200}";
|
||||
try {
|
||||
new RegExp(source).toAutomaton();
|
||||
fail();
|
||||
} catch (TooComplexToDeterminizeException e) {
|
||||
assert(e.getMessage().contains(source));
|
||||
}
|
||||
}
|
||||
|
||||
// LUCENE-6046
|
||||
public void testRepeatWithEmptyString() throws Exception {
|
||||
Automaton a = new RegExp("[^y]*{1,2}").toAutomaton(1000);
|
||||
// paranoia:
|
||||
assertTrue(a.toString().length() > 0);
|
||||
}
|
||||
}
|
|
@ -25,6 +25,7 @@ import java.util.Set;
|
|||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.ConstantScoreQuery;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
|
@ -32,7 +33,6 @@ import org.apache.lucene.search.Filter;
|
|||
import org.apache.lucene.search.FilteredQuery;
|
||||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.RegexpQuery;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TermRangeQuery;
|
||||
|
@ -903,7 +903,7 @@ public class FieldQueryTest extends AbstractTestCase {
|
|||
public void testRegexpQuery() throws Exception {
|
||||
makeIndexStrMV();
|
||||
Term term = new Term(F, "d[a-z].g");
|
||||
defgMultiTermQueryTest(new RegexpQuery (term));
|
||||
defgMultiTermQueryTest(new RegexpQuery(term));
|
||||
}
|
||||
|
||||
public void testRangeQuery() throws Exception {
|
||||
|
|
|
@ -33,6 +33,9 @@ import org.apache.lucene.search.*;
|
|||
import org.apache.lucene.search.BooleanQuery.TooManyClauses;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.QueryBuilder;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
||||
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
|
||||
|
||||
/** This class is overridden by QueryParser in QueryParser.jj
|
||||
* and acts to separate the majority of the Java code from the .jj grammar file.
|
||||
|
@ -81,6 +84,7 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer
|
|||
boolean analyzeRangeTerms = false;
|
||||
|
||||
boolean autoGeneratePhraseQueries;
|
||||
int maxDeterminizedStates = DEFAULT_MAX_DETERMINIZED_STATES;
|
||||
|
||||
// So the generated QueryParser(CharStream) won't error out
|
||||
protected QueryParserBase() {
|
||||
|
@ -398,6 +402,24 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer
|
|||
return analyzeRangeTerms;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param maxDeterminizedStates the maximum number of states that
|
||||
* determinizing a regexp query can result in. If the query results in any
|
||||
* more states a TooComplexToDeterminizeException is thrown.
|
||||
*/
|
||||
public void setMaxDeterminizedStates(int maxDeterminizedStates) {
|
||||
this.maxDeterminizedStates = maxDeterminizedStates;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the maximum number of states that determinizing a regexp query
|
||||
* can result in. If the query results in any more states a
|
||||
* TooComplexToDeterminizeException is thrown.
|
||||
*/
|
||||
public int getMaxDeterminizedStates() {
|
||||
return maxDeterminizedStates;
|
||||
}
|
||||
|
||||
protected void addClause(List<BooleanClause> clauses, int conj, int mods, Query q) {
|
||||
boolean required, prohibited;
|
||||
|
||||
|
@ -553,7 +575,8 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer
|
|||
* @return new RegexpQuery instance
|
||||
*/
|
||||
protected Query newRegexpQuery(Term regexp) {
|
||||
RegexpQuery query = new RegexpQuery(regexp);
|
||||
RegexpQuery query = new RegexpQuery(regexp, RegExp.ALL,
|
||||
maxDeterminizedStates);
|
||||
query.setRewriteMethod(multiTermRewriteMethod);
|
||||
return query;
|
||||
}
|
||||
|
|
|
@ -38,6 +38,7 @@ public class RegexpQueryNodeBuilder implements StandardQueryBuilder {
|
|||
public RegexpQuery build(QueryNode queryNode) throws QueryNodeException {
|
||||
RegexpQueryNode regexpNode = (RegexpQueryNode) queryNode;
|
||||
|
||||
// TODO: make the maxStates configurable w/ a reasonable default (QueryParserBase uses 10000)
|
||||
RegexpQuery q = new RegexpQuery(new Term(regexpNode.getFieldAsString(),
|
||||
regexpNode.textToBytesRef()));
|
||||
|
||||
|
|
|
@ -24,9 +24,9 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.IndexReaderContext;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.ReaderUtil;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
|
@ -40,6 +40,8 @@ import org.apache.lucene.util.automaton.Automaton;
|
|||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.Transition;
|
||||
|
||||
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
|
||||
|
||||
// TODO
|
||||
// - compare perf to PhraseQuery exact and sloppy
|
||||
// - optimize: find terms that are in fact MUST (because all paths
|
||||
|
@ -108,6 +110,16 @@ public class TermAutomatonQuery extends Query {
|
|||
|
||||
/** Call this once you are done adding states/transitions. */
|
||||
public void finish() {
|
||||
finish(DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Call this once you are done adding states/transitions.
|
||||
* @param maxDeterminizedStates Maximum number of states created when
|
||||
* determinizing the automaton. Higher numbers allow this operation to
|
||||
* consume more memory but allow more complex automatons.
|
||||
*/
|
||||
public void finish(int maxDeterminizedStates) {
|
||||
Automaton automaton = builder.finish();
|
||||
|
||||
// System.out.println("before det:\n" + automaton.toDot());
|
||||
|
@ -171,7 +183,8 @@ public class TermAutomatonQuery extends Query {
|
|||
automaton = newAutomaton;
|
||||
}
|
||||
|
||||
det = Operations.removeDeadStates(Operations.determinize(automaton));
|
||||
det = Operations.removeDeadStates(Operations.determinize(automaton,
|
||||
maxDeterminizedStates));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -62,6 +62,8 @@ import org.apache.lucene.util.fst.Util;
|
|||
import org.apache.lucene.util.fst.Util.Result;
|
||||
import org.apache.lucene.util.fst.Util.TopResults;
|
||||
|
||||
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
|
||||
|
||||
/**
|
||||
* Suggester that first analyzes the surface form, adds the
|
||||
* analyzed form to a weighted FST, and then does the same
|
||||
|
@ -898,7 +900,7 @@ public class AnalyzingSuggester extends Lookup {
|
|||
|
||||
// TODO: we can optimize this somewhat by determinizing
|
||||
// while we convert
|
||||
automaton = Operations.determinize(automaton);
|
||||
automaton = Operations.determinize(automaton, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
return automaton;
|
||||
}
|
||||
|
||||
|
|
|
@ -17,7 +17,6 @@ package org.apache.lucene.search.suggest.analyzing;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
@ -30,13 +29,15 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.UTF32ToUTF8;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PairOutputs.Pair;
|
||||
|
||||
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
|
||||
|
||||
/**
|
||||
* Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is
|
||||
* based on the Damerau-Levenshtein (optimal string alignment) algorithm, though
|
||||
|
@ -205,7 +206,7 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
|||
protected Automaton convertAutomaton(Automaton a) {
|
||||
if (unicodeAware) {
|
||||
Automaton utf8automaton = new UTF32ToUTF8().convert(a);
|
||||
utf8automaton = Operations.determinize(utf8automaton);
|
||||
utf8automaton = Operations.determinize(utf8automaton, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
return utf8automaton;
|
||||
} else {
|
||||
return a;
|
||||
|
@ -253,7 +254,7 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
|||
Automaton a = Operations.union(Arrays.asList(subs));
|
||||
// TODO: we could call toLevenshteinAutomata() before det?
|
||||
// this only happens if you have multiple paths anyway (e.g. synonyms)
|
||||
return Operations.determinize(a);
|
||||
return Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -40,6 +40,11 @@ import org.apache.lucene.util.UnicodeUtil;
|
|||
* basic unoptimized implementations (*slow) for testing.
|
||||
*/
|
||||
public class AutomatonTestUtil {
|
||||
/**
|
||||
* Default maximum number of states that {@link Operations#determinize} should create.
|
||||
*/
|
||||
public static final int DEFAULT_MAX_DETERMINIZED_STATES = 1000000;
|
||||
|
||||
/** Returns random string, including full unicode range. */
|
||||
public static String randomRegexp(Random r) {
|
||||
while (true) {
|
||||
|
@ -257,12 +262,12 @@ public class AutomatonTestUtil {
|
|||
// get two random Automata from regexps
|
||||
Automaton a1 = new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE).toAutomaton();
|
||||
if (random.nextBoolean()) {
|
||||
a1 = Operations.complement(a1);
|
||||
a1 = Operations.complement(a1, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
|
||||
Automaton a2 = new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE).toAutomaton();
|
||||
if (random.nextBoolean()) {
|
||||
a2 = Operations.complement(a2);
|
||||
a2 = Operations.complement(a2, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
|
||||
// combine them in random ways
|
||||
|
@ -270,7 +275,7 @@ public class AutomatonTestUtil {
|
|||
case 0: return Operations.concatenate(a1, a2);
|
||||
case 1: return Operations.union(a1, a2);
|
||||
case 2: return Operations.intersection(a1, a2);
|
||||
default: return Operations.minus(a1, a2);
|
||||
default: return Operations.minus(a1, a2, DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -41,8 +41,8 @@ import org.apache.lucene.util.QueryBuilder;
|
|||
import org.apache.lucene.util.ToStringUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.solr.analysis.ReversedWildcardFilterFactory;
|
||||
import org.apache.solr.analysis.TokenizerChain;
|
||||
import org.apache.solr.common.SolrException;
|
||||
|
@ -788,7 +788,7 @@ public abstract class SolrQueryParserBase extends QueryBuilder {
|
|||
Automata.makeChar(factory.getMarkerChar()),
|
||||
Automata.makeAnyString());
|
||||
// subtract these away
|
||||
automaton = Operations.minus(automaton, falsePositives);
|
||||
automaton = Operations.minus(automaton, falsePositives, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
return new AutomatonQuery(term, automaton) {
|
||||
// override toString so its completely transparent
|
||||
|
|
|
@ -16,9 +16,7 @@ package org.apache.solr.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.reflect.Field;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
@ -26,8 +24,8 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.search.AutomatonQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
|
@ -161,7 +159,8 @@ public class TestReversedWildcardFilterFactory extends SolrTestCaseJ4 {
|
|||
return false;
|
||||
}
|
||||
Automaton automaton = ((AutomatonQuery) q).getAutomaton();
|
||||
String prefix = Operations.getCommonPrefix(Operations.determinize(automaton));
|
||||
String prefix = Operations.getCommonPrefix(Operations.determinize(automaton,
|
||||
Operations.DEFAULT_MAX_DETERMINIZED_STATES));
|
||||
return prefix.length() > 0 && prefix.charAt(0) == '\u0001';
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue