mirror of https://github.com/apache/lucene.git
LUCENE-5752: improve tests; move isEmpty out of LA into BasicOps; BasicOps.sameLanguage requires no dead states; rename LA.finish -> finishState
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5752@1602966 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5927445bb6
commit
ce2f7e9c8c
|
@ -35,8 +35,6 @@ import org.apache.lucene.util.AttributeSource;
|
|||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.automaton.BasicAutomata;
|
||||
import org.apache.lucene.util.automaton.BasicOperations;
|
||||
import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||
|
@ -154,8 +152,6 @@ public class FuzzyTermsEnum extends TermsEnum {
|
|||
if (editDistance < runAutomata.size()) {
|
||||
//System.out.println("FuzzyTE.getAEnum: ed=" + editDistance + " lastTerm=" + (lastTerm==null ? "null" : lastTerm.utf8ToString()));
|
||||
final CompiledAutomaton compiled = runAutomata.get(editDistance);
|
||||
compiled.lightAutomaton.writeDot("compiled");
|
||||
|
||||
return new AutomatonFuzzyTermsEnum(terms.intersect(compiled, lastTerm == null ? null : compiled.floor(lastTerm, new BytesRef())),
|
||||
runAutomata.subList(0, editDistance + 1).toArray(new CompiledAutomaton[editDistance + 1]));
|
||||
} else {
|
||||
|
|
|
@ -47,7 +47,7 @@ final public class BasicAutomata {
|
|||
*/
|
||||
public static LightAutomaton makeEmptyLight() {
|
||||
LightAutomaton a = new LightAutomaton();
|
||||
a.finish();
|
||||
a.finishState();
|
||||
return a;
|
||||
}
|
||||
|
||||
|
@ -69,17 +69,10 @@ final public class BasicAutomata {
|
|||
int s = a.createState();
|
||||
a.setAccept(s, true);
|
||||
a.addTransition(s, s, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
|
||||
a.finish();
|
||||
a.finishState();
|
||||
return a;
|
||||
}
|
||||
|
||||
public static int appendAnyString(LightAutomaton a, int state) {
|
||||
int newState = a.createState();
|
||||
a.addTransition(state, newState, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
|
||||
a.addTransition(newState, newState, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
|
||||
return newState;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new (deterministic) automaton that accepts any single codepoint.
|
||||
*/
|
||||
|
@ -120,7 +113,7 @@ final public class BasicAutomata {
|
|||
int s2 = a.createState();
|
||||
a.setAccept(s2, true);
|
||||
a.addTransition(s1, s2, min, max);
|
||||
a.finish();
|
||||
a.finishState();
|
||||
return a;
|
||||
}
|
||||
|
||||
|
@ -260,10 +253,11 @@ final public class BasicAutomata {
|
|||
LightAutomaton a1 = builder.finish();
|
||||
|
||||
if (digits <= 0) {
|
||||
a1.addTransition(0, 0, '0');
|
||||
for (int p : initials) {
|
||||
a1.addEpsilon(0, p);
|
||||
}
|
||||
a1.finish();
|
||||
a1.finishState();
|
||||
}
|
||||
|
||||
return a1;
|
||||
|
@ -284,7 +278,10 @@ final public class BasicAutomata {
|
|||
}
|
||||
|
||||
a.setAccept(lastState, true);
|
||||
a.finish();
|
||||
a.finishState();
|
||||
|
||||
assert a.isDeterministic();
|
||||
assert BasicOperations.hasDeadStates(a) == false;
|
||||
|
||||
return a;
|
||||
}
|
||||
|
@ -303,7 +300,7 @@ final public class BasicAutomata {
|
|||
s = s2;
|
||||
}
|
||||
a.setAccept(s, true);
|
||||
a.finish();
|
||||
a.finishState();
|
||||
|
||||
return a;
|
||||
}
|
||||
|
|
|
@ -60,7 +60,6 @@ final public class BasicOperations {
|
|||
* Complexity: linear in total number of states.
|
||||
*/
|
||||
static public LightAutomaton concatenateLight(LightAutomaton a1, LightAutomaton a2) {
|
||||
// nocommit we lost the two-arg optimization here (prepend tiny automaton in front of huge one)
|
||||
return concatenateLight(Arrays.asList(a1, a2));
|
||||
}
|
||||
|
||||
|
@ -75,6 +74,10 @@ final public class BasicOperations {
|
|||
|
||||
// First pass: create all states
|
||||
for(LightAutomaton a : l) {
|
||||
if (a.getNumStates() == 0) {
|
||||
result.finishState();
|
||||
return result;
|
||||
}
|
||||
int numStates = a.getNumStates();
|
||||
for(int s=0;s<numStates;s++) {
|
||||
result.createState();
|
||||
|
@ -133,7 +136,7 @@ final public class BasicOperations {
|
|||
result.createState();
|
||||
}
|
||||
|
||||
result.finish();
|
||||
result.finishState();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
@ -148,28 +151,11 @@ final public class BasicOperations {
|
|||
LightAutomaton result = new LightAutomaton();
|
||||
result.createState();
|
||||
result.setAccept(0, true);
|
||||
int numStates = a.getNumStates();
|
||||
for(int i=0;i<numStates;i++) {
|
||||
result.createState();
|
||||
result.setAccept(i+1, a.isAccept(i));
|
||||
if (a.getNumStates() > 0) {
|
||||
result.copy(a);
|
||||
result.addEpsilon(0, 1);
|
||||
}
|
||||
|
||||
Transition t = new Transition();
|
||||
int count = a.initTransition(0, t);
|
||||
for(int i=0;i<count;i++) {
|
||||
a.getNextTransition(t);
|
||||
result.addTransition(0, 1+t.dest, t.min, t.max);
|
||||
}
|
||||
|
||||
for(int i=0;i<numStates;i++) {
|
||||
count = a.initTransition(i, t);
|
||||
for(int j=0;j<count;j++) {
|
||||
a.getNextTransition(t);
|
||||
result.addTransition(1+t.source, 1+t.dest, t.min, t.max);
|
||||
}
|
||||
}
|
||||
|
||||
result.finish();
|
||||
result.finishState();
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -207,8 +193,6 @@ final public class BasicOperations {
|
|||
return builder.finish();
|
||||
}
|
||||
|
||||
// nocommit move to AutomatonTestUtil
|
||||
|
||||
/**
|
||||
* Returns an automaton that accepts <code>min</code> or more concatenated
|
||||
* repetitions of the language of the given automaton.
|
||||
|
@ -268,7 +252,7 @@ final public class BasicOperations {
|
|||
}
|
||||
}
|
||||
|
||||
b.finish();
|
||||
b.finishState();
|
||||
|
||||
return b;
|
||||
}
|
||||
|
@ -355,99 +339,57 @@ final public class BasicOperations {
|
|||
}
|
||||
}
|
||||
}
|
||||
c.finish();
|
||||
c.finishState();
|
||||
|
||||
return removeDeadStates(c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an automaton that accepts the intersection of the languages of the
|
||||
* given automata. Never modifies the input automata languages.
|
||||
* <p>
|
||||
* Complexity: quadratic in number of states.
|
||||
*/
|
||||
/*
|
||||
// nocommit broken
|
||||
static public LightAutomaton intersectionLight(LightAutomaton a1, LightAutomaton a2) {
|
||||
if (a1 == a2) {
|
||||
return a1;
|
||||
}
|
||||
LightAutomaton result = new LightAutomaton();
|
||||
result.createState();
|
||||
//Transition[][] transitions1 = a1.getSortedTransitions();
|
||||
//Transition[][] transitions2 = a2.getSortedTransitions();
|
||||
LinkedList<LightStatePair> worklist = new LinkedList<>();
|
||||
HashMap<LightStatePair,LightStatePair> newstates = new HashMap<>();
|
||||
LightStatePair p = new LightStatePair(0, 0, 0);
|
||||
worklist.add(p);
|
||||
newstates.put(p, p);
|
||||
LightAutomaton.Transition t1 = new LightAutomaton.Transition();
|
||||
LightAutomaton.Transition t2 = new LightAutomaton.Transition();
|
||||
while (worklist.size() > 0) {
|
||||
p = worklist.removeFirst();
|
||||
result.setAccept(p.s, a1.isAccept(p.s1) && a2.isAccept(p.s2));
|
||||
int numT1 = a1.initTransition(p.s1, t1);
|
||||
if (numT1 > 0) {
|
||||
a1.getNextTransition(t1);
|
||||
}
|
||||
int numT2 = a2.initTransition(p.s2, t2);
|
||||
if (numT2 > 0) {
|
||||
a2.getNextTransition(t2);
|
||||
}
|
||||
//Transition[] t1 = transitions1[p.s1.number];
|
||||
//Transition[] t2 = transitions2[p.s2.number];
|
||||
for (int n1 = 0, b2 = 0; n1 < numT1; n1++) {
|
||||
while (b2 < numT2 && t2.max < t1.min) {
|
||||
b2++;
|
||||
if (b2 < numT2) {
|
||||
a2.getNextTransition(t2);
|
||||
}
|
||||
}
|
||||
for (int n2 = b2; n2 < numT2 && t1.max >= t2.min; n2++) {
|
||||
if (t2.max >= t1.min) {
|
||||
LightStatePair q = new LightStatePair(t1.dest, t2.dest);
|
||||
LightStatePair r = newstates.get(q);
|
||||
if (r == null) {
|
||||
q.s = result.createState();
|
||||
worklist.add(q);
|
||||
newstates.put(q, q);
|
||||
r = q;
|
||||
}
|
||||
int min = t1.min > t2.min ? t1.min : t2.min;
|
||||
int max = t1.max < t2.max ? t1.max : t2.max;
|
||||
result.addTransition(p.s, r.s, min, max);
|
||||
}
|
||||
if (n2 < numT2-1) {
|
||||
a2.getNextTransition(t2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result.finish();
|
||||
|
||||
return result.removeDeadTransitions();
|
||||
}
|
||||
*/
|
||||
|
||||
/** Returns true if these two automata accept exactly the
|
||||
* same language. This is a costly computation! Note
|
||||
* also that a1 and a2 will be determinized as a side
|
||||
* effect. Both automata must be determinized first! */
|
||||
* effect. Both automata must be determinized and have
|
||||
* no dead states! */
|
||||
public static boolean sameLanguage(LightAutomaton a1, LightAutomaton a2) {
|
||||
if (a1 == a2) {
|
||||
return true;
|
||||
}
|
||||
if (a1.isEmpty() && a2.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return subsetOf(a2, a1) && subsetOf(a1, a2);
|
||||
}
|
||||
|
||||
// TODO: move to test-framework?
|
||||
/** Returns true if this automaton has any states that cannot
|
||||
* be reached from the initial state or cannot reach an accept state.
|
||||
* Cost is O(numTransitions+numStates). */
|
||||
public static boolean hasDeadStates(LightAutomaton a) {
|
||||
BitSet liveStates = getLiveStates(a);
|
||||
int numLive = liveStates.cardinality();
|
||||
int numStates = a.getNumStates();
|
||||
assert numLive <= numStates: "numLive=" + numLive + " numStates=" + numStates + " " + liveStates;
|
||||
return numLive < numStates;
|
||||
}
|
||||
|
||||
// TODO: move to test-framework?
|
||||
/** Returns true if there are dead states reachable from an initial state. */
|
||||
public static boolean hasDeadStatesFromInitial(LightAutomaton a) {
|
||||
BitSet reachableFromInitial = getLiveStatesFromInitial(a);
|
||||
BitSet reachableFromAccept = getLiveStatesToAccept(a);
|
||||
reachableFromInitial.andNot(reachableFromAccept);
|
||||
return reachableFromInitial.isEmpty() == false;
|
||||
}
|
||||
|
||||
// TODO: move to test-framework?
|
||||
/** Returns true if there are dead states that reach an accept state. */
|
||||
public static boolean hasDeadStatesToAccept(LightAutomaton a) {
|
||||
BitSet reachableFromInitial = getLiveStatesFromInitial(a);
|
||||
BitSet reachableFromAccept = getLiveStatesToAccept(a);
|
||||
reachableFromAccept.andNot(reachableFromInitial);
|
||||
return reachableFromAccept.isEmpty() == false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the language of <code>a1</code> is a subset of the language
|
||||
* of <code>a2</code>. Both automata must be determinized.
|
||||
* of <code>a2</code>. Both automata must be determinized and must have no dead
|
||||
* states.
|
||||
* <p>
|
||||
* Complexity: quadratic in number of states.
|
||||
*/
|
||||
|
@ -458,6 +400,15 @@ final public class BasicOperations {
|
|||
if (a2.isDeterministic() == false) {
|
||||
throw new IllegalArgumentException("a2 must be deterministic");
|
||||
}
|
||||
assert hasDeadStatesFromInitial(a1) == false;
|
||||
assert hasDeadStatesFromInitial(a2) == false;
|
||||
if (a1.getNumStates() == 0) {
|
||||
// Empty language is alwyas a subset of any other language
|
||||
return true;
|
||||
} else if (a2.getNumStates() == 0) {
|
||||
return isEmpty(a1);
|
||||
}
|
||||
|
||||
// TODO: cutover to iterators instead
|
||||
Transition[][] transitions1 = a1.getSortedTransitions();
|
||||
Transition[][] transitions2 = a2.getSortedTransitions();
|
||||
|
@ -503,75 +454,6 @@ final public class BasicOperations {
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the language of <code>a1</code> is a subset of the language
|
||||
* of <code>a2</code>. Both automata must be determinized.
|
||||
* <p>
|
||||
* Complexity: quadratic in number of states.
|
||||
*/
|
||||
/*
|
||||
// nocommit low GC but broken!
|
||||
public static boolean subsetOf(LightAutomaton a1, LightAutomaton a2) {
|
||||
if (a1 == a2) return true;
|
||||
LinkedList<LightStatePair> worklist = new LinkedList<>();
|
||||
HashSet<LightStatePair> visited = new HashSet<>();
|
||||
LightStatePair p = new LightStatePair(0, 0);
|
||||
worklist.add(p);
|
||||
visited.add(p);
|
||||
LightAutomaton.Transition t1 = new LightAutomaton.Transition();
|
||||
LightAutomaton.Transition t2 = new LightAutomaton.Transition();
|
||||
while (worklist.size() > 0) {
|
||||
p = worklist.removeFirst();
|
||||
System.out.println("pop s1=" + p.s1 + " s2=" + p.s2);
|
||||
if (a1.isAccept(p.s1) && a2.isAccept(p.s2) == false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int numT1 = a1.initTransition(p.s1, t1);
|
||||
for (int n1 = 0, b2 = 0; n1 < numT1; n1++) {
|
||||
int numT2 = a2.initTransition(p.s2, t2);
|
||||
if (numT2 > 0) {
|
||||
a2.getNextTransition(t2);
|
||||
}
|
||||
|
||||
a1.getNextTransition(t1);
|
||||
while (b2 < numT2 && t2.max < t1.min) {
|
||||
b2++;
|
||||
if (b2 < numT2) {
|
||||
a2.getNextTransition(t2);
|
||||
}
|
||||
}
|
||||
|
||||
int min1 = t1.min, max1 = t1.max;
|
||||
|
||||
for (int n2 = b2; n2 < numT2 && t1.max >= t2.min; n2++) {
|
||||
if (t2.min > min1) {
|
||||
return false;
|
||||
}
|
||||
if (t2.max < Character.MAX_CODE_POINT) {
|
||||
min1 = t2.max + 1;
|
||||
} else {
|
||||
min1 = Character.MAX_CODE_POINT;
|
||||
max1 = Character.MIN_CODE_POINT;
|
||||
}
|
||||
LightStatePair q = new LightStatePair(t1.dest, t2.dest);
|
||||
if (!visited.contains(q)) {
|
||||
worklist.add(q);
|
||||
visited.add(q);
|
||||
}
|
||||
if (n2 < numT2-1) {
|
||||
a2.getNextTransition(t2);
|
||||
}
|
||||
}
|
||||
if (min1 <= max1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Returns an automaton that accepts the union of the languages of the given
|
||||
* automata.
|
||||
|
@ -582,64 +464,6 @@ final public class BasicOperations {
|
|||
return unionLight(Arrays.asList(a1, a2));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an automaton that accepts the union of the languages of the given
|
||||
* automata.
|
||||
* <p>
|
||||
* Complexity: linear in number of states.
|
||||
*/
|
||||
/*
|
||||
public static LightAutomaton unionLight(Collection<LightAutomaton> l) {
|
||||
LightAutomaton result = new LightAutomaton();
|
||||
// Create initial node:
|
||||
result.createState();
|
||||
int stateOffset = 1;
|
||||
|
||||
// First pass, adding all states epsilon transitions:
|
||||
LightAutomaton.Transition t = new LightAutomaton.Transition();
|
||||
for(LightAutomaton a : l) {
|
||||
int numStates = a.getNumStates();
|
||||
if (a.isAccept(0)) {
|
||||
// If any automaton accepts empty string, we do too:
|
||||
result.setAccept(0, true);
|
||||
}
|
||||
|
||||
for(int s=0;s<numStates;s++) {
|
||||
int state = result.createState();
|
||||
result.setAccept(state, a.isAccept(s));
|
||||
}
|
||||
|
||||
// Add epsilon transition from new initial state to this automaton's initial state:
|
||||
int numTransitions = a.initTransition(0, t);
|
||||
for(int i=0;i<numTransitions;i++) {
|
||||
a.getNextTransition(t);
|
||||
result.addTransition(0, stateOffset + t.dest, t.min, t.max);
|
||||
}
|
||||
|
||||
stateOffset += numStates;
|
||||
}
|
||||
|
||||
// Second pass, copying over all other transitions:
|
||||
stateOffset = 1;
|
||||
for(LightAutomaton a : l) {
|
||||
int numStates = a.getNumStates();
|
||||
for(int s=0;s<numStates;s++) {
|
||||
int numTransitions = a.initTransition(s, t);
|
||||
for(int i=0;i<numTransitions;i++) {
|
||||
a.getNextTransition(t);
|
||||
result.addTransition(stateOffset + s, stateOffset + t.dest, t.min, t.max);
|
||||
}
|
||||
}
|
||||
|
||||
stateOffset += numStates;
|
||||
}
|
||||
|
||||
result.finish();
|
||||
|
||||
return result;
|
||||
}
|
||||
*/
|
||||
|
||||
public static LightAutomaton unionLight(Collection<LightAutomaton> l) {
|
||||
LightAutomaton result = new LightAutomaton();
|
||||
|
||||
|
@ -662,7 +486,7 @@ final public class BasicOperations {
|
|||
stateOffset += a.getNumStates();
|
||||
}
|
||||
|
||||
result.finish();
|
||||
result.finishState();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
@ -804,9 +628,11 @@ final public class BasicOperations {
|
|||
*/
|
||||
public static LightAutomaton determinize(LightAutomaton a) {
|
||||
if (a.isDeterministic()) {
|
||||
// Already determinized
|
||||
return a;
|
||||
}
|
||||
if (a.getNumStates() == 0) {
|
||||
if (a.getNumStates() <= 1) {
|
||||
// Already determinized
|
||||
return a;
|
||||
}
|
||||
|
||||
|
@ -932,11 +758,45 @@ final public class BasicOperations {
|
|||
* Returns true if the given automaton accepts no strings.
|
||||
*/
|
||||
public static boolean isEmpty(LightAutomaton a) {
|
||||
return a.isAccept(0) == false && a.getNumTransitions(0) == 0;
|
||||
if (a.getNumStates() == 0) {
|
||||
// Common case: no states
|
||||
return true;
|
||||
}
|
||||
if (a.isAccept(0) == false && a.getNumTransitions(0) == 0) {
|
||||
// Common case: just one initial state
|
||||
return true;
|
||||
}
|
||||
if (a.isAccept(0) == true) {
|
||||
// Apparently common case: it accepts the damned empty string
|
||||
return false;
|
||||
}
|
||||
|
||||
LinkedList<Integer> workList = new LinkedList<>();
|
||||
BitSet seen = new BitSet(a.getNumStates());
|
||||
workList.add(0);
|
||||
seen.set(0);
|
||||
|
||||
Transition t = new Transition();
|
||||
while (workList.isEmpty() == false) {
|
||||
int state = workList.removeFirst();
|
||||
if (a.isAccept(state)) {
|
||||
return false;
|
||||
}
|
||||
int count = a.initTransition(state, t);
|
||||
for(int i=0;i<count;i++) {
|
||||
a.getNextTransition(t);
|
||||
if (seen.get(t.dest) == false) {
|
||||
workList.add(t.dest);
|
||||
seen.set(t.dest);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the given automaton accepts all strings.
|
||||
* Returns true if the given automaton accepts all strings. The automaton must be minimized.
|
||||
*/
|
||||
public static boolean isTotal(LightAutomaton a) {
|
||||
if (a.isAccept(0) && a.getNumTransitions(0) == 1) {
|
||||
|
@ -993,25 +853,18 @@ final public class BasicOperations {
|
|||
* reachable from it and if it is reachable from the initial state.
|
||||
*/
|
||||
private static BitSet getLiveStates(LightAutomaton a) {
|
||||
int numStates = a.getNumStates();
|
||||
BitSet reachableFromInitial = getLiveStatesFromInitial(a);
|
||||
BitSet reachableFromAccept = getLiveStatesFromInitial(SpecialOperations.reverse(a));
|
||||
for(int acceptState : a.getAcceptStates()) {
|
||||
reachableFromAccept.set(1+acceptState);
|
||||
}
|
||||
|
||||
for(int i=0;i<numStates;i++) {
|
||||
if (reachableFromAccept.get(i+1) == false) {
|
||||
reachableFromInitial.clear(i);
|
||||
}
|
||||
}
|
||||
return reachableFromInitial;
|
||||
BitSet live = getLiveStatesFromInitial(a);
|
||||
live.and(getLiveStatesToAccept(a));
|
||||
return live;
|
||||
}
|
||||
|
||||
/** Returns bitset marking states reachable from the initial node. */
|
||||
/** Returns bitset marking states reachable from the initial state. */
|
||||
private static BitSet getLiveStatesFromInitial(LightAutomaton a) {
|
||||
int numStates = a.getNumStates();
|
||||
BitSet live = new BitSet(numStates);
|
||||
if (numStates == 0) {
|
||||
return live;
|
||||
}
|
||||
LinkedList<Integer> workList = new LinkedList<>();
|
||||
live.set(0);
|
||||
workList.add(0);
|
||||
|
@ -1032,6 +885,47 @@ final public class BasicOperations {
|
|||
return live;
|
||||
}
|
||||
|
||||
/** Returns bitset marking states that can reach an accept state. */
|
||||
private static BitSet getLiveStatesToAccept(LightAutomaton a) {
|
||||
LightAutomaton.Builder builder = new LightAutomaton.Builder();
|
||||
|
||||
// NOTE: not quite the same thing as what SpecialOperations.reverse does:
|
||||
Transition t = new Transition();
|
||||
int numStates = a.getNumStates();
|
||||
for(int s=0;s<numStates;s++) {
|
||||
builder.createState();
|
||||
}
|
||||
for(int s=0;s<numStates;s++) {
|
||||
int count = a.initTransition(s, t);
|
||||
for(int i=0;i<count;i++) {
|
||||
a.getNextTransition(t);
|
||||
builder.addTransition(t.dest, s, t.min, t.max);
|
||||
}
|
||||
}
|
||||
LightAutomaton a2 = builder.finish();
|
||||
|
||||
LinkedList<Integer> workList = new LinkedList<>();
|
||||
BitSet live = new BitSet(numStates);
|
||||
for (int s : a.getAcceptStates()) {
|
||||
live.set(s);
|
||||
workList.add(s);
|
||||
}
|
||||
|
||||
while (workList.isEmpty() == false) {
|
||||
int s = workList.removeFirst();
|
||||
int count = a2.initTransition(s, t);
|
||||
for(int i=0;i<count;i++) {
|
||||
a2.getNextTransition(t);
|
||||
if (live.get(t.dest) == false) {
|
||||
live.set(t.dest);
|
||||
workList.add(t.dest);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return live;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes transitions to dead states (a state is "dead" if it is not
|
||||
* reachable from the initial state or no accept state is reachable from it.)
|
||||
|
@ -1066,10 +960,8 @@ final public class BasicOperations {
|
|||
}
|
||||
}
|
||||
|
||||
// nocommit need test case for "accepts no strings"
|
||||
|
||||
result.finish();
|
||||
result.finishState();
|
||||
assert hasDeadStates(result) == false;
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.util.automaton;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.Terms;
|
||||
|
@ -91,10 +90,13 @@ public class CompiledAutomaton {
|
|||
}
|
||||
|
||||
public CompiledAutomaton(LightAutomaton automaton, Boolean finite, boolean simplify) {
|
||||
|
||||
if (simplify) {
|
||||
|
||||
// Test whether the automaton is a "simple" form and
|
||||
// if so, don't create a runAutomaton. Note that on a
|
||||
// large automaton these tests could be costly:
|
||||
|
||||
if (BasicOperations.isEmpty(automaton)) {
|
||||
// matches nothing
|
||||
type = AUTOMATON_TYPE.NONE;
|
||||
|
@ -104,6 +106,7 @@ public class CompiledAutomaton {
|
|||
lightAutomaton = null;
|
||||
this.finite = null;
|
||||
return;
|
||||
// NOTE: only approximate, because automaton may not be minimal:
|
||||
} else if (BasicOperations.isTotal(automaton)) {
|
||||
// matches all possible strings
|
||||
type = AUTOMATON_TYPE.ALL;
|
||||
|
@ -138,6 +141,7 @@ public class CompiledAutomaton {
|
|||
} else if (commonPrefix.length() > 0) {
|
||||
LightAutomaton other = BasicOperations.concatenateLight(BasicAutomata.makeStringLight(commonPrefix), BasicAutomata.makeAnyStringLight());
|
||||
other = BasicOperations.determinize(other);
|
||||
assert BasicOperations.hasDeadStates(other) == false;
|
||||
if (BasicOperations.sameLanguage(automaton, other)) {
|
||||
// matches a constant prefix
|
||||
type = AUTOMATON_TYPE.PREFIX;
|
||||
|
@ -169,10 +173,10 @@ public class CompiledAutomaton {
|
|||
}
|
||||
runAutomaton = new ByteRunAutomaton(utf8, true);
|
||||
|
||||
lightAutomaton = runAutomaton.a;
|
||||
lightAutomaton = runAutomaton.automaton;
|
||||
}
|
||||
|
||||
private Transition scratch = new Transition();
|
||||
private Transition transition = new Transition();
|
||||
|
||||
//private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;
|
||||
|
||||
|
@ -181,31 +185,29 @@ public class CompiledAutomaton {
|
|||
//System.out.println(lightAutomaton.toDot());
|
||||
// Find biggest transition that's < label
|
||||
// TODO: use binary search here
|
||||
lightAutomaton.initTransition(state, scratch);
|
||||
int numTransitions = lightAutomaton.getNumTransitions(state);
|
||||
int maxIndex = -1;
|
||||
int lastMin = 0;
|
||||
int numTransitions = lightAutomaton.initTransition(state, transition);
|
||||
for(int i=0;i<numTransitions;i++) {
|
||||
lightAutomaton.getNextTransition(scratch);
|
||||
if (scratch.min < leadLabel) {
|
||||
lightAutomaton.getNextTransition(transition);
|
||||
if (transition.min < leadLabel) {
|
||||
maxIndex = i;
|
||||
} else {
|
||||
// Transitions are alway sorted
|
||||
break;
|
||||
}
|
||||
assert scratch.min >= lastMin;
|
||||
lastMin = scratch.min;
|
||||
// nocommit else break?
|
||||
}
|
||||
|
||||
//System.out.println(" maxIndex=" + maxIndex);
|
||||
|
||||
assert maxIndex != -1;
|
||||
lightAutomaton.getTransition(state, maxIndex, scratch);
|
||||
lightAutomaton.getTransition(state, maxIndex, transition);
|
||||
|
||||
// Append floorLabel
|
||||
final int floorLabel;
|
||||
if (scratch.max > leadLabel-1) {
|
||||
if (transition.max > leadLabel-1) {
|
||||
floorLabel = leadLabel-1;
|
||||
} else {
|
||||
floorLabel = scratch.max;
|
||||
floorLabel = transition.max;
|
||||
}
|
||||
//System.out.println(" floorLabel=" + (char) floorLabel);
|
||||
if (idx >= term.bytes.length) {
|
||||
|
@ -214,7 +216,7 @@ public class CompiledAutomaton {
|
|||
//if (DEBUG) System.out.println(" add floorLabel=" + (char) floorLabel + " idx=" + idx);
|
||||
term.bytes[idx] = (byte) floorLabel;
|
||||
|
||||
state = scratch.dest;
|
||||
state = transition.dest;
|
||||
//System.out.println(" dest: " + state);
|
||||
idx++;
|
||||
|
||||
|
@ -231,14 +233,14 @@ public class CompiledAutomaton {
|
|||
// We are pushing "top" -- so get last label of
|
||||
// last transition:
|
||||
//System.out.println("get state=" + state + " numTrans=" + numTransitions);
|
||||
lightAutomaton.getTransition(state, numTransitions-1, scratch);
|
||||
lightAutomaton.getTransition(state, numTransitions-1, transition);
|
||||
if (idx >= term.bytes.length) {
|
||||
term.grow(1+idx);
|
||||
}
|
||||
//if (DEBUG) System.out.println(" push maxLabel=" + (char) lastTransition.max + " idx=" + idx);
|
||||
//System.out.println(" add trans dest=" + scratch.dest + " label=" + (char) scratch.max);
|
||||
term.bytes[idx] = (byte) scratch.max;
|
||||
state = scratch.dest;
|
||||
term.bytes[idx] = (byte) transition.max;
|
||||
state = transition.dest;
|
||||
idx++;
|
||||
}
|
||||
}
|
||||
|
@ -326,9 +328,9 @@ public class CompiledAutomaton {
|
|||
//if (DEBUG) System.out.println(" return " + output.utf8ToString());
|
||||
return output;
|
||||
} else {
|
||||
lightAutomaton.getTransition(state, 0, scratch);
|
||||
lightAutomaton.getTransition(state, 0, transition);
|
||||
|
||||
if (label-1 < scratch.min) {
|
||||
if (label-1 < transition.min) {
|
||||
|
||||
if (runAutomaton.isAccept(state)) {
|
||||
output.length = idx;
|
||||
|
@ -380,18 +382,17 @@ public class CompiledAutomaton {
|
|||
b.append(" initial [shape=plaintext,label=\"\"];\n");
|
||||
b.append(" initial -> ").append(i).append("\n");
|
||||
}
|
||||
lightAutomaton.initTransition(i, scratch);
|
||||
int numTransitions = lightAutomaton.getNumTransitions(i);
|
||||
int numTransitions = lightAutomaton.initTransition(i, transition);
|
||||
for (int j = 0; j < numTransitions; j++) {
|
||||
b.append(" ").append(i);
|
||||
b.append(" -> ");
|
||||
b.append(scratch.dest);
|
||||
b.append(scratch.min);
|
||||
if (scratch.min != scratch.max) {
|
||||
b.append(transition.dest);
|
||||
b.append(transition.min);
|
||||
if (transition.min != transition.max) {
|
||||
b.append("-");
|
||||
b.append(scratch.max);
|
||||
b.append(transition.max);
|
||||
}
|
||||
lightAutomaton.getNextTransition(scratch);
|
||||
lightAutomaton.getNextTransition(transition);
|
||||
}
|
||||
}
|
||||
return b.append("}\n").toString();
|
||||
|
|
|
@ -159,8 +159,6 @@ public class LevenshteinAutomata {
|
|||
lastState = a.createState();
|
||||
}
|
||||
|
||||
// nocommit why are so many dead states created here?
|
||||
|
||||
int stateOffset = lastState;
|
||||
a.setAccept(lastState, description.isAccept(0));
|
||||
|
||||
|
@ -170,6 +168,8 @@ public class LevenshteinAutomata {
|
|||
a.setAccept(state, description.isAccept(i));
|
||||
}
|
||||
|
||||
// TODO: this creates bogus states/transitions (states are final, have self loops, and can't be reached from an init state)
|
||||
|
||||
// create transitions from state to state
|
||||
for (int k = 0; k < numStates; k++) {
|
||||
final int xpos = description.getPosition(k);
|
||||
|
@ -183,10 +183,7 @@ public class LevenshteinAutomata {
|
|||
final int cvec = getVector(ch, xpos, end);
|
||||
int dest = description.transition(k, xpos, cvec);
|
||||
if (dest >= 0) {
|
||||
// nocommit why do we create cycles in dead states?
|
||||
if (k != dest) {
|
||||
a.addTransition(stateOffset+k, stateOffset+dest, ch);
|
||||
}
|
||||
a.addTransition(stateOffset+k, stateOffset+dest, ch);
|
||||
}
|
||||
}
|
||||
// add transitions for all other chars in unicode
|
||||
|
@ -195,15 +192,12 @@ public class LevenshteinAutomata {
|
|||
int dest = description.transition(k, xpos, 0); // by definition
|
||||
if (dest >= 0) {
|
||||
for (int r = 0; r < numRanges; r++) {
|
||||
// nocommit why do we create cycles in dead states?
|
||||
if (k != dest) {
|
||||
a.addTransition(stateOffset+k, stateOffset+dest, rangeLower[r], rangeUpper[r]);
|
||||
}
|
||||
a.addTransition(stateOffset+k, stateOffset+dest, rangeLower[r], rangeUpper[r]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
a.finish();
|
||||
a.finishState();
|
||||
assert a.isDeterministic();
|
||||
return a;
|
||||
}
|
||||
|
|
|
@ -17,12 +17,10 @@ package org.apache.lucene.util.automaton;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
//import java.io.IOException;
|
||||
//import java.io.PrintWriter;
|
||||
import java.util.Arrays;
|
||||
import java.util.BitSet;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
@ -64,7 +62,6 @@ public class LightAutomaton {
|
|||
private int[] states = new int[4];
|
||||
|
||||
/** Holds toState, min, max for each transition: */
|
||||
// nocommit inefficient when labels are really bytes (max 256)
|
||||
private int[] transitions = new int[6];
|
||||
|
||||
private final Set<Integer> finalStates = new HashSet<Integer>();
|
||||
|
@ -82,6 +79,9 @@ public class LightAutomaton {
|
|||
|
||||
/** Mark this state as an accept state. */
|
||||
public void setAccept(int state, boolean isAccept) {
|
||||
if (state >= getNumStates()) {
|
||||
throw new IllegalArgumentException("state=" + state + " is out of bounds (numStates=" + getNumStates() + ")");
|
||||
}
|
||||
if (isAccept) {
|
||||
finalStates.add(state);
|
||||
} else {
|
||||
|
@ -89,10 +89,6 @@ public class LightAutomaton {
|
|||
}
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return finalStates.isEmpty();
|
||||
}
|
||||
|
||||
/** Sugar, but object-heavy; it's better to iterate instead. */
|
||||
public Transition[][] getSortedTransitions() {
|
||||
int numStates = getNumStates();
|
||||
|
@ -200,12 +196,13 @@ public class LightAutomaton {
|
|||
states[nextState+i] += nextTransition;
|
||||
}
|
||||
int state = i/2;
|
||||
if (other.isAccept(state)) {
|
||||
setAccept(stateOffset+state, true);
|
||||
}
|
||||
}
|
||||
nextState += other.nextState;
|
||||
|
||||
for(int s : other.getAcceptStates()) {
|
||||
setAccept(stateOffset+s, true);
|
||||
}
|
||||
|
||||
// Bulk copy and then fixup dest for each transition:
|
||||
transitions = ArrayUtil.grow(transitions, nextTransition + other.nextTransition);
|
||||
System.arraycopy(other.transitions, 0, transitions, nextTransition, other.nextTransition);
|
||||
|
@ -312,7 +309,8 @@ public class LightAutomaton {
|
|||
return deterministic;
|
||||
}
|
||||
|
||||
public void finish() {
|
||||
/** Finishes the current state; call this once you are done adding transitions for a state. */
|
||||
public void finishState() {
|
||||
if (curState != -1) {
|
||||
//System.out.println("finish: finish current state " + curState);
|
||||
finishCurrentState();
|
||||
|
@ -327,7 +325,6 @@ public class LightAutomaton {
|
|||
}
|
||||
|
||||
public int getNumTransitions(int state) {
|
||||
//assert curState == -1: "not finished";
|
||||
int count = states[2*state+1];
|
||||
if (count == -1) {
|
||||
return 0;
|
||||
|
@ -468,27 +465,20 @@ public class LightAutomaton {
|
|||
}
|
||||
};
|
||||
|
||||
// nocommit createStates(int count)?
|
||||
|
||||
// nocommit kinda awkward iterator api...
|
||||
/** Initialize the provided Transition for iteration; you
|
||||
* must call {@link #getNextTransition} to get the first
|
||||
* transition for the state. Returns the number of transitions
|
||||
/** Initialize the provided Transition to iterate through all transitions
|
||||
* leaving the specified state. You must call {@link #getNextTransition} to
|
||||
* get each transition. Returns the number of transitions
|
||||
* leaving this state. */
|
||||
public int initTransition(int state, Transition t) {
|
||||
// assert curState == -1: "not finished";
|
||||
assert state < nextState/2: "state=" + state + " nextState=" + nextState;
|
||||
t.source = state;
|
||||
//System.out.println("initTrans source=" + state + " numTrans=" + getNumTransitions(state));
|
||||
t.transitionUpto = states[2*state];
|
||||
return getNumTransitions(state);
|
||||
}
|
||||
|
||||
/** Iterate to the next transition after the provided one */
|
||||
public void getNextTransition(Transition t) {
|
||||
//assert curState == -1: "not finished";
|
||||
// Make sure there is still a transition left:
|
||||
//System.out.println("getNextTrans transUpto=" + t.transitionUpto);
|
||||
//System.out.println(" states[2*t.source]=" + states[2*t.source] + " numTrans=" + states[2*t.source+1] + " transitionUpto+3=" + (t.transitionUpto+3) + " t=" + t);
|
||||
assert (t.transitionUpto+3 - states[2*t.source]) <= 3*states[2*t.source+1];
|
||||
t.dest = transitions[t.transitionUpto++];
|
||||
t.min = transitions[t.transitionUpto++];
|
||||
|
@ -498,7 +488,6 @@ public class LightAutomaton {
|
|||
/** Fill the provided {@link Transition} with the index'th
|
||||
* transition leaving the specified state. */
|
||||
public void getTransition(int state, int index, Transition t) {
|
||||
assert curState == -1: "not finished";
|
||||
int i = states[2*state] + 3*index;
|
||||
t.source = state;
|
||||
t.dest = transitions[i++];
|
||||
|
@ -552,11 +541,12 @@ public class LightAutomaton {
|
|||
result.addTransition(i, deadState, maxi, Character.MAX_CODE_POINT);
|
||||
}
|
||||
}
|
||||
result.finish();
|
||||
result.finishState();
|
||||
return result;
|
||||
}
|
||||
|
||||
// nocommit
|
||||
/*
|
||||
public void writeDot(String fileName) {
|
||||
if (fileName.indexOf('/') == -1) {
|
||||
fileName = "/l/la/lucene/core/" + fileName + ".dot";
|
||||
|
@ -569,6 +559,7 @@ public class LightAutomaton {
|
|||
throw new RuntimeException(ioe);
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
public String toDot() {
|
||||
// TODO: breadth first search so we can see get layered output...
|
||||
|
@ -660,14 +651,12 @@ public class LightAutomaton {
|
|||
assert label >= 0;
|
||||
int trans = states[2*state];
|
||||
int limit = trans + 3*states[2*state+1];
|
||||
// nocommit we could do bin search; transitions are sorted
|
||||
// System.out.println("la.step state=" + state + " label=" + label + " trans=" + trans + " limit=" + limit);
|
||||
// TODO: we could do bin search; transitions are sorted
|
||||
while (trans < limit) {
|
||||
int dest = transitions[trans];
|
||||
int min = transitions[trans+1];
|
||||
int max = transitions[trans+2];
|
||||
if (min <= label && label <= max) {
|
||||
//System.out.println(" ret dest=" + dest);
|
||||
return dest;
|
||||
}
|
||||
trans += 3;
|
||||
|
@ -780,7 +769,7 @@ public class LightAutomaton {
|
|||
upto += 4;
|
||||
}
|
||||
|
||||
a.finish();
|
||||
a.finishState();
|
||||
return a;
|
||||
}
|
||||
|
||||
|
|
|
@ -57,7 +57,8 @@ final public class MinimizationOperationsLight {
|
|||
* Minimizes the given automaton using Hopcroft's algorithm.
|
||||
*/
|
||||
public static LightAutomaton minimizeHopcroft(LightAutomaton a) {
|
||||
if (a.isEmpty()) {
|
||||
if (a.getNumStates() == 0 || (a.isAccept(0) == false && a.getNumTransitions(0) == 0)) {
|
||||
// Fastmatch for common case
|
||||
return new LightAutomaton();
|
||||
}
|
||||
a = BasicOperations.determinize(a);
|
||||
|
@ -209,7 +210,6 @@ final public class MinimizationOperationsLight {
|
|||
int[] stateMap = new int[statesLen];
|
||||
int[] stateRep = new int[k];
|
||||
|
||||
// nocommit maybe LA should be born already with the initial state?
|
||||
result.createState();
|
||||
|
||||
//System.out.println("min: k=" + k);
|
||||
|
@ -251,7 +251,7 @@ final public class MinimizationOperationsLight {
|
|||
result.addTransition(n, stateMap[t.dest], t.min, t.max);
|
||||
}
|
||||
}
|
||||
result.finish();
|
||||
result.finishState();
|
||||
//System.out.println(result.getNumStates() + " states");
|
||||
|
||||
return BasicOperations.removeDeadStates(result);
|
||||
|
|
|
@ -361,8 +361,6 @@ public class RegExp {
|
|||
*/
|
||||
public static final int NONE = 0x0000;
|
||||
|
||||
private static boolean allow_mutation = false;
|
||||
|
||||
Kind kind;
|
||||
RegExp exp1, exp2;
|
||||
String s;
|
||||
|
@ -467,7 +465,7 @@ public class RegExp {
|
|||
findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider);
|
||||
findLeaves(exp2, Kind.REGEXP_UNION, list, automata, automaton_provider);
|
||||
a = BasicOperations.unionLight(list);
|
||||
MinimizationOperationsLight.minimize(a);
|
||||
a = MinimizationOperationsLight.minimize(a);
|
||||
break;
|
||||
case REGEXP_CONCATENATION:
|
||||
list = new ArrayList<>();
|
||||
|
@ -476,7 +474,7 @@ public class RegExp {
|
|||
findLeaves(exp2, Kind.REGEXP_CONCATENATION, list, automata,
|
||||
automaton_provider);
|
||||
a = BasicOperations.concatenateLight(list);
|
||||
MinimizationOperationsLight.minimize(a);
|
||||
a = MinimizationOperationsLight.minimize(a);
|
||||
break;
|
||||
case REGEXP_INTERSECTION:
|
||||
a = BasicOperations.intersectionLight(
|
||||
|
|
|
@ -37,8 +37,7 @@ import java.util.Arrays;
|
|||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class RunAutomaton {
|
||||
// nocommit
|
||||
final LightAutomaton a;
|
||||
final LightAutomaton automaton;
|
||||
final int maxInterval;
|
||||
final int size;
|
||||
final boolean[] accept;
|
||||
|
@ -125,7 +124,7 @@ public abstract class RunAutomaton {
|
|||
this.maxInterval = maxInterval;
|
||||
//System.out.println("before det a=" + a.getNumStates());
|
||||
a = BasicOperations.determinize(a);
|
||||
this.a = a;
|
||||
this.automaton = a;
|
||||
//System.out.println("AFTER DET tableize= " + tableize + ": ");
|
||||
//System.out.println(a.toDot());
|
||||
points = a.getStartPoints();
|
||||
|
|
|
@ -30,17 +30,13 @@
|
|||
package org.apache.lucene.util.automaton;
|
||||
|
||||
import java.util.BitSet;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
/**
|
||||
* Special automata operations.
|
||||
|
@ -71,6 +67,9 @@ final public class SpecialOperations {
|
|||
* Returns true if the language of this automaton is finite.
|
||||
*/
|
||||
public static boolean isFinite(LightAutomaton a) {
|
||||
if (a.getNumStates() == 0) {
|
||||
return true;
|
||||
}
|
||||
return isFinite(new Transition(), a, 0, new BitSet(a.getNumStates()), new BitSet(a.getNumStates()));
|
||||
}
|
||||
|
||||
|
@ -174,8 +173,8 @@ final public class SpecialOperations {
|
|||
|
||||
public static LightAutomaton reverse(LightAutomaton a, Set<Integer> initialStates) {
|
||||
|
||||
if (a.isEmpty()) {
|
||||
return a;
|
||||
if (BasicOperations.isEmpty(a)) {
|
||||
return new LightAutomaton();
|
||||
}
|
||||
|
||||
int numStates = a.getNumStates();
|
||||
|
@ -204,15 +203,16 @@ final public class SpecialOperations {
|
|||
}
|
||||
|
||||
LightAutomaton result = builder.finish();
|
||||
|
||||
|
||||
for(int s : a.getAcceptStates()) {
|
||||
assert s < numStates;
|
||||
result.addEpsilon(0, s+1);
|
||||
if (initialStates != null) {
|
||||
initialStates.add(s+1);
|
||||
}
|
||||
}
|
||||
|
||||
result.finish();
|
||||
result.finishState();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
package org.apache.lucene.util.automaton;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/** Used temporarily when iterating through transitions from a {@link LightAutomaton}
|
||||
* {@link getTransition} and {@link #getNextTransition}. */
|
||||
public class Transition {
|
||||
|
||||
public int source;
|
||||
public int dest;
|
||||
public int min;
|
||||
public int max;
|
||||
|
||||
/** Remembers where we are in the iteration; init to -1 to provoke
|
||||
* exception if nextTransition is called without first initTransition. */
|
||||
int transitionUpto = -1;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return source + " --> " + dest + " " + (char) min + "-" + (char) max;
|
||||
}
|
||||
}
|
||||
|
|
@ -17,11 +17,6 @@ package org.apache.lucene.util.automaton;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.InPlaceMergeSorter;
|
||||
import org.apache.lucene.util.Sorter;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
@ -268,12 +263,6 @@ public final class UTF32ToUTF8Light {
|
|||
* not in general be deterministic, so you must
|
||||
* determinize it if that's needed. */
|
||||
public LightAutomaton convert(LightAutomaton utf32) {
|
||||
//System.out.println("\nCONVERT");
|
||||
|
||||
// nocommit make sure singleton cases work:
|
||||
//if (utf32.isSingleton()) {
|
||||
//utf32 = utf32.cloneExpanded();
|
||||
//}
|
||||
if (utf32.getNumStates() == 0) {
|
||||
return utf32;
|
||||
}
|
||||
|
@ -286,9 +275,6 @@ public final class UTF32ToUTF8Light {
|
|||
pending.add(utf32State);
|
||||
utf8 = new LightAutomaton.Builder();
|
||||
|
||||
// nocommit we don't track this
|
||||
// utf8.setDeterministic(false);
|
||||
|
||||
int utf8State = utf8.createState();
|
||||
|
||||
utf8.setAccept(utf8State, utf32.isAccept(utf32State));
|
||||
|
@ -325,19 +311,4 @@ public final class UTF32ToUTF8Light {
|
|||
|
||||
return utf8.finish();
|
||||
}
|
||||
|
||||
/*
|
||||
private State newUTF8State() {
|
||||
State s = new State();
|
||||
if (utf8StateCount == utf8States.length) {
|
||||
final State[] newArray = new State[ArrayUtil.oversize(1+utf8StateCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
System.arraycopy(utf8States, 0, newArray, 0, utf8StateCount);
|
||||
utf8States = newArray;
|
||||
}
|
||||
utf8States[utf8StateCount] = s;
|
||||
s.number = utf8StateCount;
|
||||
utf8StateCount++;
|
||||
return s;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
|
|
@ -411,8 +411,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
});
|
||||
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final LightAutomaton expected = s2a("abc");
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
|
||||
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
|
||||
}
|
||||
|
||||
public void testMultipleHoles() throws Exception {
|
||||
|
@ -423,8 +423,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
});
|
||||
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final LightAutomaton expected = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
|
||||
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
|
||||
}
|
||||
|
||||
public void testSynOverMultipleHoles() throws Exception {
|
||||
|
@ -438,8 +438,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
final LightAutomaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
|
||||
final LightAutomaton a2 = join(s2a("x"), SEP_A, s2a("b"));
|
||||
final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
|
||||
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
|
||||
}
|
||||
|
||||
// for debugging!
|
||||
|
@ -485,8 +485,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
final LightAutomaton expected = join("abc", "def");
|
||||
|
||||
//toDot(actual);
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
|
||||
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
|
||||
}
|
||||
|
||||
public void testHole() throws Exception {
|
||||
|
@ -501,8 +501,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
final LightAutomaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"));
|
||||
|
||||
//toDot(actual);
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
|
||||
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
|
||||
}
|
||||
|
||||
public void testOverlappedTokensSausage() throws Exception {
|
||||
|
@ -517,8 +517,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
final LightAutomaton a1 = s2a("abc");
|
||||
final LightAutomaton a2 = s2a("xyz");
|
||||
final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
|
||||
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
|
||||
}
|
||||
|
||||
public void testOverlappedTokensLattice() throws Exception {
|
||||
|
@ -535,8 +535,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
|
||||
final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
|
||||
//toDot(actual);
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
|
||||
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
|
||||
}
|
||||
|
||||
public void testSynOverHole() throws Exception {
|
||||
|
@ -554,8 +554,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
final LightAutomaton expected = BasicOperations.concatenateLight(a1,
|
||||
join(SEP_A, s2a("b")));
|
||||
//toDot(actual);
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
|
||||
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
|
||||
}
|
||||
|
||||
public void testSynOverHole2() throws Exception {
|
||||
|
@ -570,8 +570,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
final LightAutomaton expected = BasicOperations.unionLight(
|
||||
join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")),
|
||||
s2a("abc"));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
|
||||
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
|
||||
}
|
||||
|
||||
public void testOverlappedTokensLattice2() throws Exception {
|
||||
|
@ -588,8 +588,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
final LightAutomaton a2 = join("abc", "def", "ghi");
|
||||
final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
|
||||
//toDot(actual);
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
|
||||
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
|
||||
}
|
||||
|
||||
public void testToDot() throws Exception {
|
||||
|
@ -607,8 +607,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final LightAutomaton expected = join(HOLE_A, SEP_A, s2a("abc"));
|
||||
//toDot(actual);
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
|
||||
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
|
||||
}
|
||||
|
||||
// TODO: testEndsWithHole... but we need posInc to set in TS.end()
|
||||
|
@ -622,7 +622,7 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final LightAutomaton expected = BasicOperations.unionLight(s2a("a"),
|
||||
s2a("X"));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
|
||||
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -85,6 +85,7 @@ public class TestTermsEnum2 extends LuceneTestCase {
|
|||
|
||||
/** tests a pre-intersected automaton against the original */
|
||||
public void testFiniteVersusInfinite() throws Exception {
|
||||
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
String reg = AutomatonTestUtil.randomRegexp(random());
|
||||
LightAutomaton automaton = BasicOperations.determinize(new RegExp(reg, RegExp.NONE).toLightAutomaton());
|
||||
|
|
|
@ -107,10 +107,6 @@ public class TestAutomatonQuery extends LuceneTestCase {
|
|||
* Test some very simple automata.
|
||||
*/
|
||||
public void testBasicAutomata() throws IOException {
|
||||
|
||||
// nocommit
|
||||
assertAutomatonHits(2, BasicAutomata.makeIntervalLight(1233, 2346, 0));
|
||||
|
||||
assertAutomatonHits(0, BasicAutomata.makeEmptyLight());
|
||||
assertAutomatonHits(0, BasicAutomata.makeEmptyStringLight());
|
||||
assertAutomatonHits(2, BasicAutomata.makeAnyCharLight());
|
||||
|
|
|
@ -50,7 +50,7 @@ public class TestBasicOperations extends LuceneTestCase {
|
|||
public void testEmptyLanguageConcatenate() {
|
||||
LightAutomaton a = BasicAutomata.makeStringLight("a");
|
||||
LightAutomaton concat = BasicOperations.concatenateLight(a, BasicAutomata.makeEmptyLight());
|
||||
assertTrue(concat.isEmpty());
|
||||
assertTrue(BasicOperations.isEmpty(concat));
|
||||
}
|
||||
|
||||
/** Test optimization to concatenate() with empty String to an NFA */
|
||||
|
@ -81,7 +81,7 @@ public class TestBasicOperations extends LuceneTestCase {
|
|||
final LightAutomaton a = BasicOperations.determinize(re.toLightAutomaton());
|
||||
assertFalse(BasicOperations.isEmpty(a));
|
||||
|
||||
final AutomatonTestUtil.RandomAcceptedStringsLight rx = new AutomatonTestUtil.RandomAcceptedStringsLight(a);
|
||||
final AutomatonTestUtil.RandomAcceptedStrings rx = new AutomatonTestUtil.RandomAcceptedStrings(a);
|
||||
for(int j=0;j<ITER2;j++) {
|
||||
//System.out.println("TEST: j=" + j);
|
||||
int[] acc = null;
|
||||
|
|
|
@ -46,18 +46,18 @@ public class TestDeterminism extends LuceneTestCase {
|
|||
}
|
||||
|
||||
private static void assertAutomaton(LightAutomaton a) {
|
||||
a = BasicOperations.determinize(a);
|
||||
a = BasicOperations.determinize(BasicOperations.removeDeadStates(a));
|
||||
|
||||
// complement(complement(a)) = a
|
||||
LightAutomaton equivalent = BasicOperations.complementLight(BasicOperations.complementLight(a));
|
||||
assertTrue(BasicOperations.sameLanguage(a, equivalent));
|
||||
|
||||
// a union a = a
|
||||
equivalent = BasicOperations.determinize(BasicOperations.unionLight(a, a));
|
||||
equivalent = BasicOperations.determinize(BasicOperations.removeDeadStates(BasicOperations.unionLight(a, a)));
|
||||
assertTrue(BasicOperations.sameLanguage(a, equivalent));
|
||||
|
||||
// a intersect a = a
|
||||
equivalent = BasicOperations.determinize(BasicOperations.intersectionLight(a, a));
|
||||
equivalent = BasicOperations.determinize(BasicOperations.removeDeadStates(BasicOperations.intersectionLight(a, a)));
|
||||
assertTrue(BasicOperations.sameLanguage(a, equivalent));
|
||||
|
||||
// a minus a = empty
|
||||
|
|
|
@ -41,8 +41,7 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
|
|||
|
||||
// LUCENE-3094
|
||||
public void testNoWastedStates() throws Exception {
|
||||
// nocommit this fails ... pre-existing issue i think!!
|
||||
// AutomatonTestUtil.assertNoDetachedStates(new LevenshteinAutomata("abc", false).toAutomaton(1));
|
||||
assertFalse(BasicOperations.hasDeadStatesFromInitial(new LevenshteinAutomata("abc", false).toAutomaton(1)));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -78,30 +77,34 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
|
|||
assertTrue(tautomata[n].isDeterministic());
|
||||
assertTrue(SpecialOperations.isFinite(automata[n]));
|
||||
assertTrue(SpecialOperations.isFinite(tautomata[n]));
|
||||
// nocommit LEV creates detached states
|
||||
//AutomatonTestUtil.assertNoDetachedStates(automata[n]);
|
||||
//AutomatonTestUtil.assertNoDetachedStates(tautomata[n]);
|
||||
assertFalse(BasicOperations.hasDeadStatesFromInitial(automata[n]));
|
||||
assertFalse(BasicOperations.hasDeadStatesFromInitial(tautomata[n]));
|
||||
// check that the dfa for n-1 accepts a subset of the dfa for n
|
||||
if (n > 0) {
|
||||
assertTrue(BasicOperations.subsetOf(automata[n-1], automata[n]));
|
||||
assertTrue(BasicOperations.subsetOf(automata[n-1], tautomata[n]));
|
||||
assertTrue(BasicOperations.subsetOf(tautomata[n-1], automata[n]));
|
||||
assertTrue(BasicOperations.subsetOf(tautomata[n-1], tautomata[n]));
|
||||
assertTrue(BasicOperations.subsetOf(BasicOperations.removeDeadStates(automata[n-1]),
|
||||
BasicOperations.removeDeadStates(automata[n])));
|
||||
assertTrue(BasicOperations.subsetOf(BasicOperations.removeDeadStates(automata[n-1]),
|
||||
BasicOperations.removeDeadStates(tautomata[n])));
|
||||
assertTrue(BasicOperations.subsetOf(BasicOperations.removeDeadStates(tautomata[n-1]),
|
||||
BasicOperations.removeDeadStates(automata[n])));
|
||||
assertTrue(BasicOperations.subsetOf(BasicOperations.removeDeadStates(tautomata[n-1]),
|
||||
BasicOperations.removeDeadStates(tautomata[n])));
|
||||
assertNotSame(automata[n-1], automata[n]);
|
||||
}
|
||||
// check that Lev(N) is a subset of LevT(N)
|
||||
assertTrue(BasicOperations.subsetOf(automata[n], tautomata[n]));
|
||||
assertTrue(BasicOperations.subsetOf(BasicOperations.removeDeadStates(automata[n]),
|
||||
BasicOperations.removeDeadStates(tautomata[n])));
|
||||
// special checks for specific n
|
||||
switch(n) {
|
||||
case 0:
|
||||
// easy, matches the string itself
|
||||
assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeStringLight(s), automata[0]));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeStringLight(s), tautomata[0]));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeStringLight(s), BasicOperations.removeDeadStates(automata[0])));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeStringLight(s), BasicOperations.removeDeadStates(tautomata[0])));
|
||||
break;
|
||||
case 1:
|
||||
// generate a lev1 naively, and check the accepted lang is the same.
|
||||
assertTrue(BasicOperations.sameLanguage(naiveLev1(s), automata[1]));
|
||||
assertTrue(BasicOperations.sameLanguage(naiveLev1T(s), tautomata[1]));
|
||||
assertTrue(BasicOperations.sameLanguage(naiveLev1(s), BasicOperations.removeDeadStates(automata[1])));
|
||||
assertTrue(BasicOperations.sameLanguage(naiveLev1T(s), BasicOperations.removeDeadStates(tautomata[1])));
|
||||
break;
|
||||
default:
|
||||
assertBruteForce(s, automata[n], n);
|
||||
|
|
|
@ -19,17 +19,19 @@ package org.apache.lucene.util.automaton;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStringsLight;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStrings;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
public class TestLightAutomaton extends LuceneTestCase {
|
||||
|
@ -46,7 +48,7 @@ public class TestLightAutomaton extends LuceneTestCase {
|
|||
a.addTransition(start, end, 'd', 'd');
|
||||
a.addTransition(x, y, 'b', 'b');
|
||||
a.addTransition(y, end, 'c', 'c');
|
||||
a.finish();
|
||||
a.finishState();
|
||||
}
|
||||
|
||||
public void testReduceBasic() throws Exception {
|
||||
|
@ -62,7 +64,7 @@ public class TestLightAutomaton extends LuceneTestCase {
|
|||
a.addTransition(start, end, 'x', 'x');
|
||||
a.addTransition(start, end, 'y', 'y');
|
||||
|
||||
a.finish();
|
||||
a.finishState();
|
||||
assertEquals(3, a.getNumTransitions(start));
|
||||
Transition scratch = new Transition();
|
||||
a.initTransition(start, scratch);
|
||||
|
@ -79,9 +81,9 @@ public class TestLightAutomaton extends LuceneTestCase {
|
|||
|
||||
public void testSameLanguage() throws Exception {
|
||||
LightAutomaton a1 = BasicAutomata.makeStringLight("foobar");
|
||||
LightAutomaton a2 = BasicOperations.concatenateLight(
|
||||
LightAutomaton a2 = BasicOperations.removeDeadStates(BasicOperations.concatenateLight(
|
||||
BasicAutomata.makeStringLight("foo"),
|
||||
BasicAutomata.makeStringLight("bar"));
|
||||
BasicAutomata.makeStringLight("bar")));
|
||||
assertTrue(BasicOperations.sameLanguage(a1, a2));
|
||||
}
|
||||
|
||||
|
@ -149,7 +151,7 @@ public class TestLightAutomaton extends LuceneTestCase {
|
|||
LightAutomaton a = BasicOperations.unionLight(Arrays.asList(BasicAutomata.makeStringLight("foobar"),
|
||||
BasicAutomata.makeStringLight("boobar")));
|
||||
LightAutomaton aMin = MinimizationOperationsLight.minimize(a);
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(a), aMin));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(a)), aMin));
|
||||
}
|
||||
|
||||
public void testReverse() throws Exception {
|
||||
|
@ -234,7 +236,7 @@ public class TestLightAutomaton extends LuceneTestCase {
|
|||
a.setAccept(fini, true);
|
||||
a.addTransition(init, fini, 'm');
|
||||
a.addTransition(fini, fini, 'm');
|
||||
a.finish();
|
||||
a.finishState();
|
||||
assertEquals(0, SpecialOperations.getCommonSuffixBytesRef(a).length);
|
||||
}
|
||||
|
||||
|
@ -244,8 +246,8 @@ public class TestLightAutomaton extends LuceneTestCase {
|
|||
LightAutomaton a = AutomatonTestUtil.randomAutomaton(random());
|
||||
LightAutomaton ra = SpecialOperations.reverse(a);
|
||||
LightAutomaton rra = SpecialOperations.reverse(ra);
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(a),
|
||||
BasicOperations.determinize(rra)));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(a)),
|
||||
BasicOperations.determinize(BasicOperations.removeDeadStates(rra))));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -260,16 +262,16 @@ public class TestLightAutomaton extends LuceneTestCase {
|
|||
LightAutomaton ra = SpecialOperations.reverse(a);
|
||||
LightAutomaton rda = BasicOperations.determinize(ra);
|
||||
|
||||
if (a.isEmpty()) {
|
||||
assertTrue(rda.isEmpty());
|
||||
if (BasicOperations.isEmpty(a)) {
|
||||
assertTrue(BasicOperations.isEmpty(rda));
|
||||
continue;
|
||||
}
|
||||
|
||||
RandomAcceptedStringsLight rasl = new RandomAcceptedStringsLight(a);
|
||||
RandomAcceptedStrings ras = new RandomAcceptedStrings(a);
|
||||
|
||||
for(int iter2=0;iter2<20;iter2++) {
|
||||
// Find string accepted by original automaton
|
||||
int[] s = rasl.getRandomAcceptedString(random());
|
||||
int[] s = ras.getRandomAcceptedString(random());
|
||||
|
||||
// Reverse it
|
||||
for(int j=0;j<s.length/2;j++) {
|
||||
|
@ -290,11 +292,16 @@ public class TestLightAutomaton extends LuceneTestCase {
|
|||
assertTrue(BasicOperations.run(a, ""));
|
||||
}
|
||||
|
||||
public void testBasicIsEmpty() throws Exception {
|
||||
LightAutomaton a = new LightAutomaton();
|
||||
a.createState();
|
||||
assertTrue(BasicOperations.isEmpty(a));
|
||||
}
|
||||
|
||||
public void testRemoveDeadTransitionsEmpty() throws Exception {
|
||||
LightAutomaton a = BasicAutomata.makeEmptyLight();
|
||||
LightAutomaton a2 = BasicOperations.removeDeadStates(a);
|
||||
assertTrue(a2.isEmpty());
|
||||
assertTrue(BasicOperations.isEmpty(a2));
|
||||
}
|
||||
|
||||
public void testInvalidAddTransition() throws Exception {
|
||||
|
@ -340,13 +347,38 @@ public class TestLightAutomaton extends LuceneTestCase {
|
|||
}
|
||||
|
||||
assertTrue(BasicOperations.sameLanguage(
|
||||
BasicOperations.determinize(a),
|
||||
BasicOperations.determinize(builder.finish())));
|
||||
BasicOperations.determinize(BasicOperations.removeDeadStates(a)),
|
||||
BasicOperations.determinize(BasicOperations.removeDeadStates(builder.finish()))));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// nocommit testMinus
|
||||
public void testIsTotal() throws Exception {
|
||||
assertFalse(BasicOperations.isTotal(new LightAutomaton()));
|
||||
LightAutomaton a = new LightAutomaton();
|
||||
int init = a.createState();
|
||||
int fini = a.createState();
|
||||
a.setAccept(fini, true);
|
||||
a.addTransition(init, fini, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
|
||||
a.finishState();
|
||||
assertFalse(BasicOperations.isTotal(a));
|
||||
a.addTransition(fini, fini, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
|
||||
a.finishState();
|
||||
assertFalse(BasicOperations.isTotal(a));
|
||||
a.setAccept(init, true);
|
||||
assertTrue(BasicOperations.isTotal(MinimizationOperationsLight.minimize(a)));
|
||||
}
|
||||
|
||||
public void testMinimizeEmpty() throws Exception {
|
||||
LightAutomaton a = new LightAutomaton();
|
||||
int init = a.createState();
|
||||
int fini = a.createState();
|
||||
a.addTransition(init, fini, 'a');
|
||||
a.finishState();
|
||||
a = MinimizationOperationsLight.minimize(a);
|
||||
assertEquals(0, a.getNumStates());
|
||||
}
|
||||
|
||||
public void testMinus() throws Exception {
|
||||
LightAutomaton a1 = BasicAutomata.makeStringLight("foobar");
|
||||
LightAutomaton a2 = BasicAutomata.makeStringLight("boobar");
|
||||
|
@ -379,6 +411,20 @@ public class TestLightAutomaton extends LuceneTestCase {
|
|||
assertMatches(a4);
|
||||
}
|
||||
|
||||
public void testOneInterval() throws Exception {
|
||||
LightAutomaton a = BasicAutomata.makeIntervalLight(999, 1032, 0);
|
||||
a = BasicOperations.determinize(a);
|
||||
assertTrue(BasicOperations.run(a, "0999"));
|
||||
assertTrue(BasicOperations.run(a, "00999"));
|
||||
assertTrue(BasicOperations.run(a, "000999"));
|
||||
}
|
||||
|
||||
public void testAnotherInterval() throws Exception {
|
||||
LightAutomaton a = BasicAutomata.makeIntervalLight(1, 2, 0);
|
||||
a = BasicOperations.determinize(a);
|
||||
assertTrue(BasicOperations.run(a, "01"));
|
||||
}
|
||||
|
||||
public void testIntervalRandom() throws Exception {
|
||||
int ITERS = atLeast(100);
|
||||
for(int iter=0;iter<ITERS;iter++) {
|
||||
|
@ -397,7 +443,7 @@ public class TestLightAutomaton extends LuceneTestCase {
|
|||
}
|
||||
String prefix = b.toString();
|
||||
|
||||
LightAutomaton a = BasicOperations.determinize(BasicAutomata.makeIntervalLight(min, max, digits ));
|
||||
LightAutomaton a = BasicOperations.determinize(BasicAutomata.makeIntervalLight(min, max, digits));
|
||||
if (random().nextBoolean()) {
|
||||
a = MinimizationOperationsLight.minimize(a);
|
||||
}
|
||||
|
@ -414,27 +460,24 @@ public class TestLightAutomaton extends LuceneTestCase {
|
|||
int x = random().nextInt(2*max);
|
||||
boolean expected = x >= min && x <= max;
|
||||
String sx = Integer.toString(x);
|
||||
if (digits > 0 && sx.length() < digits) {
|
||||
if (sx.length() < digits) {
|
||||
// Left-fill with 0s
|
||||
sx = b.substring(sx.length()) + sx;
|
||||
} else if (digits == 0) {
|
||||
// Left-fill with random number of 0s:
|
||||
int numZeros = random().nextInt(10);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for(int i=0;i<numZeros;i++) {
|
||||
sb.append('0');
|
||||
}
|
||||
sb.append(sx);
|
||||
sx = sb.toString();
|
||||
}
|
||||
assertEquals(expected, BasicOperations.run(a, sx));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// nocommit testRemoveDead of an A acceptint nothing should go to emptye A (0 states)
|
||||
|
||||
public void testRemoveDead() throws Exception {
|
||||
LightAutomaton a = BasicOperations.concatenateLight(Arrays.asList(BasicAutomata.makeStringLight("x"),
|
||||
BasicAutomata.makeStringLight("y")));
|
||||
assertEquals(4, a.getNumStates());
|
||||
a = BasicOperations.removeDeadStates(a);
|
||||
assertEquals(3, a.getNumStates());
|
||||
}
|
||||
|
||||
// nocommit more tests ... it's an algebra
|
||||
|
||||
private void assertMatches(LightAutomaton a, String... strings) {
|
||||
Set<IntsRef> expected = new HashSet<>();
|
||||
for(String s : strings) {
|
||||
|
@ -444,4 +487,527 @@ public class TestLightAutomaton extends LuceneTestCase {
|
|||
|
||||
assertEquals(expected, SpecialOperations.getFiniteStrings(BasicOperations.determinize(a), -1));
|
||||
}
|
||||
|
||||
public void testConcatenatePreservesDet() throws Exception {
|
||||
LightAutomaton a1 = BasicAutomata.makeStringLight("foobar");
|
||||
assertTrue(a1.isDeterministic());
|
||||
LightAutomaton a2 = BasicAutomata.makeStringLight("baz");
|
||||
assertTrue(a2.isDeterministic());
|
||||
assertTrue((BasicOperations.concatenateLight(Arrays.asList(a1, a2)).isDeterministic()));
|
||||
}
|
||||
|
||||
public void testRemoveDeadStates() throws Exception {
|
||||
LightAutomaton a = BasicOperations.concatenateLight(Arrays.asList(BasicAutomata.makeStringLight("x"),
|
||||
BasicAutomata.makeStringLight("y")));
|
||||
assertEquals(4, a.getNumStates());
|
||||
a = BasicOperations.removeDeadStates(a);
|
||||
assertEquals(3, a.getNumStates());
|
||||
}
|
||||
|
||||
public void testRemoveDeadStatesEmpty1() throws Exception {
|
||||
LightAutomaton a = new LightAutomaton();
|
||||
a.finishState();
|
||||
assertTrue(BasicOperations.isEmpty(a));
|
||||
assertTrue(BasicOperations.isEmpty(BasicOperations.removeDeadStates(a)));
|
||||
}
|
||||
|
||||
public void testRemoveDeadStatesEmpty2() throws Exception {
|
||||
LightAutomaton a = new LightAutomaton();
|
||||
a.finishState();
|
||||
assertTrue(BasicOperations.isEmpty(a));
|
||||
assertTrue(BasicOperations.isEmpty(BasicOperations.removeDeadStates(a)));
|
||||
}
|
||||
|
||||
public void testRemoveDeadStatesEmpty3() throws Exception {
|
||||
LightAutomaton a = new LightAutomaton();
|
||||
int init = a.createState();
|
||||
int fini = a.createState();
|
||||
a.addTransition(init, fini, 'a');
|
||||
LightAutomaton a2 = BasicOperations.removeDeadStates(a);
|
||||
assertEquals(0, a2.getNumStates());
|
||||
}
|
||||
|
||||
public void testConcatEmpty() throws Exception {
|
||||
// If you concat empty automaton to anything the result should still be empty:
|
||||
LightAutomaton a = BasicOperations.concatenateLight(BasicAutomata.makeEmptyLight(),
|
||||
BasicAutomata.makeStringLight("foo"));
|
||||
assertEquals(new HashSet<IntsRef>(), SpecialOperations.getFiniteStrings(a, -1));
|
||||
|
||||
a = BasicOperations.concatenateLight(BasicAutomata.makeStringLight("foo"),
|
||||
BasicAutomata.makeEmptyLight());
|
||||
assertEquals(new HashSet<IntsRef>(), SpecialOperations.getFiniteStrings(a, -1));
|
||||
}
|
||||
|
||||
public void testSeemsNonEmptyButIsNot1() throws Exception {
|
||||
LightAutomaton a = new LightAutomaton();
|
||||
// Init state has a transition but doesn't lead to accept
|
||||
int init = a.createState();
|
||||
int s = a.createState();
|
||||
a.addTransition(init, s, 'a');
|
||||
a.finishState();
|
||||
assertTrue(BasicOperations.isEmpty(a));
|
||||
}
|
||||
|
||||
public void testSeemsNonEmptyButIsNot2() throws Exception {
|
||||
LightAutomaton a = new LightAutomaton();
|
||||
int init = a.createState();
|
||||
int s = a.createState();
|
||||
a.addTransition(init, s, 'a');
|
||||
// An orphan'd accept state
|
||||
s = a.createState();
|
||||
a.setAccept(s, true);
|
||||
a.finishState();
|
||||
assertTrue(BasicOperations.isEmpty(a));
|
||||
}
|
||||
|
||||
public void testSameLanguage1() throws Exception {
|
||||
LightAutomaton a = BasicAutomata.makeEmptyStringLight();
|
||||
LightAutomaton a2 = BasicAutomata.makeEmptyStringLight();
|
||||
int state = a2.createState();
|
||||
a2.addTransition(0, state, 'a');
|
||||
a2.finishState();
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.removeDeadStates(a),
|
||||
BasicOperations.removeDeadStates(a2)));
|
||||
}
|
||||
|
||||
private LightAutomaton randomNoOp(LightAutomaton a) {
|
||||
switch (random().nextInt(5)) {
|
||||
case 0:
|
||||
if (VERBOSE) {
|
||||
System.out.println(" randomNoOp: determinize");
|
||||
}
|
||||
return BasicOperations.determinize(a);
|
||||
case 1:
|
||||
if (VERBOSE) {
|
||||
System.out.println(" randomNoOp: minimize");
|
||||
}
|
||||
return MinimizationOperationsLight.minimize(a);
|
||||
case 2:
|
||||
if (VERBOSE) {
|
||||
System.out.println(" randomNoOp: removeDeadStates");
|
||||
}
|
||||
return BasicOperations.removeDeadStates(a);
|
||||
case 3:
|
||||
if (VERBOSE) {
|
||||
System.out.println(" randomNoOp: reverse reverse");
|
||||
}
|
||||
a = SpecialOperations.reverse(a);
|
||||
a = randomNoOp(a);
|
||||
return SpecialOperations.reverse(a);
|
||||
case 4:
|
||||
if (VERBOSE) {
|
||||
System.out.println(" randomNoOp: concat empty string");
|
||||
}
|
||||
return BasicOperations.concatenateLight(a, BasicAutomata.makeEmptyStringLight());
|
||||
case 5:
|
||||
if (VERBOSE) {
|
||||
System.out.println(" randomNoOp: union empty automaton");
|
||||
}
|
||||
return BasicOperations.unionLight(a, BasicAutomata.makeEmptyLight());
|
||||
}
|
||||
assert false;
|
||||
return null;
|
||||
}
|
||||
|
||||
private LightAutomaton unionTerms(Collection<BytesRef> terms) {
|
||||
LightAutomaton a;
|
||||
if (random().nextBoolean()) {
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: unionTerms: use union");
|
||||
}
|
||||
List<LightAutomaton> as = new ArrayList<>();
|
||||
for(BytesRef term : terms) {
|
||||
as.add(BasicAutomata.makeStringLight(term.utf8ToString()));
|
||||
}
|
||||
a = BasicOperations.unionLight(as);
|
||||
} else {
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: unionTerms: use makeStringUnion");
|
||||
}
|
||||
List<BytesRef> termsList = new ArrayList<>(terms);
|
||||
Collections.sort(termsList);
|
||||
a = BasicAutomata.makeStringUnionLight(termsList);
|
||||
}
|
||||
|
||||
return randomNoOp(a);
|
||||
}
|
||||
|
||||
private String getRandomString(boolean isAscii) {
|
||||
if (isAscii) {
|
||||
return TestUtil.randomSimpleString(random());
|
||||
} else {
|
||||
return TestUtil.randomRealisticUnicodeString(random());
|
||||
}
|
||||
}
|
||||
|
||||
public void testRandomFinite() throws Exception {
|
||||
|
||||
int numTerms = atLeast(10);
|
||||
int iters = atLeast(100);
|
||||
|
||||
// Some of the ops we do (stripping random byte, reverse) turn valid UTF8 into invalid if we allow non-ascii:
|
||||
boolean isAscii = random().nextBoolean();
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: isAscii=" + isAscii + " numTerms" + numTerms + " iters=" + iters);
|
||||
}
|
||||
|
||||
Set<BytesRef> terms = new HashSet<>();
|
||||
while (terms.size() < numTerms) {
|
||||
terms.add(new BytesRef(getRandomString(isAscii)));
|
||||
}
|
||||
|
||||
LightAutomaton a = unionTerms(terms);
|
||||
assertSame(terms, a);
|
||||
|
||||
for(int iter=0;iter<iters;iter++) {
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: iter=" + iter + " numTerms=" + terms.size());
|
||||
System.out.println(" terms:");
|
||||
for(BytesRef term : terms) {
|
||||
System.out.println(" " + term);
|
||||
}
|
||||
}
|
||||
switch(random().nextInt(14)) {
|
||||
|
||||
case 0:
|
||||
// concatenate prefix
|
||||
{
|
||||
if (VERBOSE) {
|
||||
System.out.println(" op=concat prefix");
|
||||
}
|
||||
Set<BytesRef> newTerms = new HashSet<>();
|
||||
BytesRef prefix = new BytesRef(getRandomString(isAscii));
|
||||
for(BytesRef term : terms) {
|
||||
BytesRef newTerm = BytesRef.deepCopyOf(prefix);
|
||||
newTerm.append(term);
|
||||
newTerms.add(newTerm);
|
||||
}
|
||||
terms = newTerms;
|
||||
boolean wasDeterministic1 = a.isDeterministic();
|
||||
a = BasicOperations.concatenateLight(BasicAutomata.makeStringLight(prefix.utf8ToString()), a);
|
||||
assertEquals(wasDeterministic1, a.isDeterministic());
|
||||
}
|
||||
break;
|
||||
|
||||
case 1:
|
||||
// concatenate suffix
|
||||
{
|
||||
BytesRef suffix = new BytesRef(getRandomString(isAscii));
|
||||
if (VERBOSE) {
|
||||
System.out.println(" op=concat suffix " + suffix);
|
||||
}
|
||||
Set<BytesRef> newTerms = new HashSet<>();
|
||||
for(BytesRef term : terms) {
|
||||
BytesRef newTerm = BytesRef.deepCopyOf(term);
|
||||
newTerm.append(suffix);
|
||||
newTerms.add(newTerm);
|
||||
}
|
||||
terms = newTerms;
|
||||
a = BasicOperations.concatenateLight(a, BasicAutomata.makeStringLight(suffix.utf8ToString()));
|
||||
}
|
||||
break;
|
||||
|
||||
// nocommit sometimes concat a suffix accepting more than 1 term, and sometimes non-det
|
||||
|
||||
case 2:
|
||||
// determinize
|
||||
if (VERBOSE) {
|
||||
System.out.println(" op=determinize");
|
||||
}
|
||||
a = BasicOperations.determinize(a);
|
||||
assertTrue(a.isDeterministic());
|
||||
break;
|
||||
|
||||
case 3:
|
||||
if (VERBOSE) {
|
||||
System.out.println(" op=minimize");
|
||||
}
|
||||
// minimize
|
||||
a = MinimizationOperationsLight.minimize(a);
|
||||
break;
|
||||
|
||||
case 4:
|
||||
// union
|
||||
{
|
||||
if (VERBOSE) {
|
||||
System.out.println(" op=union");
|
||||
}
|
||||
Set<BytesRef> newTerms = new HashSet<>();
|
||||
int numNewTerms = random().nextInt(5);
|
||||
while (newTerms.size() < numNewTerms) {
|
||||
newTerms.add(new BytesRef(getRandomString(isAscii)));
|
||||
}
|
||||
terms.addAll(newTerms);
|
||||
LightAutomaton newA = unionTerms(newTerms);
|
||||
a = BasicOperations.unionLight(a, newA);
|
||||
}
|
||||
break;
|
||||
|
||||
case 5:
|
||||
// optional
|
||||
{
|
||||
if (VERBOSE) {
|
||||
System.out.println(" op=optional");
|
||||
}
|
||||
a = BasicOperations.optionalLight(a);
|
||||
terms.add(new BytesRef());
|
||||
}
|
||||
break;
|
||||
|
||||
case 6:
|
||||
// minus finite
|
||||
{
|
||||
if (VERBOSE) {
|
||||
System.out.println(" op=minus finite");
|
||||
}
|
||||
if (terms.size() > 0) {
|
||||
RandomAcceptedStrings rasl = new RandomAcceptedStrings(BasicOperations.removeDeadStates(a));
|
||||
Set<BytesRef> toRemove = new HashSet<>();
|
||||
int numToRemove = TestUtil.nextInt(random(), 1, (terms.size()+1)/2);
|
||||
while (toRemove.size() < numToRemove) {
|
||||
int[] ints = rasl.getRandomAcceptedString(random());
|
||||
BytesRef term = new BytesRef(UnicodeUtil.newString(ints, 0, ints.length));
|
||||
if (toRemove.contains(term) == false) {
|
||||
toRemove.add(term);
|
||||
}
|
||||
}
|
||||
for(BytesRef term : toRemove) {
|
||||
boolean removed = terms.remove(term);
|
||||
assertTrue(removed);
|
||||
}
|
||||
LightAutomaton a2 = unionTerms(toRemove);
|
||||
a = BasicOperations.minusLight(a, a2);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 7:
|
||||
{
|
||||
// minus infinite
|
||||
List<LightAutomaton> as = new ArrayList<>();
|
||||
int count = TestUtil.nextInt(random(), 1, 5);
|
||||
Set<Integer> prefixes = new HashSet<>();
|
||||
while(prefixes.size() < count) {
|
||||
// prefix is a leading ascii byte; we remove <prefix>* from a
|
||||
int prefix = random().nextInt(128);
|
||||
prefixes.add(prefix);
|
||||
}
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(" op=minus infinite prefixes=" + prefixes);
|
||||
}
|
||||
|
||||
for(int prefix : prefixes) {
|
||||
// prefix is a leading ascii byte; we remove <prefix>* from a
|
||||
LightAutomaton a2 = new LightAutomaton();
|
||||
int init = a2.createState();
|
||||
int state = a2.createState();
|
||||
a2.addTransition(init, state, prefix);
|
||||
a2.setAccept(state, true);
|
||||
a2.addTransition(state, state, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
|
||||
a2.finishState();
|
||||
as.add(a2);
|
||||
Iterator<BytesRef> it = terms.iterator();
|
||||
while (it.hasNext()) {
|
||||
BytesRef term = it.next();
|
||||
if (term.length > 0 && (term.bytes[term.offset] & 0xFF) == prefix) {
|
||||
it.remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
LightAutomaton a2 = randomNoOp(BasicOperations.unionLight(as));
|
||||
a = BasicOperations.minusLight(a, a2);
|
||||
}
|
||||
break;
|
||||
|
||||
case 8:
|
||||
{
|
||||
int count = TestUtil.nextInt(random(), 10, 20);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" op=intersect infinite count=" + count);
|
||||
}
|
||||
// intersect infinite
|
||||
List<LightAutomaton> as = new ArrayList<>();
|
||||
|
||||
Set<Integer> prefixes = new HashSet<>();
|
||||
while(prefixes.size() < count) {
|
||||
int prefix = random().nextInt(128);
|
||||
prefixes.add(prefix);
|
||||
}
|
||||
if (VERBOSE) {
|
||||
System.out.println(" prefixes=" + prefixes);
|
||||
}
|
||||
|
||||
for(int prefix : prefixes) {
|
||||
// prefix is a leading ascii byte; we retain <prefix>* in a
|
||||
LightAutomaton a2 = new LightAutomaton();
|
||||
int init = a2.createState();
|
||||
int state = a2.createState();
|
||||
a2.addTransition(init, state, prefix);
|
||||
a2.setAccept(state, true);
|
||||
a2.addTransition(state, state, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
|
||||
a2.finishState();
|
||||
as.add(a2);
|
||||
prefixes.add(prefix);
|
||||
}
|
||||
|
||||
LightAutomaton a2 = BasicOperations.unionLight(as);
|
||||
if (random().nextBoolean()) {
|
||||
a2 = BasicOperations.determinize(a2);
|
||||
} else if (random().nextBoolean()) {
|
||||
a2 = MinimizationOperationsLight.minimize(a2);
|
||||
}
|
||||
a = BasicOperations.intersectionLight(a, a2);
|
||||
|
||||
Iterator<BytesRef> it = terms.iterator();
|
||||
while (it.hasNext()) {
|
||||
BytesRef term = it.next();
|
||||
if (term.length == 0 || prefixes.contains(term.bytes[term.offset]&0xff) == false) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" drop term=" + term);
|
||||
}
|
||||
it.remove();
|
||||
} else {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" keep term=" + term);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 9:
|
||||
// reverse
|
||||
if (VERBOSE) {
|
||||
System.out.println(" op=reverse");
|
||||
}
|
||||
a = SpecialOperations.reverse(a);
|
||||
Set<BytesRef> newTerms = new HashSet<>();
|
||||
for(BytesRef term : terms) {
|
||||
newTerms.add(new BytesRef(new StringBuilder(term.utf8ToString()).reverse().toString()));
|
||||
}
|
||||
terms = newTerms;
|
||||
break;
|
||||
|
||||
case 10:
|
||||
if (VERBOSE) {
|
||||
System.out.println(" op=randomNoOp");
|
||||
}
|
||||
a = randomNoOp(a);
|
||||
break;
|
||||
|
||||
case 11:
|
||||
// interval
|
||||
int min = random().nextInt(1000);
|
||||
int max = min + random().nextInt(50);
|
||||
// digits must be non-zero else we make cycle
|
||||
int digits = Integer.toString(max).length();
|
||||
if (VERBOSE) {
|
||||
System.out.println(" op=union interval min=" + min + " max=" + max + " digits=" + digits);
|
||||
}
|
||||
a = BasicOperations.unionLight(a, BasicAutomata.makeIntervalLight(min, max, digits));
|
||||
StringBuilder b = new StringBuilder();
|
||||
for(int i=0;i<digits;i++) {
|
||||
b.append('0');
|
||||
}
|
||||
String prefix = b.toString();
|
||||
for(int i=min;i<=max;i++) {
|
||||
String s = Integer.toString(i);
|
||||
if (s.length() < digits) {
|
||||
// Left-fill with 0s
|
||||
s = prefix.substring(s.length()) + s;
|
||||
}
|
||||
terms.add(new BytesRef(s));
|
||||
}
|
||||
break;
|
||||
|
||||
case 12:
|
||||
if (VERBOSE) {
|
||||
System.out.println(" op=remove the empty string");
|
||||
}
|
||||
a = BasicOperations.minusLight(a, BasicAutomata.makeEmptyStringLight());
|
||||
terms.remove(new BytesRef());
|
||||
break;
|
||||
|
||||
case 13:
|
||||
if (VERBOSE) {
|
||||
System.out.println(" op=add the empty string");
|
||||
}
|
||||
a = BasicOperations.unionLight(a, BasicAutomata.makeEmptyStringLight());
|
||||
terms.add(new BytesRef());
|
||||
break;
|
||||
}
|
||||
|
||||
assertSame(terms, a);
|
||||
}
|
||||
|
||||
assertSame(terms, a);
|
||||
}
|
||||
|
||||
private void assertSame(Collection<BytesRef> terms, LightAutomaton a) {
|
||||
|
||||
try {
|
||||
assertTrue(SpecialOperations.isFinite(a));
|
||||
assertFalse(BasicOperations.isTotal(a));
|
||||
|
||||
LightAutomaton detA = BasicOperations.determinize(a);
|
||||
|
||||
// Make sure all terms are accepted:
|
||||
IntsRef scratch = new IntsRef();
|
||||
for(BytesRef term : terms) {
|
||||
Util.toIntsRef(term, scratch);
|
||||
assertTrue("failed to accept term=" + term.utf8ToString(), BasicOperations.run(detA, term.utf8ToString()));
|
||||
}
|
||||
|
||||
// Use getFiniteStrings:
|
||||
Set<IntsRef> expected = new HashSet<>();
|
||||
for(BytesRef term : terms) {
|
||||
IntsRef intsRef = new IntsRef();
|
||||
Util.toUTF32(term.utf8ToString(), intsRef);
|
||||
expected.add(intsRef);
|
||||
}
|
||||
Set<IntsRef> actual = SpecialOperations.getFiniteStrings(a, -1);
|
||||
|
||||
if (expected.equals(actual) == false) {
|
||||
System.out.println("FAILED:");
|
||||
for(IntsRef term : expected) {
|
||||
if (actual.contains(term) == false) {
|
||||
System.out.println(" term=" + term + " should be accepted but isn't");
|
||||
}
|
||||
}
|
||||
for(IntsRef term : actual) {
|
||||
if (expected.contains(term) == false) {
|
||||
System.out.println(" term=" + term + " is accepted but should not be");
|
||||
}
|
||||
}
|
||||
throw new AssertionError("mismatch");
|
||||
}
|
||||
|
||||
// Use sameLanguage:
|
||||
LightAutomaton a2 = BasicOperations.removeDeadStates(BasicOperations.determinize(unionTerms(terms)));
|
||||
assertTrue(BasicOperations.sameLanguage(a2, BasicOperations.removeDeadStates(BasicOperations.determinize(a))));
|
||||
|
||||
// Do same check, in UTF8 space
|
||||
LightAutomaton utf8 = randomNoOp(new UTF32ToUTF8Light().convert(a));
|
||||
|
||||
Set<IntsRef> expected2 = new HashSet<>();
|
||||
for(BytesRef term : terms) {
|
||||
IntsRef intsRef = new IntsRef();
|
||||
Util.toIntsRef(term, intsRef);
|
||||
expected2.add(intsRef);
|
||||
}
|
||||
assertEquals(expected2, SpecialOperations.getFiniteStrings(utf8, -1));
|
||||
} catch (AssertionError ae) {
|
||||
System.out.println("TEST: FAILED: not same");
|
||||
System.out.println(" terms (count=" + terms.size() + "):");
|
||||
for(BytesRef term : terms) {
|
||||
System.out.println(" " + term);
|
||||
}
|
||||
System.out.println(" automaton:");
|
||||
System.out.println(a.toDot());
|
||||
//a.writeDot("fail");
|
||||
throw ae;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,8 +28,8 @@ public class TestMinimize extends LuceneTestCase {
|
|||
int num = atLeast(200);
|
||||
for (int i = 0; i < num; i++) {
|
||||
LightAutomaton a = AutomatonTestUtil.randomAutomaton(random());
|
||||
LightAutomaton la = BasicOperations.determinize(a);
|
||||
LightAutomaton lb = BasicOperations.determinize(MinimizationOperationsLight.minimize(a));
|
||||
LightAutomaton la = BasicOperations.determinize(BasicOperations.removeDeadStates(a));
|
||||
LightAutomaton lb = MinimizationOperationsLight.minimize(a);
|
||||
assertTrue(BasicOperations.sameLanguage(la, lb));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,13 +17,17 @@ package org.apache.lucene.util.automaton;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashSet;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Random;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
public class TestUTF32ToUTF8 extends LuceneTestCase {
|
||||
|
||||
|
@ -203,11 +207,25 @@ public class TestUTF32ToUTF8 extends LuceneTestCase {
|
|||
assertAutomaton(new RegExp(AutomatonTestUtil.randomRegexp(random()), RegExp.NONE).toLightAutomaton());
|
||||
}
|
||||
}
|
||||
|
||||
public void testSingleton() throws Exception {
|
||||
int iters = atLeast(100);
|
||||
for(int iter=0;iter<iters;iter++) {
|
||||
String s = TestUtil.randomRealisticUnicodeString(random());
|
||||
LightAutomaton a = BasicAutomata.makeStringLight(s);
|
||||
LightAutomaton utf8 = new UTF32ToUTF8Light().convert(a);
|
||||
IntsRef ints = new IntsRef();
|
||||
Util.toIntsRef(new BytesRef(s), ints);
|
||||
Set<IntsRef> set = new HashSet<>();
|
||||
set.add(ints);
|
||||
assertEquals(set, SpecialOperations.getFiniteStrings(utf8, -1));
|
||||
}
|
||||
}
|
||||
|
||||
private void assertAutomaton(LightAutomaton automaton) throws Exception {
|
||||
CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
|
||||
ByteRunAutomaton bra = new ByteRunAutomaton(automaton);
|
||||
final AutomatonTestUtil.RandomAcceptedStringsLight ras = new AutomatonTestUtil.RandomAcceptedStringsLight(automaton);
|
||||
final AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton);
|
||||
|
||||
int num = atLeast(1000);
|
||||
for (int i = 0; i < num; i++) {
|
||||
|
|
|
@ -328,7 +328,7 @@ public class AnalyzingSuggester extends Lookup {
|
|||
}
|
||||
}
|
||||
|
||||
result.finish();
|
||||
result.finishState();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.automaton.BasicOperations;
|
||||
import org.apache.lucene.util.automaton.LightAutomaton;
|
||||
import org.apache.lucene.util.automaton.Transition;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
|
@ -72,6 +71,10 @@ public class FSTUtil {
|
|||
assert a.isDeterministic();
|
||||
final List<Path<T>> queue = new ArrayList<>();
|
||||
final List<Path<T>> endNodes = new ArrayList<>();
|
||||
if (a.getNumStates() == 0) {
|
||||
return endNodes;
|
||||
}
|
||||
|
||||
queue.add(new Path<>(0, fst
|
||||
.getFirstArc(new FST.Arc<T>()), fst.outputs.getNoOutput(),
|
||||
new IntsRef()));
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.util.automaton;
|
|||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -31,7 +30,6 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
/**
|
||||
* Utilities for testing automata.
|
||||
|
@ -136,7 +134,7 @@ public class AutomatonTestUtil {
|
|||
* Once created, call {@link #getRandomAcceptedString(Random)}
|
||||
* to get a new string (in UTF-32 codepoints).
|
||||
*/
|
||||
public static class RandomAcceptedStringsLight {
|
||||
public static class RandomAcceptedStrings {
|
||||
|
||||
private final Map<Transition,Boolean> leadsToAccept;
|
||||
private final LightAutomaton a;
|
||||
|
@ -152,7 +150,7 @@ public class AutomatonTestUtil {
|
|||
}
|
||||
}
|
||||
|
||||
public RandomAcceptedStringsLight(LightAutomaton a) {
|
||||
public RandomAcceptedStrings(LightAutomaton a) {
|
||||
this.a = a;
|
||||
if (a.getNumStates() == 0) {
|
||||
throw new IllegalArgumentException("this automaton accepts nothing");
|
||||
|
@ -334,6 +332,9 @@ public class AutomatonTestUtil {
|
|||
* Determinizes the given automaton using the given set of initial states.
|
||||
*/
|
||||
public static LightAutomaton determinizeSimpleLight(LightAutomaton a, Set<Integer> initialset) {
|
||||
if (a.getNumStates() == 0) {
|
||||
return a;
|
||||
}
|
||||
int[] points = a.getStartPoints();
|
||||
// subset construction
|
||||
Map<Set<Integer>, Set<Integer>> sets = new HashMap<>();
|
||||
|
@ -448,6 +449,9 @@ public class AutomatonTestUtil {
|
|||
* this is only used to test the correctness of our faster implementation.
|
||||
*/
|
||||
public static boolean isFiniteSlow(LightAutomaton a) {
|
||||
if (a.getNumStates() == 0) {
|
||||
return true;
|
||||
}
|
||||
return isFiniteSlow(a, 0, new HashSet<Integer>());
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue