LUCENE-5752: improve tests; move isEmpty out of LA into BasicOps; BasicOps.sameLanguage requires no dead states; rename LA.finish -> finishState

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5752@1602966 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2014-06-16 19:22:02 +00:00
parent 5927445bb6
commit ce2f7e9c8c
24 changed files with 944 additions and 478 deletions

View File

@ -35,8 +35,6 @@ import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
@ -154,8 +152,6 @@ public class FuzzyTermsEnum extends TermsEnum {
if (editDistance < runAutomata.size()) {
//System.out.println("FuzzyTE.getAEnum: ed=" + editDistance + " lastTerm=" + (lastTerm==null ? "null" : lastTerm.utf8ToString()));
final CompiledAutomaton compiled = runAutomata.get(editDistance);
compiled.lightAutomaton.writeDot("compiled");
return new AutomatonFuzzyTermsEnum(terms.intersect(compiled, lastTerm == null ? null : compiled.floor(lastTerm, new BytesRef())),
runAutomata.subList(0, editDistance + 1).toArray(new CompiledAutomaton[editDistance + 1]));
} else {

View File

@ -47,7 +47,7 @@ final public class BasicAutomata {
*/
public static LightAutomaton makeEmptyLight() {
LightAutomaton a = new LightAutomaton();
a.finish();
a.finishState();
return a;
}
@ -69,17 +69,10 @@ final public class BasicAutomata {
int s = a.createState();
a.setAccept(s, true);
a.addTransition(s, s, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
a.finish();
a.finishState();
return a;
}
public static int appendAnyString(LightAutomaton a, int state) {
int newState = a.createState();
a.addTransition(state, newState, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
a.addTransition(newState, newState, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
return newState;
}
/**
* Returns a new (deterministic) automaton that accepts any single codepoint.
*/
@ -120,7 +113,7 @@ final public class BasicAutomata {
int s2 = a.createState();
a.setAccept(s2, true);
a.addTransition(s1, s2, min, max);
a.finish();
a.finishState();
return a;
}
@ -260,10 +253,11 @@ final public class BasicAutomata {
LightAutomaton a1 = builder.finish();
if (digits <= 0) {
a1.addTransition(0, 0, '0');
for (int p : initials) {
a1.addEpsilon(0, p);
}
a1.finish();
a1.finishState();
}
return a1;
@ -284,7 +278,10 @@ final public class BasicAutomata {
}
a.setAccept(lastState, true);
a.finish();
a.finishState();
assert a.isDeterministic();
assert BasicOperations.hasDeadStates(a) == false;
return a;
}
@ -303,7 +300,7 @@ final public class BasicAutomata {
s = s2;
}
a.setAccept(s, true);
a.finish();
a.finishState();
return a;
}

View File

@ -60,7 +60,6 @@ final public class BasicOperations {
* Complexity: linear in total number of states.
*/
static public LightAutomaton concatenateLight(LightAutomaton a1, LightAutomaton a2) {
// nocommit we lost the two-arg optimization here (prepend tiny automaton in front of huge one)
return concatenateLight(Arrays.asList(a1, a2));
}
@ -75,6 +74,10 @@ final public class BasicOperations {
// First pass: create all states
for(LightAutomaton a : l) {
if (a.getNumStates() == 0) {
result.finishState();
return result;
}
int numStates = a.getNumStates();
for(int s=0;s<numStates;s++) {
result.createState();
@ -133,7 +136,7 @@ final public class BasicOperations {
result.createState();
}
result.finish();
result.finishState();
return result;
}
@ -148,28 +151,11 @@ final public class BasicOperations {
LightAutomaton result = new LightAutomaton();
result.createState();
result.setAccept(0, true);
int numStates = a.getNumStates();
for(int i=0;i<numStates;i++) {
result.createState();
result.setAccept(i+1, a.isAccept(i));
if (a.getNumStates() > 0) {
result.copy(a);
result.addEpsilon(0, 1);
}
Transition t = new Transition();
int count = a.initTransition(0, t);
for(int i=0;i<count;i++) {
a.getNextTransition(t);
result.addTransition(0, 1+t.dest, t.min, t.max);
}
for(int i=0;i<numStates;i++) {
count = a.initTransition(i, t);
for(int j=0;j<count;j++) {
a.getNextTransition(t);
result.addTransition(1+t.source, 1+t.dest, t.min, t.max);
}
}
result.finish();
result.finishState();
return result;
}
@ -207,8 +193,6 @@ final public class BasicOperations {
return builder.finish();
}
// nocommit move to AutomatonTestUtil
/**
* Returns an automaton that accepts <code>min</code> or more concatenated
* repetitions of the language of the given automaton.
@ -268,7 +252,7 @@ final public class BasicOperations {
}
}
b.finish();
b.finishState();
return b;
}
@ -355,99 +339,57 @@ final public class BasicOperations {
}
}
}
c.finish();
c.finishState();
return removeDeadStates(c);
}
/**
* Returns an automaton that accepts the intersection of the languages of the
* given automata. Never modifies the input automata languages.
* <p>
* Complexity: quadratic in number of states.
*/
/*
// nocommit broken
static public LightAutomaton intersectionLight(LightAutomaton a1, LightAutomaton a2) {
if (a1 == a2) {
return a1;
}
LightAutomaton result = new LightAutomaton();
result.createState();
//Transition[][] transitions1 = a1.getSortedTransitions();
//Transition[][] transitions2 = a2.getSortedTransitions();
LinkedList<LightStatePair> worklist = new LinkedList<>();
HashMap<LightStatePair,LightStatePair> newstates = new HashMap<>();
LightStatePair p = new LightStatePair(0, 0, 0);
worklist.add(p);
newstates.put(p, p);
LightAutomaton.Transition t1 = new LightAutomaton.Transition();
LightAutomaton.Transition t2 = new LightAutomaton.Transition();
while (worklist.size() > 0) {
p = worklist.removeFirst();
result.setAccept(p.s, a1.isAccept(p.s1) && a2.isAccept(p.s2));
int numT1 = a1.initTransition(p.s1, t1);
if (numT1 > 0) {
a1.getNextTransition(t1);
}
int numT2 = a2.initTransition(p.s2, t2);
if (numT2 > 0) {
a2.getNextTransition(t2);
}
//Transition[] t1 = transitions1[p.s1.number];
//Transition[] t2 = transitions2[p.s2.number];
for (int n1 = 0, b2 = 0; n1 < numT1; n1++) {
while (b2 < numT2 && t2.max < t1.min) {
b2++;
if (b2 < numT2) {
a2.getNextTransition(t2);
}
}
for (int n2 = b2; n2 < numT2 && t1.max >= t2.min; n2++) {
if (t2.max >= t1.min) {
LightStatePair q = new LightStatePair(t1.dest, t2.dest);
LightStatePair r = newstates.get(q);
if (r == null) {
q.s = result.createState();
worklist.add(q);
newstates.put(q, q);
r = q;
}
int min = t1.min > t2.min ? t1.min : t2.min;
int max = t1.max < t2.max ? t1.max : t2.max;
result.addTransition(p.s, r.s, min, max);
}
if (n2 < numT2-1) {
a2.getNextTransition(t2);
}
}
}
}
result.finish();
return result.removeDeadTransitions();
}
*/
/** Returns true if these two automata accept exactly the
* same language. This is a costly computation! Note
* also that a1 and a2 will be determinized as a side
* effect. Both automata must be determinized first! */
* effect. Both automata must be determinized and have
* no dead states! */
public static boolean sameLanguage(LightAutomaton a1, LightAutomaton a2) {
if (a1 == a2) {
return true;
}
if (a1.isEmpty() && a2.isEmpty()) {
return true;
}
return subsetOf(a2, a1) && subsetOf(a1, a2);
}
// TODO: move to test-framework?
/** Returns true if this automaton has any states that cannot
* be reached from the initial state or cannot reach an accept state.
* Cost is O(numTransitions+numStates). */
public static boolean hasDeadStates(LightAutomaton a) {
BitSet liveStates = getLiveStates(a);
int numLive = liveStates.cardinality();
int numStates = a.getNumStates();
assert numLive <= numStates: "numLive=" + numLive + " numStates=" + numStates + " " + liveStates;
return numLive < numStates;
}
// TODO: move to test-framework?
/** Returns true if there are dead states reachable from an initial state. */
public static boolean hasDeadStatesFromInitial(LightAutomaton a) {
BitSet reachableFromInitial = getLiveStatesFromInitial(a);
BitSet reachableFromAccept = getLiveStatesToAccept(a);
reachableFromInitial.andNot(reachableFromAccept);
return reachableFromInitial.isEmpty() == false;
}
// TODO: move to test-framework?
/** Returns true if there are dead states that reach an accept state. */
public static boolean hasDeadStatesToAccept(LightAutomaton a) {
BitSet reachableFromInitial = getLiveStatesFromInitial(a);
BitSet reachableFromAccept = getLiveStatesToAccept(a);
reachableFromAccept.andNot(reachableFromInitial);
return reachableFromAccept.isEmpty() == false;
}
/**
* Returns true if the language of <code>a1</code> is a subset of the language
* of <code>a2</code>. Both automata must be determinized.
* of <code>a2</code>. Both automata must be determinized and must have no dead
* states.
* <p>
* Complexity: quadratic in number of states.
*/
@ -458,6 +400,15 @@ final public class BasicOperations {
if (a2.isDeterministic() == false) {
throw new IllegalArgumentException("a2 must be deterministic");
}
assert hasDeadStatesFromInitial(a1) == false;
assert hasDeadStatesFromInitial(a2) == false;
if (a1.getNumStates() == 0) {
// Empty language is alwyas a subset of any other language
return true;
} else if (a2.getNumStates() == 0) {
return isEmpty(a1);
}
// TODO: cutover to iterators instead
Transition[][] transitions1 = a1.getSortedTransitions();
Transition[][] transitions2 = a2.getSortedTransitions();
@ -503,75 +454,6 @@ final public class BasicOperations {
return true;
}
/**
* Returns true if the language of <code>a1</code> is a subset of the language
* of <code>a2</code>. Both automata must be determinized.
* <p>
* Complexity: quadratic in number of states.
*/
/*
// nocommit low GC but broken!
public static boolean subsetOf(LightAutomaton a1, LightAutomaton a2) {
if (a1 == a2) return true;
LinkedList<LightStatePair> worklist = new LinkedList<>();
HashSet<LightStatePair> visited = new HashSet<>();
LightStatePair p = new LightStatePair(0, 0);
worklist.add(p);
visited.add(p);
LightAutomaton.Transition t1 = new LightAutomaton.Transition();
LightAutomaton.Transition t2 = new LightAutomaton.Transition();
while (worklist.size() > 0) {
p = worklist.removeFirst();
System.out.println("pop s1=" + p.s1 + " s2=" + p.s2);
if (a1.isAccept(p.s1) && a2.isAccept(p.s2) == false) {
return false;
}
int numT1 = a1.initTransition(p.s1, t1);
for (int n1 = 0, b2 = 0; n1 < numT1; n1++) {
int numT2 = a2.initTransition(p.s2, t2);
if (numT2 > 0) {
a2.getNextTransition(t2);
}
a1.getNextTransition(t1);
while (b2 < numT2 && t2.max < t1.min) {
b2++;
if (b2 < numT2) {
a2.getNextTransition(t2);
}
}
int min1 = t1.min, max1 = t1.max;
for (int n2 = b2; n2 < numT2 && t1.max >= t2.min; n2++) {
if (t2.min > min1) {
return false;
}
if (t2.max < Character.MAX_CODE_POINT) {
min1 = t2.max + 1;
} else {
min1 = Character.MAX_CODE_POINT;
max1 = Character.MIN_CODE_POINT;
}
LightStatePair q = new LightStatePair(t1.dest, t2.dest);
if (!visited.contains(q)) {
worklist.add(q);
visited.add(q);
}
if (n2 < numT2-1) {
a2.getNextTransition(t2);
}
}
if (min1 <= max1) {
return false;
}
}
}
return true;
}
*/
/**
* Returns an automaton that accepts the union of the languages of the given
* automata.
@ -582,64 +464,6 @@ final public class BasicOperations {
return unionLight(Arrays.asList(a1, a2));
}
/**
* Returns an automaton that accepts the union of the languages of the given
* automata.
* <p>
* Complexity: linear in number of states.
*/
/*
public static LightAutomaton unionLight(Collection<LightAutomaton> l) {
LightAutomaton result = new LightAutomaton();
// Create initial node:
result.createState();
int stateOffset = 1;
// First pass, adding all states epsilon transitions:
LightAutomaton.Transition t = new LightAutomaton.Transition();
for(LightAutomaton a : l) {
int numStates = a.getNumStates();
if (a.isAccept(0)) {
// If any automaton accepts empty string, we do too:
result.setAccept(0, true);
}
for(int s=0;s<numStates;s++) {
int state = result.createState();
result.setAccept(state, a.isAccept(s));
}
// Add epsilon transition from new initial state to this automaton's initial state:
int numTransitions = a.initTransition(0, t);
for(int i=0;i<numTransitions;i++) {
a.getNextTransition(t);
result.addTransition(0, stateOffset + t.dest, t.min, t.max);
}
stateOffset += numStates;
}
// Second pass, copying over all other transitions:
stateOffset = 1;
for(LightAutomaton a : l) {
int numStates = a.getNumStates();
for(int s=0;s<numStates;s++) {
int numTransitions = a.initTransition(s, t);
for(int i=0;i<numTransitions;i++) {
a.getNextTransition(t);
result.addTransition(stateOffset + s, stateOffset + t.dest, t.min, t.max);
}
}
stateOffset += numStates;
}
result.finish();
return result;
}
*/
public static LightAutomaton unionLight(Collection<LightAutomaton> l) {
LightAutomaton result = new LightAutomaton();
@ -662,7 +486,7 @@ final public class BasicOperations {
stateOffset += a.getNumStates();
}
result.finish();
result.finishState();
return result;
}
@ -804,9 +628,11 @@ final public class BasicOperations {
*/
public static LightAutomaton determinize(LightAutomaton a) {
if (a.isDeterministic()) {
// Already determinized
return a;
}
if (a.getNumStates() == 0) {
if (a.getNumStates() <= 1) {
// Already determinized
return a;
}
@ -932,11 +758,45 @@ final public class BasicOperations {
* Returns true if the given automaton accepts no strings.
*/
public static boolean isEmpty(LightAutomaton a) {
return a.isAccept(0) == false && a.getNumTransitions(0) == 0;
if (a.getNumStates() == 0) {
// Common case: no states
return true;
}
if (a.isAccept(0) == false && a.getNumTransitions(0) == 0) {
// Common case: just one initial state
return true;
}
if (a.isAccept(0) == true) {
// Apparently common case: it accepts the damned empty string
return false;
}
LinkedList<Integer> workList = new LinkedList<>();
BitSet seen = new BitSet(a.getNumStates());
workList.add(0);
seen.set(0);
Transition t = new Transition();
while (workList.isEmpty() == false) {
int state = workList.removeFirst();
if (a.isAccept(state)) {
return false;
}
int count = a.initTransition(state, t);
for(int i=0;i<count;i++) {
a.getNextTransition(t);
if (seen.get(t.dest) == false) {
workList.add(t.dest);
seen.set(t.dest);
}
}
}
return true;
}
/**
* Returns true if the given automaton accepts all strings.
* Returns true if the given automaton accepts all strings. The automaton must be minimized.
*/
public static boolean isTotal(LightAutomaton a) {
if (a.isAccept(0) && a.getNumTransitions(0) == 1) {
@ -993,25 +853,18 @@ final public class BasicOperations {
* reachable from it and if it is reachable from the initial state.
*/
private static BitSet getLiveStates(LightAutomaton a) {
int numStates = a.getNumStates();
BitSet reachableFromInitial = getLiveStatesFromInitial(a);
BitSet reachableFromAccept = getLiveStatesFromInitial(SpecialOperations.reverse(a));
for(int acceptState : a.getAcceptStates()) {
reachableFromAccept.set(1+acceptState);
}
for(int i=0;i<numStates;i++) {
if (reachableFromAccept.get(i+1) == false) {
reachableFromInitial.clear(i);
}
}
return reachableFromInitial;
BitSet live = getLiveStatesFromInitial(a);
live.and(getLiveStatesToAccept(a));
return live;
}
/** Returns bitset marking states reachable from the initial node. */
/** Returns bitset marking states reachable from the initial state. */
private static BitSet getLiveStatesFromInitial(LightAutomaton a) {
int numStates = a.getNumStates();
BitSet live = new BitSet(numStates);
if (numStates == 0) {
return live;
}
LinkedList<Integer> workList = new LinkedList<>();
live.set(0);
workList.add(0);
@ -1032,6 +885,47 @@ final public class BasicOperations {
return live;
}
/** Returns bitset marking states that can reach an accept state. */
private static BitSet getLiveStatesToAccept(LightAutomaton a) {
LightAutomaton.Builder builder = new LightAutomaton.Builder();
// NOTE: not quite the same thing as what SpecialOperations.reverse does:
Transition t = new Transition();
int numStates = a.getNumStates();
for(int s=0;s<numStates;s++) {
builder.createState();
}
for(int s=0;s<numStates;s++) {
int count = a.initTransition(s, t);
for(int i=0;i<count;i++) {
a.getNextTransition(t);
builder.addTransition(t.dest, s, t.min, t.max);
}
}
LightAutomaton a2 = builder.finish();
LinkedList<Integer> workList = new LinkedList<>();
BitSet live = new BitSet(numStates);
for (int s : a.getAcceptStates()) {
live.set(s);
workList.add(s);
}
while (workList.isEmpty() == false) {
int s = workList.removeFirst();
int count = a2.initTransition(s, t);
for(int i=0;i<count;i++) {
a2.getNextTransition(t);
if (live.get(t.dest) == false) {
live.set(t.dest);
workList.add(t.dest);
}
}
}
return live;
}
/**
* Removes transitions to dead states (a state is "dead" if it is not
* reachable from the initial state or no accept state is reachable from it.)
@ -1066,10 +960,8 @@ final public class BasicOperations {
}
}
// nocommit need test case for "accepts no strings"
result.finish();
result.finishState();
assert hasDeadStates(result) == false;
return result;
}
}

View File

@ -19,7 +19,6 @@ package org.apache.lucene.util.automaton;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.index.Terms;
@ -91,10 +90,13 @@ public class CompiledAutomaton {
}
public CompiledAutomaton(LightAutomaton automaton, Boolean finite, boolean simplify) {
if (simplify) {
// Test whether the automaton is a "simple" form and
// if so, don't create a runAutomaton. Note that on a
// large automaton these tests could be costly:
if (BasicOperations.isEmpty(automaton)) {
// matches nothing
type = AUTOMATON_TYPE.NONE;
@ -104,6 +106,7 @@ public class CompiledAutomaton {
lightAutomaton = null;
this.finite = null;
return;
// NOTE: only approximate, because automaton may not be minimal:
} else if (BasicOperations.isTotal(automaton)) {
// matches all possible strings
type = AUTOMATON_TYPE.ALL;
@ -138,6 +141,7 @@ public class CompiledAutomaton {
} else if (commonPrefix.length() > 0) {
LightAutomaton other = BasicOperations.concatenateLight(BasicAutomata.makeStringLight(commonPrefix), BasicAutomata.makeAnyStringLight());
other = BasicOperations.determinize(other);
assert BasicOperations.hasDeadStates(other) == false;
if (BasicOperations.sameLanguage(automaton, other)) {
// matches a constant prefix
type = AUTOMATON_TYPE.PREFIX;
@ -169,10 +173,10 @@ public class CompiledAutomaton {
}
runAutomaton = new ByteRunAutomaton(utf8, true);
lightAutomaton = runAutomaton.a;
lightAutomaton = runAutomaton.automaton;
}
private Transition scratch = new Transition();
private Transition transition = new Transition();
//private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;
@ -181,31 +185,29 @@ public class CompiledAutomaton {
//System.out.println(lightAutomaton.toDot());
// Find biggest transition that's < label
// TODO: use binary search here
lightAutomaton.initTransition(state, scratch);
int numTransitions = lightAutomaton.getNumTransitions(state);
int maxIndex = -1;
int lastMin = 0;
int numTransitions = lightAutomaton.initTransition(state, transition);
for(int i=0;i<numTransitions;i++) {
lightAutomaton.getNextTransition(scratch);
if (scratch.min < leadLabel) {
lightAutomaton.getNextTransition(transition);
if (transition.min < leadLabel) {
maxIndex = i;
} else {
// Transitions are alway sorted
break;
}
assert scratch.min >= lastMin;
lastMin = scratch.min;
// nocommit else break?
}
//System.out.println(" maxIndex=" + maxIndex);
assert maxIndex != -1;
lightAutomaton.getTransition(state, maxIndex, scratch);
lightAutomaton.getTransition(state, maxIndex, transition);
// Append floorLabel
final int floorLabel;
if (scratch.max > leadLabel-1) {
if (transition.max > leadLabel-1) {
floorLabel = leadLabel-1;
} else {
floorLabel = scratch.max;
floorLabel = transition.max;
}
//System.out.println(" floorLabel=" + (char) floorLabel);
if (idx >= term.bytes.length) {
@ -214,7 +216,7 @@ public class CompiledAutomaton {
//if (DEBUG) System.out.println(" add floorLabel=" + (char) floorLabel + " idx=" + idx);
term.bytes[idx] = (byte) floorLabel;
state = scratch.dest;
state = transition.dest;
//System.out.println(" dest: " + state);
idx++;
@ -231,14 +233,14 @@ public class CompiledAutomaton {
// We are pushing "top" -- so get last label of
// last transition:
//System.out.println("get state=" + state + " numTrans=" + numTransitions);
lightAutomaton.getTransition(state, numTransitions-1, scratch);
lightAutomaton.getTransition(state, numTransitions-1, transition);
if (idx >= term.bytes.length) {
term.grow(1+idx);
}
//if (DEBUG) System.out.println(" push maxLabel=" + (char) lastTransition.max + " idx=" + idx);
//System.out.println(" add trans dest=" + scratch.dest + " label=" + (char) scratch.max);
term.bytes[idx] = (byte) scratch.max;
state = scratch.dest;
term.bytes[idx] = (byte) transition.max;
state = transition.dest;
idx++;
}
}
@ -326,9 +328,9 @@ public class CompiledAutomaton {
//if (DEBUG) System.out.println(" return " + output.utf8ToString());
return output;
} else {
lightAutomaton.getTransition(state, 0, scratch);
lightAutomaton.getTransition(state, 0, transition);
if (label-1 < scratch.min) {
if (label-1 < transition.min) {
if (runAutomaton.isAccept(state)) {
output.length = idx;
@ -380,18 +382,17 @@ public class CompiledAutomaton {
b.append(" initial [shape=plaintext,label=\"\"];\n");
b.append(" initial -> ").append(i).append("\n");
}
lightAutomaton.initTransition(i, scratch);
int numTransitions = lightAutomaton.getNumTransitions(i);
int numTransitions = lightAutomaton.initTransition(i, transition);
for (int j = 0; j < numTransitions; j++) {
b.append(" ").append(i);
b.append(" -> ");
b.append(scratch.dest);
b.append(scratch.min);
if (scratch.min != scratch.max) {
b.append(transition.dest);
b.append(transition.min);
if (transition.min != transition.max) {
b.append("-");
b.append(scratch.max);
b.append(transition.max);
}
lightAutomaton.getNextTransition(scratch);
lightAutomaton.getNextTransition(transition);
}
}
return b.append("}\n").toString();

View File

@ -159,8 +159,6 @@ public class LevenshteinAutomata {
lastState = a.createState();
}
// nocommit why are so many dead states created here?
int stateOffset = lastState;
a.setAccept(lastState, description.isAccept(0));
@ -170,6 +168,8 @@ public class LevenshteinAutomata {
a.setAccept(state, description.isAccept(i));
}
// TODO: this creates bogus states/transitions (states are final, have self loops, and can't be reached from an init state)
// create transitions from state to state
for (int k = 0; k < numStates; k++) {
final int xpos = description.getPosition(k);
@ -183,10 +183,7 @@ public class LevenshteinAutomata {
final int cvec = getVector(ch, xpos, end);
int dest = description.transition(k, xpos, cvec);
if (dest >= 0) {
// nocommit why do we create cycles in dead states?
if (k != dest) {
a.addTransition(stateOffset+k, stateOffset+dest, ch);
}
a.addTransition(stateOffset+k, stateOffset+dest, ch);
}
}
// add transitions for all other chars in unicode
@ -195,15 +192,12 @@ public class LevenshteinAutomata {
int dest = description.transition(k, xpos, 0); // by definition
if (dest >= 0) {
for (int r = 0; r < numRanges; r++) {
// nocommit why do we create cycles in dead states?
if (k != dest) {
a.addTransition(stateOffset+k, stateOffset+dest, rangeLower[r], rangeUpper[r]);
}
a.addTransition(stateOffset+k, stateOffset+dest, rangeLower[r], rangeUpper[r]);
}
}
}
a.finish();
a.finishState();
assert a.isDeterministic();
return a;
}

View File

@ -17,12 +17,10 @@ package org.apache.lucene.util.automaton;
* limitations under the License.
*/
import java.io.IOException;
import java.io.PrintWriter;
//import java.io.IOException;
//import java.io.PrintWriter;
import java.util.Arrays;
import java.util.BitSet;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Set;
import org.apache.lucene.util.ArrayUtil;
@ -64,7 +62,6 @@ public class LightAutomaton {
private int[] states = new int[4];
/** Holds toState, min, max for each transition: */
// nocommit inefficient when labels are really bytes (max 256)
private int[] transitions = new int[6];
private final Set<Integer> finalStates = new HashSet<Integer>();
@ -82,6 +79,9 @@ public class LightAutomaton {
/** Mark this state as an accept state. */
public void setAccept(int state, boolean isAccept) {
if (state >= getNumStates()) {
throw new IllegalArgumentException("state=" + state + " is out of bounds (numStates=" + getNumStates() + ")");
}
if (isAccept) {
finalStates.add(state);
} else {
@ -89,10 +89,6 @@ public class LightAutomaton {
}
}
public boolean isEmpty() {
return finalStates.isEmpty();
}
/** Sugar, but object-heavy; it's better to iterate instead. */
public Transition[][] getSortedTransitions() {
int numStates = getNumStates();
@ -200,12 +196,13 @@ public class LightAutomaton {
states[nextState+i] += nextTransition;
}
int state = i/2;
if (other.isAccept(state)) {
setAccept(stateOffset+state, true);
}
}
nextState += other.nextState;
for(int s : other.getAcceptStates()) {
setAccept(stateOffset+s, true);
}
// Bulk copy and then fixup dest for each transition:
transitions = ArrayUtil.grow(transitions, nextTransition + other.nextTransition);
System.arraycopy(other.transitions, 0, transitions, nextTransition, other.nextTransition);
@ -312,7 +309,8 @@ public class LightAutomaton {
return deterministic;
}
public void finish() {
/** Finishes the current state; call this once you are done adding transitions for a state. */
public void finishState() {
if (curState != -1) {
//System.out.println("finish: finish current state " + curState);
finishCurrentState();
@ -327,7 +325,6 @@ public class LightAutomaton {
}
public int getNumTransitions(int state) {
//assert curState == -1: "not finished";
int count = states[2*state+1];
if (count == -1) {
return 0;
@ -468,27 +465,20 @@ public class LightAutomaton {
}
};
// nocommit createStates(int count)?
// nocommit kinda awkward iterator api...
/** Initialize the provided Transition for iteration; you
* must call {@link #getNextTransition} to get the first
* transition for the state. Returns the number of transitions
/** Initialize the provided Transition to iterate through all transitions
* leaving the specified state. You must call {@link #getNextTransition} to
* get each transition. Returns the number of transitions
* leaving this state. */
public int initTransition(int state, Transition t) {
// assert curState == -1: "not finished";
assert state < nextState/2: "state=" + state + " nextState=" + nextState;
t.source = state;
//System.out.println("initTrans source=" + state + " numTrans=" + getNumTransitions(state));
t.transitionUpto = states[2*state];
return getNumTransitions(state);
}
/** Iterate to the next transition after the provided one */
public void getNextTransition(Transition t) {
//assert curState == -1: "not finished";
// Make sure there is still a transition left:
//System.out.println("getNextTrans transUpto=" + t.transitionUpto);
//System.out.println(" states[2*t.source]=" + states[2*t.source] + " numTrans=" + states[2*t.source+1] + " transitionUpto+3=" + (t.transitionUpto+3) + " t=" + t);
assert (t.transitionUpto+3 - states[2*t.source]) <= 3*states[2*t.source+1];
t.dest = transitions[t.transitionUpto++];
t.min = transitions[t.transitionUpto++];
@ -498,7 +488,6 @@ public class LightAutomaton {
/** Fill the provided {@link Transition} with the index'th
* transition leaving the specified state. */
public void getTransition(int state, int index, Transition t) {
assert curState == -1: "not finished";
int i = states[2*state] + 3*index;
t.source = state;
t.dest = transitions[i++];
@ -552,11 +541,12 @@ public class LightAutomaton {
result.addTransition(i, deadState, maxi, Character.MAX_CODE_POINT);
}
}
result.finish();
result.finishState();
return result;
}
// nocommit
/*
public void writeDot(String fileName) {
if (fileName.indexOf('/') == -1) {
fileName = "/l/la/lucene/core/" + fileName + ".dot";
@ -569,6 +559,7 @@ public class LightAutomaton {
throw new RuntimeException(ioe);
}
}
*/
public String toDot() {
// TODO: breadth first search so we can see get layered output...
@ -660,14 +651,12 @@ public class LightAutomaton {
assert label >= 0;
int trans = states[2*state];
int limit = trans + 3*states[2*state+1];
// nocommit we could do bin search; transitions are sorted
// System.out.println("la.step state=" + state + " label=" + label + " trans=" + trans + " limit=" + limit);
// TODO: we could do bin search; transitions are sorted
while (trans < limit) {
int dest = transitions[trans];
int min = transitions[trans+1];
int max = transitions[trans+2];
if (min <= label && label <= max) {
//System.out.println(" ret dest=" + dest);
return dest;
}
trans += 3;
@ -780,7 +769,7 @@ public class LightAutomaton {
upto += 4;
}
a.finish();
a.finishState();
return a;
}

View File

@ -57,7 +57,8 @@ final public class MinimizationOperationsLight {
* Minimizes the given automaton using Hopcroft's algorithm.
*/
public static LightAutomaton minimizeHopcroft(LightAutomaton a) {
if (a.isEmpty()) {
if (a.getNumStates() == 0 || (a.isAccept(0) == false && a.getNumTransitions(0) == 0)) {
// Fastmatch for common case
return new LightAutomaton();
}
a = BasicOperations.determinize(a);
@ -209,7 +210,6 @@ final public class MinimizationOperationsLight {
int[] stateMap = new int[statesLen];
int[] stateRep = new int[k];
// nocommit maybe LA should be born already with the initial state?
result.createState();
//System.out.println("min: k=" + k);
@ -251,7 +251,7 @@ final public class MinimizationOperationsLight {
result.addTransition(n, stateMap[t.dest], t.min, t.max);
}
}
result.finish();
result.finishState();
//System.out.println(result.getNumStates() + " states");
return BasicOperations.removeDeadStates(result);

View File

@ -361,8 +361,6 @@ public class RegExp {
*/
public static final int NONE = 0x0000;
private static boolean allow_mutation = false;
Kind kind;
RegExp exp1, exp2;
String s;
@ -467,7 +465,7 @@ public class RegExp {
findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider);
findLeaves(exp2, Kind.REGEXP_UNION, list, automata, automaton_provider);
a = BasicOperations.unionLight(list);
MinimizationOperationsLight.minimize(a);
a = MinimizationOperationsLight.minimize(a);
break;
case REGEXP_CONCATENATION:
list = new ArrayList<>();
@ -476,7 +474,7 @@ public class RegExp {
findLeaves(exp2, Kind.REGEXP_CONCATENATION, list, automata,
automaton_provider);
a = BasicOperations.concatenateLight(list);
MinimizationOperationsLight.minimize(a);
a = MinimizationOperationsLight.minimize(a);
break;
case REGEXP_INTERSECTION:
a = BasicOperations.intersectionLight(

View File

@ -37,8 +37,7 @@ import java.util.Arrays;
* @lucene.experimental
*/
public abstract class RunAutomaton {
// nocommit
final LightAutomaton a;
final LightAutomaton automaton;
final int maxInterval;
final int size;
final boolean[] accept;
@ -125,7 +124,7 @@ public abstract class RunAutomaton {
this.maxInterval = maxInterval;
//System.out.println("before det a=" + a.getNumStates());
a = BasicOperations.determinize(a);
this.a = a;
this.automaton = a;
//System.out.println("AFTER DET tableize= " + tableize + ": ");
//System.out.println(a.toDot());
points = a.getStartPoints();

View File

@ -30,17 +30,13 @@
package org.apache.lucene.util.automaton;
import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.Set;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.Util;
/**
* Special automata operations.
@ -71,6 +67,9 @@ final public class SpecialOperations {
* Returns true if the language of this automaton is finite.
*/
public static boolean isFinite(LightAutomaton a) {
if (a.getNumStates() == 0) {
return true;
}
return isFinite(new Transition(), a, 0, new BitSet(a.getNumStates()), new BitSet(a.getNumStates()));
}
@ -174,8 +173,8 @@ final public class SpecialOperations {
public static LightAutomaton reverse(LightAutomaton a, Set<Integer> initialStates) {
if (a.isEmpty()) {
return a;
if (BasicOperations.isEmpty(a)) {
return new LightAutomaton();
}
int numStates = a.getNumStates();
@ -204,15 +203,16 @@ final public class SpecialOperations {
}
LightAutomaton result = builder.finish();
for(int s : a.getAcceptStates()) {
assert s < numStates;
result.addEpsilon(0, s+1);
if (initialStates != null) {
initialStates.add(s+1);
}
}
result.finish();
result.finishState();
return result;
}

View File

@ -0,0 +1,38 @@
package org.apache.lucene.util.automaton;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** Used temporarily when iterating through transitions from a {@link LightAutomaton}
* {@link getTransition} and {@link #getNextTransition}. */
public class Transition {
public int source;
public int dest;
public int min;
public int max;
/** Remembers where we are in the iteration; init to -1 to provoke
* exception if nextTransition is called without first initTransition. */
int transitionUpto = -1;
@Override
public String toString() {
return source + " --> " + dest + " " + (char) min + "-" + (char) max;
}
}

View File

@ -17,11 +17,6 @@ package org.apache.lucene.util.automaton;
* limitations under the License.
*/
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.Sorter;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.List;
@ -268,12 +263,6 @@ public final class UTF32ToUTF8Light {
* not in general be deterministic, so you must
* determinize it if that's needed. */
public LightAutomaton convert(LightAutomaton utf32) {
//System.out.println("\nCONVERT");
// nocommit make sure singleton cases work:
//if (utf32.isSingleton()) {
//utf32 = utf32.cloneExpanded();
//}
if (utf32.getNumStates() == 0) {
return utf32;
}
@ -286,9 +275,6 @@ public final class UTF32ToUTF8Light {
pending.add(utf32State);
utf8 = new LightAutomaton.Builder();
// nocommit we don't track this
// utf8.setDeterministic(false);
int utf8State = utf8.createState();
utf8.setAccept(utf8State, utf32.isAccept(utf32State));
@ -325,19 +311,4 @@ public final class UTF32ToUTF8Light {
return utf8.finish();
}
/*
private State newUTF8State() {
State s = new State();
if (utf8StateCount == utf8States.length) {
final State[] newArray = new State[ArrayUtil.oversize(1+utf8StateCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(utf8States, 0, newArray, 0, utf8StateCount);
utf8States = newArray;
}
utf8States[utf8StateCount] = s;
s.number = utf8StateCount;
utf8StateCount++;
return s;
}
*/
}

View File

@ -411,8 +411,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
});
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final LightAutomaton expected = s2a("abc");
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
}
public void testMultipleHoles() throws Exception {
@ -423,8 +423,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
});
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final LightAutomaton expected = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
}
public void testSynOverMultipleHoles() throws Exception {
@ -438,8 +438,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
final LightAutomaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
final LightAutomaton a2 = join(s2a("x"), SEP_A, s2a("b"));
final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
}
// for debugging!
@ -485,8 +485,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
final LightAutomaton expected = join("abc", "def");
//toDot(actual);
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
}
public void testHole() throws Exception {
@ -501,8 +501,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
final LightAutomaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"));
//toDot(actual);
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
}
public void testOverlappedTokensSausage() throws Exception {
@ -517,8 +517,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
final LightAutomaton a1 = s2a("abc");
final LightAutomaton a2 = s2a("xyz");
final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
}
public void testOverlappedTokensLattice() throws Exception {
@ -535,8 +535,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
//toDot(actual);
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
}
public void testSynOverHole() throws Exception {
@ -554,8 +554,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
final LightAutomaton expected = BasicOperations.concatenateLight(a1,
join(SEP_A, s2a("b")));
//toDot(actual);
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
}
public void testSynOverHole2() throws Exception {
@ -570,8 +570,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
final LightAutomaton expected = BasicOperations.unionLight(
join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")),
s2a("abc"));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
}
public void testOverlappedTokensLattice2() throws Exception {
@ -588,8 +588,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
final LightAutomaton a2 = join("abc", "def", "ghi");
final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
//toDot(actual);
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
}
public void testToDot() throws Exception {
@ -607,8 +607,8 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final LightAutomaton expected = join(HOLE_A, SEP_A, s2a("abc"));
//toDot(actual);
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
}
// TODO: testEndsWithHole... but we need posInc to set in TS.end()
@ -622,7 +622,7 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final LightAutomaton expected = BasicOperations.unionLight(s2a("a"),
s2a("X"));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(expected)),
BasicOperations.determinize(BasicOperations.removeDeadStates(actual))));
}
}

View File

@ -85,6 +85,7 @@ public class TestTermsEnum2 extends LuceneTestCase {
/** tests a pre-intersected automaton against the original */
public void testFiniteVersusInfinite() throws Exception {
for (int i = 0; i < numIterations; i++) {
String reg = AutomatonTestUtil.randomRegexp(random());
LightAutomaton automaton = BasicOperations.determinize(new RegExp(reg, RegExp.NONE).toLightAutomaton());

View File

@ -107,10 +107,6 @@ public class TestAutomatonQuery extends LuceneTestCase {
* Test some very simple automata.
*/
public void testBasicAutomata() throws IOException {
// nocommit
assertAutomatonHits(2, BasicAutomata.makeIntervalLight(1233, 2346, 0));
assertAutomatonHits(0, BasicAutomata.makeEmptyLight());
assertAutomatonHits(0, BasicAutomata.makeEmptyStringLight());
assertAutomatonHits(2, BasicAutomata.makeAnyCharLight());

View File

@ -50,7 +50,7 @@ public class TestBasicOperations extends LuceneTestCase {
public void testEmptyLanguageConcatenate() {
LightAutomaton a = BasicAutomata.makeStringLight("a");
LightAutomaton concat = BasicOperations.concatenateLight(a, BasicAutomata.makeEmptyLight());
assertTrue(concat.isEmpty());
assertTrue(BasicOperations.isEmpty(concat));
}
/** Test optimization to concatenate() with empty String to an NFA */
@ -81,7 +81,7 @@ public class TestBasicOperations extends LuceneTestCase {
final LightAutomaton a = BasicOperations.determinize(re.toLightAutomaton());
assertFalse(BasicOperations.isEmpty(a));
final AutomatonTestUtil.RandomAcceptedStringsLight rx = new AutomatonTestUtil.RandomAcceptedStringsLight(a);
final AutomatonTestUtil.RandomAcceptedStrings rx = new AutomatonTestUtil.RandomAcceptedStrings(a);
for(int j=0;j<ITER2;j++) {
//System.out.println("TEST: j=" + j);
int[] acc = null;

View File

@ -46,18 +46,18 @@ public class TestDeterminism extends LuceneTestCase {
}
private static void assertAutomaton(LightAutomaton a) {
a = BasicOperations.determinize(a);
a = BasicOperations.determinize(BasicOperations.removeDeadStates(a));
// complement(complement(a)) = a
LightAutomaton equivalent = BasicOperations.complementLight(BasicOperations.complementLight(a));
assertTrue(BasicOperations.sameLanguage(a, equivalent));
// a union a = a
equivalent = BasicOperations.determinize(BasicOperations.unionLight(a, a));
equivalent = BasicOperations.determinize(BasicOperations.removeDeadStates(BasicOperations.unionLight(a, a)));
assertTrue(BasicOperations.sameLanguage(a, equivalent));
// a intersect a = a
equivalent = BasicOperations.determinize(BasicOperations.intersectionLight(a, a));
equivalent = BasicOperations.determinize(BasicOperations.removeDeadStates(BasicOperations.intersectionLight(a, a)));
assertTrue(BasicOperations.sameLanguage(a, equivalent));
// a minus a = empty

View File

@ -41,8 +41,7 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
// LUCENE-3094
public void testNoWastedStates() throws Exception {
// nocommit this fails ... pre-existing issue i think!!
// AutomatonTestUtil.assertNoDetachedStates(new LevenshteinAutomata("abc", false).toAutomaton(1));
assertFalse(BasicOperations.hasDeadStatesFromInitial(new LevenshteinAutomata("abc", false).toAutomaton(1)));
}
/**
@ -78,30 +77,34 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
assertTrue(tautomata[n].isDeterministic());
assertTrue(SpecialOperations.isFinite(automata[n]));
assertTrue(SpecialOperations.isFinite(tautomata[n]));
// nocommit LEV creates detached states
//AutomatonTestUtil.assertNoDetachedStates(automata[n]);
//AutomatonTestUtil.assertNoDetachedStates(tautomata[n]);
assertFalse(BasicOperations.hasDeadStatesFromInitial(automata[n]));
assertFalse(BasicOperations.hasDeadStatesFromInitial(tautomata[n]));
// check that the dfa for n-1 accepts a subset of the dfa for n
if (n > 0) {
assertTrue(BasicOperations.subsetOf(automata[n-1], automata[n]));
assertTrue(BasicOperations.subsetOf(automata[n-1], tautomata[n]));
assertTrue(BasicOperations.subsetOf(tautomata[n-1], automata[n]));
assertTrue(BasicOperations.subsetOf(tautomata[n-1], tautomata[n]));
assertTrue(BasicOperations.subsetOf(BasicOperations.removeDeadStates(automata[n-1]),
BasicOperations.removeDeadStates(automata[n])));
assertTrue(BasicOperations.subsetOf(BasicOperations.removeDeadStates(automata[n-1]),
BasicOperations.removeDeadStates(tautomata[n])));
assertTrue(BasicOperations.subsetOf(BasicOperations.removeDeadStates(tautomata[n-1]),
BasicOperations.removeDeadStates(automata[n])));
assertTrue(BasicOperations.subsetOf(BasicOperations.removeDeadStates(tautomata[n-1]),
BasicOperations.removeDeadStates(tautomata[n])));
assertNotSame(automata[n-1], automata[n]);
}
// check that Lev(N) is a subset of LevT(N)
assertTrue(BasicOperations.subsetOf(automata[n], tautomata[n]));
assertTrue(BasicOperations.subsetOf(BasicOperations.removeDeadStates(automata[n]),
BasicOperations.removeDeadStates(tautomata[n])));
// special checks for specific n
switch(n) {
case 0:
// easy, matches the string itself
assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeStringLight(s), automata[0]));
assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeStringLight(s), tautomata[0]));
assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeStringLight(s), BasicOperations.removeDeadStates(automata[0])));
assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeStringLight(s), BasicOperations.removeDeadStates(tautomata[0])));
break;
case 1:
// generate a lev1 naively, and check the accepted lang is the same.
assertTrue(BasicOperations.sameLanguage(naiveLev1(s), automata[1]));
assertTrue(BasicOperations.sameLanguage(naiveLev1T(s), tautomata[1]));
assertTrue(BasicOperations.sameLanguage(naiveLev1(s), BasicOperations.removeDeadStates(automata[1])));
assertTrue(BasicOperations.sameLanguage(naiveLev1T(s), BasicOperations.removeDeadStates(tautomata[1])));
break;
default:
assertBruteForce(s, automata[n], n);

View File

@ -19,17 +19,19 @@ package org.apache.lucene.util.automaton;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStringsLight;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStrings;
import org.apache.lucene.util.fst.Util;
public class TestLightAutomaton extends LuceneTestCase {
@ -46,7 +48,7 @@ public class TestLightAutomaton extends LuceneTestCase {
a.addTransition(start, end, 'd', 'd');
a.addTransition(x, y, 'b', 'b');
a.addTransition(y, end, 'c', 'c');
a.finish();
a.finishState();
}
public void testReduceBasic() throws Exception {
@ -62,7 +64,7 @@ public class TestLightAutomaton extends LuceneTestCase {
a.addTransition(start, end, 'x', 'x');
a.addTransition(start, end, 'y', 'y');
a.finish();
a.finishState();
assertEquals(3, a.getNumTransitions(start));
Transition scratch = new Transition();
a.initTransition(start, scratch);
@ -79,9 +81,9 @@ public class TestLightAutomaton extends LuceneTestCase {
public void testSameLanguage() throws Exception {
LightAutomaton a1 = BasicAutomata.makeStringLight("foobar");
LightAutomaton a2 = BasicOperations.concatenateLight(
LightAutomaton a2 = BasicOperations.removeDeadStates(BasicOperations.concatenateLight(
BasicAutomata.makeStringLight("foo"),
BasicAutomata.makeStringLight("bar"));
BasicAutomata.makeStringLight("bar")));
assertTrue(BasicOperations.sameLanguage(a1, a2));
}
@ -149,7 +151,7 @@ public class TestLightAutomaton extends LuceneTestCase {
LightAutomaton a = BasicOperations.unionLight(Arrays.asList(BasicAutomata.makeStringLight("foobar"),
BasicAutomata.makeStringLight("boobar")));
LightAutomaton aMin = MinimizationOperationsLight.minimize(a);
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(a), aMin));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(a)), aMin));
}
public void testReverse() throws Exception {
@ -234,7 +236,7 @@ public class TestLightAutomaton extends LuceneTestCase {
a.setAccept(fini, true);
a.addTransition(init, fini, 'm');
a.addTransition(fini, fini, 'm');
a.finish();
a.finishState();
assertEquals(0, SpecialOperations.getCommonSuffixBytesRef(a).length);
}
@ -244,8 +246,8 @@ public class TestLightAutomaton extends LuceneTestCase {
LightAutomaton a = AutomatonTestUtil.randomAutomaton(random());
LightAutomaton ra = SpecialOperations.reverse(a);
LightAutomaton rra = SpecialOperations.reverse(ra);
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(a),
BasicOperations.determinize(rra)));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(BasicOperations.removeDeadStates(a)),
BasicOperations.determinize(BasicOperations.removeDeadStates(rra))));
}
}
@ -260,16 +262,16 @@ public class TestLightAutomaton extends LuceneTestCase {
LightAutomaton ra = SpecialOperations.reverse(a);
LightAutomaton rda = BasicOperations.determinize(ra);
if (a.isEmpty()) {
assertTrue(rda.isEmpty());
if (BasicOperations.isEmpty(a)) {
assertTrue(BasicOperations.isEmpty(rda));
continue;
}
RandomAcceptedStringsLight rasl = new RandomAcceptedStringsLight(a);
RandomAcceptedStrings ras = new RandomAcceptedStrings(a);
for(int iter2=0;iter2<20;iter2++) {
// Find string accepted by original automaton
int[] s = rasl.getRandomAcceptedString(random());
int[] s = ras.getRandomAcceptedString(random());
// Reverse it
for(int j=0;j<s.length/2;j++) {
@ -290,11 +292,16 @@ public class TestLightAutomaton extends LuceneTestCase {
assertTrue(BasicOperations.run(a, ""));
}
public void testBasicIsEmpty() throws Exception {
LightAutomaton a = new LightAutomaton();
a.createState();
assertTrue(BasicOperations.isEmpty(a));
}
public void testRemoveDeadTransitionsEmpty() throws Exception {
LightAutomaton a = BasicAutomata.makeEmptyLight();
LightAutomaton a2 = BasicOperations.removeDeadStates(a);
assertTrue(a2.isEmpty());
assertTrue(BasicOperations.isEmpty(a2));
}
public void testInvalidAddTransition() throws Exception {
@ -340,13 +347,38 @@ public class TestLightAutomaton extends LuceneTestCase {
}
assertTrue(BasicOperations.sameLanguage(
BasicOperations.determinize(a),
BasicOperations.determinize(builder.finish())));
BasicOperations.determinize(BasicOperations.removeDeadStates(a)),
BasicOperations.determinize(BasicOperations.removeDeadStates(builder.finish()))));
}
}
// nocommit testMinus
public void testIsTotal() throws Exception {
assertFalse(BasicOperations.isTotal(new LightAutomaton()));
LightAutomaton a = new LightAutomaton();
int init = a.createState();
int fini = a.createState();
a.setAccept(fini, true);
a.addTransition(init, fini, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
a.finishState();
assertFalse(BasicOperations.isTotal(a));
a.addTransition(fini, fini, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
a.finishState();
assertFalse(BasicOperations.isTotal(a));
a.setAccept(init, true);
assertTrue(BasicOperations.isTotal(MinimizationOperationsLight.minimize(a)));
}
public void testMinimizeEmpty() throws Exception {
LightAutomaton a = new LightAutomaton();
int init = a.createState();
int fini = a.createState();
a.addTransition(init, fini, 'a');
a.finishState();
a = MinimizationOperationsLight.minimize(a);
assertEquals(0, a.getNumStates());
}
public void testMinus() throws Exception {
LightAutomaton a1 = BasicAutomata.makeStringLight("foobar");
LightAutomaton a2 = BasicAutomata.makeStringLight("boobar");
@ -379,6 +411,20 @@ public class TestLightAutomaton extends LuceneTestCase {
assertMatches(a4);
}
public void testOneInterval() throws Exception {
LightAutomaton a = BasicAutomata.makeIntervalLight(999, 1032, 0);
a = BasicOperations.determinize(a);
assertTrue(BasicOperations.run(a, "0999"));
assertTrue(BasicOperations.run(a, "00999"));
assertTrue(BasicOperations.run(a, "000999"));
}
public void testAnotherInterval() throws Exception {
LightAutomaton a = BasicAutomata.makeIntervalLight(1, 2, 0);
a = BasicOperations.determinize(a);
assertTrue(BasicOperations.run(a, "01"));
}
public void testIntervalRandom() throws Exception {
int ITERS = atLeast(100);
for(int iter=0;iter<ITERS;iter++) {
@ -397,7 +443,7 @@ public class TestLightAutomaton extends LuceneTestCase {
}
String prefix = b.toString();
LightAutomaton a = BasicOperations.determinize(BasicAutomata.makeIntervalLight(min, max, digits ));
LightAutomaton a = BasicOperations.determinize(BasicAutomata.makeIntervalLight(min, max, digits));
if (random().nextBoolean()) {
a = MinimizationOperationsLight.minimize(a);
}
@ -414,27 +460,24 @@ public class TestLightAutomaton extends LuceneTestCase {
int x = random().nextInt(2*max);
boolean expected = x >= min && x <= max;
String sx = Integer.toString(x);
if (digits > 0 && sx.length() < digits) {
if (sx.length() < digits) {
// Left-fill with 0s
sx = b.substring(sx.length()) + sx;
} else if (digits == 0) {
// Left-fill with random number of 0s:
int numZeros = random().nextInt(10);
StringBuilder sb = new StringBuilder();
for(int i=0;i<numZeros;i++) {
sb.append('0');
}
sb.append(sx);
sx = sb.toString();
}
assertEquals(expected, BasicOperations.run(a, sx));
}
}
}
// nocommit testRemoveDead of an A acceptint nothing should go to emptye A (0 states)
public void testRemoveDead() throws Exception {
LightAutomaton a = BasicOperations.concatenateLight(Arrays.asList(BasicAutomata.makeStringLight("x"),
BasicAutomata.makeStringLight("y")));
assertEquals(4, a.getNumStates());
a = BasicOperations.removeDeadStates(a);
assertEquals(3, a.getNumStates());
}
// nocommit more tests ... it's an algebra
private void assertMatches(LightAutomaton a, String... strings) {
Set<IntsRef> expected = new HashSet<>();
for(String s : strings) {
@ -444,4 +487,527 @@ public class TestLightAutomaton extends LuceneTestCase {
assertEquals(expected, SpecialOperations.getFiniteStrings(BasicOperations.determinize(a), -1));
}
public void testConcatenatePreservesDet() throws Exception {
LightAutomaton a1 = BasicAutomata.makeStringLight("foobar");
assertTrue(a1.isDeterministic());
LightAutomaton a2 = BasicAutomata.makeStringLight("baz");
assertTrue(a2.isDeterministic());
assertTrue((BasicOperations.concatenateLight(Arrays.asList(a1, a2)).isDeterministic()));
}
public void testRemoveDeadStates() throws Exception {
LightAutomaton a = BasicOperations.concatenateLight(Arrays.asList(BasicAutomata.makeStringLight("x"),
BasicAutomata.makeStringLight("y")));
assertEquals(4, a.getNumStates());
a = BasicOperations.removeDeadStates(a);
assertEquals(3, a.getNumStates());
}
public void testRemoveDeadStatesEmpty1() throws Exception {
LightAutomaton a = new LightAutomaton();
a.finishState();
assertTrue(BasicOperations.isEmpty(a));
assertTrue(BasicOperations.isEmpty(BasicOperations.removeDeadStates(a)));
}
public void testRemoveDeadStatesEmpty2() throws Exception {
LightAutomaton a = new LightAutomaton();
a.finishState();
assertTrue(BasicOperations.isEmpty(a));
assertTrue(BasicOperations.isEmpty(BasicOperations.removeDeadStates(a)));
}
public void testRemoveDeadStatesEmpty3() throws Exception {
LightAutomaton a = new LightAutomaton();
int init = a.createState();
int fini = a.createState();
a.addTransition(init, fini, 'a');
LightAutomaton a2 = BasicOperations.removeDeadStates(a);
assertEquals(0, a2.getNumStates());
}
public void testConcatEmpty() throws Exception {
// If you concat empty automaton to anything the result should still be empty:
LightAutomaton a = BasicOperations.concatenateLight(BasicAutomata.makeEmptyLight(),
BasicAutomata.makeStringLight("foo"));
assertEquals(new HashSet<IntsRef>(), SpecialOperations.getFiniteStrings(a, -1));
a = BasicOperations.concatenateLight(BasicAutomata.makeStringLight("foo"),
BasicAutomata.makeEmptyLight());
assertEquals(new HashSet<IntsRef>(), SpecialOperations.getFiniteStrings(a, -1));
}
public void testSeemsNonEmptyButIsNot1() throws Exception {
LightAutomaton a = new LightAutomaton();
// Init state has a transition but doesn't lead to accept
int init = a.createState();
int s = a.createState();
a.addTransition(init, s, 'a');
a.finishState();
assertTrue(BasicOperations.isEmpty(a));
}
public void testSeemsNonEmptyButIsNot2() throws Exception {
LightAutomaton a = new LightAutomaton();
int init = a.createState();
int s = a.createState();
a.addTransition(init, s, 'a');
// An orphan'd accept state
s = a.createState();
a.setAccept(s, true);
a.finishState();
assertTrue(BasicOperations.isEmpty(a));
}
public void testSameLanguage1() throws Exception {
LightAutomaton a = BasicAutomata.makeEmptyStringLight();
LightAutomaton a2 = BasicAutomata.makeEmptyStringLight();
int state = a2.createState();
a2.addTransition(0, state, 'a');
a2.finishState();
assertTrue(BasicOperations.sameLanguage(BasicOperations.removeDeadStates(a),
BasicOperations.removeDeadStates(a2)));
}
private LightAutomaton randomNoOp(LightAutomaton a) {
switch (random().nextInt(5)) {
case 0:
if (VERBOSE) {
System.out.println(" randomNoOp: determinize");
}
return BasicOperations.determinize(a);
case 1:
if (VERBOSE) {
System.out.println(" randomNoOp: minimize");
}
return MinimizationOperationsLight.minimize(a);
case 2:
if (VERBOSE) {
System.out.println(" randomNoOp: removeDeadStates");
}
return BasicOperations.removeDeadStates(a);
case 3:
if (VERBOSE) {
System.out.println(" randomNoOp: reverse reverse");
}
a = SpecialOperations.reverse(a);
a = randomNoOp(a);
return SpecialOperations.reverse(a);
case 4:
if (VERBOSE) {
System.out.println(" randomNoOp: concat empty string");
}
return BasicOperations.concatenateLight(a, BasicAutomata.makeEmptyStringLight());
case 5:
if (VERBOSE) {
System.out.println(" randomNoOp: union empty automaton");
}
return BasicOperations.unionLight(a, BasicAutomata.makeEmptyLight());
}
assert false;
return null;
}
private LightAutomaton unionTerms(Collection<BytesRef> terms) {
LightAutomaton a;
if (random().nextBoolean()) {
if (VERBOSE) {
System.out.println("TEST: unionTerms: use union");
}
List<LightAutomaton> as = new ArrayList<>();
for(BytesRef term : terms) {
as.add(BasicAutomata.makeStringLight(term.utf8ToString()));
}
a = BasicOperations.unionLight(as);
} else {
if (VERBOSE) {
System.out.println("TEST: unionTerms: use makeStringUnion");
}
List<BytesRef> termsList = new ArrayList<>(terms);
Collections.sort(termsList);
a = BasicAutomata.makeStringUnionLight(termsList);
}
return randomNoOp(a);
}
private String getRandomString(boolean isAscii) {
if (isAscii) {
return TestUtil.randomSimpleString(random());
} else {
return TestUtil.randomRealisticUnicodeString(random());
}
}
public void testRandomFinite() throws Exception {
int numTerms = atLeast(10);
int iters = atLeast(100);
// Some of the ops we do (stripping random byte, reverse) turn valid UTF8 into invalid if we allow non-ascii:
boolean isAscii = random().nextBoolean();
if (VERBOSE) {
System.out.println("TEST: isAscii=" + isAscii + " numTerms" + numTerms + " iters=" + iters);
}
Set<BytesRef> terms = new HashSet<>();
while (terms.size() < numTerms) {
terms.add(new BytesRef(getRandomString(isAscii)));
}
LightAutomaton a = unionTerms(terms);
assertSame(terms, a);
for(int iter=0;iter<iters;iter++) {
if (VERBOSE) {
System.out.println("TEST: iter=" + iter + " numTerms=" + terms.size());
System.out.println(" terms:");
for(BytesRef term : terms) {
System.out.println(" " + term);
}
}
switch(random().nextInt(14)) {
case 0:
// concatenate prefix
{
if (VERBOSE) {
System.out.println(" op=concat prefix");
}
Set<BytesRef> newTerms = new HashSet<>();
BytesRef prefix = new BytesRef(getRandomString(isAscii));
for(BytesRef term : terms) {
BytesRef newTerm = BytesRef.deepCopyOf(prefix);
newTerm.append(term);
newTerms.add(newTerm);
}
terms = newTerms;
boolean wasDeterministic1 = a.isDeterministic();
a = BasicOperations.concatenateLight(BasicAutomata.makeStringLight(prefix.utf8ToString()), a);
assertEquals(wasDeterministic1, a.isDeterministic());
}
break;
case 1:
// concatenate suffix
{
BytesRef suffix = new BytesRef(getRandomString(isAscii));
if (VERBOSE) {
System.out.println(" op=concat suffix " + suffix);
}
Set<BytesRef> newTerms = new HashSet<>();
for(BytesRef term : terms) {
BytesRef newTerm = BytesRef.deepCopyOf(term);
newTerm.append(suffix);
newTerms.add(newTerm);
}
terms = newTerms;
a = BasicOperations.concatenateLight(a, BasicAutomata.makeStringLight(suffix.utf8ToString()));
}
break;
// nocommit sometimes concat a suffix accepting more than 1 term, and sometimes non-det
case 2:
// determinize
if (VERBOSE) {
System.out.println(" op=determinize");
}
a = BasicOperations.determinize(a);
assertTrue(a.isDeterministic());
break;
case 3:
if (VERBOSE) {
System.out.println(" op=minimize");
}
// minimize
a = MinimizationOperationsLight.minimize(a);
break;
case 4:
// union
{
if (VERBOSE) {
System.out.println(" op=union");
}
Set<BytesRef> newTerms = new HashSet<>();
int numNewTerms = random().nextInt(5);
while (newTerms.size() < numNewTerms) {
newTerms.add(new BytesRef(getRandomString(isAscii)));
}
terms.addAll(newTerms);
LightAutomaton newA = unionTerms(newTerms);
a = BasicOperations.unionLight(a, newA);
}
break;
case 5:
// optional
{
if (VERBOSE) {
System.out.println(" op=optional");
}
a = BasicOperations.optionalLight(a);
terms.add(new BytesRef());
}
break;
case 6:
// minus finite
{
if (VERBOSE) {
System.out.println(" op=minus finite");
}
if (terms.size() > 0) {
RandomAcceptedStrings rasl = new RandomAcceptedStrings(BasicOperations.removeDeadStates(a));
Set<BytesRef> toRemove = new HashSet<>();
int numToRemove = TestUtil.nextInt(random(), 1, (terms.size()+1)/2);
while (toRemove.size() < numToRemove) {
int[] ints = rasl.getRandomAcceptedString(random());
BytesRef term = new BytesRef(UnicodeUtil.newString(ints, 0, ints.length));
if (toRemove.contains(term) == false) {
toRemove.add(term);
}
}
for(BytesRef term : toRemove) {
boolean removed = terms.remove(term);
assertTrue(removed);
}
LightAutomaton a2 = unionTerms(toRemove);
a = BasicOperations.minusLight(a, a2);
}
}
break;
case 7:
{
// minus infinite
List<LightAutomaton> as = new ArrayList<>();
int count = TestUtil.nextInt(random(), 1, 5);
Set<Integer> prefixes = new HashSet<>();
while(prefixes.size() < count) {
// prefix is a leading ascii byte; we remove <prefix>* from a
int prefix = random().nextInt(128);
prefixes.add(prefix);
}
if (VERBOSE) {
System.out.println(" op=minus infinite prefixes=" + prefixes);
}
for(int prefix : prefixes) {
// prefix is a leading ascii byte; we remove <prefix>* from a
LightAutomaton a2 = new LightAutomaton();
int init = a2.createState();
int state = a2.createState();
a2.addTransition(init, state, prefix);
a2.setAccept(state, true);
a2.addTransition(state, state, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
a2.finishState();
as.add(a2);
Iterator<BytesRef> it = terms.iterator();
while (it.hasNext()) {
BytesRef term = it.next();
if (term.length > 0 && (term.bytes[term.offset] & 0xFF) == prefix) {
it.remove();
}
}
}
LightAutomaton a2 = randomNoOp(BasicOperations.unionLight(as));
a = BasicOperations.minusLight(a, a2);
}
break;
case 8:
{
int count = TestUtil.nextInt(random(), 10, 20);
if (VERBOSE) {
System.out.println(" op=intersect infinite count=" + count);
}
// intersect infinite
List<LightAutomaton> as = new ArrayList<>();
Set<Integer> prefixes = new HashSet<>();
while(prefixes.size() < count) {
int prefix = random().nextInt(128);
prefixes.add(prefix);
}
if (VERBOSE) {
System.out.println(" prefixes=" + prefixes);
}
for(int prefix : prefixes) {
// prefix is a leading ascii byte; we retain <prefix>* in a
LightAutomaton a2 = new LightAutomaton();
int init = a2.createState();
int state = a2.createState();
a2.addTransition(init, state, prefix);
a2.setAccept(state, true);
a2.addTransition(state, state, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
a2.finishState();
as.add(a2);
prefixes.add(prefix);
}
LightAutomaton a2 = BasicOperations.unionLight(as);
if (random().nextBoolean()) {
a2 = BasicOperations.determinize(a2);
} else if (random().nextBoolean()) {
a2 = MinimizationOperationsLight.minimize(a2);
}
a = BasicOperations.intersectionLight(a, a2);
Iterator<BytesRef> it = terms.iterator();
while (it.hasNext()) {
BytesRef term = it.next();
if (term.length == 0 || prefixes.contains(term.bytes[term.offset]&0xff) == false) {
if (VERBOSE) {
System.out.println(" drop term=" + term);
}
it.remove();
} else {
if (VERBOSE) {
System.out.println(" keep term=" + term);
}
}
}
}
break;
case 9:
// reverse
if (VERBOSE) {
System.out.println(" op=reverse");
}
a = SpecialOperations.reverse(a);
Set<BytesRef> newTerms = new HashSet<>();
for(BytesRef term : terms) {
newTerms.add(new BytesRef(new StringBuilder(term.utf8ToString()).reverse().toString()));
}
terms = newTerms;
break;
case 10:
if (VERBOSE) {
System.out.println(" op=randomNoOp");
}
a = randomNoOp(a);
break;
case 11:
// interval
int min = random().nextInt(1000);
int max = min + random().nextInt(50);
// digits must be non-zero else we make cycle
int digits = Integer.toString(max).length();
if (VERBOSE) {
System.out.println(" op=union interval min=" + min + " max=" + max + " digits=" + digits);
}
a = BasicOperations.unionLight(a, BasicAutomata.makeIntervalLight(min, max, digits));
StringBuilder b = new StringBuilder();
for(int i=0;i<digits;i++) {
b.append('0');
}
String prefix = b.toString();
for(int i=min;i<=max;i++) {
String s = Integer.toString(i);
if (s.length() < digits) {
// Left-fill with 0s
s = prefix.substring(s.length()) + s;
}
terms.add(new BytesRef(s));
}
break;
case 12:
if (VERBOSE) {
System.out.println(" op=remove the empty string");
}
a = BasicOperations.minusLight(a, BasicAutomata.makeEmptyStringLight());
terms.remove(new BytesRef());
break;
case 13:
if (VERBOSE) {
System.out.println(" op=add the empty string");
}
a = BasicOperations.unionLight(a, BasicAutomata.makeEmptyStringLight());
terms.add(new BytesRef());
break;
}
assertSame(terms, a);
}
assertSame(terms, a);
}
private void assertSame(Collection<BytesRef> terms, LightAutomaton a) {
try {
assertTrue(SpecialOperations.isFinite(a));
assertFalse(BasicOperations.isTotal(a));
LightAutomaton detA = BasicOperations.determinize(a);
// Make sure all terms are accepted:
IntsRef scratch = new IntsRef();
for(BytesRef term : terms) {
Util.toIntsRef(term, scratch);
assertTrue("failed to accept term=" + term.utf8ToString(), BasicOperations.run(detA, term.utf8ToString()));
}
// Use getFiniteStrings:
Set<IntsRef> expected = new HashSet<>();
for(BytesRef term : terms) {
IntsRef intsRef = new IntsRef();
Util.toUTF32(term.utf8ToString(), intsRef);
expected.add(intsRef);
}
Set<IntsRef> actual = SpecialOperations.getFiniteStrings(a, -1);
if (expected.equals(actual) == false) {
System.out.println("FAILED:");
for(IntsRef term : expected) {
if (actual.contains(term) == false) {
System.out.println(" term=" + term + " should be accepted but isn't");
}
}
for(IntsRef term : actual) {
if (expected.contains(term) == false) {
System.out.println(" term=" + term + " is accepted but should not be");
}
}
throw new AssertionError("mismatch");
}
// Use sameLanguage:
LightAutomaton a2 = BasicOperations.removeDeadStates(BasicOperations.determinize(unionTerms(terms)));
assertTrue(BasicOperations.sameLanguage(a2, BasicOperations.removeDeadStates(BasicOperations.determinize(a))));
// Do same check, in UTF8 space
LightAutomaton utf8 = randomNoOp(new UTF32ToUTF8Light().convert(a));
Set<IntsRef> expected2 = new HashSet<>();
for(BytesRef term : terms) {
IntsRef intsRef = new IntsRef();
Util.toIntsRef(term, intsRef);
expected2.add(intsRef);
}
assertEquals(expected2, SpecialOperations.getFiniteStrings(utf8, -1));
} catch (AssertionError ae) {
System.out.println("TEST: FAILED: not same");
System.out.println(" terms (count=" + terms.size() + "):");
for(BytesRef term : terms) {
System.out.println(" " + term);
}
System.out.println(" automaton:");
System.out.println(a.toDot());
//a.writeDot("fail");
throw ae;
}
}
}

View File

@ -28,8 +28,8 @@ public class TestMinimize extends LuceneTestCase {
int num = atLeast(200);
for (int i = 0; i < num; i++) {
LightAutomaton a = AutomatonTestUtil.randomAutomaton(random());
LightAutomaton la = BasicOperations.determinize(a);
LightAutomaton lb = BasicOperations.determinize(MinimizationOperationsLight.minimize(a));
LightAutomaton la = BasicOperations.determinize(BasicOperations.removeDeadStates(a));
LightAutomaton lb = MinimizationOperationsLight.minimize(a);
assertTrue(BasicOperations.sameLanguage(la, lb));
}
}

View File

@ -17,13 +17,17 @@ package org.apache.lucene.util.automaton;
* limitations under the License.
*/
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import java.nio.charset.StandardCharsets;
import java.util.Random;
import org.apache.lucene.util.fst.Util;
public class TestUTF32ToUTF8 extends LuceneTestCase {
@ -203,11 +207,25 @@ public class TestUTF32ToUTF8 extends LuceneTestCase {
assertAutomaton(new RegExp(AutomatonTestUtil.randomRegexp(random()), RegExp.NONE).toLightAutomaton());
}
}
public void testSingleton() throws Exception {
int iters = atLeast(100);
for(int iter=0;iter<iters;iter++) {
String s = TestUtil.randomRealisticUnicodeString(random());
LightAutomaton a = BasicAutomata.makeStringLight(s);
LightAutomaton utf8 = new UTF32ToUTF8Light().convert(a);
IntsRef ints = new IntsRef();
Util.toIntsRef(new BytesRef(s), ints);
Set<IntsRef> set = new HashSet<>();
set.add(ints);
assertEquals(set, SpecialOperations.getFiniteStrings(utf8, -1));
}
}
private void assertAutomaton(LightAutomaton automaton) throws Exception {
CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
ByteRunAutomaton bra = new ByteRunAutomaton(automaton);
final AutomatonTestUtil.RandomAcceptedStringsLight ras = new AutomatonTestUtil.RandomAcceptedStringsLight(automaton);
final AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton);
int num = atLeast(1000);
for (int i = 0; i < num; i++) {

View File

@ -328,7 +328,7 @@ public class AnalyzingSuggester extends Lookup {
}
}
result.finish();
result.finishState();
return result;
}

View File

@ -22,7 +22,6 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.LightAutomaton;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.FST;
@ -72,6 +71,10 @@ public class FSTUtil {
assert a.isDeterministic();
final List<Path<T>> queue = new ArrayList<>();
final List<Path<T>> endNodes = new ArrayList<>();
if (a.getNumStates() == 0) {
return endNodes;
}
queue.add(new Path<>(0, fst
.getFirstArc(new FST.Arc<T>()), fst.outputs.getNoOutput(),
new IntsRef()));

View File

@ -20,7 +20,6 @@ package org.apache.lucene.util.automaton;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
@ -31,7 +30,6 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.Util;
/**
* Utilities for testing automata.
@ -136,7 +134,7 @@ public class AutomatonTestUtil {
* Once created, call {@link #getRandomAcceptedString(Random)}
* to get a new string (in UTF-32 codepoints).
*/
public static class RandomAcceptedStringsLight {
public static class RandomAcceptedStrings {
private final Map<Transition,Boolean> leadsToAccept;
private final LightAutomaton a;
@ -152,7 +150,7 @@ public class AutomatonTestUtil {
}
}
public RandomAcceptedStringsLight(LightAutomaton a) {
public RandomAcceptedStrings(LightAutomaton a) {
this.a = a;
if (a.getNumStates() == 0) {
throw new IllegalArgumentException("this automaton accepts nothing");
@ -334,6 +332,9 @@ public class AutomatonTestUtil {
* Determinizes the given automaton using the given set of initial states.
*/
public static LightAutomaton determinizeSimpleLight(LightAutomaton a, Set<Integer> initialset) {
if (a.getNumStates() == 0) {
return a;
}
int[] points = a.getStartPoints();
// subset construction
Map<Set<Integer>, Set<Integer>> sets = new HashMap<>();
@ -448,6 +449,9 @@ public class AutomatonTestUtil {
* this is only used to test the correctness of our faster implementation.
*/
public static boolean isFiniteSlow(LightAutomaton a) {
if (a.getNumStates() == 0) {
return true;
}
return isFiniteSlow(a, 0, new HashSet<Integer>());
}