LUCENE-2672: speed up automaton seeking in nextString

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1001781 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-09-27 15:32:21 +00:00
parent 35c9c34ee3
commit 87496c7a6d
1 changed files with 23 additions and 15 deletions

View File

@ -22,6 +22,7 @@ import java.util.Comparator;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.SpecialOperations;
@ -110,7 +111,6 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
// used for path tracking, where each bit is a numbered state.
visited = new long[runAutomaton.getSize()];
setUseTermsCache(finite);
termComp = getComparator();
}
@ -201,6 +201,8 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
linearUpperBound.length = length;
}
private final IntsRef savedStates = new IntsRef(10);
/**
* Increments the byte buffer to the next String in binary order after s that will not put
* the machine into a reject state. If such a string does not exist, returns
@ -214,17 +216,20 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
private boolean nextString() {
int state;
int pos = 0;
savedStates.grow(seekBytesRef.length+1);
final int[] states = savedStates.ints;
states[0] = runAutomaton.getInitialState();
while (true) {
curGen++;
linear = false;
state = runAutomaton.getInitialState();
// walk the automaton until a character is rejected.
for (pos = 0; pos < seekBytesRef.length; pos++) {
for (state = states[pos]; pos < seekBytesRef.length; pos++) {
visited[state] = curGen;
int nextState = runAutomaton.step(state, seekBytesRef.bytes[pos] & 0xff);
if (nextState == -1)
break;
states[pos+1] = nextState;
// we found a loop, record it for faster enumeration
if (!finite && !linear && visited[nextState] == curGen) {
linear = true;
@ -238,12 +243,16 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
if (nextString(state, pos)) {
return true;
} else { /* no more solutions exist from this useful portion, backtrack */
if (!backtrack(pos)) /* no more solutions at all */
if ((pos = backtrack(pos)) < 0) /* no more solutions at all */
return false;
else if (runAutomaton.run(seekBytesRef.bytes, 0, seekBytesRef.length))
final int newState = runAutomaton.step(states[pos], seekBytesRef.bytes[pos] & 0xff);
if (newState >= 0 && runAutomaton.isAccept(newState))
/* String is good to go as-is */
return true;
/* else advance further */
// TODO: paranoia? if we backtrack thru an infinite DFA, the loop detection is important!
// for now, restart from scratch for all infinite DFAs
if (!finite) pos = 0;
}
}
}
@ -332,20 +341,19 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
* can match.
*
* @param position current position in the input String
* @return true if more possible solutions exist for the DFA
* @return position >=0 if more possible solutions exist for the DFA
*/
private boolean backtrack(int position) {
while (position > 0) {
int nextChar = seekBytesRef.bytes[position - 1] & 0xff;
private int backtrack(int position) {
while (position-- > 0) {
int nextChar = seekBytesRef.bytes[position] & 0xff;
// if a character is 0xff its a dead-end too,
// because there is no higher character in binary sort order.
if (nextChar++ != 0xff) {
seekBytesRef.bytes[position - 1] = (byte) nextChar;
seekBytesRef.length = position;
return true;
seekBytesRef.bytes[position] = (byte) nextChar;
seekBytesRef.length = position+1;
return position;
}
position--;
}
return false; /* all solutions exhausted */
return -1; /* all solutions exhausted */
}
}