LUCENE-5752: cutover suggesters

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5752@1602031 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2014-06-11 21:56:21 +00:00
parent 7339f52d92
commit c98b9a8d52
10 changed files with 259 additions and 200 deletions

View File

@ -26,6 +26,7 @@ import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RollingBuffer;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.LightAutomaton;
import org.apache.lucene.util.automaton.State;
import org.apache.lucene.util.automaton.Transition;
@ -61,15 +62,15 @@ public class TokenStreamToAutomaton {
private static class Position implements RollingBuffer.Resettable {
// Any tokens that ended at our position arrive to this state:
State arriving;
int arriving = -1;
// Any tokens that start at our position leave from this state:
State leaving;
int leaving = -1;
@Override
public void reset() {
arriving = null;
leaving = null;
arriving = -1;
leaving = -1;
}
}
@ -98,9 +99,9 @@ public class TokenStreamToAutomaton {
* TokenStream}, and creates the corresponding
* automaton where arcs are bytes (or Unicode code points
* if unicodeArcs = true) from each term. */
public Automaton toAutomaton(TokenStream in) throws IOException {
final Automaton a = new Automaton();
boolean deterministic = true;
public LightAutomaton toAutomaton(TokenStream in) throws IOException {
final LightAutomaton.Builder builder = new LightAutomaton.Builder();
builder.createState();
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
@ -132,34 +133,29 @@ public class TokenStreamToAutomaton {
pos += posInc;
posData = positions.get(pos);
assert posData.leaving == null;
assert posData.leaving == -1;
if (posData.arriving == null) {
if (posData.arriving == -1) {
// No token ever arrived to this position
if (pos == 0) {
// OK: this is the first token
posData.leaving = a.getInitialState();
posData.leaving = 0;
} else {
// This means there's a hole (eg, StopFilter
// does this):
posData.leaving = new State();
addHoles(a.getInitialState(), positions, pos);
posData.leaving = builder.createState();
addHoles(builder, positions, pos);
}
} else {
posData.leaving = new State();
posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));
posData.leaving = builder.createState();
builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
if (posInc > 1) {
// A token spanned over a hole; add holes
// "under" it:
addHoles(a.getInitialState(), positions, pos);
addHoles(builder, positions, pos);
}
}
positions.freeBefore(pos);
} else {
// note: this isn't necessarily true. its just that we aren't surely det.
// we could optimize this further (e.g. buffer and sort synonyms at a position)
// but thats probably overkill. this is cheap and dirty
deterministic = false;
}
final int endPos = pos + posLengthAtt.getPositionLength();
@ -168,31 +164,33 @@ public class TokenStreamToAutomaton {
final BytesRef termUTF8 = changeToken(term);
int[] termUnicode = null;
final Position endPosData = positions.get(endPos);
if (endPosData.arriving == null) {
endPosData.arriving = new State();
if (endPosData.arriving == -1) {
endPosData.arriving = builder.createState();
}
State state = posData.leaving;
int termLen;
if (unicodeArcs) {
final String utf16 = termUTF8.utf8ToString();
termUnicode = new int[utf16.codePointCount(0, utf16.length())];
termLen = termUnicode.length;
for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp))
for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
termUnicode[j++] = cp = utf16.codePointAt(i);
}
} else {
termLen = termUTF8.length;
}
int state = posData.leaving;
for(int byteIDX=0;byteIDX<termLen;byteIDX++) {
final State nextState = byteIDX == termLen-1 ? endPosData.arriving : new State();
final int nextState = byteIDX == termLen-1 ? endPosData.arriving : builder.createState();
int c;
if (unicodeArcs) {
c = termUnicode[byteIDX];
} else {
c = termUTF8.bytes[termUTF8.offset + byteIDX] & 0xff;
}
state.addTransition(new Transition(c, nextState));
builder.addTransition(state, nextState, c);
state = nextState;
}
@ -200,28 +198,26 @@ public class TokenStreamToAutomaton {
}
in.end();
State endState = null;
int endState = -1;
if (offsetAtt.endOffset() > maxOffset) {
endState = new State();
endState.setAccept(true);
endState = builder.createState();
builder.setAccept(endState, true);
}
pos++;
while (pos <= positions.getMaxPos()) {
posData = positions.get(pos);
if (posData.arriving != null) {
if (endState != null) {
posData.arriving.addTransition(new Transition(POS_SEP, endState));
if (posData.arriving != -1) {
if (endState != -1) {
builder.addTransition(posData.arriving, endState, POS_SEP);
} else {
posData.arriving.setAccept(true);
builder.setAccept(posData.arriving, true);
}
}
pos++;
}
//toDot(a);
a.setDeterministic(deterministic);
return a;
return builder.finish();
}
// for debugging!
@ -235,26 +231,26 @@ public class TokenStreamToAutomaton {
}
*/
private static void addHoles(State startState, RollingBuffer<Position> positions, int pos) {
private static void addHoles(LightAutomaton.Builder builder, RollingBuffer<Position> positions, int pos) {
Position posData = positions.get(pos);
Position prevPosData = positions.get(pos-1);
while(posData.arriving == null || prevPosData.leaving == null) {
if (posData.arriving == null) {
posData.arriving = new State();
posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));
while(posData.arriving == -1 || prevPosData.leaving == -1) {
if (posData.arriving == -1) {
posData.arriving = builder.createState();
builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
}
if (prevPosData.leaving == null) {
if (prevPosData.leaving == -1) {
if (pos == 1) {
prevPosData.leaving = startState;
prevPosData.leaving = 0;
} else {
prevPosData.leaving = new State();
prevPosData.leaving = builder.createState();
}
if (prevPosData.arriving != null) {
prevPosData.arriving.addTransition(new Transition(POS_SEP, prevPosData.leaving));
if (prevPosData.arriving != -1) {
builder.addTransition(prevPosData.arriving, prevPosData.leaving, POS_SEP);
}
}
prevPosData.leaving.addTransition(new Transition(HOLE, posData.arriving));
builder.addTransition(prevPosData.leaving, posData.arriving, HOLE);
pos--;
if (pos <= 0) {
break;

View File

@ -483,6 +483,21 @@ final public class BasicAutomata {
return a;
}
public static LightAutomaton makeStringLight(int[] word, int offset, int length) {
LightAutomaton a = new LightAutomaton();
a.createState();
int s = 0;
for (int i = offset; i < offset+length; i++) {
int s2 = a.createState();
a.addTransition(s, s2, word[i]);
s = s2;
}
a.setAccept(s, true);
a.finish();
return a;
}
/**
* Returns a new (deterministic and minimal) automaton that accepts the union
* of the given collection of {@link BytesRef}s representing UTF-8 encoded

View File

@ -35,7 +35,7 @@ import org.apache.lucene.util.Sorter;
// - could use packed int arrays instead
// - could encode dest w/ delta from to?
// nocommit should we keep determinized bit?
// nocommit should we keep determinized bit? it could be entirely privately computed now?
/** Uses only int[]s to represent the automaton, but requires that all
* transitions for each state are added at once. If this is too restrictive,

View File

@ -274,6 +274,9 @@ public final class UTF32ToUTF8Light {
//if (utf32.isSingleton()) {
//utf32 = utf32.cloneExpanded();
//}
if (utf32.getNumStates() == 0) {
return utf32;
}
int[] map = new int[utf32.getNumStates()];
Arrays.fill(map, -1);

View File

@ -18,8 +18,8 @@ package org.apache.lucene.analysis;
*/
import java.io.IOException;
import java.io.StringWriter;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@ -32,6 +32,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.LightAutomaton;
public class TestGraphTokenizers extends BaseTokenStreamTestCase {
@ -409,9 +410,10 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
new Token[] {
token("abc", 1, 1),
});
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final Automaton expected = BasicAutomata.makeString("abc");
assertTrue(BasicOperations.sameLanguage(expected, actual));
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final LightAutomaton expected = s2a("abc");
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
}
public void testMultipleHoles() throws Exception {
@ -420,9 +422,10 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
token("a", 1, 1),
token("b", 3, 1),
});
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final Automaton expected = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
assertTrue(BasicOperations.sameLanguage(expected, actual));
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final LightAutomaton expected = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
}
public void testSynOverMultipleHoles() throws Exception {
@ -432,11 +435,12 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
token("x", 0, 3),
token("b", 3, 1),
});
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
final Automaton a2 = join(s2a("x"), SEP_A, s2a("b"));
final Automaton expected = BasicOperations.union(a1, a2);
assertTrue(BasicOperations.sameLanguage(expected, actual));
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final LightAutomaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
final LightAutomaton a2 = join(s2a("x"), SEP_A, s2a("b"));
final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
}
// for debugging!
@ -450,25 +454,25 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
}
*/
private static final Automaton SEP_A = BasicAutomata.makeChar(TokenStreamToAutomaton.POS_SEP);
private static final Automaton HOLE_A = BasicAutomata.makeChar(TokenStreamToAutomaton.HOLE);
private static final LightAutomaton SEP_A = BasicAutomata.makeCharLight(TokenStreamToAutomaton.POS_SEP);
private static final LightAutomaton HOLE_A = BasicAutomata.makeCharLight(TokenStreamToAutomaton.HOLE);
private Automaton join(String ... strings) {
List<Automaton> as = new ArrayList<>();
private LightAutomaton join(String ... strings) {
List<LightAutomaton> as = new ArrayList<>();
for(String s : strings) {
as.add(BasicAutomata.makeString(s));
as.add(s2a(s));
as.add(SEP_A);
}
as.remove(as.size()-1);
return BasicOperations.concatenate(as);
return BasicOperations.concatenateLight(as);
}
private Automaton join(Automaton ... as) {
return BasicOperations.concatenate(Arrays.asList(as));
private LightAutomaton join(LightAutomaton ... as) {
return BasicOperations.concatenateLight(Arrays.asList(as));
}
private Automaton s2a(String s) {
return BasicAutomata.makeString(s);
private LightAutomaton s2a(String s) {
return BasicAutomata.makeStringLight(s);
}
public void testTwoTokens() throws Exception {
@ -478,11 +482,12 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
token("abc", 1, 1),
token("def", 1, 1),
});
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final Automaton expected = join("abc", "def");
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final LightAutomaton expected = join("abc", "def");
//toDot(actual);
assertTrue(BasicOperations.sameLanguage(expected, actual));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
}
public void testHole() throws Exception {
@ -492,12 +497,13 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
token("abc", 1, 1),
token("def", 2, 1),
});
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final Automaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"));
final LightAutomaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"));
//toDot(actual);
assertTrue(BasicOperations.sameLanguage(expected, actual));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
}
public void testOverlappedTokensSausage() throws Exception {
@ -508,11 +514,12 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
token("abc", 1, 1),
token("xyz", 0, 1)
});
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final Automaton a1 = BasicAutomata.makeString("abc");
final Automaton a2 = BasicAutomata.makeString("xyz");
final Automaton expected = BasicOperations.union(a1, a2);
assertTrue(BasicOperations.sameLanguage(expected, actual));
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final LightAutomaton a1 = s2a("abc");
final LightAutomaton a2 = s2a("xyz");
final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
}
public void testOverlappedTokensLattice() throws Exception {
@ -523,13 +530,14 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
token("xyz", 0, 2),
token("def", 1, 1),
});
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final Automaton a1 = BasicAutomata.makeString("xyz");
final Automaton a2 = join("abc", "def");
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final LightAutomaton a1 = s2a("xyz");
final LightAutomaton a2 = join("abc", "def");
final Automaton expected = BasicOperations.union(a1, a2);
final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
//toDot(actual);
assertTrue(BasicOperations.sameLanguage(expected, actual));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
}
public void testSynOverHole() throws Exception {
@ -540,14 +548,15 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
token("X", 0, 2),
token("b", 2, 1),
});
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final Automaton a1 = BasicOperations.union(
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final LightAutomaton a1 = BasicOperations.unionLight(
join(s2a("a"), SEP_A, HOLE_A),
BasicAutomata.makeString("X"));
final Automaton expected = BasicOperations.concatenate(a1,
s2a("X"));
final LightAutomaton expected = BasicOperations.concatenateLight(a1,
join(SEP_A, s2a("b")));
//toDot(actual);
assertTrue(BasicOperations.sameLanguage(expected, actual));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
}
public void testSynOverHole2() throws Exception {
@ -558,11 +567,12 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
token("abc", 0, 3),
token("def", 2, 1),
});
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final Automaton expected = BasicOperations.union(
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final LightAutomaton expected = BasicOperations.unionLight(
join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")),
BasicAutomata.makeString("abc"));
assertTrue(BasicOperations.sameLanguage(expected, actual));
s2a("abc"));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
}
public void testOverlappedTokensLattice2() throws Exception {
@ -574,12 +584,13 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
token("def", 1, 1),
token("ghi", 1, 1),
});
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final Automaton a1 = BasicAutomata.makeString("xyz");
final Automaton a2 = join("abc", "def", "ghi");
final Automaton expected = BasicOperations.union(a1, a2);
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final LightAutomaton a1 = s2a("xyz");
final LightAutomaton a2 = join("abc", "def", "ghi");
final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
//toDot(actual);
assertTrue(BasicOperations.sameLanguage(expected, actual));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
}
public void testToDot() throws Exception {
@ -594,10 +605,11 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
new Token[] {
token("abc", 2, 1),
});
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final Automaton expected = join(HOLE_A, SEP_A, s2a("abc"));
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final LightAutomaton expected = join(HOLE_A, SEP_A, s2a("abc"));
//toDot(actual);
assertTrue(BasicOperations.sameLanguage(expected, actual));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
}
// TODO: testEndsWithHole... but we need posInc to set in TS.end()
@ -608,9 +620,10 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
token("a", 1, 1),
token("X", 0, 10),
});
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final Automaton expected = BasicOperations.union(BasicAutomata.makeString("a"),
BasicAutomata.makeString("X"));
assertTrue(BasicOperations.sameLanguage(expected, actual));
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final LightAutomaton expected = BasicOperations.unionLight(s2a("a"),
s2a("X"));
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
BasicOperations.determinize(actual)));
}
}

View File

@ -17,6 +17,16 @@ package org.apache.lucene.search.suggest.analyzing;
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
@ -35,28 +45,20 @@ import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.LightAutomaton;
import org.apache.lucene.util.automaton.SpecialOperations;
import org.apache.lucene.util.automaton.State;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.fst.Util.Result;
import org.apache.lucene.util.fst.Util.TopResults;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.util.fst.Util;
/**
* Suggester that first analyzes the surface form, adds the
@ -255,37 +257,65 @@ public class AnalyzingSuggester extends Lookup {
return fst == null ? 0 : fst.ramBytesUsed();
}
private void copyDestTransitions(State from, State to, List<Transition> transitions) {
if (to.isAccept()) {
from.setAccept(true);
}
for(Transition t : to.getTransitions()) {
transitions.add(t);
private int[] topoSortStates(LightAutomaton a) {
int[] states = new int[a.getNumStates()];
final Set<Integer> visited = new HashSet<>();
final LinkedList<Integer> worklist = new LinkedList<>();
worklist.add(0);
visited.add(0);
int upto = 0;
states[upto] = 0;
upto++;
LightAutomaton.Transition t = new LightAutomaton.Transition();
while (worklist.size() > 0) {
int s = worklist.removeFirst();
int count = a.initTransition(s, t);
for (int i=0;i<count;i++) {
a.getNextTransition(t);
if (!visited.contains(t.dest)) {
visited.add(t.dest);
worklist.add(t.dest);
states[upto++] = t.dest;
}
}
}
return states;
}
// Replaces SEP with epsilon or remaps them if
// we were asked to preserve them:
private void replaceSep(Automaton a) {
private LightAutomaton replaceSep(LightAutomaton a) {
State[] states = a.getNumberedStates();
LightAutomaton result = new LightAutomaton();
// Copy all states over
int numStates = a.getNumStates();
for(int s=0;s<numStates;s++) {
result.createState();
result.setAccept(s, a.isAccept(s));
}
// Go in reverse topo sort so we know we only have to
// make one pass:
for(int stateNumber=states.length-1;stateNumber >=0;stateNumber--) {
final State state = states[stateNumber];
LightAutomaton.Transition t = new LightAutomaton.Transition();
int[] topoSortStates = topoSortStates(a);
for(int i=0;i<topoSortStates.length;i++) {
int state = topoSortStates[topoSortStates.length-1-i];
List<Transition> newTransitions = new ArrayList<>();
for(Transition t : state.getTransitions()) {
assert t.getMin() == t.getMax();
if (t.getMin() == TokenStreamToAutomaton.POS_SEP) {
int count = a.initTransition(state, t);
for(int j=0;j<count;j++) {
a.getNextTransition(t);
if (t.min == TokenStreamToAutomaton.POS_SEP) {
assert t.max == TokenStreamToAutomaton.POS_SEP;
if (preserveSep) {
// Remap to SEP_LABEL:
newTransitions.add(new Transition(SEP_LABEL, t.getDest()));
result.addTransition(state, t.dest, SEP_LABEL);
} else {
copyDestTransitions(state, t.getDest(), newTransitions);
a.setDeterministic(false);
result.addEpsilon(state, t.dest);
}
} else if (t.getMin() == TokenStreamToAutomaton.HOLE) {
} else if (t.min == TokenStreamToAutomaton.HOLE) {
assert t.max == TokenStreamToAutomaton.HOLE;
// Just remove the hole: there will then be two
// SEP tokens next to each other, which will only
@ -294,19 +324,21 @@ public class AnalyzingSuggester extends Lookup {
// that's somehow a problem we can always map HOLE
// to a dedicated byte (and escape it in the
// input).
copyDestTransitions(state, t.getDest(), newTransitions);
a.setDeterministic(false);
result.addEpsilon(state, t.dest);
} else {
newTransitions.add(t);
result.addTransition(state, t.dest, t.min, t.max);
}
}
state.setTransitions(newTransitions.toArray(new Transition[newTransitions.size()]));
}
result.finish();
return result;
}
/** Used by subclass to change the lookup automaton, if
* necessary. */
protected Automaton convertAutomaton(Automaton a) {
protected LightAutomaton convertAutomaton(LightAutomaton a) {
return a;
}
@ -665,8 +697,7 @@ public class AnalyzingSuggester extends Lookup {
}
final BytesRef utf8Key = new BytesRef(key);
try {
Automaton lookupAutomaton = toLookupAutomaton(key);
LightAutomaton lookupAutomaton = toLookupAutomaton(key);
final CharsRef spare = new CharsRef();
@ -818,7 +849,7 @@ public class AnalyzingSuggester extends Lookup {
/** Returns all prefix paths to initialize the search. */
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
Automaton lookupAutomaton,
LightAutomaton lookupAutomaton,
FST<Pair<Long,BytesRef>> fst)
throws IOException {
return prefixPaths;
@ -826,7 +857,7 @@ public class AnalyzingSuggester extends Lookup {
final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
// Analyze surface form:
Automaton automaton = null;
LightAutomaton automaton = null;
try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) {
// Create corresponding automaton: labels are bytes
@ -835,7 +866,7 @@ public class AnalyzingSuggester extends Lookup {
automaton = ts2a.toAutomaton(ts);
}
replaceSep(automaton);
automaton = replaceSep(automaton);
automaton = convertAutomaton(automaton);
// TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings
@ -848,32 +879,25 @@ public class AnalyzingSuggester extends Lookup {
// TODO: we could walk & add simultaneously, so we
// don't have to alloc [possibly biggish]
// intermediate HashSet in RAM:
return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
}
final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
final LightAutomaton toLookupAutomaton(final CharSequence key) throws IOException {
// TODO: is there a Reader from a CharSequence?
// Turn tokenstream into automaton:
Automaton automaton = null;
LightAutomaton automaton = null;
try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
automaton = getTokenStreamToAutomaton().toAutomaton(ts);
}
// TODO: we could use the end offset to "guess"
// whether the final token was a partial token; this
// would only be a heuristic ... but maybe an OK one.
// This way we could eg differentiate "net" from "net ",
// which we can't today...
replaceSep(automaton);
automaton = replaceSep(automaton);
// TODO: we can optimize this somewhat by determinizing
// while we convert
BasicOperations.determinize(automaton);
automaton = BasicOperations.determinize(automaton);
return automaton;
}
/**
* Returns the weight associated with an input string,

View File

@ -17,12 +17,14 @@ package org.apache.lucene.search.suggest.analyzing;
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.io.IOException;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.LightAutomaton;
import org.apache.lucene.util.automaton.State;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.FST;
@ -43,7 +45,7 @@ public class FSTUtil {
public static final class Path<T> {
/** Node in the automaton where path ends: */
public final State state;
public final int state;
/** Node in the FST where path ends: */
public final FST.Arc<T> fstNode;
@ -55,7 +57,7 @@ public class FSTUtil {
public final IntsRef input;
/** Sole constructor. */
public Path(State state, FST.Arc<T> fstNode, T output, IntsRef input) {
public Path(int state, FST.Arc<T> fstNode, T output, IntsRef input) {
this.state = state;
this.fstNode = fstNode;
this.output = output;
@ -67,21 +69,23 @@ public class FSTUtil {
* Enumerates all minimal prefix paths in the automaton that also intersect the FST,
* accumulating the FST end node and output for each path.
*/
public static <T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst)
public static <T> List<Path<T>> intersectPrefixPaths(LightAutomaton a, FST<T> fst)
throws IOException {
assert a.isDeterministic();
assert BasicOperations.isDeterministic(a);
final List<Path<T>> queue = new ArrayList<>();
final List<Path<T>> endNodes = new ArrayList<>();
queue.add(new Path<>(a.getInitialState(), fst
queue.add(new Path<>(0, fst
.getFirstArc(new FST.Arc<T>()), fst.outputs.getNoOutput(),
new IntsRef()));
final FST.Arc<T> scratchArc = new FST.Arc<>();
final FST.BytesReader fstReader = fst.getBytesReader();
LightAutomaton.Transition t = new LightAutomaton.Transition();
while (queue.size() != 0) {
final Path<T> path = queue.remove(queue.size() - 1);
if (path.state.isAccept()) {
if (a.isAccept(path.state)) {
endNodes.add(path);
// we can stop here if we accept this path,
// we accept all further paths too
@ -89,18 +93,20 @@ public class FSTUtil {
}
IntsRef currentInput = path.input;
for (Transition t : path.state.getTransitions()) {
final int min = t.getMin();
final int max = t.getMax();
int count = a.initTransition(path.state, t);
for (int i=0;i<count;i++) {
a.getNextTransition(t);
final int min = t.min;
final int max = t.max;
if (min == max) {
final FST.Arc<T> nextArc = fst.findTargetArc(t.getMin(),
final FST.Arc<T> nextArc = fst.findTargetArc(t.min,
path.fstNode, scratchArc, fstReader);
if (nextArc != null) {
final IntsRef newInput = new IntsRef(currentInput.length + 1);
newInput.copyInts(currentInput);
newInput.ints[currentInput.length] = t.getMin();
newInput.ints[currentInput.length] = t.min;
newInput.length = currentInput.length + 1;
queue.add(new Path<>(t.getDest(), new FST.Arc<T>()
queue.add(new Path<>(t.dest, new FST.Arc<T>()
.copyFrom(nextArc), fst.outputs
.add(path.output, nextArc.output), newInput));
}
@ -122,7 +128,7 @@ public class FSTUtil {
newInput.copyInts(currentInput);
newInput.ints[currentInput.length] = nextArc.label;
newInput.length = currentInput.length + 1;
queue.add(new Path<>(t.getDest(), new FST.Arc<T>()
queue.add(new Path<>(t.dest, new FST.Arc<T>()
.copyFrom(nextArc), fst.outputs
.add(path.output, nextArc.output), newInput));
final int label = nextArc.label; // used in assert

View File

@ -32,8 +32,10 @@ import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.LightAutomaton;
import org.apache.lucene.util.automaton.SpecialOperations;
import org.apache.lucene.util.automaton.UTF32ToUTF8;
import org.apache.lucene.util.automaton.UTF32ToUTF8Light;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs.Pair;
@ -177,7 +179,7 @@ public final class FuzzySuggester extends AnalyzingSuggester {
@Override
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
Automaton lookupAutomaton,
LightAutomaton lookupAutomaton,
FST<Pair<Long,BytesRef>> fst)
throws IOException {
@ -191,7 +193,7 @@ public final class FuzzySuggester extends AnalyzingSuggester {
// "compete") ... in which case I think the wFST needs
// to be log weights or something ...
Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
LightAutomaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
/*
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8);
w.write(levA.toDot());
@ -202,10 +204,10 @@ public final class FuzzySuggester extends AnalyzingSuggester {
}
@Override
protected Automaton convertAutomaton(Automaton a) {
protected LightAutomaton convertAutomaton(LightAutomaton a) {
if (unicodeAware) {
Automaton utf8automaton = new UTF32ToUTF8().convert(a);
BasicOperations.determinize(utf8automaton);
LightAutomaton utf8automaton = new UTF32ToUTF8Light().convert(a);
utf8automaton = BasicOperations.determinize(utf8automaton);
return utf8automaton;
} else {
return a;
@ -219,16 +221,16 @@ public final class FuzzySuggester extends AnalyzingSuggester {
return tsta;
}
Automaton toLevenshteinAutomata(Automaton automaton) {
LightAutomaton toLevenshteinAutomata(LightAutomaton automaton) {
final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
Automaton subs[] = new Automaton[ref.size()];
LightAutomaton subs[] = new LightAutomaton[ref.size()];
int upto = 0;
for (IntsRef path : ref) {
if (path.length <= nonFuzzyPrefix || path.length < minFuzzyLength) {
subs[upto] = BasicAutomata.makeString(path.ints, path.offset, path.length);
subs[upto] = BasicAutomata.makeStringLight(path.ints, path.offset, path.length);
upto++;
} else {
Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, nonFuzzyPrefix);
LightAutomaton prefix = BasicAutomata.makeStringLight(path.ints, path.offset, nonFuzzyPrefix);
int ints[] = new int[path.length-nonFuzzyPrefix];
System.arraycopy(path.ints, path.offset+nonFuzzyPrefix, ints, 0, ints.length);
// TODO: maybe add alphaMin to LevenshteinAutomata,
@ -237,9 +239,8 @@ public final class FuzzySuggester extends AnalyzingSuggester {
// edited... but then 0 byte is "in general" allowed
// on input (but not in UTF8).
LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
Automaton levAutomaton = lev.toAutomaton(maxEdits);
Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
LightAutomaton levAutomaton = lev.toLightAutomaton(maxEdits);
LightAutomaton combined = BasicOperations.concatenateLight(prefix, levAutomaton);
subs[upto] = combined;
upto++;
}
@ -247,19 +248,17 @@ public final class FuzzySuggester extends AnalyzingSuggester {
if (subs.length == 0) {
// automaton is empty, there is no accepted paths through it
return BasicAutomata.makeEmpty(); // matches nothing
return BasicAutomata.makeEmptyLight(); // matches nothing
} else if (subs.length == 1) {
// no synonyms or anything: just a single path through the tokenstream
return subs[0];
} else {
// multiple paths: this is really scary! is it slow?
// maybe we should not do this and throw UOE?
Automaton a = BasicOperations.union(Arrays.asList(subs));
LightAutomaton a = BasicOperations.unionLight(Arrays.asList(subs));
// TODO: we could call toLevenshteinAutomata() before det?
// this only happens if you have multiple paths anyway (e.g. synonyms)
BasicOperations.determinize(a);
return a;
return BasicOperations.determinize(a);
}
}
}

View File

@ -92,4 +92,4 @@ public final class InputArrayIterator implements InputIterator {
public boolean hasContexts() {
return hasContexts;
}
}
}

View File

@ -40,14 +40,16 @@ import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.search.suggest.Input;
import org.apache.lucene.search.suggest.InputArrayIterator;
import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.LightAutomaton;
import org.apache.lucene.util.automaton.State;
import org.apache.lucene.util.fst.Util;
@ -752,29 +754,30 @@ public class FuzzySuggesterTest extends LuceneTestCase {
// us the "answer key" (ie maybe we have a bug in
// suggester.toLevA ...) ... but testRandom2() fixes
// this:
Automaton automaton = suggester.convertAutomaton(suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)));
assertTrue(automaton.isDeterministic());
LightAutomaton automaton = suggester.convertAutomaton(suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)));
assertTrue(BasicOperations.isDeterministic(automaton));
// TODO: could be faster... but its slowCompletor for a reason
BytesRef spare = new BytesRef();
for (TermFreqPayload2 e : slowCompletor) {
spare.copyChars(e.analyzedForm);
Set<IntsRef> finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToAutomaton);
for (IntsRef intsRef : finiteStrings) {
State p = automaton.getInitialState();
int p = 0;
BytesRef ref = Util.toBytesRef(intsRef, spare);
boolean added = false;
for (int i = ref.offset; i < ref.length; i++) {
State q = p.step(ref.bytes[i] & 0xff);
if (q == null) {
int q = automaton.step(p, ref.bytes[i] & 0xff);
if (q == -1) {
break;
} else if (q.isAccept()) {
} else if (automaton.isAccept(q)) {
matches.add(new LookupResult(e.surfaceForm, e.weight));
added = true;
break;
}
p = q;
}
if (!added && p.isAccept()) {
if (!added && automaton.isAccept(p)) {
matches.add(new LookupResult(e.surfaceForm, e.weight));
}
}