mirror of https://github.com/apache/lucene.git
LUCENE-5752: cutover suggesters
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5752@1602031 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7339f52d92
commit
c98b9a8d52
|
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.RollingBuffer;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.LightAutomaton;
|
||||
import org.apache.lucene.util.automaton.State;
|
||||
import org.apache.lucene.util.automaton.Transition;
|
||||
|
||||
|
@ -61,15 +62,15 @@ public class TokenStreamToAutomaton {
|
|||
|
||||
private static class Position implements RollingBuffer.Resettable {
|
||||
// Any tokens that ended at our position arrive to this state:
|
||||
State arriving;
|
||||
int arriving = -1;
|
||||
|
||||
// Any tokens that start at our position leave from this state:
|
||||
State leaving;
|
||||
int leaving = -1;
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
arriving = null;
|
||||
leaving = null;
|
||||
arriving = -1;
|
||||
leaving = -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -98,9 +99,9 @@ public class TokenStreamToAutomaton {
|
|||
* TokenStream}, and creates the corresponding
|
||||
* automaton where arcs are bytes (or Unicode code points
|
||||
* if unicodeArcs = true) from each term. */
|
||||
public Automaton toAutomaton(TokenStream in) throws IOException {
|
||||
final Automaton a = new Automaton();
|
||||
boolean deterministic = true;
|
||||
public LightAutomaton toAutomaton(TokenStream in) throws IOException {
|
||||
final LightAutomaton.Builder builder = new LightAutomaton.Builder();
|
||||
builder.createState();
|
||||
|
||||
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
|
||||
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
|
||||
|
@ -132,34 +133,29 @@ public class TokenStreamToAutomaton {
|
|||
pos += posInc;
|
||||
|
||||
posData = positions.get(pos);
|
||||
assert posData.leaving == null;
|
||||
assert posData.leaving == -1;
|
||||
|
||||
if (posData.arriving == null) {
|
||||
if (posData.arriving == -1) {
|
||||
// No token ever arrived to this position
|
||||
if (pos == 0) {
|
||||
// OK: this is the first token
|
||||
posData.leaving = a.getInitialState();
|
||||
posData.leaving = 0;
|
||||
} else {
|
||||
// This means there's a hole (eg, StopFilter
|
||||
// does this):
|
||||
posData.leaving = new State();
|
||||
addHoles(a.getInitialState(), positions, pos);
|
||||
posData.leaving = builder.createState();
|
||||
addHoles(builder, positions, pos);
|
||||
}
|
||||
} else {
|
||||
posData.leaving = new State();
|
||||
posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));
|
||||
posData.leaving = builder.createState();
|
||||
builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
|
||||
if (posInc > 1) {
|
||||
// A token spanned over a hole; add holes
|
||||
// "under" it:
|
||||
addHoles(a.getInitialState(), positions, pos);
|
||||
addHoles(builder, positions, pos);
|
||||
}
|
||||
}
|
||||
positions.freeBefore(pos);
|
||||
} else {
|
||||
// note: this isn't necessarily true. its just that we aren't surely det.
|
||||
// we could optimize this further (e.g. buffer and sort synonyms at a position)
|
||||
// but thats probably overkill. this is cheap and dirty
|
||||
deterministic = false;
|
||||
}
|
||||
|
||||
final int endPos = pos + posLengthAtt.getPositionLength();
|
||||
|
@ -168,31 +164,33 @@ public class TokenStreamToAutomaton {
|
|||
final BytesRef termUTF8 = changeToken(term);
|
||||
int[] termUnicode = null;
|
||||
final Position endPosData = positions.get(endPos);
|
||||
if (endPosData.arriving == null) {
|
||||
endPosData.arriving = new State();
|
||||
if (endPosData.arriving == -1) {
|
||||
endPosData.arriving = builder.createState();
|
||||
}
|
||||
|
||||
State state = posData.leaving;
|
||||
int termLen;
|
||||
if (unicodeArcs) {
|
||||
final String utf16 = termUTF8.utf8ToString();
|
||||
termUnicode = new int[utf16.codePointCount(0, utf16.length())];
|
||||
termLen = termUnicode.length;
|
||||
for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp))
|
||||
for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
|
||||
termUnicode[j++] = cp = utf16.codePointAt(i);
|
||||
}
|
||||
} else {
|
||||
termLen = termUTF8.length;
|
||||
}
|
||||
|
||||
int state = posData.leaving;
|
||||
|
||||
for(int byteIDX=0;byteIDX<termLen;byteIDX++) {
|
||||
final State nextState = byteIDX == termLen-1 ? endPosData.arriving : new State();
|
||||
final int nextState = byteIDX == termLen-1 ? endPosData.arriving : builder.createState();
|
||||
int c;
|
||||
if (unicodeArcs) {
|
||||
c = termUnicode[byteIDX];
|
||||
} else {
|
||||
c = termUTF8.bytes[termUTF8.offset + byteIDX] & 0xff;
|
||||
}
|
||||
state.addTransition(new Transition(c, nextState));
|
||||
builder.addTransition(state, nextState, c);
|
||||
state = nextState;
|
||||
}
|
||||
|
||||
|
@ -200,28 +198,26 @@ public class TokenStreamToAutomaton {
|
|||
}
|
||||
|
||||
in.end();
|
||||
State endState = null;
|
||||
int endState = -1;
|
||||
if (offsetAtt.endOffset() > maxOffset) {
|
||||
endState = new State();
|
||||
endState.setAccept(true);
|
||||
endState = builder.createState();
|
||||
builder.setAccept(endState, true);
|
||||
}
|
||||
|
||||
pos++;
|
||||
while (pos <= positions.getMaxPos()) {
|
||||
posData = positions.get(pos);
|
||||
if (posData.arriving != null) {
|
||||
if (endState != null) {
|
||||
posData.arriving.addTransition(new Transition(POS_SEP, endState));
|
||||
if (posData.arriving != -1) {
|
||||
if (endState != -1) {
|
||||
builder.addTransition(posData.arriving, endState, POS_SEP);
|
||||
} else {
|
||||
posData.arriving.setAccept(true);
|
||||
builder.setAccept(posData.arriving, true);
|
||||
}
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
|
||||
//toDot(a);
|
||||
a.setDeterministic(deterministic);
|
||||
return a;
|
||||
return builder.finish();
|
||||
}
|
||||
|
||||
// for debugging!
|
||||
|
@ -235,26 +231,26 @@ public class TokenStreamToAutomaton {
|
|||
}
|
||||
*/
|
||||
|
||||
private static void addHoles(State startState, RollingBuffer<Position> positions, int pos) {
|
||||
private static void addHoles(LightAutomaton.Builder builder, RollingBuffer<Position> positions, int pos) {
|
||||
Position posData = positions.get(pos);
|
||||
Position prevPosData = positions.get(pos-1);
|
||||
|
||||
while(posData.arriving == null || prevPosData.leaving == null) {
|
||||
if (posData.arriving == null) {
|
||||
posData.arriving = new State();
|
||||
posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));
|
||||
while(posData.arriving == -1 || prevPosData.leaving == -1) {
|
||||
if (posData.arriving == -1) {
|
||||
posData.arriving = builder.createState();
|
||||
builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
|
||||
}
|
||||
if (prevPosData.leaving == null) {
|
||||
if (prevPosData.leaving == -1) {
|
||||
if (pos == 1) {
|
||||
prevPosData.leaving = startState;
|
||||
prevPosData.leaving = 0;
|
||||
} else {
|
||||
prevPosData.leaving = new State();
|
||||
prevPosData.leaving = builder.createState();
|
||||
}
|
||||
if (prevPosData.arriving != null) {
|
||||
prevPosData.arriving.addTransition(new Transition(POS_SEP, prevPosData.leaving));
|
||||
if (prevPosData.arriving != -1) {
|
||||
builder.addTransition(prevPosData.arriving, prevPosData.leaving, POS_SEP);
|
||||
}
|
||||
}
|
||||
prevPosData.leaving.addTransition(new Transition(HOLE, posData.arriving));
|
||||
builder.addTransition(prevPosData.leaving, posData.arriving, HOLE);
|
||||
pos--;
|
||||
if (pos <= 0) {
|
||||
break;
|
||||
|
|
|
@ -483,6 +483,21 @@ final public class BasicAutomata {
|
|||
return a;
|
||||
}
|
||||
|
||||
public static LightAutomaton makeStringLight(int[] word, int offset, int length) {
|
||||
LightAutomaton a = new LightAutomaton();
|
||||
a.createState();
|
||||
int s = 0;
|
||||
for (int i = offset; i < offset+length; i++) {
|
||||
int s2 = a.createState();
|
||||
a.addTransition(s, s2, word[i]);
|
||||
s = s2;
|
||||
}
|
||||
a.setAccept(s, true);
|
||||
a.finish();
|
||||
|
||||
return a;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new (deterministic and minimal) automaton that accepts the union
|
||||
* of the given collection of {@link BytesRef}s representing UTF-8 encoded
|
||||
|
|
|
@ -35,7 +35,7 @@ import org.apache.lucene.util.Sorter;
|
|||
// - could use packed int arrays instead
|
||||
// - could encode dest w/ delta from to?
|
||||
|
||||
// nocommit should we keep determinized bit?
|
||||
// nocommit should we keep determinized bit? it could be entirely privately computed now?
|
||||
|
||||
/** Uses only int[]s to represent the automaton, but requires that all
|
||||
* transitions for each state are added at once. If this is too restrictive,
|
||||
|
|
|
@ -274,6 +274,9 @@ public final class UTF32ToUTF8Light {
|
|||
//if (utf32.isSingleton()) {
|
||||
//utf32 = utf32.cloneExpanded();
|
||||
//}
|
||||
if (utf32.getNumStates() == 0) {
|
||||
return utf32;
|
||||
}
|
||||
|
||||
int[] map = new int[utf32.getNumStates()];
|
||||
Arrays.fill(map, -1);
|
||||
|
|
|
@ -18,8 +18,8 @@ package org.apache.lucene.analysis;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.io.PrintWriter;
|
||||
import java.io.StringWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
@ -32,6 +32,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
|||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.BasicAutomata;
|
||||
import org.apache.lucene.util.automaton.BasicOperations;
|
||||
import org.apache.lucene.util.automaton.LightAutomaton;
|
||||
|
||||
public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
||||
|
||||
|
@ -409,9 +410,10 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
new Token[] {
|
||||
token("abc", 1, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton expected = BasicAutomata.makeString("abc");
|
||||
assertTrue(BasicOperations.sameLanguage(expected, actual));
|
||||
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final LightAutomaton expected = s2a("abc");
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
}
|
||||
|
||||
public void testMultipleHoles() throws Exception {
|
||||
|
@ -420,9 +422,10 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
token("a", 1, 1),
|
||||
token("b", 3, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton expected = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
|
||||
assertTrue(BasicOperations.sameLanguage(expected, actual));
|
||||
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final LightAutomaton expected = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
}
|
||||
|
||||
public void testSynOverMultipleHoles() throws Exception {
|
||||
|
@ -432,11 +435,12 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
token("x", 0, 3),
|
||||
token("b", 3, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
|
||||
final Automaton a2 = join(s2a("x"), SEP_A, s2a("b"));
|
||||
final Automaton expected = BasicOperations.union(a1, a2);
|
||||
assertTrue(BasicOperations.sameLanguage(expected, actual));
|
||||
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final LightAutomaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
|
||||
final LightAutomaton a2 = join(s2a("x"), SEP_A, s2a("b"));
|
||||
final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
}
|
||||
|
||||
// for debugging!
|
||||
|
@ -450,25 +454,25 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
}
|
||||
*/
|
||||
|
||||
private static final Automaton SEP_A = BasicAutomata.makeChar(TokenStreamToAutomaton.POS_SEP);
|
||||
private static final Automaton HOLE_A = BasicAutomata.makeChar(TokenStreamToAutomaton.HOLE);
|
||||
private static final LightAutomaton SEP_A = BasicAutomata.makeCharLight(TokenStreamToAutomaton.POS_SEP);
|
||||
private static final LightAutomaton HOLE_A = BasicAutomata.makeCharLight(TokenStreamToAutomaton.HOLE);
|
||||
|
||||
private Automaton join(String ... strings) {
|
||||
List<Automaton> as = new ArrayList<>();
|
||||
private LightAutomaton join(String ... strings) {
|
||||
List<LightAutomaton> as = new ArrayList<>();
|
||||
for(String s : strings) {
|
||||
as.add(BasicAutomata.makeString(s));
|
||||
as.add(s2a(s));
|
||||
as.add(SEP_A);
|
||||
}
|
||||
as.remove(as.size()-1);
|
||||
return BasicOperations.concatenate(as);
|
||||
return BasicOperations.concatenateLight(as);
|
||||
}
|
||||
|
||||
private Automaton join(Automaton ... as) {
|
||||
return BasicOperations.concatenate(Arrays.asList(as));
|
||||
private LightAutomaton join(LightAutomaton ... as) {
|
||||
return BasicOperations.concatenateLight(Arrays.asList(as));
|
||||
}
|
||||
|
||||
private Automaton s2a(String s) {
|
||||
return BasicAutomata.makeString(s);
|
||||
private LightAutomaton s2a(String s) {
|
||||
return BasicAutomata.makeStringLight(s);
|
||||
}
|
||||
|
||||
public void testTwoTokens() throws Exception {
|
||||
|
@ -478,11 +482,12 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
token("abc", 1, 1),
|
||||
token("def", 1, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton expected = join("abc", "def");
|
||||
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final LightAutomaton expected = join("abc", "def");
|
||||
|
||||
//toDot(actual);
|
||||
assertTrue(BasicOperations.sameLanguage(expected, actual));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
}
|
||||
|
||||
public void testHole() throws Exception {
|
||||
|
@ -492,12 +497,13 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
token("abc", 1, 1),
|
||||
token("def", 2, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
|
||||
final Automaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"));
|
||||
final LightAutomaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"));
|
||||
|
||||
//toDot(actual);
|
||||
assertTrue(BasicOperations.sameLanguage(expected, actual));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
}
|
||||
|
||||
public void testOverlappedTokensSausage() throws Exception {
|
||||
|
@ -508,11 +514,12 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
token("abc", 1, 1),
|
||||
token("xyz", 0, 1)
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton a1 = BasicAutomata.makeString("abc");
|
||||
final Automaton a2 = BasicAutomata.makeString("xyz");
|
||||
final Automaton expected = BasicOperations.union(a1, a2);
|
||||
assertTrue(BasicOperations.sameLanguage(expected, actual));
|
||||
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final LightAutomaton a1 = s2a("abc");
|
||||
final LightAutomaton a2 = s2a("xyz");
|
||||
final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
}
|
||||
|
||||
public void testOverlappedTokensLattice() throws Exception {
|
||||
|
@ -523,13 +530,14 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
token("xyz", 0, 2),
|
||||
token("def", 1, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton a1 = BasicAutomata.makeString("xyz");
|
||||
final Automaton a2 = join("abc", "def");
|
||||
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final LightAutomaton a1 = s2a("xyz");
|
||||
final LightAutomaton a2 = join("abc", "def");
|
||||
|
||||
final Automaton expected = BasicOperations.union(a1, a2);
|
||||
final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
|
||||
//toDot(actual);
|
||||
assertTrue(BasicOperations.sameLanguage(expected, actual));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
}
|
||||
|
||||
public void testSynOverHole() throws Exception {
|
||||
|
@ -540,14 +548,15 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
token("X", 0, 2),
|
||||
token("b", 2, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton a1 = BasicOperations.union(
|
||||
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final LightAutomaton a1 = BasicOperations.unionLight(
|
||||
join(s2a("a"), SEP_A, HOLE_A),
|
||||
BasicAutomata.makeString("X"));
|
||||
final Automaton expected = BasicOperations.concatenate(a1,
|
||||
s2a("X"));
|
||||
final LightAutomaton expected = BasicOperations.concatenateLight(a1,
|
||||
join(SEP_A, s2a("b")));
|
||||
//toDot(actual);
|
||||
assertTrue(BasicOperations.sameLanguage(expected, actual));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
}
|
||||
|
||||
public void testSynOverHole2() throws Exception {
|
||||
|
@ -558,11 +567,12 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
token("abc", 0, 3),
|
||||
token("def", 2, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton expected = BasicOperations.union(
|
||||
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final LightAutomaton expected = BasicOperations.unionLight(
|
||||
join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")),
|
||||
BasicAutomata.makeString("abc"));
|
||||
assertTrue(BasicOperations.sameLanguage(expected, actual));
|
||||
s2a("abc"));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
}
|
||||
|
||||
public void testOverlappedTokensLattice2() throws Exception {
|
||||
|
@ -574,12 +584,13 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
token("def", 1, 1),
|
||||
token("ghi", 1, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton a1 = BasicAutomata.makeString("xyz");
|
||||
final Automaton a2 = join("abc", "def", "ghi");
|
||||
final Automaton expected = BasicOperations.union(a1, a2);
|
||||
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final LightAutomaton a1 = s2a("xyz");
|
||||
final LightAutomaton a2 = join("abc", "def", "ghi");
|
||||
final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
|
||||
//toDot(actual);
|
||||
assertTrue(BasicOperations.sameLanguage(expected, actual));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
}
|
||||
|
||||
public void testToDot() throws Exception {
|
||||
|
@ -594,10 +605,11 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
new Token[] {
|
||||
token("abc", 2, 1),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton expected = join(HOLE_A, SEP_A, s2a("abc"));
|
||||
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final LightAutomaton expected = join(HOLE_A, SEP_A, s2a("abc"));
|
||||
//toDot(actual);
|
||||
assertTrue(BasicOperations.sameLanguage(expected, actual));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
}
|
||||
|
||||
// TODO: testEndsWithHole... but we need posInc to set in TS.end()
|
||||
|
@ -608,9 +620,10 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
token("a", 1, 1),
|
||||
token("X", 0, 10),
|
||||
});
|
||||
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final Automaton expected = BasicOperations.union(BasicAutomata.makeString("a"),
|
||||
BasicAutomata.makeString("X"));
|
||||
assertTrue(BasicOperations.sameLanguage(expected, actual));
|
||||
final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
|
||||
final LightAutomaton expected = BasicOperations.unionLight(s2a("a"),
|
||||
s2a("X"));
|
||||
assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
|
||||
BasicOperations.determinize(actual)));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,16 @@ package org.apache.lucene.search.suggest.analyzing;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.TokenStreamToAutomaton;
|
||||
|
@ -35,28 +45,20 @@ import org.apache.lucene.util.OfflineSorter;
|
|||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.BasicOperations;
|
||||
import org.apache.lucene.util.automaton.LightAutomaton;
|
||||
import org.apache.lucene.util.automaton.SpecialOperations;
|
||||
import org.apache.lucene.util.automaton.State;
|
||||
import org.apache.lucene.util.automaton.Transition;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.FST.BytesReader;
|
||||
import org.apache.lucene.util.fst.PairOutputs;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PairOutputs.Pair;
|
||||
import org.apache.lucene.util.fst.PairOutputs;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
import org.apache.lucene.util.fst.Util.Result;
|
||||
import org.apache.lucene.util.fst.Util.TopResults;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
/**
|
||||
* Suggester that first analyzes the surface form, adds the
|
||||
|
@ -255,37 +257,65 @@ public class AnalyzingSuggester extends Lookup {
|
|||
return fst == null ? 0 : fst.ramBytesUsed();
|
||||
}
|
||||
|
||||
private void copyDestTransitions(State from, State to, List<Transition> transitions) {
|
||||
if (to.isAccept()) {
|
||||
from.setAccept(true);
|
||||
}
|
||||
for(Transition t : to.getTransitions()) {
|
||||
transitions.add(t);
|
||||
private int[] topoSortStates(LightAutomaton a) {
|
||||
int[] states = new int[a.getNumStates()];
|
||||
final Set<Integer> visited = new HashSet<>();
|
||||
final LinkedList<Integer> worklist = new LinkedList<>();
|
||||
worklist.add(0);
|
||||
visited.add(0);
|
||||
int upto = 0;
|
||||
states[upto] = 0;
|
||||
upto++;
|
||||
LightAutomaton.Transition t = new LightAutomaton.Transition();
|
||||
while (worklist.size() > 0) {
|
||||
int s = worklist.removeFirst();
|
||||
int count = a.initTransition(s, t);
|
||||
for (int i=0;i<count;i++) {
|
||||
a.getNextTransition(t);
|
||||
if (!visited.contains(t.dest)) {
|
||||
visited.add(t.dest);
|
||||
worklist.add(t.dest);
|
||||
states[upto++] = t.dest;
|
||||
}
|
||||
}
|
||||
}
|
||||
return states;
|
||||
}
|
||||
|
||||
|
||||
// Replaces SEP with epsilon or remaps them if
|
||||
// we were asked to preserve them:
|
||||
private void replaceSep(Automaton a) {
|
||||
private LightAutomaton replaceSep(LightAutomaton a) {
|
||||
|
||||
State[] states = a.getNumberedStates();
|
||||
LightAutomaton result = new LightAutomaton();
|
||||
|
||||
// Copy all states over
|
||||
int numStates = a.getNumStates();
|
||||
for(int s=0;s<numStates;s++) {
|
||||
result.createState();
|
||||
result.setAccept(s, a.isAccept(s));
|
||||
}
|
||||
|
||||
// Go in reverse topo sort so we know we only have to
|
||||
// make one pass:
|
||||
for(int stateNumber=states.length-1;stateNumber >=0;stateNumber--) {
|
||||
final State state = states[stateNumber];
|
||||
LightAutomaton.Transition t = new LightAutomaton.Transition();
|
||||
int[] topoSortStates = topoSortStates(a);
|
||||
for(int i=0;i<topoSortStates.length;i++) {
|
||||
int state = topoSortStates[topoSortStates.length-1-i];
|
||||
List<Transition> newTransitions = new ArrayList<>();
|
||||
for(Transition t : state.getTransitions()) {
|
||||
assert t.getMin() == t.getMax();
|
||||
if (t.getMin() == TokenStreamToAutomaton.POS_SEP) {
|
||||
int count = a.initTransition(state, t);
|
||||
for(int j=0;j<count;j++) {
|
||||
a.getNextTransition(t);
|
||||
if (t.min == TokenStreamToAutomaton.POS_SEP) {
|
||||
assert t.max == TokenStreamToAutomaton.POS_SEP;
|
||||
if (preserveSep) {
|
||||
// Remap to SEP_LABEL:
|
||||
newTransitions.add(new Transition(SEP_LABEL, t.getDest()));
|
||||
result.addTransition(state, t.dest, SEP_LABEL);
|
||||
} else {
|
||||
copyDestTransitions(state, t.getDest(), newTransitions);
|
||||
a.setDeterministic(false);
|
||||
result.addEpsilon(state, t.dest);
|
||||
}
|
||||
} else if (t.getMin() == TokenStreamToAutomaton.HOLE) {
|
||||
} else if (t.min == TokenStreamToAutomaton.HOLE) {
|
||||
assert t.max == TokenStreamToAutomaton.HOLE;
|
||||
|
||||
// Just remove the hole: there will then be two
|
||||
// SEP tokens next to each other, which will only
|
||||
|
@ -294,19 +324,21 @@ public class AnalyzingSuggester extends Lookup {
|
|||
// that's somehow a problem we can always map HOLE
|
||||
// to a dedicated byte (and escape it in the
|
||||
// input).
|
||||
copyDestTransitions(state, t.getDest(), newTransitions);
|
||||
a.setDeterministic(false);
|
||||
result.addEpsilon(state, t.dest);
|
||||
} else {
|
||||
newTransitions.add(t);
|
||||
result.addTransition(state, t.dest, t.min, t.max);
|
||||
}
|
||||
}
|
||||
state.setTransitions(newTransitions.toArray(new Transition[newTransitions.size()]));
|
||||
}
|
||||
|
||||
result.finish();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Used by subclass to change the lookup automaton, if
|
||||
* necessary. */
|
||||
protected Automaton convertAutomaton(Automaton a) {
|
||||
protected LightAutomaton convertAutomaton(LightAutomaton a) {
|
||||
return a;
|
||||
}
|
||||
|
||||
|
@ -665,8 +697,7 @@ public class AnalyzingSuggester extends Lookup {
|
|||
}
|
||||
final BytesRef utf8Key = new BytesRef(key);
|
||||
try {
|
||||
|
||||
Automaton lookupAutomaton = toLookupAutomaton(key);
|
||||
LightAutomaton lookupAutomaton = toLookupAutomaton(key);
|
||||
|
||||
final CharsRef spare = new CharsRef();
|
||||
|
||||
|
@ -818,7 +849,7 @@ public class AnalyzingSuggester extends Lookup {
|
|||
|
||||
/** Returns all prefix paths to initialize the search. */
|
||||
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
|
||||
Automaton lookupAutomaton,
|
||||
LightAutomaton lookupAutomaton,
|
||||
FST<Pair<Long,BytesRef>> fst)
|
||||
throws IOException {
|
||||
return prefixPaths;
|
||||
|
@ -826,7 +857,7 @@ public class AnalyzingSuggester extends Lookup {
|
|||
|
||||
final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
|
||||
// Analyze surface form:
|
||||
Automaton automaton = null;
|
||||
LightAutomaton automaton = null;
|
||||
try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) {
|
||||
|
||||
// Create corresponding automaton: labels are bytes
|
||||
|
@ -835,7 +866,7 @@ public class AnalyzingSuggester extends Lookup {
|
|||
automaton = ts2a.toAutomaton(ts);
|
||||
}
|
||||
|
||||
replaceSep(automaton);
|
||||
automaton = replaceSep(automaton);
|
||||
automaton = convertAutomaton(automaton);
|
||||
|
||||
// TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings
|
||||
|
@ -848,32 +879,25 @@ public class AnalyzingSuggester extends Lookup {
|
|||
// TODO: we could walk & add simultaneously, so we
|
||||
// don't have to alloc [possibly biggish]
|
||||
// intermediate HashSet in RAM:
|
||||
|
||||
return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
|
||||
}
|
||||
|
||||
final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
|
||||
final LightAutomaton toLookupAutomaton(final CharSequence key) throws IOException {
|
||||
// TODO: is there a Reader from a CharSequence?
|
||||
// Turn tokenstream into automaton:
|
||||
Automaton automaton = null;
|
||||
LightAutomaton automaton = null;
|
||||
try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
|
||||
automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
|
||||
automaton = getTokenStreamToAutomaton().toAutomaton(ts);
|
||||
}
|
||||
|
||||
// TODO: we could use the end offset to "guess"
|
||||
// whether the final token was a partial token; this
|
||||
// would only be a heuristic ... but maybe an OK one.
|
||||
// This way we could eg differentiate "net" from "net ",
|
||||
// which we can't today...
|
||||
|
||||
replaceSep(automaton);
|
||||
automaton = replaceSep(automaton);
|
||||
|
||||
// TODO: we can optimize this somewhat by determinizing
|
||||
// while we convert
|
||||
BasicOperations.determinize(automaton);
|
||||
automaton = BasicOperations.determinize(automaton);
|
||||
return automaton;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Returns the weight associated with an input string,
|
||||
|
|
|
@ -17,12 +17,14 @@ package org.apache.lucene.search.suggest.analyzing;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.BasicOperations;
|
||||
import org.apache.lucene.util.automaton.LightAutomaton;
|
||||
import org.apache.lucene.util.automaton.State;
|
||||
import org.apache.lucene.util.automaton.Transition;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
|
@ -43,7 +45,7 @@ public class FSTUtil {
|
|||
public static final class Path<T> {
|
||||
|
||||
/** Node in the automaton where path ends: */
|
||||
public final State state;
|
||||
public final int state;
|
||||
|
||||
/** Node in the FST where path ends: */
|
||||
public final FST.Arc<T> fstNode;
|
||||
|
@ -55,7 +57,7 @@ public class FSTUtil {
|
|||
public final IntsRef input;
|
||||
|
||||
/** Sole constructor. */
|
||||
public Path(State state, FST.Arc<T> fstNode, T output, IntsRef input) {
|
||||
public Path(int state, FST.Arc<T> fstNode, T output, IntsRef input) {
|
||||
this.state = state;
|
||||
this.fstNode = fstNode;
|
||||
this.output = output;
|
||||
|
@ -67,21 +69,23 @@ public class FSTUtil {
|
|||
* Enumerates all minimal prefix paths in the automaton that also intersect the FST,
|
||||
* accumulating the FST end node and output for each path.
|
||||
*/
|
||||
public static <T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst)
|
||||
public static <T> List<Path<T>> intersectPrefixPaths(LightAutomaton a, FST<T> fst)
|
||||
throws IOException {
|
||||
assert a.isDeterministic();
|
||||
assert BasicOperations.isDeterministic(a);
|
||||
final List<Path<T>> queue = new ArrayList<>();
|
||||
final List<Path<T>> endNodes = new ArrayList<>();
|
||||
queue.add(new Path<>(a.getInitialState(), fst
|
||||
queue.add(new Path<>(0, fst
|
||||
.getFirstArc(new FST.Arc<T>()), fst.outputs.getNoOutput(),
|
||||
new IntsRef()));
|
||||
|
||||
final FST.Arc<T> scratchArc = new FST.Arc<>();
|
||||
final FST.BytesReader fstReader = fst.getBytesReader();
|
||||
|
||||
|
||||
LightAutomaton.Transition t = new LightAutomaton.Transition();
|
||||
|
||||
while (queue.size() != 0) {
|
||||
final Path<T> path = queue.remove(queue.size() - 1);
|
||||
if (path.state.isAccept()) {
|
||||
if (a.isAccept(path.state)) {
|
||||
endNodes.add(path);
|
||||
// we can stop here if we accept this path,
|
||||
// we accept all further paths too
|
||||
|
@ -89,18 +93,20 @@ public class FSTUtil {
|
|||
}
|
||||
|
||||
IntsRef currentInput = path.input;
|
||||
for (Transition t : path.state.getTransitions()) {
|
||||
final int min = t.getMin();
|
||||
final int max = t.getMax();
|
||||
int count = a.initTransition(path.state, t);
|
||||
for (int i=0;i<count;i++) {
|
||||
a.getNextTransition(t);
|
||||
final int min = t.min;
|
||||
final int max = t.max;
|
||||
if (min == max) {
|
||||
final FST.Arc<T> nextArc = fst.findTargetArc(t.getMin(),
|
||||
final FST.Arc<T> nextArc = fst.findTargetArc(t.min,
|
||||
path.fstNode, scratchArc, fstReader);
|
||||
if (nextArc != null) {
|
||||
final IntsRef newInput = new IntsRef(currentInput.length + 1);
|
||||
newInput.copyInts(currentInput);
|
||||
newInput.ints[currentInput.length] = t.getMin();
|
||||
newInput.ints[currentInput.length] = t.min;
|
||||
newInput.length = currentInput.length + 1;
|
||||
queue.add(new Path<>(t.getDest(), new FST.Arc<T>()
|
||||
queue.add(new Path<>(t.dest, new FST.Arc<T>()
|
||||
.copyFrom(nextArc), fst.outputs
|
||||
.add(path.output, nextArc.output), newInput));
|
||||
}
|
||||
|
@ -122,7 +128,7 @@ public class FSTUtil {
|
|||
newInput.copyInts(currentInput);
|
||||
newInput.ints[currentInput.length] = nextArc.label;
|
||||
newInput.length = currentInput.length + 1;
|
||||
queue.add(new Path<>(t.getDest(), new FST.Arc<T>()
|
||||
queue.add(new Path<>(t.dest, new FST.Arc<T>()
|
||||
.copyFrom(nextArc), fst.outputs
|
||||
.add(path.output, nextArc.output), newInput));
|
||||
final int label = nextArc.label; // used in assert
|
||||
|
|
|
@ -32,8 +32,10 @@ import org.apache.lucene.util.automaton.Automaton;
|
|||
import org.apache.lucene.util.automaton.BasicAutomata;
|
||||
import org.apache.lucene.util.automaton.BasicOperations;
|
||||
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||
import org.apache.lucene.util.automaton.LightAutomaton;
|
||||
import org.apache.lucene.util.automaton.SpecialOperations;
|
||||
import org.apache.lucene.util.automaton.UTF32ToUTF8;
|
||||
import org.apache.lucene.util.automaton.UTF32ToUTF8Light;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PairOutputs.Pair;
|
||||
|
||||
|
@ -177,7 +179,7 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
|||
|
||||
@Override
|
||||
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
|
||||
Automaton lookupAutomaton,
|
||||
LightAutomaton lookupAutomaton,
|
||||
FST<Pair<Long,BytesRef>> fst)
|
||||
throws IOException {
|
||||
|
||||
|
@ -191,7 +193,7 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
|||
// "compete") ... in which case I think the wFST needs
|
||||
// to be log weights or something ...
|
||||
|
||||
Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
|
||||
LightAutomaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
|
||||
/*
|
||||
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8);
|
||||
w.write(levA.toDot());
|
||||
|
@ -202,10 +204,10 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Automaton convertAutomaton(Automaton a) {
|
||||
protected LightAutomaton convertAutomaton(LightAutomaton a) {
|
||||
if (unicodeAware) {
|
||||
Automaton utf8automaton = new UTF32ToUTF8().convert(a);
|
||||
BasicOperations.determinize(utf8automaton);
|
||||
LightAutomaton utf8automaton = new UTF32ToUTF8Light().convert(a);
|
||||
utf8automaton = BasicOperations.determinize(utf8automaton);
|
||||
return utf8automaton;
|
||||
} else {
|
||||
return a;
|
||||
|
@ -219,16 +221,16 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
|||
return tsta;
|
||||
}
|
||||
|
||||
Automaton toLevenshteinAutomata(Automaton automaton) {
|
||||
LightAutomaton toLevenshteinAutomata(LightAutomaton automaton) {
|
||||
final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
|
||||
Automaton subs[] = new Automaton[ref.size()];
|
||||
LightAutomaton subs[] = new LightAutomaton[ref.size()];
|
||||
int upto = 0;
|
||||
for (IntsRef path : ref) {
|
||||
if (path.length <= nonFuzzyPrefix || path.length < minFuzzyLength) {
|
||||
subs[upto] = BasicAutomata.makeString(path.ints, path.offset, path.length);
|
||||
subs[upto] = BasicAutomata.makeStringLight(path.ints, path.offset, path.length);
|
||||
upto++;
|
||||
} else {
|
||||
Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, nonFuzzyPrefix);
|
||||
LightAutomaton prefix = BasicAutomata.makeStringLight(path.ints, path.offset, nonFuzzyPrefix);
|
||||
int ints[] = new int[path.length-nonFuzzyPrefix];
|
||||
System.arraycopy(path.ints, path.offset+nonFuzzyPrefix, ints, 0, ints.length);
|
||||
// TODO: maybe add alphaMin to LevenshteinAutomata,
|
||||
|
@ -237,9 +239,8 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
|||
// edited... but then 0 byte is "in general" allowed
|
||||
// on input (but not in UTF8).
|
||||
LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
|
||||
Automaton levAutomaton = lev.toAutomaton(maxEdits);
|
||||
Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
|
||||
combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
|
||||
LightAutomaton levAutomaton = lev.toLightAutomaton(maxEdits);
|
||||
LightAutomaton combined = BasicOperations.concatenateLight(prefix, levAutomaton);
|
||||
subs[upto] = combined;
|
||||
upto++;
|
||||
}
|
||||
|
@ -247,19 +248,17 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
|||
|
||||
if (subs.length == 0) {
|
||||
// automaton is empty, there is no accepted paths through it
|
||||
return BasicAutomata.makeEmpty(); // matches nothing
|
||||
return BasicAutomata.makeEmptyLight(); // matches nothing
|
||||
} else if (subs.length == 1) {
|
||||
// no synonyms or anything: just a single path through the tokenstream
|
||||
return subs[0];
|
||||
} else {
|
||||
// multiple paths: this is really scary! is it slow?
|
||||
// maybe we should not do this and throw UOE?
|
||||
Automaton a = BasicOperations.union(Arrays.asList(subs));
|
||||
LightAutomaton a = BasicOperations.unionLight(Arrays.asList(subs));
|
||||
// TODO: we could call toLevenshteinAutomata() before det?
|
||||
// this only happens if you have multiple paths anyway (e.g. synonyms)
|
||||
BasicOperations.determinize(a);
|
||||
|
||||
return a;
|
||||
return BasicOperations.determinize(a);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -92,4 +92,4 @@ public final class InputArrayIterator implements InputIterator {
|
|||
public boolean hasContexts() {
|
||||
return hasContexts;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -40,14 +40,16 @@ import org.apache.lucene.analysis.TokenStreamToAutomaton;
|
|||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.search.suggest.Lookup.LookupResult;
|
||||
import org.apache.lucene.search.suggest.Input;
|
||||
import org.apache.lucene.search.suggest.InputArrayIterator;
|
||||
import org.apache.lucene.search.suggest.Lookup.LookupResult;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.BasicOperations;
|
||||
import org.apache.lucene.util.automaton.LightAutomaton;
|
||||
import org.apache.lucene.util.automaton.State;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
|
@ -752,29 +754,30 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
|||
// us the "answer key" (ie maybe we have a bug in
|
||||
// suggester.toLevA ...) ... but testRandom2() fixes
|
||||
// this:
|
||||
Automaton automaton = suggester.convertAutomaton(suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)));
|
||||
assertTrue(automaton.isDeterministic());
|
||||
LightAutomaton automaton = suggester.convertAutomaton(suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)));
|
||||
assertTrue(BasicOperations.isDeterministic(automaton));
|
||||
|
||||
// TODO: could be faster... but its slowCompletor for a reason
|
||||
BytesRef spare = new BytesRef();
|
||||
for (TermFreqPayload2 e : slowCompletor) {
|
||||
spare.copyChars(e.analyzedForm);
|
||||
Set<IntsRef> finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToAutomaton);
|
||||
for (IntsRef intsRef : finiteStrings) {
|
||||
State p = automaton.getInitialState();
|
||||
int p = 0;
|
||||
BytesRef ref = Util.toBytesRef(intsRef, spare);
|
||||
boolean added = false;
|
||||
for (int i = ref.offset; i < ref.length; i++) {
|
||||
State q = p.step(ref.bytes[i] & 0xff);
|
||||
if (q == null) {
|
||||
int q = automaton.step(p, ref.bytes[i] & 0xff);
|
||||
if (q == -1) {
|
||||
break;
|
||||
} else if (q.isAccept()) {
|
||||
} else if (automaton.isAccept(q)) {
|
||||
matches.add(new LookupResult(e.surfaceForm, e.weight));
|
||||
added = true;
|
||||
break;
|
||||
}
|
||||
p = q;
|
||||
}
|
||||
if (!added && p.isAccept()) {
|
||||
if (!added && automaton.isAccept(p)) {
|
||||
matches.add(new LookupResult(e.surfaceForm, e.weight));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue