LUCENE-8697: GraphTokenStreamFiniteStrings correctly handles side paths with gaps

This commit is contained in:
Alan Woodward 2019-02-19 13:55:57 +00:00
parent 97875af3f9
commit 55b4d2dcaa
3 changed files with 61 additions and 7 deletions

View File

@ -223,6 +223,9 @@ New Features
* LUCENE-8655: Add a getter in FunctionScoreQuery class in order to access to the
underlying DoubleValuesSource. (Gérald Quaire via Alan Woodward)
* LUCENE-8697: GraphTokenStreamFiniteStrings correctly handles side paths
containing gaps (Alan Woodward)
Improvements
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.

View File

@ -211,26 +211,33 @@ public final class GraphTokenStreamFiniteStrings {
int pos = -1;
int prevIncr = 1;
int state = -1;
int gap = 0;
while (in.incrementToken()) {
int currentIncr = posIncAtt.getPositionIncrement();
if (pos == -1 && currentIncr < 1) {
throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
}
// always use inc 1 while building, but save original increment
int incr = Math.min(1, currentIncr);
if (incr > 0) {
pos += incr;
if (currentIncr == 0) {
if (gap > 0) {
pos -= gap;
}
}
else {
pos++;
gap = currentIncr - 1;
}
int endPos = pos + posLengthAtt.getPositionLength();
int endPos = pos + posLengthAtt.getPositionLength() + gap;
while (state < endPos) {
state = builder.createState();
}
BytesRef term = termBytesAtt.getBytesRef();
int id = getTermID(currentIncr, prevIncr, term);
//System.out.println("Adding transition: " + term.utf8ToString() + "@" + pos + "->" + endPos);
builder.addTransition(pos, endPos, id);
pos += gap;
// only save last increment on non-zero increment in case we have multiple stacked tokens
if (currentIncr > 0) {

View File

@ -539,14 +539,16 @@ public class TestGraphTokenStreamFiniteStrings extends LuceneTestCase {
}
public void testMultipleSidePaths() throws Exception {
// 0 1 2 3 4 5 6 7 8
// the ny:4/new york wifi:5/wi fi:4 [] wifi:2/wi fi network
TokenStream ts = new CannedTokenStream(
token("the", 1, 1),
token("ny", 1, 4),
token("new", 0, 1),
token("york", 1, 1),
token("wifi", 1, 4),
token("wifi", 1, 5),
token("wi", 0, 1),
token("fi", 1, 3),
token("fi", 1, 4),
token("wifi", 2, 2),
token("wi", 0, 1),
token("fi", 1, 1),
@ -596,4 +598,46 @@ public class TestGraphTokenStreamFiniteStrings extends LuceneTestCase {
terms = graph.getTerms("field", 7);
assertArrayEquals(terms, new Term[] {new Term("field", "network")});
}
public void testSidePathWithGap() throws Exception {
// 0 1 2 3 4 5
// king alfred:3/alfred [] [] great/awesome ruled
CannedTokenStream cts = new CannedTokenStream(
token("king", 1, 1),
token("alfred", 1, 4),
token("alfred", 0, 1),
token("great", 3, 1),
token("awesome", 0, 1),
token("ruled", 1, 1)
);
GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(cts);
Iterator<TokenStream> it = graph.getFiniteStrings();
assertTrue(it.hasNext());
assertTokenStream(it.next(), new String[]{ "king", "alfred", "ruled" }, new int[]{ 1, 1, 1 });
assertTrue(it.hasNext());
assertTokenStream(it.next(), new String[]{ "king", "alfred", "great", "ruled"}, new int[]{ 1, 1, 3, 1 });
assertTrue(it.hasNext());
assertTokenStream(it.next(), new String[]{ "king", "alfred", "awesome", "ruled"}, new int[]{ 1, 1, 3, 1 });
assertFalse(it.hasNext());
}
public void testMultipleSidePathsWithGaps() throws Exception {
// king alfred:4/alfred [] [] saxons:3 [] wessex ruled
CannedTokenStream cts = new CannedTokenStream(
token("king", 1, 1),
token("alfred", 1, 4),
token("alfred", 0, 1),
token("saxons", 3, 3),
token("wessex", 2, 1),
token("ruled", 1, 1)
);
GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(cts);
Iterator<TokenStream> it = graph.getFiniteStrings();
assertTrue(it.hasNext());
assertTokenStream(it.next(), new String[]{ "king", "alfred", "wessex", "ruled" }, new int[]{ 1, 1, 2, 1 });
assertTrue(it.hasNext());
assertTokenStream(it.next(), new String[]{ "king", "alfred", "saxons", "ruled" }, new int[]{ 1, 1, 3, 1 });
assertFalse(it.hasNext());
}
}