mirror of https://github.com/apache/lucene.git
LUCENE-8697: GraphTokenStreamFiniteStrings correctly handles side paths with gaps
This commit is contained in:
parent
97875af3f9
commit
55b4d2dcaa
|
@ -223,6 +223,9 @@ New Features
|
|||
* LUCENE-8655: Add a getter in FunctionScoreQuery class in order to access to the
|
||||
underlying DoubleValuesSource. (Gérald Quaire via Alan Woodward)
|
||||
|
||||
* LUCENE-8697: GraphTokenStreamFiniteStrings correctly handles side paths
|
||||
containing gaps (Alan Woodward)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.
|
||||
|
|
|
@ -211,26 +211,33 @@ public final class GraphTokenStreamFiniteStrings {
|
|||
int pos = -1;
|
||||
int prevIncr = 1;
|
||||
int state = -1;
|
||||
int gap = 0;
|
||||
while (in.incrementToken()) {
|
||||
int currentIncr = posIncAtt.getPositionIncrement();
|
||||
if (pos == -1 && currentIncr < 1) {
|
||||
throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
|
||||
}
|
||||
|
||||
// always use inc 1 while building, but save original increment
|
||||
int incr = Math.min(1, currentIncr);
|
||||
if (incr > 0) {
|
||||
pos += incr;
|
||||
if (currentIncr == 0) {
|
||||
if (gap > 0) {
|
||||
pos -= gap;
|
||||
}
|
||||
}
|
||||
else {
|
||||
pos++;
|
||||
gap = currentIncr - 1;
|
||||
}
|
||||
|
||||
int endPos = pos + posLengthAtt.getPositionLength();
|
||||
int endPos = pos + posLengthAtt.getPositionLength() + gap;
|
||||
while (state < endPos) {
|
||||
state = builder.createState();
|
||||
}
|
||||
|
||||
BytesRef term = termBytesAtt.getBytesRef();
|
||||
int id = getTermID(currentIncr, prevIncr, term);
|
||||
//System.out.println("Adding transition: " + term.utf8ToString() + "@" + pos + "->" + endPos);
|
||||
builder.addTransition(pos, endPos, id);
|
||||
pos += gap;
|
||||
|
||||
// only save last increment on non-zero increment in case we have multiple stacked tokens
|
||||
if (currentIncr > 0) {
|
||||
|
|
|
@ -539,14 +539,16 @@ public class TestGraphTokenStreamFiniteStrings extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testMultipleSidePaths() throws Exception {
|
||||
// 0 1 2 3 4 5 6 7 8
|
||||
// the ny:4/new york wifi:5/wi fi:4 [] wifi:2/wi fi network
|
||||
TokenStream ts = new CannedTokenStream(
|
||||
token("the", 1, 1),
|
||||
token("ny", 1, 4),
|
||||
token("new", 0, 1),
|
||||
token("york", 1, 1),
|
||||
token("wifi", 1, 4),
|
||||
token("wifi", 1, 5),
|
||||
token("wi", 0, 1),
|
||||
token("fi", 1, 3),
|
||||
token("fi", 1, 4),
|
||||
token("wifi", 2, 2),
|
||||
token("wi", 0, 1),
|
||||
token("fi", 1, 1),
|
||||
|
@ -596,4 +598,46 @@ public class TestGraphTokenStreamFiniteStrings extends LuceneTestCase {
|
|||
terms = graph.getTerms("field", 7);
|
||||
assertArrayEquals(terms, new Term[] {new Term("field", "network")});
|
||||
}
|
||||
|
||||
public void testSidePathWithGap() throws Exception {
|
||||
// 0 1 2 3 4 5
|
||||
// king alfred:3/alfred [] [] great/awesome ruled
|
||||
CannedTokenStream cts = new CannedTokenStream(
|
||||
token("king", 1, 1),
|
||||
token("alfred", 1, 4),
|
||||
token("alfred", 0, 1),
|
||||
token("great", 3, 1),
|
||||
token("awesome", 0, 1),
|
||||
token("ruled", 1, 1)
|
||||
);
|
||||
GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(cts);
|
||||
Iterator<TokenStream> it = graph.getFiniteStrings();
|
||||
assertTrue(it.hasNext());
|
||||
assertTokenStream(it.next(), new String[]{ "king", "alfred", "ruled" }, new int[]{ 1, 1, 1 });
|
||||
assertTrue(it.hasNext());
|
||||
assertTokenStream(it.next(), new String[]{ "king", "alfred", "great", "ruled"}, new int[]{ 1, 1, 3, 1 });
|
||||
assertTrue(it.hasNext());
|
||||
assertTokenStream(it.next(), new String[]{ "king", "alfred", "awesome", "ruled"}, new int[]{ 1, 1, 3, 1 });
|
||||
assertFalse(it.hasNext());
|
||||
}
|
||||
|
||||
public void testMultipleSidePathsWithGaps() throws Exception {
|
||||
// king alfred:4/alfred [] [] saxons:3 [] wessex ruled
|
||||
CannedTokenStream cts = new CannedTokenStream(
|
||||
token("king", 1, 1),
|
||||
token("alfred", 1, 4),
|
||||
token("alfred", 0, 1),
|
||||
token("saxons", 3, 3),
|
||||
token("wessex", 2, 1),
|
||||
token("ruled", 1, 1)
|
||||
);
|
||||
GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(cts);
|
||||
Iterator<TokenStream> it = graph.getFiniteStrings();
|
||||
assertTrue(it.hasNext());
|
||||
assertTokenStream(it.next(), new String[]{ "king", "alfred", "wessex", "ruled" }, new int[]{ 1, 1, 2, 1 });
|
||||
assertTrue(it.hasNext());
|
||||
assertTokenStream(it.next(), new String[]{ "king", "alfred", "saxons", "ruled" }, new int[]{ 1, 1, 3, 1 });
|
||||
assertFalse(it.hasNext());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue