diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f7f60091605..130d7965848 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -223,6 +223,9 @@ New Features * LUCENE-8655: Add a getter in FunctionScoreQuery class in order to access to the underlying DoubleValuesSource. (GĂ©rald Quaire via Alan Woodward) +* LUCENE-8697: GraphTokenStreamFiniteStrings correctly handles side paths + containing gaps (Alan Woodward) + Improvements * LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities. diff --git a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java index a7005012b73..b6a99958e6a 100644 --- a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java +++ b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java @@ -211,26 +211,33 @@ public final class GraphTokenStreamFiniteStrings { int pos = -1; int prevIncr = 1; int state = -1; + int gap = 0; while (in.incrementToken()) { int currentIncr = posIncAtt.getPositionIncrement(); if (pos == -1 && currentIncr < 1) { throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1"); } - // always use inc 1 while building, but save original increment - int incr = Math.min(1, currentIncr); - if (incr > 0) { - pos += incr; + if (currentIncr == 0) { + if (gap > 0) { + pos -= gap; + } + } + else { + pos++; + gap = currentIncr - 1; } - int endPos = pos + posLengthAtt.getPositionLength(); + int endPos = pos + posLengthAtt.getPositionLength() + gap; while (state < endPos) { state = builder.createState(); } BytesRef term = termBytesAtt.getBytesRef(); int id = getTermID(currentIncr, prevIncr, term); + //System.out.println("Adding transition: " + term.utf8ToString() + "@" + pos + "->" + endPos); builder.addTransition(pos, endPos, id); + pos += gap; // only save last increment on non-zero increment in case we have multiple stacked tokens if (currentIncr > 0) { diff --git a/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java b/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java index 44b7b7c4dec..1739fa0c7d6 100644 --- a/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java +++ b/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java @@ -539,14 +539,16 @@ public class TestGraphTokenStreamFiniteStrings extends LuceneTestCase { } public void testMultipleSidePaths() throws Exception { + // 0 1 2 3 4 5 6 7 8 + // the ny:4/new york wifi:5/wi fi:4 [] wifi:2/wi fi network TokenStream ts = new CannedTokenStream( token("the", 1, 1), token("ny", 1, 4), token("new", 0, 1), token("york", 1, 1), - token("wifi", 1, 4), + token("wifi", 1, 5), token("wi", 0, 1), - token("fi", 1, 3), + token("fi", 1, 4), token("wifi", 2, 2), token("wi", 0, 1), token("fi", 1, 1), @@ -596,4 +598,46 @@ public class TestGraphTokenStreamFiniteStrings extends LuceneTestCase { terms = graph.getTerms("field", 7); assertArrayEquals(terms, new Term[] {new Term("field", "network")}); } + + public void testSidePathWithGap() throws Exception { + // 0 1 2 3 4 5 + // king alfred:3/alfred [] [] great/awesome ruled + CannedTokenStream cts = new CannedTokenStream( + token("king", 1, 1), + token("alfred", 1, 4), + token("alfred", 0, 1), + token("great", 3, 1), + token("awesome", 0, 1), + token("ruled", 1, 1) + ); + GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(cts); + Iterator it = graph.getFiniteStrings(); + assertTrue(it.hasNext()); + assertTokenStream(it.next(), new String[]{ "king", "alfred", "ruled" }, new int[]{ 1, 1, 1 }); + assertTrue(it.hasNext()); + assertTokenStream(it.next(), new String[]{ "king", "alfred", "great", "ruled"}, new int[]{ 1, 1, 3, 1 }); + assertTrue(it.hasNext()); + assertTokenStream(it.next(), new String[]{ "king", "alfred", "awesome", "ruled"}, new int[]{ 1, 1, 3, 1 }); + assertFalse(it.hasNext()); + } + + public void testMultipleSidePathsWithGaps() throws Exception { + // king alfred:4/alfred [] [] saxons:3 [] wessex ruled + CannedTokenStream cts = new CannedTokenStream( + token("king", 1, 1), + token("alfred", 1, 4), + token("alfred", 0, 1), + token("saxons", 3, 3), + token("wessex", 2, 1), + token("ruled", 1, 1) + ); + GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(cts); + Iterator it = graph.getFiniteStrings(); + assertTrue(it.hasNext()); + assertTokenStream(it.next(), new String[]{ "king", "alfred", "wessex", "ruled" }, new int[]{ 1, 1, 2, 1 }); + assertTrue(it.hasNext()); + assertTokenStream(it.next(), new String[]{ "king", "alfred", "saxons", "ruled" }, new int[]{ 1, 1, 3, 1 }); + assertFalse(it.hasNext()); + } + }