From f062a18ae71642b831af2026748e74a2e78b1e7b Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Tue, 22 Jan 2019 09:19:48 +0000 Subject: [PATCH] LUCENE-8650: Fix end() and reset() in ConcatenatingTokenStream --- lucene/CHANGES.txt | 4 +++ .../ConcatenatingTokenStream.java | 20 +++++++++++++ .../TestConcatenatingTokenStream.java | 29 +++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 33cb6993895..00c03958ce4 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -283,6 +283,10 @@ Bug fixes: * LUCENE-8654: Polygon2D#relateTriangle returns the wrong answer if polygon is inside the triangle. (Ignacio Vera) +* LUCENE-8650: ConcatenatingTokenStream did not correctly clear its state in reset(), and + was not propagating final position increments from its child streams correctly. + (Dan Meehl, Alan Woodward) + New Features * LUCENE-8026: ExitableDirectoryReader may now time out queries that run on diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenatingTokenStream.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenatingTokenStream.java index 960cae1876b..e32bda45f76 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenatingTokenStream.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenatingTokenStream.java @@ -22,6 +22,7 @@ import java.util.Iterator; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.IOUtils; @@ -39,10 +40,13 @@ public final class ConcatenatingTokenStream extends TokenStream { private final TokenStream[] sources; private final OffsetAttribute[] sourceOffsets; + private final PositionIncrementAttribute[] sourceIncrements; private final OffsetAttribute offsetAtt; + private final PositionIncrementAttribute posIncAtt; private int currentSource; private int offsetIncrement; + private int initialPositionIncrement = 1; /** * Create a new ConcatenatingTokenStream from a set of inputs @@ -52,9 +56,12 @@ public final class ConcatenatingTokenStream extends TokenStream { super(combineSources(sources)); this.sources = sources; this.offsetAtt = addAttribute(OffsetAttribute.class); + this.posIncAtt = addAttribute(PositionIncrementAttribute.class); this.sourceOffsets = new OffsetAttribute[sources.length]; + this.sourceIncrements = new PositionIncrementAttribute[sources.length]; for (int i = 0; i < sources.length; i++) { this.sourceOffsets[i] = sources[i].addAttribute(OffsetAttribute.class); + this.sourceIncrements[i] = sources[i].addAttribute(PositionIncrementAttribute.class); } } @@ -78,19 +85,26 @@ public final class ConcatenatingTokenStream extends TokenStream { @Override public boolean incrementToken() throws IOException { + boolean newSource = false; while (sources[currentSource].incrementToken() == false) { if (currentSource >= sources.length - 1) return false; sources[currentSource].end(); + initialPositionIncrement = sourceIncrements[currentSource].getPositionIncrement(); OffsetAttribute att = sourceOffsets[currentSource]; if (att != null) offsetIncrement += att.endOffset(); currentSource++; + newSource = true; } clearAttributes(); sources[currentSource].copyTo(this); offsetAtt.setOffset(offsetAtt.startOffset() + offsetIncrement, offsetAtt.endOffset() + offsetIncrement); + if (newSource) { + int posInc = posIncAtt.getPositionIncrement(); + posIncAtt.setPositionIncrement(posInc + initialPositionIncrement); + } return true; } @@ -98,7 +112,11 @@ public final class ConcatenatingTokenStream extends TokenStream { @Override public void end() throws IOException { sources[currentSource].end(); + int finalOffset = sourceOffsets[currentSource].endOffset() + offsetIncrement; + int finalPosInc = sourceIncrements[currentSource].getPositionIncrement(); super.end(); + offsetAtt.setOffset(finalOffset, finalOffset); + posIncAtt.setPositionIncrement(finalPosInc); } @Override @@ -107,6 +125,8 @@ public final class ConcatenatingTokenStream extends TokenStream { source.reset(); } super.reset(); + currentSource = 0; + offsetIncrement = 0; } @Override diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java index 258f9b8632f..19542e408fc 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java @@ -21,7 +21,9 @@ import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.CannedTokenStream; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; @@ -46,6 +48,33 @@ public class TestConcatenatingTokenStream extends BaseTokenStreamTestCase { new int[]{ 0, 6, 12, 19, 25, 31 }, new int[]{ 5, 11, 18, 24, 30, 36 }); + // test re-use + first.setReader(new StringReader("first words ")); + second.setReader(new StringReader("second words")); + third.setReader(new StringReader(" third words")); + assertTokenStreamContents(ts, + new String[] { "first", "words", "second", "words", "third", "words" }, + new int[]{ 0, 6, 12, 19, 25, 31 }, + new int[]{ 5, 11, 18, 24, 30, 36 }, + new int[]{ 1, 1, 1, 1, 1, 1 }); + + } + + public void testOffsetGaps() throws IOException { + CannedTokenStream cts1 = new CannedTokenStream(2, 10, + new Token("a", 0, 1), new Token("b", 2, 3)); + CannedTokenStream cts2 = new CannedTokenStream(2, 10, + new Token("c", 0, 1), new Token("d", 2, 3)); + + TokenStream ts = new ConcatenatingTokenStream(cts1, cts2); + assertTokenStreamContents(ts, + new String[] { "a", "b", "c", "d" }, + new int[]{ 0, 2, 10, 12 }, + new int[]{ 1, 3, 11, 13 }, + null, + new int[]{ 1, 1, 3, 1 }, + null, 20, 2, null, false, null + ); } public void testInconsistentAttributes() throws IOException {