LUCENE-8650: Fix end() and reset() in ConcatenatingTokenStream

This commit is contained in:
Alan Woodward 2019-01-22 09:19:48 +00:00
parent f543b4e1f4
commit 7713a4f245
3 changed files with 53 additions and 0 deletions

View File

@ -286,6 +286,10 @@ Bug fixes:
* LUCENE-8654: Polygon2D#relateTriangle returns the wrong answer if polygon is inside * LUCENE-8654: Polygon2D#relateTriangle returns the wrong answer if polygon is inside
the triangle. (Ignacio Vera) the triangle. (Ignacio Vera)
* LUCENE-8650: ConcatenatingTokenStream did not correctly clear its state in reset(), and
was not propagating final position increments from its child streams correctly.
(Dan Meehl, Alan Woodward)
New Features New Features
* LUCENE-8026: ExitableDirectoryReader may now time out queries that run on * LUCENE-8026: ExitableDirectoryReader may now time out queries that run on

View File

@ -22,6 +22,7 @@ import java.util.Iterator;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.Attribute; import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
@ -39,10 +40,13 @@ public final class ConcatenatingTokenStream extends TokenStream {
private final TokenStream[] sources; private final TokenStream[] sources;
private final OffsetAttribute[] sourceOffsets; private final OffsetAttribute[] sourceOffsets;
private final PositionIncrementAttribute[] sourceIncrements;
private final OffsetAttribute offsetAtt; private final OffsetAttribute offsetAtt;
private final PositionIncrementAttribute posIncAtt;
private int currentSource; private int currentSource;
private int offsetIncrement; private int offsetIncrement;
private int initialPositionIncrement = 1;
/** /**
* Create a new ConcatenatingTokenStream from a set of inputs * Create a new ConcatenatingTokenStream from a set of inputs
@ -52,9 +56,12 @@ public final class ConcatenatingTokenStream extends TokenStream {
super(combineSources(sources)); super(combineSources(sources));
this.sources = sources; this.sources = sources;
this.offsetAtt = addAttribute(OffsetAttribute.class); this.offsetAtt = addAttribute(OffsetAttribute.class);
this.posIncAtt = addAttribute(PositionIncrementAttribute.class);
this.sourceOffsets = new OffsetAttribute[sources.length]; this.sourceOffsets = new OffsetAttribute[sources.length];
this.sourceIncrements = new PositionIncrementAttribute[sources.length];
for (int i = 0; i < sources.length; i++) { for (int i = 0; i < sources.length; i++) {
this.sourceOffsets[i] = sources[i].addAttribute(OffsetAttribute.class); this.sourceOffsets[i] = sources[i].addAttribute(OffsetAttribute.class);
this.sourceIncrements[i] = sources[i].addAttribute(PositionIncrementAttribute.class);
} }
} }
@ -78,19 +85,26 @@ public final class ConcatenatingTokenStream extends TokenStream {
@Override @Override
public boolean incrementToken() throws IOException { public boolean incrementToken() throws IOException {
boolean newSource = false;
while (sources[currentSource].incrementToken() == false) { while (sources[currentSource].incrementToken() == false) {
if (currentSource >= sources.length - 1) if (currentSource >= sources.length - 1)
return false; return false;
sources[currentSource].end(); sources[currentSource].end();
initialPositionIncrement = sourceIncrements[currentSource].getPositionIncrement();
OffsetAttribute att = sourceOffsets[currentSource]; OffsetAttribute att = sourceOffsets[currentSource];
if (att != null) if (att != null)
offsetIncrement += att.endOffset(); offsetIncrement += att.endOffset();
currentSource++; currentSource++;
newSource = true;
} }
clearAttributes(); clearAttributes();
sources[currentSource].copyTo(this); sources[currentSource].copyTo(this);
offsetAtt.setOffset(offsetAtt.startOffset() + offsetIncrement, offsetAtt.endOffset() + offsetIncrement); offsetAtt.setOffset(offsetAtt.startOffset() + offsetIncrement, offsetAtt.endOffset() + offsetIncrement);
if (newSource) {
int posInc = posIncAtt.getPositionIncrement();
posIncAtt.setPositionIncrement(posInc + initialPositionIncrement);
}
return true; return true;
} }
@ -98,7 +112,11 @@ public final class ConcatenatingTokenStream extends TokenStream {
@Override @Override
public void end() throws IOException { public void end() throws IOException {
sources[currentSource].end(); sources[currentSource].end();
int finalOffset = sourceOffsets[currentSource].endOffset() + offsetIncrement;
int finalPosInc = sourceIncrements[currentSource].getPositionIncrement();
super.end(); super.end();
offsetAtt.setOffset(finalOffset, finalOffset);
posIncAtt.setPositionIncrement(finalPosInc);
} }
@Override @Override
@ -107,6 +125,8 @@ public final class ConcatenatingTokenStream extends TokenStream {
source.reset(); source.reset();
} }
super.reset(); super.reset();
currentSource = 0;
offsetIncrement = 0;
} }
@Override @Override

View File

@ -21,7 +21,9 @@ import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
@ -46,6 +48,33 @@ public class TestConcatenatingTokenStream extends BaseTokenStreamTestCase {
new int[]{ 0, 6, 12, 19, 25, 31 }, new int[]{ 0, 6, 12, 19, 25, 31 },
new int[]{ 5, 11, 18, 24, 30, 36 }); new int[]{ 5, 11, 18, 24, 30, 36 });
// test re-use
first.setReader(new StringReader("first words "));
second.setReader(new StringReader("second words"));
third.setReader(new StringReader(" third words"));
assertTokenStreamContents(ts,
new String[] { "first", "words", "second", "words", "third", "words" },
new int[]{ 0, 6, 12, 19, 25, 31 },
new int[]{ 5, 11, 18, 24, 30, 36 },
new int[]{ 1, 1, 1, 1, 1, 1 });
}
public void testOffsetGaps() throws IOException {
CannedTokenStream cts1 = new CannedTokenStream(2, 10,
new Token("a", 0, 1), new Token("b", 2, 3));
CannedTokenStream cts2 = new CannedTokenStream(2, 10,
new Token("c", 0, 1), new Token("d", 2, 3));
TokenStream ts = new ConcatenatingTokenStream(cts1, cts2);
assertTokenStreamContents(ts,
new String[] { "a", "b", "c", "d" },
new int[]{ 0, 2, 10, 12 },
new int[]{ 1, 3, 11, 13 },
null,
new int[]{ 1, 1, 3, 1 },
null, 20, 2, null, false, null
);
} }
public void testInconsistentAttributes() throws IOException { public void testInconsistentAttributes() throws IOException {