mirror of https://github.com/apache/lucene.git
LUCENE-3742: fix token offset for hangs-off-end output in SynonymFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1238851 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
440b514452
commit
8e40ea5bf8
|
@ -181,7 +181,6 @@ Bug Fixes
|
|||
children (such docs will never match, but BJQ was tripping an
|
||||
assert if such a parent doc was the first doc in the segment).
|
||||
(Shay Banon, Mike McCandless)
|
||||
|
||||
* LUCENE-3609: Fix regression in BooleanFilter, introduced in Lucene 3.5,
|
||||
to correctly handle minShouldMatch behaviour of previous versions.
|
||||
(Shay Banon, Uwe Schindler)
|
||||
|
@ -194,6 +193,11 @@ Bug Fixes
|
|||
cover all tokens it had matched. (Koji Sekiguchi, Robert Muir,
|
||||
Mike McCandless)
|
||||
|
||||
* LUCENE-3742: When SynonymFilter has an output extending beyond the
|
||||
input tokens, it now sets the start and end offset to the same
|
||||
values for the last token (not 0, 0). (Robert Muir, Mike
|
||||
McCandless)
|
||||
|
||||
* LUCENE-3686: CategoryEnhancement must override Object.equals(Object).
|
||||
(Sivan Yogev via Shai Erera)
|
||||
|
||||
|
|
|
@ -290,6 +290,8 @@ public final class SynonymFilter extends TokenFilter {
|
|||
capture the state if no further tokens were checked. So
|
||||
caller must then forward state to our caller, or capture:
|
||||
*/
|
||||
private int lastStartOffset;
|
||||
private int lastEndOffset;
|
||||
|
||||
private void parse() throws IOException {
|
||||
//System.out.println("\nS: parse");
|
||||
|
@ -338,8 +340,8 @@ public final class SynonymFilter extends TokenFilter {
|
|||
buffer = termAtt.buffer();
|
||||
bufferLen = termAtt.length();
|
||||
final PendingInput input = futureInputs[nextWrite];
|
||||
input.startOffset = offsetAtt.startOffset();
|
||||
input.endOffset = offsetAtt.endOffset();
|
||||
lastStartOffset = input.startOffset = offsetAtt.startOffset();
|
||||
lastEndOffset = input.endOffset = offsetAtt.endOffset();
|
||||
inputEndOffset = input.endOffset;
|
||||
//System.out.println(" new token=" + new String(buffer, 0, bufferLen));
|
||||
if (nextRead != nextWrite) {
|
||||
|
@ -582,6 +584,8 @@ public final class SynonymFilter extends TokenFilter {
|
|||
nextWrite = nextRead = rollIncr(nextRead);
|
||||
}
|
||||
clearAttributes();
|
||||
// Keep offset from last input token:
|
||||
offsetAtt.setOffset(lastStartOffset, lastEndOffset);
|
||||
termAtt.copyBuffer(output.chars, output.offset, output.length);
|
||||
typeAtt.setType(TYPE_SYNONYM);
|
||||
//System.out.println(" set posIncr=" + outputs.posIncr + " outputs=" + outputs);
|
||||
|
|
|
@ -606,6 +606,32 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
|||
new String[] { "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo", "zoo" },
|
||||
new int[] { 1, 0, 1, 0, 0, 1, 0, 1, 0, 1 });
|
||||
}
|
||||
|
||||
public void testOutputHangsOffEnd() throws Exception {
|
||||
b = new SynonymMap.Builder(true);
|
||||
final boolean keepOrig = false;
|
||||
// b hangs off the end (no input token under it):
|
||||
add("a", "a b", keepOrig);
|
||||
final SynonymMap map = b.build();
|
||||
tokensIn = new MockTokenizer(new StringReader("a"),
|
||||
MockTokenizer.WHITESPACE,
|
||||
true);
|
||||
tokensIn.reset();
|
||||
assertTrue(tokensIn.incrementToken());
|
||||
assertFalse(tokensIn.incrementToken());
|
||||
tokensIn.end();
|
||||
tokensIn.close();
|
||||
|
||||
tokensOut = new SynonymFilter(tokensIn,
|
||||
b.build(),
|
||||
true);
|
||||
termAtt = tokensOut.addAttribute(CharTermAttribute.class);
|
||||
posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
|
||||
offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
|
||||
|
||||
// Make sure endOffset inherits from previous input token:
|
||||
verify("a", "a b:1");
|
||||
}
|
||||
|
||||
public void testIncludeOrig() throws Exception {
|
||||
b = new SynonymMap.Builder(true);
|
||||
|
|
Loading…
Reference in New Issue