LUCENE-3742: fix token offset for hangs-off-end output in SynonymFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1238851 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-01-31 23:01:55 +00:00
parent 440b514452
commit 8e40ea5bf8
3 changed files with 37 additions and 3 deletions

View File

@ -181,7 +181,6 @@ Bug Fixes
children (such docs will never match, but BJQ was tripping an
assert if such a parent doc was the first doc in the segment).
(Shay Banon, Mike McCandless)
* LUCENE-3609: Fix regression in BooleanFilter, introduced in Lucene 3.5,
to correctly handle minShouldMatch behaviour of previous versions.
(Shay Banon, Uwe Schindler)
@ -194,6 +193,11 @@ Bug Fixes
cover all tokens it had matched. (Koji Sekiguchi, Robert Muir,
Mike McCandless)
* LUCENE-3742: When SynonymFilter has an output extending beyond the
input tokens, it now sets the start and end offset to the same
values for the last token (not 0, 0). (Robert Muir, Mike
McCandless)
* LUCENE-3686: CategoryEnhancement must override Object.equals(Object).
(Sivan Yogev via Shai Erera)

View File

@ -290,6 +290,8 @@ public final class SynonymFilter extends TokenFilter {
capture the state if no further tokens were checked. So
caller must then forward state to our caller, or capture:
*/
private int lastStartOffset;
private int lastEndOffset;
private void parse() throws IOException {
//System.out.println("\nS: parse");
@ -338,8 +340,8 @@ public final class SynonymFilter extends TokenFilter {
buffer = termAtt.buffer();
bufferLen = termAtt.length();
final PendingInput input = futureInputs[nextWrite];
input.startOffset = offsetAtt.startOffset();
input.endOffset = offsetAtt.endOffset();
lastStartOffset = input.startOffset = offsetAtt.startOffset();
lastEndOffset = input.endOffset = offsetAtt.endOffset();
inputEndOffset = input.endOffset;
//System.out.println(" new token=" + new String(buffer, 0, bufferLen));
if (nextRead != nextWrite) {
@ -582,6 +584,8 @@ public final class SynonymFilter extends TokenFilter {
nextWrite = nextRead = rollIncr(nextRead);
}
clearAttributes();
// Keep offset from last input token:
offsetAtt.setOffset(lastStartOffset, lastEndOffset);
termAtt.copyBuffer(output.chars, output.offset, output.length);
typeAtt.setType(TYPE_SYNONYM);
//System.out.println(" set posIncr=" + outputs.posIncr + " outputs=" + outputs);

View File

@ -606,6 +606,32 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
new String[] { "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo", "zoo" },
new int[] { 1, 0, 1, 0, 0, 1, 0, 1, 0, 1 });
}
public void testOutputHangsOffEnd() throws Exception {
b = new SynonymMap.Builder(true);
final boolean keepOrig = false;
// b hangs off the end (no input token under it):
add("a", "a b", keepOrig);
final SynonymMap map = b.build();
tokensIn = new MockTokenizer(new StringReader("a"),
MockTokenizer.WHITESPACE,
true);
tokensIn.reset();
assertTrue(tokensIn.incrementToken());
assertFalse(tokensIn.incrementToken());
tokensIn.end();
tokensIn.close();
tokensOut = new SynonymFilter(tokensIn,
b.build(),
true);
termAtt = tokensOut.addAttribute(CharTermAttribute.class);
posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
// Make sure endOffset inherits from previous input token:
verify("a", "a b:1");
}
public void testIncludeOrig() throws Exception {
b = new SynonymMap.Builder(true);