From 8e40ea5bf8475d2f84c564c45a072863224249e2 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Tue, 31 Jan 2012 23:01:55 +0000 Subject: [PATCH] LUCENE-3742: fix token offset for hangs-off-end output in SynonymFilter git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1238851 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 6 ++++- .../analysis/synonym/SynonymFilter.java | 8 ++++-- .../synonym/TestSynonymMapFilter.java | 26 +++++++++++++++++++ 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 3c646719681..93e1dce5d89 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -181,7 +181,6 @@ Bug Fixes children (such docs will never match, but BJQ was tripping an assert if such a parent doc was the first doc in the segment). (Shay Banon, Mike McCandless) - * LUCENE-3609: Fix regression in BooleanFilter, introduced in Lucene 3.5, to correctly handle minShouldMatch behaviour of previous versions. (Shay Banon, Uwe Schindler) @@ -194,6 +193,11 @@ Bug Fixes cover all tokens it had matched. (Koji Sekiguchi, Robert Muir, Mike McCandless) + * LUCENE-3742: When SynonymFilter has an output extending beyond the + input tokens, it now sets the start and end offset to the same + values for the last token (not 0, 0). (Robert Muir, Mike + McCandless) + * LUCENE-3686: CategoryEnhancement must override Object.equals(Object). (Sivan Yogev via Shai Erera) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java index 30bad751566..be2ce6ea9d7 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java @@ -290,6 +290,8 @@ public final class SynonymFilter extends TokenFilter { capture the state if no further tokens were checked. So caller must then forward state to our caller, or capture: */ + private int lastStartOffset; + private int lastEndOffset; private void parse() throws IOException { //System.out.println("\nS: parse"); @@ -338,8 +340,8 @@ public final class SynonymFilter extends TokenFilter { buffer = termAtt.buffer(); bufferLen = termAtt.length(); final PendingInput input = futureInputs[nextWrite]; - input.startOffset = offsetAtt.startOffset(); - input.endOffset = offsetAtt.endOffset(); + lastStartOffset = input.startOffset = offsetAtt.startOffset(); + lastEndOffset = input.endOffset = offsetAtt.endOffset(); inputEndOffset = input.endOffset; //System.out.println(" new token=" + new String(buffer, 0, bufferLen)); if (nextRead != nextWrite) { @@ -582,6 +584,8 @@ public final class SynonymFilter extends TokenFilter { nextWrite = nextRead = rollIncr(nextRead); } clearAttributes(); + // Keep offset from last input token: + offsetAtt.setOffset(lastStartOffset, lastEndOffset); termAtt.copyBuffer(output.chars, output.offset, output.length); typeAtt.setType(TYPE_SYNONYM); //System.out.println(" set posIncr=" + outputs.posIncr + " outputs=" + outputs); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java index cbe5c2d1576..5c07a81f92a 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java @@ -606,6 +606,32 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase { new String[] { "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo", "zoo" }, new int[] { 1, 0, 1, 0, 0, 1, 0, 1, 0, 1 }); } + + public void testOutputHangsOffEnd() throws Exception { + b = new SynonymMap.Builder(true); + final boolean keepOrig = false; + // b hangs off the end (no input token under it): + add("a", "a b", keepOrig); + final SynonymMap map = b.build(); + tokensIn = new MockTokenizer(new StringReader("a"), + MockTokenizer.WHITESPACE, + true); + tokensIn.reset(); + assertTrue(tokensIn.incrementToken()); + assertFalse(tokensIn.incrementToken()); + tokensIn.end(); + tokensIn.close(); + + tokensOut = new SynonymFilter(tokensIn, + b.build(), + true); + termAtt = tokensOut.addAttribute(CharTermAttribute.class); + posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class); + offsetAtt = tokensOut.addAttribute(OffsetAttribute.class); + + // Make sure endOffset inherits from previous input token: + verify("a", "a b:1"); + } public void testIncludeOrig() throws Exception { b = new SynonymMap.Builder(true);