LUCENE-3742: fix token offset for hangs-off-end output in SynonymFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1238851 13f79535-47bb-0310-9956-ffa450edef68
2012-01-31 23:01:55 +00:00 · 2012-01-31 23:01:55 +00:00 · 8e40ea5bf8
parent 440b514452
commit 8e40ea5bf8
3 changed files with 37 additions and 3 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -181,7 +181,6 @@ Bug Fixes
   children (such docs will never match, but BJQ was tripping an
   assert if such a parent doc was the first doc in the segment).
   (Shay Banon, Mike McCandless)
-
 * LUCENE-3609: Fix regression in BooleanFilter, introduced in Lucene 3.5,
   to correctly handle minShouldMatch behaviour of previous versions.
   (Shay Banon, Uwe Schindler)
@ -194,6 +193,11 @@ Bug Fixes
   cover all tokens it had matched.  (Koji Sekiguchi, Robert Muir,
   Mike McCandless)

+ * LUCENE-3742: When SynonymFilter has an output extending beyond the
+   input tokens, it now sets the start and end offset to the same
+   values for the last token (not 0, 0).  (Robert Muir, Mike
+   McCandless)
+
 * LUCENE-3686: CategoryEnhancement must override Object.equals(Object).
   (Sivan Yogev via Shai Erera)

--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
@ -290,6 +290,8 @@ public final class SynonymFilter extends TokenFilter {
   capture the state if no further tokens were checked.  So
   caller must then forward state to our caller, or capture:
  */
+  private int lastStartOffset;
+  private int lastEndOffset;

  private void parse() throws IOException {
    //System.out.println("\nS: parse");
@ -338,8 +340,8 @@ public final class SynonymFilter extends TokenFilter {
            buffer = termAtt.buffer();
            bufferLen = termAtt.length();
            final PendingInput input = futureInputs[nextWrite];
-            input.startOffset = offsetAtt.startOffset();
-            input.endOffset = offsetAtt.endOffset();
+            lastStartOffset = input.startOffset = offsetAtt.startOffset();
+            lastEndOffset = input.endOffset = offsetAtt.endOffset();
            inputEndOffset = input.endOffset;
            //System.out.println("  new token=" + new String(buffer, 0, bufferLen));
            if (nextRead != nextWrite) {
@ -582,6 +584,8 @@ public final class SynonymFilter extends TokenFilter {
            nextWrite = nextRead = rollIncr(nextRead);
          }
          clearAttributes();
+          // Keep offset from last input token:
+          offsetAtt.setOffset(lastStartOffset, lastEndOffset);
          termAtt.copyBuffer(output.chars, output.offset, output.length);
          typeAtt.setType(TYPE_SYNONYM);
          //System.out.println("  set posIncr=" + outputs.posIncr + " outputs=" + outputs);
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
@ -607,6 +607,32 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
        new int[] { 1, 0, 1, 0, 0, 1, 0, 1, 0, 1 });
  }

+  public void testOutputHangsOffEnd() throws Exception {
+    b = new SynonymMap.Builder(true);
+    final boolean keepOrig = false;
+    // b hangs off the end (no input token under it):
+    add("a", "a b", keepOrig);
+    final SynonymMap map = b.build();
+    tokensIn = new MockTokenizer(new StringReader("a"),
+                                 MockTokenizer.WHITESPACE,
+                                 true);
+    tokensIn.reset();
+    assertTrue(tokensIn.incrementToken());
+    assertFalse(tokensIn.incrementToken());
+    tokensIn.end();
+    tokensIn.close();
+
+    tokensOut = new SynonymFilter(tokensIn,
+                                     b.build(),
+                                     true);
+    termAtt = tokensOut.addAttribute(CharTermAttribute.class);
+    posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
+    offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
+
+    // Make sure endOffset inherits from previous input token:
+    verify("a", "a b:1");
+  }
+  
  public void testIncludeOrig() throws Exception {
    b = new SynonymMap.Builder(true);
    final boolean keepOrig = true;