LUCENE-3668: if there's only 1 output for a synonym rule then set start/endOffset to match the full span of the input tokens

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1228650 13f79535-47bb-0310-9956-ffa450edef68
2012-01-07 16:26:15 +00:00 · 2012-01-07 16:26:15 +00:00 · ed9f0fd5ef
parent 40b3b75a6e
commit ed9f0fd5ef
4 changed files with 123 additions and 11 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -136,6 +136,14 @@ Bug Fixes
 * LUCENE-3609: Fix regression in BooleanFilter, introduced in Lucene 3.5,
   to correctly handle minShouldMatch behaviour of previous versions.
   (Shay Banon, Uwe Schindler)
+
+ * LUCENE-3668: For a multi-token synonym mapping to a single token,
+   SynonymFilter will now set the start offset of the synonym token to
+   the start offset of the first matched token, and the end offset of
+   the synonym token to the end offset of the last matched token.
+   This way if the synonym token is used for highlighting, it will
+   cover all tokens it had matched.  (Koji Sekiguchi, Robert Muir,
+   Mike McCandless)
 
 Documentation

--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
@ -153,12 +153,15 @@ public final class SynonymFilter extends TokenFilter {
  // Holds pending output synonyms for one future position:
  private static class PendingOutputs {
    CharsRef[] outputs;
+    int[] endOffsets;
    int upto;
    int count;
    int posIncr = 1;
+    int lastEndOffset;

    public PendingOutputs() {
      outputs = new CharsRef[1];
+      endOffsets = new int[1];
    }

    public void reset() {
@ -168,6 +171,7 @@ public final class SynonymFilter extends TokenFilter {

    public CharsRef pullNext() {
      assert upto < count;
+      lastEndOffset = endOffsets[upto];
      final CharsRef result = outputs[upto++];
      posIncr = 0;
      if (upto == count) {
@ -176,16 +180,29 @@ public final class SynonymFilter extends TokenFilter {
      return result;
    }

-    public void add(char[] output, int offset, int len) {
+    public int getLastEndOffset() {
+      return lastEndOffset;
+    }
+
+    public void add(char[] output, int offset, int len, int endOffset) {
      if (count == outputs.length) {
        final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
        System.arraycopy(outputs, 0, next, 0, count);
        outputs = next;
      }
+      if (count == endOffsets.length) {
+        final int[] next = new int[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_INT)];
+        System.arraycopy(endOffsets, 0, next, 0, count);
+        endOffsets = next;
+      }
      if (outputs[count] == null) {
        outputs[count] = new CharsRef();
      }
      outputs[count].copyChars(output, offset, len);
+      // endOffset can be -1, in which case we should simply
+      // use the endOffset of the input token, or X >= 0, in
+      // which case we use X as the endOffset for this output
+      endOffsets[count] = endOffset;
      count++;
    }
  };
@ -281,6 +298,7 @@ public final class SynonymFilter extends TokenFilter {
    // Holds the longest match we've seen so far:
    BytesRef matchOutput = null;
    int matchInputLength = 0;
+    int matchEndOffset = -1;

    BytesRef pendingOutput = fst.outputs.getNoOutput();
    fst.getFirstArc(scratchArc);
@ -297,6 +315,8 @@ public final class SynonymFilter extends TokenFilter {
      final int bufferLen;
      //System.out.println("  cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);

+      int inputEndOffset = 0;
+
      if (curNextRead == nextWrite) {

        // We used up our lookahead buffer of input tokens
@ -317,6 +337,7 @@ public final class SynonymFilter extends TokenFilter {
            final PendingInput input = futureInputs[nextWrite];
            input.startOffset = offsetAtt.startOffset();
            input.endOffset = offsetAtt.endOffset();
+            inputEndOffset = input.endOffset;
            //System.out.println("  new token=" + new String(buffer, 0, bufferLen));
            if (nextRead != nextWrite) {
              capture();
@ -335,6 +356,7 @@ public final class SynonymFilter extends TokenFilter {
        // Still in our lookahead
        buffer = futureInputs[curNextRead].term.chars;
        bufferLen = futureInputs[curNextRead].term.length;
+        inputEndOffset = futureInputs[curNextRead].endOffset;
        //System.out.println("  old token=" + new String(buffer, 0, bufferLen));
      }

@ -360,6 +382,7 @@ public final class SynonymFilter extends TokenFilter {
      if (scratchArc.isFinal()) {
        matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
        matchInputLength = tokenCount;
+        matchEndOffset = inputEndOffset;
        //System.out.println("  found matchLength=" + matchInputLength + " output=" + matchOutput);
      }

@ -390,7 +413,7 @@ public final class SynonymFilter extends TokenFilter {
    if (matchOutput != null) {
      //System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
      inputSkipCount = matchInputLength;
-      addOutput(matchOutput, matchInputLength);
+      addOutput(matchOutput, matchInputLength, matchEndOffset);
    } else if (nextRead != nextWrite) {
      // Even though we had no match here, we set to 1
      // because we need to skip current input token before
@ -404,7 +427,7 @@ public final class SynonymFilter extends TokenFilter {
  }

  // Interleaves all output tokens onto the futureOutputs:
-  private void addOutput(BytesRef bytes, int matchInputLength) {
+  private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) {
    bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);

    final int code = bytesReader.readVInt();
@ -425,7 +448,21 @@ public final class SynonymFilter extends TokenFilter {
          // Caller is not allowed to have empty string in
          // the output:
          assert outputLen > 0: "output contains empty string: " + scratchChars;
-          futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen);
+          final int endOffset;
+          if (chIDX == chEnd && lastStart == scratchChars.offset) {
+            // This rule had a single output token, so, we set
+            // this output's endOffset to the current
+            // endOffset (ie, endOffset of the last input
+            // token it matched):
+            endOffset = matchEndOffset;
+          } else {
+            // This rule has more than one output token; we
+            // can't pick any particular endOffset for this
+            // case, so, we inherit the endOffset for the
+            // input token which this output overlaps:
+            endOffset = -1;
+          }
+          futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen, endOffset);
          //System.out.println("      " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
          lastStart = 1+chIDX;
          //System.out.println("  slot=" + outputUpto + " keepOrig=" + keepOrig);
@ -507,7 +544,11 @@ public final class SynonymFilter extends TokenFilter {
          clearAttributes();
          termAtt.copyBuffer(output.chars, output.offset, output.length);
          typeAtt.setType(TYPE_SYNONYM);
-          offsetAtt.setOffset(input.startOffset, input.endOffset);
+          int endOffset = outputs.getLastEndOffset();
+          if (endOffset == -1) {
+            endOffset = input.endOffset;
+          }
+          offsetAtt.setOffset(input.startOffset, endOffset);
          posIncrAtt.setPositionIncrement(posIncr);
          if (outputs.count == 0) {
            // Done with the buffered input and all outputs at
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
@ -59,7 +59,12 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
    }
  }

-  // todo: we should probably refactor this guy to use/take analyzer,
+  // For the output string: separate positions with a space,
+  // and separate multiple tokens at each position with a
+  // /.  If a token should have end offset != the input
+  // token's end offset then add :X to it:
+
+  // TODO: we should probably refactor this guy to use/take analyzer,
  // the tests are a little messy
  private void verify(String input, String output) throws Exception {
    if (VERBOSE) {
@ -73,7 +78,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
    while(tokensOut.incrementToken()) {

      if (VERBOSE) {
-        System.out.println("  incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
+        System.out.println("  incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement() + " startOff=" + offsetAtt.startOffset() + " endOff=" + offsetAtt.endOffset());
      }

      assertTrue(expectedUpto < expected.length);
@ -85,16 +90,26 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
        if (atPos > 0) {
          assertTrue(tokensOut.incrementToken());
          if (VERBOSE) {
-            System.out.println("  incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
+            System.out.println("  incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement() + " startOff=" + offsetAtt.startOffset() + " endOff=" + offsetAtt.endOffset());
          }
        }
-        assertEquals(termAtt, expectedAtPos[atPos]);
+        final int colonIndex = expectedAtPos[atPos].indexOf(':');
+        final String expectedToken;
+        final int expectedEndOffset;
+        if (colonIndex != -1) {
+          expectedToken = expectedAtPos[atPos].substring(0, colonIndex);
+          expectedEndOffset = Integer.parseInt(expectedAtPos[atPos].substring(1+colonIndex));
+        } else {
+          expectedToken = expectedAtPos[atPos];
+          expectedEndOffset = endOffset;
+        }
+        assertEquals(expectedToken, termAtt.toString());
        assertEquals(atPos == 0 ? 1 : 0,
                     posIncrAtt.getPositionIncrement());
        // start/end offset of all tokens at same pos should
        // be the same:
        assertEquals(startOffset, offsetAtt.startOffset());
-        assertEquals(endOffset, offsetAtt.endOffset());
+        assertEquals(expectedEndOffset, offsetAtt.endOffset());
      }
    }
    tokensOut.end();
@ -112,6 +127,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
    add("b c", "dog collar", true);
    add("c d", "dog harness holder extras", true);
    add("m c e", "dog barks loudly", false);
+    add("i j k", "feep", true);

    add("e f", "foo bar", false);
    add("e f", "baz bee", false);
@ -148,6 +164,9 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
    // two outputs for same input
    verify("e f", "foo/baz bar/bee");

+    // verify multi-word / single-output offsets:
+    verify("g i j k g", "g i/feep:7 j k g");
+
    // mixed keepOrig true/false:
    verify("a m c e x", "a/foo dog barks loudly x");
    verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x");
@ -241,6 +260,10 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
          } else {
            outputs[matchIDX] = outputs[matchIDX] + "/" + synOutputs[synUpto++];
          }
+          if (synOutputs.length == 1) {
+            // Add endOffset
+            outputs[matchIDX] = outputs[matchIDX] + ":" + ((inputIDX*2) + syn.in.length());
+          }
        }
      }
    }
@ -663,4 +686,24 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
        new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" },
        new int[] { 1, 0, 1, 1, 1, 0, 1 });
  }
+  
+  public void testMultiwordOffsets() throws Exception {
+    b = new SynonymMap.Builder(true);
+    final boolean keepOrig = true;
+    add("national hockey league", "nhl", keepOrig);
+    final SynonymMap map = b.build();
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
+      }
+    };
+    
+    assertAnalyzesTo(a, "national hockey league",
+        new String[] { "national", "nhl", "hockey", "league" },
+        new int[] { 0, 0, 9, 16 },
+        new int[] { 8, 22, 15, 22 },
+        new int[] { 1, 0, 1, 1 });
+  }
 }
--- a/solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java
+++ b/solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java
@ -21,6 +21,7 @@ import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.StringReader;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@ -65,6 +66,25 @@ public class TestSynonymFilterFactory extends BaseTokenTestCase {
        new int[] { 1, 0, 0, 0 });
  }
  
+  /** test multiword offsets with the old impl
+   * @deprecated Remove this test in Lucene 5.0 */
+  @Deprecated
+  public void testMultiwordOffsetsOld() throws Exception {
+    SynonymFilterFactory factory = new SynonymFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("luceneMatchVersion", Version.LUCENE_33.toString());
+    args.put("synonyms", "synonyms.txt");
+    factory.init(args);
+    factory.inform(new StringMockSolrResourceLoader("national hockey league, nhl"));
+    TokenStream ts = factory.create(new MockTokenizer(new StringReader("national hockey league"), MockTokenizer.WHITESPACE, false));
+    // WTF?
+    assertTokenStreamContents(ts, 
+        new String[] { "national", "nhl", "hockey", "league" },
+        new int[] { 0, 0, 0, 0 },
+        new int[] { 22, 22, 22, 22 },
+        new int[] { 1, 0, 1, 1 });
+  }
+  
  /** if the synonyms are completely empty, test that we still analyze correctly */
  public void testEmptySynonyms() throws Exception {
    SynonymFilterFactory factory = new SynonymFilterFactory();
@ -85,7 +105,7 @@ public class TestSynonymFilterFactory extends BaseTokenTestCase {
    }

    public List<String> getLines(String resource) throws IOException {
-      return null;
+      return Arrays.asList(text.split("\n"));
    }

    public Object newInstance(String cname, String... subpackages) {