From 7eab19aff7ea89e057252ec695b7036dc9ba631e Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Tue, 16 Aug 2011 16:01:05 +0000
Subject: [PATCH] LUCENE-3375: fix synonyms bug where keepOrig=false would
 discard unmatched inputs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1158342 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/contrib/CHANGES.txt                    |   4 +-
 .../analysis/synonym/SynonymFilter.java       |  16 +-
 .../synonym/TestSynonymMapFilter.java         | 284 +++++++++++++++++-
 3 files changed, 293 insertions(+), 11 deletions(-)

diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt
index c56159af27d..f6db582cd0d 100644
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@@ -78,9 +78,9 @@ New Features
    documents must be indexed as a document block, using
    IndexWriter.add/UpdateDocuments (Mark Harwood, Mike McCandless)
 
- * LUCENE-3233: Added SynonymFilter for applying multi-word synonyms
+ * LUCENE-3233, LUCENE-3375: Added SynonymFilter for applying multi-word synonyms
    during indexing or querying (with parsers for wordnet and solr formats).
-   Removed contrib/wordnet.  (Robert Muir, Mike McCandless)
+   Removed contrib/wordnet.  (Simon Rosenthal, Robert Muir, Mike McCandless)
 
 API Changes
 
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
index 64827c821c4..984298458d1 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
@@ -132,6 +132,7 @@ public final class SynonymFilter extends TokenFilter {
     final CharsRef term = new CharsRef();
     AttributeSource.State state;
     boolean keepOrig;
+    boolean matched;
     boolean consumed = true;
     int startOffset;
     int endOffset;
@@ -140,6 +141,7 @@ public final class SynonymFilter extends TokenFilter {
       state = null;
       consumed = true;
       keepOrig = false;
+      matched = false;
     }
   };
 
@@ -388,7 +390,7 @@ public final class SynonymFilter extends TokenFilter {
     if (matchOutput != null) {
       //System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
       inputSkipCount = matchInputLength;
-      addOutput(matchOutput);
+      addOutput(matchOutput, matchInputLength);
     } else if (nextRead != nextWrite) {
       // Even though we had no match here, we set to 1
       // because we need to skip current input token before
@@ -402,7 +404,7 @@ public final class SynonymFilter extends TokenFilter {
   }
 
   // Interleaves all output tokens onto the futureOutputs:
-  private void addOutput(BytesRef bytes) {
+  private void addOutput(BytesRef bytes, int matchInputLength) {
     bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
 
     final int code = bytesReader.readVInt();
@@ -426,13 +428,19 @@ public final class SynonymFilter extends TokenFilter {
           futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen);
           //System.out.println("      " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
           lastStart = 1+chIDX;
-          futureInputs[outputUpto].keepOrig |= keepOrig;
           //System.out.println("  slot=" + outputUpto + " keepOrig=" + keepOrig);
           outputUpto = rollIncr(outputUpto);
           assert futureOutputs[outputUpto].posIncr == 1: "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite;
         }
       }
     }
+
+    int upto = nextRead;
+    for(int idx=0;idx<matchInputLength;idx++) {
+      futureInputs[upto].keepOrig |= keepOrig;
+      futureInputs[upto].matched = true;
+      upto = rollIncr(upto);
+    }
   }
 
   // ++ mod rollBufferSize
@@ -471,7 +479,7 @@ public final class SynonymFilter extends TokenFilter {
         
         //System.out.println("  cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state);
 
-        if (!input.consumed && (input.keepOrig || outputs.count == 0)) {
+        if (!input.consumed && (input.keepOrig || !input.matched)) {
           if (input.state != null) {
             // Return a previously saved token (because we
             // had to lookahead):
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
index ba1b23f5c6b..93f323a2df9 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
@@ -29,6 +29,7 @@ import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.*;
@@ -59,6 +60,8 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
     }
   }
 
+  // todo: we should probably refactor this guy to use/take analyzer,
+  // the tests are a little messy
   private void verify(String input, String output) throws Exception {
     if (VERBOSE) {
       System.out.println("TEST: verify input=" + input + " expectedOutput=" + output);
@@ -148,7 +151,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
 
     // mixed keepOrig true/false:
     verify("a m c e x", "a/foo dog barks loudly x");
-    verify("c d m c e x", "c/dog d/harness m/holder/dog c/extras/barks loudly x");
+    verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x");
     assertTrue(tokensOut.getCaptureCount() > 0);
 
     // no captureStates when no syns matched
@@ -181,6 +184,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
     assertTrue(doc.length() % 2 == 0);
     final int numInputs = doc.length()/2;
     boolean[] keepOrigs = new boolean[numInputs];
+    boolean[] hasMatch = new boolean[numInputs];
     Arrays.fill(keepOrigs, false);
     String[] outputs = new String[numInputs + maxOutputLength];
     OneSyn[] matches = new OneSyn[numInputs];
@@ -223,6 +227,10 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
       if (syn == null) {
         continue;
       }
+      for(int idx=0;idx<(1+syn.in.length())/2;idx++) {
+        hasMatch[inputIDX+idx] = true;
+        keepOrigs[inputIDX+idx] |= syn.keepOrig;
+      }
       for(String synOut : syn.out) {
         final String[] synOutputs = synOut.split(" ");
         assertEquals(synOutputs.length, (1+synOut.length())/2);
@@ -234,9 +242,6 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
           } else {
             outputs[matchIDX] = outputs[matchIDX] + "/" + synOutputs[synUpto++];
           }
-          if (matchIDX < numInputs) {
-            keepOrigs[matchIDX] |= syn.keepOrig;
-          }
         }
       }
     }
@@ -249,7 +254,8 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
       if (inputIDX >= numInputs && outputs[inputIDX] == null) {
         break;
       }
-      if (inputIDX < numInputs && (outputs[inputIDX] == null || keepOrigs[inputIDX])) {
+      if (inputIDX < numInputs && (!hasMatch[inputIDX] || keepOrigs[inputIDX])) {
+        assertTrue(inputTokens[inputIDX].length() != 0);
         sb.append(inputTokens[inputIDX]);
         posHasOutput = true;
       }
@@ -259,6 +265,8 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
           sb.append('/');
         }
         sb.append(outputs[inputIDX]);
+      } else if (!posHasOutput) {
+        continue;
       }
       if (inputIDX < limit-1) {
         sb.append(' ');
@@ -390,4 +398,270 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
       checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER);
     }
   }
+  
+  // LUCENE-3375
+  public void testVanishingTerms() throws Exception {
+    String testFile = 
+      "aaa => aaaa1 aaaa2 aaaa3\n" + 
+      "bbb => bbbb1 bbbb2\n";
+      
+    SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random));
+    parser.add(new StringReader(testFile));
+    final SynonymMap map = parser.build();
+      
+    Analyzer analyzer = new ReusableAnalyzerBase() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
+        return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
+      }
+    };
+    
+    // where did my pot go?!
+    assertAnalyzesTo(analyzer, "xyzzy bbb pot of gold",
+                     new String[] { "xyzzy", "bbbb1", "pot", "bbbb2", "of", "gold" });
+    
+    // this one nukes 'pot' and 'of'
+    // xyzzy aaa pot of gold -> xyzzy aaaa1 aaaa2 aaaa3 gold
+    assertAnalyzesTo(analyzer, "xyzzy aaa pot of gold",
+                     new String[] { "xyzzy", "aaaa1", "pot", "aaaa2", "of", "aaaa3", "gold" });
+  }
+
+  public void testBasic2() throws Exception {
+    b = new SynonymMap.Builder(true);
+    final boolean keepOrig = false;
+    add("aaa", "aaaa1 aaaa2 aaaa3", keepOrig);
+    add("bbb", "bbbb1 bbbb2", keepOrig);
+    tokensIn = new MockTokenizer(new StringReader("a"),
+                                 MockTokenizer.WHITESPACE,
+                                 true);
+    tokensIn.reset();
+    assertTrue(tokensIn.incrementToken());
+    assertFalse(tokensIn.incrementToken());
+    tokensIn.end();
+    tokensIn.close();
+
+    tokensOut = new SynonymFilter(tokensIn,
+                                     b.build(),
+                                     true);
+    termAtt = tokensOut.addAttribute(CharTermAttribute.class);
+    posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
+    offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
+
+    if (keepOrig) {
+      verify("xyzzy bbb pot of gold", "xyzzy bbb/bbbb1 pot/bbbb2 of gold");
+      verify("xyzzy aaa pot of gold", "xyzzy aaa/aaaa1 pot/aaaa2 of/aaaa3 gold");
+    } else {
+      verify("xyzzy bbb pot of gold", "xyzzy bbbb1 pot/bbbb2 of gold");
+      verify("xyzzy aaa pot of gold", "xyzzy aaaa1 pot/aaaa2 of/aaaa3 gold");
+    }
+  }
+  
+  public void testMatching() throws Exception {
+    b = new SynonymMap.Builder(true);
+    final boolean keepOrig = false;
+    add("a b", "ab", keepOrig);
+    add("a c", "ac", keepOrig);
+    add("a", "aa", keepOrig);
+    add("b", "bb", keepOrig);
+    add("z x c v", "zxcv", keepOrig);
+    add("x c", "xc", keepOrig);
+    final SynonymMap map = b.build();
+    Analyzer a = new ReusableAnalyzerBase() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
+      }
+    };
+
+    checkOneTerm(a, "$", "$");
+    checkOneTerm(a, "a", "aa");
+    checkOneTerm(a, "b", "bb");
+    
+    assertAnalyzesTo(a, "a $",
+       new String[] { "aa", "$" },
+       new int[] { 1, 1 });
+    
+    assertAnalyzesTo(a, "$ a",
+        new String[] { "$", "aa" },
+        new int[] { 1, 1 });
+    
+    assertAnalyzesTo(a, "a a",
+        new String[] { "aa", "aa" },
+        new int[] { 1, 1 });
+    
+    assertAnalyzesTo(a, "z x c v",
+        new String[] { "zxcv" },
+        new int[] { 1 });
+    
+    assertAnalyzesTo(a, "z x c $",
+        new String[] { "z", "xc", "$" },
+        new int[] { 1, 1, 1 });
+  }
+  
+  public void testRepeatsOff() throws Exception {
+    b = new SynonymMap.Builder(true);
+    final boolean keepOrig = false;
+    add("a b", "ab", keepOrig);
+    add("a b", "ab", keepOrig);
+    add("a b", "ab", keepOrig);
+    final SynonymMap map = b.build();
+    Analyzer a = new ReusableAnalyzerBase() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
+      }
+    };
+
+    assertAnalyzesTo(a, "a b",
+        new String[] { "ab" },
+        new int[] { 1 });
+  }
+  
+  public void testRepeatsOn() throws Exception {
+    b = new SynonymMap.Builder(false);
+    final boolean keepOrig = false;
+    add("a b", "ab", keepOrig);
+    add("a b", "ab", keepOrig);
+    add("a b", "ab", keepOrig);
+    final SynonymMap map = b.build();
+    Analyzer a = new ReusableAnalyzerBase() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
+      }
+    };
+
+    assertAnalyzesTo(a, "a b",
+        new String[] { "ab", "ab", "ab" },
+        new int[] { 1, 0, 0 });
+  }
+  
+  public void testRecursion() throws Exception {
+    b = new SynonymMap.Builder(true);
+    final boolean keepOrig = false;
+    add("zoo", "zoo", keepOrig);
+    final SynonymMap map = b.build();
+    Analyzer a = new ReusableAnalyzerBase() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
+      }
+    };
+    
+    assertAnalyzesTo(a, "zoo zoo $ zoo",
+        new String[] { "zoo", "zoo", "$", "zoo" },
+        new int[] { 1, 1, 1, 1 });
+  }
+ 
+  public void testRecursion2() throws Exception {
+    b = new SynonymMap.Builder(true);
+    final boolean keepOrig = false;
+    add("zoo", "zoo", keepOrig);
+    add("zoo", "zoo zoo", keepOrig);
+    final SynonymMap map = b.build();
+    Analyzer a = new ReusableAnalyzerBase() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
+      }
+    };
+
+    // verify("zoo zoo $ zoo", "zoo/zoo zoo/zoo/zoo $/zoo zoo/zoo zoo");
+    assertAnalyzesTo(a, "zoo zoo $ zoo",
+        new String[] { "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo", "zoo" },
+        new int[] { 1, 0, 1, 0, 0, 1, 0, 1, 0, 1 });
+  }
+  
+  public void testIncludeOrig() throws Exception {
+    b = new SynonymMap.Builder(true);
+    final boolean keepOrig = true;
+    add("a b", "ab", keepOrig);
+    add("a c", "ac", keepOrig);
+    add("a", "aa", keepOrig);
+    add("b", "bb", keepOrig);
+    add("z x c v", "zxcv", keepOrig);
+    add("x c", "xc", keepOrig);
+    final SynonymMap map = b.build();
+    Analyzer a = new ReusableAnalyzerBase() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
+      }
+    };
+    
+    assertAnalyzesTo(a, "$", 
+        new String[] { "$" },
+        new int[] { 1 });
+    assertAnalyzesTo(a, "a", 
+        new String[] { "a", "aa" },
+        new int[] { 1, 0 });
+    assertAnalyzesTo(a, "a", 
+        new String[] { "a", "aa" },
+        new int[] { 1, 0 });
+    assertAnalyzesTo(a, "$ a", 
+        new String[] { "$", "a", "aa" },
+        new int[] { 1, 1, 0 });
+    assertAnalyzesTo(a, "a $", 
+        new String[] { "a", "aa", "$" },
+        new int[] { 1, 0, 1 });
+    assertAnalyzesTo(a, "$ a !", 
+        new String[] { "$", "a", "aa", "!" },
+        new int[] { 1, 1, 0, 1 });
+    assertAnalyzesTo(a, "a a", 
+        new String[] { "a", "aa", "a", "aa" },
+        new int[] { 1, 0, 1, 0 });
+    assertAnalyzesTo(a, "b", 
+        new String[] { "b", "bb" },
+        new int[] { 1, 0 });
+    assertAnalyzesTo(a, "z x c v",
+        new String[] { "z", "zxcv", "x", "c", "v" },
+        new int[] { 1, 0, 1, 1, 1 });
+    assertAnalyzesTo(a, "z x c $",
+        new String[] { "z", "x", "xc", "c", "$" },
+        new int[] { 1, 1, 0, 1, 1 });
+  }
+  
+  public void testRecursion3() throws Exception {
+    b = new SynonymMap.Builder(true);
+    final boolean keepOrig = true;
+    add("zoo zoo", "zoo", keepOrig);
+    final SynonymMap map = b.build();
+    Analyzer a = new ReusableAnalyzerBase() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
+      }
+    };
+    
+    assertAnalyzesTo(a, "zoo zoo $ zoo",
+        new String[] { "zoo", "zoo", "zoo", "$", "zoo" },
+        new int[] { 1, 0, 1, 1, 1 });
+  }
+  
+  public void testRecursion4() throws Exception {
+    b = new SynonymMap.Builder(true);
+    final boolean keepOrig = true;
+    add("zoo zoo", "zoo", keepOrig);
+    add("zoo", "zoo zoo", keepOrig);
+    final SynonymMap map = b.build();
+    Analyzer a = new ReusableAnalyzerBase() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
+      }
+    };
+    
+    assertAnalyzesTo(a, "zoo zoo $ zoo",
+        new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" },
+        new int[] { 1, 0, 1, 1, 1, 0, 1 });
+  }
 }