From 58cd4a04d79edf1dd673236a3f2147d4e7176f84 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Wed, 19 Aug 2009 11:56:31 +0000
Subject: [PATCH] LUCENE-1794: Ensure analyzer options are applied immediately
 when using reusable token streams

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@805766 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene/analysis/br/BrazilianAnalyzer.java |  3 ++
 .../lucene/analysis/cz/CzechAnalyzer.java     |  5 +-
 .../lucene/analysis/de/GermanAnalyzer.java    |  3 ++
 .../lucene/analysis/fr/FrenchAnalyzer.java    |  3 ++
 .../lucene/analysis/nl/DutchAnalyzer.java     |  6 ++-
 .../analysis/br/TestBrazilianStemmer.java     | 11 +++++
 .../lucene/analysis/cz/TestCzechAnalyzer.java | 46 ++++++++++++++++++-
 .../lucene/analysis/cz/customStopWordFile.txt |  3 ++
 .../analysis/de/TestGermanStemFilter.java     | 11 +++++
 .../analysis/fr/TestFrenchAnalyzer.java       | 10 ++++
 .../lucene/analysis/nl/TestDutchStemmer.java  | 26 ++++++++++-
 .../lucene/analysis/nl/customStemDict.txt     |  3 ++
 12 files changed, 125 insertions(+), 5 deletions(-)
 create mode 100644 contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/customStopWordFile.txt
 create mode 100644 contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/customStemDict.txt

diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
index d06f4cc7f03..f0efada4f4e 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
@@ -111,18 +111,21 @@ public final class BrazilianAnalyzer extends Analyzer {
 	 */
 	public void setStemExclusionTable( String[] exclusionlist ) {
 		excltable = StopFilter.makeStopSet( exclusionlist );
+		setPreviousTokenStream(null); // force a new stemmer to be created
 	}
 	/**
 	 * Builds an exclusionlist from a {@link Map}.
 	 */
 	public void setStemExclusionTable( Map exclusionlist ) {
 		excltable = new HashSet(exclusionlist.keySet());
+		setPreviousTokenStream(null); // force a new stemmer to be created
 	}
 	/**
 	 * Builds an exclusionlist from the words contained in the given file.
 	 */
 	public void setStemExclusionTable( File exclusionlist ) throws IOException {
 		excltable = WordlistLoader.getWordSet( exclusionlist );
+		setPreviousTokenStream(null); // force a new stemmer to be created
 	}
 
 	/**
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
index 603df1349ba..30b05345eed 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
@@ -100,6 +100,7 @@ public final class CzechAnalyzer extends Analyzer {
      * @param   encoding    Encoding used (win-1250, iso-8859-2, ...), null for default system encoding
      */
     public void loadStopWords( InputStream wordfile, String encoding ) {
+        setPreviousTokenStream(null); // force a new stopfilter to be created
         if ( wordfile == null ) {
             stoptable = new HashSet();
             return;
@@ -121,7 +122,9 @@ public final class CzechAnalyzer extends Analyzer {
             }
 
         } catch ( IOException e ) {
-            stoptable = null;
+          // clear any previous table (if present)
+          // TODO: throw IOException
+          stoptable = new HashSet();
         }
     }
 
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
index bf45a967507..d328ae8269b 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
@@ -114,6 +114,7 @@ public class GermanAnalyzer extends Analyzer {
    */
   public void setStemExclusionTable(String[] exclusionlist) {
     exclusionSet = StopFilter.makeStopSet(exclusionlist);
+    setPreviousTokenStream(null); // force a new stemmer to be created
   }
 
   /**
@@ -121,6 +122,7 @@ public class GermanAnalyzer extends Analyzer {
    */
   public void setStemExclusionTable(Map exclusionlist) {
     exclusionSet = new HashSet(exclusionlist.keySet());
+    setPreviousTokenStream(null); // force a new stemmer to be created
   }
 
   /**
@@ -128,6 +130,7 @@ public class GermanAnalyzer extends Analyzer {
    */
   public void setStemExclusionTable(File exclusionlist) throws IOException {
     exclusionSet = WordlistLoader.getWordSet(exclusionlist);
+    setPreviousTokenStream(null); // force a new stemmer to be created
   }
 
   /**
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
index 048e8e7f06b..bbcff01ed0a 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
@@ -111,6 +111,7 @@ public final class FrenchAnalyzer extends Analyzer {
    */
   public void setStemExclusionTable(String[] exclusionlist) {
     excltable = StopFilter.makeStopSet(exclusionlist);
+    setPreviousTokenStream(null); // force a new stemmer to be created
   }
 
   /**
@@ -118,6 +119,7 @@ public final class FrenchAnalyzer extends Analyzer {
    */
   public void setStemExclusionTable(Map exclusionlist) {
     excltable = new HashSet(exclusionlist.keySet());
+    setPreviousTokenStream(null); // force a new stemmer to be created
   }
 
   /**
@@ -126,6 +128,7 @@ public final class FrenchAnalyzer extends Analyzer {
    */
   public void setStemExclusionTable(File exclusionlist) throws IOException {
     excltable = new HashSet(WordlistLoader.getWordSet(exclusionlist));
+    setPreviousTokenStream(null); // force a new stemmer to be created
   }
 
   /**
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
index 081fbcf9898..ae40d4fe72b 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
@@ -131,6 +131,7 @@ public class DutchAnalyzer extends Analyzer {
    */
   public void setStemExclusionTable(String[] exclusionlist) {
     excltable = StopFilter.makeStopSet(exclusionlist);
+    setPreviousTokenStream(null); // force a new stemmer to be created
   }
 
   /**
@@ -138,6 +139,7 @@ public class DutchAnalyzer extends Analyzer {
    */
   public void setStemExclusionTable(HashSet exclusionlist) {
     excltable = exclusionlist;
+    setPreviousTokenStream(null); // force a new stemmer to be created
   }
 
   /**
@@ -146,6 +148,7 @@ public class DutchAnalyzer extends Analyzer {
   public void setStemExclusionTable(File exclusionlist) {
     try {
       excltable = org.apache.lucene.analysis.WordlistLoader.getWordSet(exclusionlist);
+      setPreviousTokenStream(null); // force a new stemmer to be created
     } catch (IOException e) {
       // TODO: throw IOException
       throw new RuntimeException(e);
@@ -160,6 +163,7 @@ public class DutchAnalyzer extends Analyzer {
   public void setStemDictionary(File stemdictFile) {
     try {
       stemdict = org.apache.lucene.analysis.WordlistLoader.getStemDict(stemdictFile);
+      setPreviousTokenStream(null); // force a new stemmer to be created
     } catch (IOException e) {
       // TODO: throw IOException
       throw new RuntimeException(e);
@@ -210,7 +214,7 @@ public class DutchAnalyzer extends Analyzer {
       streams.source = new StandardTokenizer(reader);
       streams.result = new StandardFilter(streams.source);
       streams.result = new StopFilter(streams.result, stoptable);
-      streams.result = new DutchStemFilter(streams.result, excltable);
+      streams.result = new DutchStemFilter(streams.result, excltable, stemdict);
       setPreviousTokenStream(streams);
     } else {
       streams.source.reset(reader);
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
index 0427cacc865..cb659cd0d8a 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
@@ -139,6 +139,17 @@ public class TestBrazilianStemmer extends TestCase {
     a.setStemExclusionTable(new String[] { "quintessência" });
     checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
   }
+  
+  /* 
+   * Test that changes to the exclusion table are applied immediately
+   * when using reusable token streams.
+   */
+  public void testExclusionTableReuse() throws Exception {
+    BrazilianAnalyzer a = new BrazilianAnalyzer();
+    checkReuse(a, "quintessência", "quintessente");
+    a.setStemExclusionTable(new String[] { "quintessência" });
+    checkReuse(a, "quintessência", "quintessência");
+  }
 
   private void check(final String input, final String expected) throws IOException {
     Analyzer analyzer = new BrazilianAnalyzer(); 
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
index 6672abb6713..1516e6d5820 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
@@ -17,6 +17,10 @@ package org.apache.lucene.analysis.cz;
  * limitations under the License.
  */
 
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
 import java.io.StringReader;
 
 import junit.framework.TestCase;
@@ -32,17 +36,55 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
  *
  */
 public class TestCzechAnalyzer extends TestCase {
-
+  File dataDir = new File(System.getProperty("dataDir", "./bin"));
+  File customStopFile = new File(dataDir, "org/apache/lucene/analysis/cz/customStopWordFile.txt");
+  
   public void testStopWord() throws Exception {
     assertAnalyzesTo(new CzechAnalyzer(), "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" });
   }
-  
+    
   public void testReusableTokenStream() throws Exception {
     Analyzer analyzer = new CzechAnalyzer();
     assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" });
     assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česká", "republika" });
   }
 
+  /*
+   * An input stream that always throws IOException for testing.
+   */
+  private class UnreliableInputStream extends InputStream {
+    public int read() throws IOException {
+      throw new IOException();
+    }
+  }
+  
+  /*
+   * The loadStopWords method does not throw IOException on error,
+   * instead previously it set the stoptable to null (versus empty)
+   * this would cause a NPE when it is time to create the StopFilter.
+   */
+  public void testInvalidStopWordFile() throws Exception {
+    CzechAnalyzer cz = new CzechAnalyzer();
+    cz.loadStopWords(new UnreliableInputStream(), "UTF-8");
+    assertAnalyzesTo(cz, "Pokud mluvime o volnem",
+        new String[] { "pokud", "mluvime", "o", "volnem" });
+  }
+  
+  /* 
+   * Test that changes to the stop table via loadStopWords are applied immediately
+   * when using reusable token streams.
+   */
+  public void testStopWordFileReuse() throws Exception {
+    CzechAnalyzer cz = new CzechAnalyzer();
+    assertAnalyzesToReuse(cz, "Česká Republika", 
+      new String[] { "česká", "republika" });
+    
+    InputStream stopwords = new FileInputStream(customStopFile);
+    cz.loadStopWords(stopwords, "UTF-8");
+    
+    assertAnalyzesToReuse(cz, "Česká Republika", new String[] { "česká" });
+  }
+
   private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
     TokenStream ts = a.tokenStream("dummy", new StringReader(input));
     TermAttribute text = (TermAttribute) ts.getAttribute(TermAttribute.class);
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/customStopWordFile.txt b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/customStopWordFile.txt
new file mode 100644
index 00000000000..6f9fd8703b3
--- /dev/null
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/customStopWordFile.txt
@@ -0,0 +1,3 @@
+examplestopword
+anotherexamplestopword
+republika
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
index 64c64b239c6..a2cc2030f57 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
@@ -89,6 +89,17 @@ public class TestGermanStemFilter extends TestCase {
     checkReuse(new GermanSubclassAnalyzer(), "Tischen", "Tischen");
   }
 
+  /* 
+   * Test that changes to the exclusion table are applied immediately
+   * when using reusable token streams.
+   */
+  public void testExclusionTableReuse() throws Exception {
+    GermanAnalyzer a = new GermanAnalyzer();
+    checkReuse(a, "tischen", "tisch");
+    a.setStemExclusionTable(new String[] { "tischen" });
+    checkReuse(a, "tischen", "tischen");
+  }
+  
   private void check(final String input, final String expected) throws IOException {
     Analyzer a = new GermanAnalyzer();
     TokenStream tokenStream = a.tokenStream("dummy", new StringReader(input));
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
index 31183bece03..b3c2b1cd5c9 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
@@ -221,4 +221,14 @@ public class TestFrenchAnalyzer extends TestCase {
               "captif" });
 	}
 
+	/* 
+	 * Test that changes to the exclusion table are applied immediately
+	 * when using reusable token streams.
+	 */
+	public void testExclusionTableReuse() throws Exception {
+	  FrenchAnalyzer fa = new FrenchAnalyzer();
+	  assertAnalyzesToReuse(fa, "habitable", new String[] { "habit" });
+	  fa.setStemExclusionTable(new String[] { "habitable" });
+	  assertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" });
+	}
 }
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
index 3581c24dad1..fde17ae3476 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
@@ -17,6 +17,7 @@ package org.apache.lucene.analysis.nl;
  * limitations under the License.
  */
 
+import java.io.File;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
@@ -35,6 +36,8 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
  * 
  */
 public class TestDutchStemmer extends TestCase {
+  File dataDir = new File(System.getProperty("dataDir", "./bin"));
+  File customDictFile = new File(dataDir, "org/apache/lucene/analysis/nl/customStemDict.txt");
   
   public void testWithSnowballExamples() throws IOException {
 	 check("lichaamsziek", "lichaamsziek");
@@ -144,7 +147,28 @@ public class TestDutchStemmer extends TestCase {
     checkReuse(a, "lichamelijkheden", "lichamelijkheden");
   }
  
-
+  /* 
+   * Test that changes to the exclusion table are applied immediately
+   * when using reusable token streams.
+   */
+  public void testExclusionTableReuse() throws Exception {
+    DutchAnalyzer a = new DutchAnalyzer();
+    checkReuse(a, "lichamelijk", "licham");
+    a.setStemExclusionTable(new String[] { "lichamelijk" });
+    checkReuse(a, "lichamelijk", "lichamelijk");
+  }
+  
+  /* 
+   * Test that changes to the dictionary stemming table are applied immediately
+   * when using reusable token streams.
+   */
+  public void testStemDictionaryReuse() throws Exception {
+    DutchAnalyzer a = new DutchAnalyzer();
+    checkReuse(a, "lichamelijk", "licham");
+    a.setStemDictionary(customDictFile);
+    checkReuse(a, "lichamelijk", "somethingentirelydifferent");
+  }
+  
   private void check(final String input, final String expected) throws IOException {
     Analyzer analyzer = new DutchAnalyzer(); 
     TokenStream stream = analyzer.tokenStream("dummy", new StringReader(input));
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/customStemDict.txt b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/customStemDict.txt
new file mode 100644
index 00000000000..34d3abcbb27
--- /dev/null
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/customStemDict.txt
@@ -0,0 +1,3 @@
+lichamelijk	somethingentirelydifferent
+lichamelijke	licham
+lichamelijkheden	licham