LUCENE-1794: Ensure analyzer options are applied immediately when using reusable token streams

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@805766 13f79535-47bb-0310-9956-ffa450edef68
2009-08-19 11:56:31 +00:00 · 2009-08-19 11:56:31 +00:00 · 58cd4a04d7
parent 39ada16b9c
commit 58cd4a04d7
12 changed files with 125 additions and 5 deletions
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
@ -111,18 +111,21 @@ public final class BrazilianAnalyzer extends Analyzer {
 	 */
 	public void setStemExclusionTable( String[] exclusionlist ) {
 		excltable = StopFilter.makeStopSet( exclusionlist );
+		setPreviousTokenStream(null); // force a new stemmer to be created
 	}
 	/**
 	 * Builds an exclusionlist from a {@link Map}.
 	 */
 	public void setStemExclusionTable( Map exclusionlist ) {
 		excltable = new HashSet(exclusionlist.keySet());
+		setPreviousTokenStream(null); // force a new stemmer to be created
 	}
 	/**
 	 * Builds an exclusionlist from the words contained in the given file.
 	 */
 	public void setStemExclusionTable( File exclusionlist ) throws IOException {
 		excltable = WordlistLoader.getWordSet( exclusionlist );
+		setPreviousTokenStream(null); // force a new stemmer to be created
 	}

 	/**
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
@ -100,6 +100,7 @@ public final class CzechAnalyzer extends Analyzer {
     * @param   encoding    Encoding used (win-1250, iso-8859-2, ...), null for default system encoding
     */
    public void loadStopWords( InputStream wordfile, String encoding ) {
+        setPreviousTokenStream(null); // force a new stopfilter to be created
        if ( wordfile == null ) {
            stoptable = new HashSet();
            return;
@ -121,7 +122,9 @@ public final class CzechAnalyzer extends Analyzer {
            }

        } catch ( IOException e ) {
-            stoptable = null;
+          // clear any previous table (if present)
+          // TODO: throw IOException
+          stoptable = new HashSet();
        }
    }

--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
@ -114,6 +114,7 @@ public class GermanAnalyzer extends Analyzer {
   */
  public void setStemExclusionTable(String[] exclusionlist) {
    exclusionSet = StopFilter.makeStopSet(exclusionlist);
+    setPreviousTokenStream(null); // force a new stemmer to be created
  }

  /**
@ -121,6 +122,7 @@ public class GermanAnalyzer extends Analyzer {
   */
  public void setStemExclusionTable(Map exclusionlist) {
    exclusionSet = new HashSet(exclusionlist.keySet());
+    setPreviousTokenStream(null); // force a new stemmer to be created
  }

  /**
@ -128,6 +130,7 @@ public class GermanAnalyzer extends Analyzer {
   */
  public void setStemExclusionTable(File exclusionlist) throws IOException {
    exclusionSet = WordlistLoader.getWordSet(exclusionlist);
+    setPreviousTokenStream(null); // force a new stemmer to be created
  }

  /**
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
@ -111,6 +111,7 @@ public final class FrenchAnalyzer extends Analyzer {
   */
  public void setStemExclusionTable(String[] exclusionlist) {
    excltable = StopFilter.makeStopSet(exclusionlist);
+    setPreviousTokenStream(null); // force a new stemmer to be created
  }

  /**
@ -118,6 +119,7 @@ public final class FrenchAnalyzer extends Analyzer {
   */
  public void setStemExclusionTable(Map exclusionlist) {
    excltable = new HashSet(exclusionlist.keySet());
+    setPreviousTokenStream(null); // force a new stemmer to be created
  }

  /**
@ -126,6 +128,7 @@ public final class FrenchAnalyzer extends Analyzer {
   */
  public void setStemExclusionTable(File exclusionlist) throws IOException {
    excltable = new HashSet(WordlistLoader.getWordSet(exclusionlist));
+    setPreviousTokenStream(null); // force a new stemmer to be created
  }

  /**
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
@ -131,6 +131,7 @@ public class DutchAnalyzer extends Analyzer {
   */
  public void setStemExclusionTable(String[] exclusionlist) {
    excltable = StopFilter.makeStopSet(exclusionlist);
+    setPreviousTokenStream(null); // force a new stemmer to be created
  }

  /**
@ -138,6 +139,7 @@ public class DutchAnalyzer extends Analyzer {
   */
  public void setStemExclusionTable(HashSet exclusionlist) {
    excltable = exclusionlist;
+    setPreviousTokenStream(null); // force a new stemmer to be created
  }

  /**
@ -146,6 +148,7 @@ public class DutchAnalyzer extends Analyzer {
  public void setStemExclusionTable(File exclusionlist) {
    try {
      excltable = org.apache.lucene.analysis.WordlistLoader.getWordSet(exclusionlist);
+      setPreviousTokenStream(null); // force a new stemmer to be created
    } catch (IOException e) {
      // TODO: throw IOException
      throw new RuntimeException(e);
@ -160,6 +163,7 @@ public class DutchAnalyzer extends Analyzer {
  public void setStemDictionary(File stemdictFile) {
    try {
      stemdict = org.apache.lucene.analysis.WordlistLoader.getStemDict(stemdictFile);
+      setPreviousTokenStream(null); // force a new stemmer to be created
    } catch (IOException e) {
      // TODO: throw IOException
      throw new RuntimeException(e);
@ -210,7 +214,7 @@ public class DutchAnalyzer extends Analyzer {
      streams.source = new StandardTokenizer(reader);
      streams.result = new StandardFilter(streams.source);
      streams.result = new StopFilter(streams.result, stoptable);
-      streams.result = new DutchStemFilter(streams.result, excltable);
+      streams.result = new DutchStemFilter(streams.result, excltable, stemdict);
      setPreviousTokenStream(streams);
    } else {
      streams.source.reset(reader);
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
@ -139,6 +139,17 @@ public class TestBrazilianStemmer extends TestCase {
    a.setStemExclusionTable(new String[] { "quintessência" });
    checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
  }
+  
+  /* 
+   * Test that changes to the exclusion table are applied immediately
+   * when using reusable token streams.
+   */
+  public void testExclusionTableReuse() throws Exception {
+    BrazilianAnalyzer a = new BrazilianAnalyzer();
+    checkReuse(a, "quintessência", "quintessente");
+    a.setStemExclusionTable(new String[] { "quintessência" });
+    checkReuse(a, "quintessência", "quintessência");
+  }

  private void check(final String input, final String expected) throws IOException {
    Analyzer analyzer = new BrazilianAnalyzer(); 
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
@ -17,6 +17,10 @@ package org.apache.lucene.analysis.cz;
 * limitations under the License.
 */

+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
 import java.io.StringReader;

 import junit.framework.TestCase;
@ -32,17 +36,55 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 *
 */
 public class TestCzechAnalyzer extends TestCase {
-
+  File dataDir = new File(System.getProperty("dataDir", "./bin"));
+  File customStopFile = new File(dataDir, "org/apache/lucene/analysis/cz/customStopWordFile.txt");
+  
  public void testStopWord() throws Exception {
    assertAnalyzesTo(new CzechAnalyzer(), "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" });
  }
-  
+    
  public void testReusableTokenStream() throws Exception {
    Analyzer analyzer = new CzechAnalyzer();
    assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" });
    assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česká", "republika" });
  }

+  /*
+   * An input stream that always throws IOException for testing.
+   */
+  private class UnreliableInputStream extends InputStream {
+    public int read() throws IOException {
+      throw new IOException();
+    }
+  }
+  
+  /*
+   * The loadStopWords method does not throw IOException on error,
+   * instead previously it set the stoptable to null (versus empty)
+   * this would cause a NPE when it is time to create the StopFilter.
+   */
+  public void testInvalidStopWordFile() throws Exception {
+    CzechAnalyzer cz = new CzechAnalyzer();
+    cz.loadStopWords(new UnreliableInputStream(), "UTF-8");
+    assertAnalyzesTo(cz, "Pokud mluvime o volnem",
+        new String[] { "pokud", "mluvime", "o", "volnem" });
+  }
+  
+  /* 
+   * Test that changes to the stop table via loadStopWords are applied immediately
+   * when using reusable token streams.
+   */
+  public void testStopWordFileReuse() throws Exception {
+    CzechAnalyzer cz = new CzechAnalyzer();
+    assertAnalyzesToReuse(cz, "Česká Republika", 
+      new String[] { "česká", "republika" });
+    
+    InputStream stopwords = new FileInputStream(customStopFile);
+    cz.loadStopWords(stopwords, "UTF-8");
+    
+    assertAnalyzesToReuse(cz, "Česká Republika", new String[] { "česká" });
+  }
+
  private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
    TokenStream ts = a.tokenStream("dummy", new StringReader(input));
    TermAttribute text = (TermAttribute) ts.getAttribute(TermAttribute.class);
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/customStopWordFile.txt
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/customStopWordFile.txt
@ -0,0 +1,3 @@
+examplestopword
+anotherexamplestopword
+republika
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
@ -89,6 +89,17 @@ public class TestGermanStemFilter extends TestCase {
    checkReuse(new GermanSubclassAnalyzer(), "Tischen", "Tischen");
  }

+  /* 
+   * Test that changes to the exclusion table are applied immediately
+   * when using reusable token streams.
+   */
+  public void testExclusionTableReuse() throws Exception {
+    GermanAnalyzer a = new GermanAnalyzer();
+    checkReuse(a, "tischen", "tisch");
+    a.setStemExclusionTable(new String[] { "tischen" });
+    checkReuse(a, "tischen", "tischen");
+  }
+  
  private void check(final String input, final String expected) throws IOException {
    Analyzer a = new GermanAnalyzer();
    TokenStream tokenStream = a.tokenStream("dummy", new StringReader(input));
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
@ -221,4 +221,14 @@ public class TestFrenchAnalyzer extends TestCase {
              "captif" });
 	}

+	/* 
+	 * Test that changes to the exclusion table are applied immediately
+	 * when using reusable token streams.
+	 */
+	public void testExclusionTableReuse() throws Exception {
+	  FrenchAnalyzer fa = new FrenchAnalyzer();
+	  assertAnalyzesToReuse(fa, "habitable", new String[] { "habit" });
+	  fa.setStemExclusionTable(new String[] { "habitable" });
+	  assertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" });
+	}
 }
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
@ -17,6 +17,7 @@ package org.apache.lucene.analysis.nl;
 * limitations under the License.
 */

+import java.io.File;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
@ -35,6 +36,8 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 * 
 */
 public class TestDutchStemmer extends TestCase {
+  File dataDir = new File(System.getProperty("dataDir", "./bin"));
+  File customDictFile = new File(dataDir, "org/apache/lucene/analysis/nl/customStemDict.txt");
  
  public void testWithSnowballExamples() throws IOException {
 	 check("lichaamsziek", "lichaamsziek");
@ -144,7 +147,28 @@ public class TestDutchStemmer extends TestCase {
    checkReuse(a, "lichamelijkheden", "lichamelijkheden");
  }
 
-
+  /* 
+   * Test that changes to the exclusion table are applied immediately
+   * when using reusable token streams.
+   */
+  public void testExclusionTableReuse() throws Exception {
+    DutchAnalyzer a = new DutchAnalyzer();
+    checkReuse(a, "lichamelijk", "licham");
+    a.setStemExclusionTable(new String[] { "lichamelijk" });
+    checkReuse(a, "lichamelijk", "lichamelijk");
+  }
+  
+  /* 
+   * Test that changes to the dictionary stemming table are applied immediately
+   * when using reusable token streams.
+   */
+  public void testStemDictionaryReuse() throws Exception {
+    DutchAnalyzer a = new DutchAnalyzer();
+    checkReuse(a, "lichamelijk", "licham");
+    a.setStemDictionary(customDictFile);
+    checkReuse(a, "lichamelijk", "somethingentirelydifferent");
+  }
+  
  private void check(final String input, final String expected) throws IOException {
    Analyzer analyzer = new DutchAnalyzer(); 
    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(input));
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/customStemDict.txt
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/customStemDict.txt
@ -0,0 +1,3 @@
+lichamelijk	somethingentirelydifferent
+lichamelijke	licham
+lichamelijkheden	licham