From 58cd4a04d79edf1dd673236a3f2147d4e7176f84 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Wed, 19 Aug 2009 11:56:31 +0000 Subject: [PATCH] LUCENE-1794: Ensure analyzer options are applied immediately when using reusable token streams git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@805766 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/br/BrazilianAnalyzer.java | 3 ++ .../lucene/analysis/cz/CzechAnalyzer.java | 5 +- .../lucene/analysis/de/GermanAnalyzer.java | 3 ++ .../lucene/analysis/fr/FrenchAnalyzer.java | 3 ++ .../lucene/analysis/nl/DutchAnalyzer.java | 6 ++- .../analysis/br/TestBrazilianStemmer.java | 11 +++++ .../lucene/analysis/cz/TestCzechAnalyzer.java | 46 ++++++++++++++++++- .../lucene/analysis/cz/customStopWordFile.txt | 3 ++ .../analysis/de/TestGermanStemFilter.java | 11 +++++ .../analysis/fr/TestFrenchAnalyzer.java | 10 ++++ .../lucene/analysis/nl/TestDutchStemmer.java | 26 ++++++++++- .../lucene/analysis/nl/customStemDict.txt | 3 ++ 12 files changed, 125 insertions(+), 5 deletions(-) create mode 100644 contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/customStopWordFile.txt create mode 100644 contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/customStemDict.txt diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java index d06f4cc7f03..f0efada4f4e 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java @@ -111,18 +111,21 @@ public final class BrazilianAnalyzer extends Analyzer { */ public void setStemExclusionTable( String[] exclusionlist ) { excltable = StopFilter.makeStopSet( exclusionlist ); + setPreviousTokenStream(null); // force a new stemmer to be created } /** * Builds an exclusionlist from a {@link Map}. */ public void setStemExclusionTable( Map exclusionlist ) { excltable = new HashSet(exclusionlist.keySet()); + setPreviousTokenStream(null); // force a new stemmer to be created } /** * Builds an exclusionlist from the words contained in the given file. */ public void setStemExclusionTable( File exclusionlist ) throws IOException { excltable = WordlistLoader.getWordSet( exclusionlist ); + setPreviousTokenStream(null); // force a new stemmer to be created } /** diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java index 603df1349ba..30b05345eed 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java @@ -100,6 +100,7 @@ public final class CzechAnalyzer extends Analyzer { * @param encoding Encoding used (win-1250, iso-8859-2, ...), null for default system encoding */ public void loadStopWords( InputStream wordfile, String encoding ) { + setPreviousTokenStream(null); // force a new stopfilter to be created if ( wordfile == null ) { stoptable = new HashSet(); return; @@ -121,7 +122,9 @@ public final class CzechAnalyzer extends Analyzer { } } catch ( IOException e ) { - stoptable = null; + // clear any previous table (if present) + // TODO: throw IOException + stoptable = new HashSet(); } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java index bf45a967507..d328ae8269b 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java @@ -114,6 +114,7 @@ public class GermanAnalyzer extends Analyzer { */ public void setStemExclusionTable(String[] exclusionlist) { exclusionSet = StopFilter.makeStopSet(exclusionlist); + setPreviousTokenStream(null); // force a new stemmer to be created } /** @@ -121,6 +122,7 @@ public class GermanAnalyzer extends Analyzer { */ public void setStemExclusionTable(Map exclusionlist) { exclusionSet = new HashSet(exclusionlist.keySet()); + setPreviousTokenStream(null); // force a new stemmer to be created } /** @@ -128,6 +130,7 @@ public class GermanAnalyzer extends Analyzer { */ public void setStemExclusionTable(File exclusionlist) throws IOException { exclusionSet = WordlistLoader.getWordSet(exclusionlist); + setPreviousTokenStream(null); // force a new stemmer to be created } /** diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java index 048e8e7f06b..bbcff01ed0a 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java @@ -111,6 +111,7 @@ public final class FrenchAnalyzer extends Analyzer { */ public void setStemExclusionTable(String[] exclusionlist) { excltable = StopFilter.makeStopSet(exclusionlist); + setPreviousTokenStream(null); // force a new stemmer to be created } /** @@ -118,6 +119,7 @@ public final class FrenchAnalyzer extends Analyzer { */ public void setStemExclusionTable(Map exclusionlist) { excltable = new HashSet(exclusionlist.keySet()); + setPreviousTokenStream(null); // force a new stemmer to be created } /** @@ -126,6 +128,7 @@ public final class FrenchAnalyzer extends Analyzer { */ public void setStemExclusionTable(File exclusionlist) throws IOException { excltable = new HashSet(WordlistLoader.getWordSet(exclusionlist)); + setPreviousTokenStream(null); // force a new stemmer to be created } /** diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java index 081fbcf9898..ae40d4fe72b 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java @@ -131,6 +131,7 @@ public class DutchAnalyzer extends Analyzer { */ public void setStemExclusionTable(String[] exclusionlist) { excltable = StopFilter.makeStopSet(exclusionlist); + setPreviousTokenStream(null); // force a new stemmer to be created } /** @@ -138,6 +139,7 @@ public class DutchAnalyzer extends Analyzer { */ public void setStemExclusionTable(HashSet exclusionlist) { excltable = exclusionlist; + setPreviousTokenStream(null); // force a new stemmer to be created } /** @@ -146,6 +148,7 @@ public class DutchAnalyzer extends Analyzer { public void setStemExclusionTable(File exclusionlist) { try { excltable = org.apache.lucene.analysis.WordlistLoader.getWordSet(exclusionlist); + setPreviousTokenStream(null); // force a new stemmer to be created } catch (IOException e) { // TODO: throw IOException throw new RuntimeException(e); @@ -160,6 +163,7 @@ public class DutchAnalyzer extends Analyzer { public void setStemDictionary(File stemdictFile) { try { stemdict = org.apache.lucene.analysis.WordlistLoader.getStemDict(stemdictFile); + setPreviousTokenStream(null); // force a new stemmer to be created } catch (IOException e) { // TODO: throw IOException throw new RuntimeException(e); @@ -210,7 +214,7 @@ public class DutchAnalyzer extends Analyzer { streams.source = new StandardTokenizer(reader); streams.result = new StandardFilter(streams.source); streams.result = new StopFilter(streams.result, stoptable); - streams.result = new DutchStemFilter(streams.result, excltable); + streams.result = new DutchStemFilter(streams.result, excltable, stemdict); setPreviousTokenStream(streams); } else { streams.source.reset(reader); diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java index 0427cacc865..cb659cd0d8a 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java @@ -139,6 +139,17 @@ public class TestBrazilianStemmer extends TestCase { a.setStemExclusionTable(new String[] { "quintessência" }); checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged. } + + /* + * Test that changes to the exclusion table are applied immediately + * when using reusable token streams. + */ + public void testExclusionTableReuse() throws Exception { + BrazilianAnalyzer a = new BrazilianAnalyzer(); + checkReuse(a, "quintessência", "quintessente"); + a.setStemExclusionTable(new String[] { "quintessência" }); + checkReuse(a, "quintessência", "quintessência"); + } private void check(final String input, final String expected) throws IOException { Analyzer analyzer = new BrazilianAnalyzer(); diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java index 6672abb6713..1516e6d5820 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java @@ -17,6 +17,10 @@ package org.apache.lucene.analysis.cz; * limitations under the License. */ +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; import java.io.StringReader; import junit.framework.TestCase; @@ -32,17 +36,55 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute; * */ public class TestCzechAnalyzer extends TestCase { - + File dataDir = new File(System.getProperty("dataDir", "./bin")); + File customStopFile = new File(dataDir, "org/apache/lucene/analysis/cz/customStopWordFile.txt"); + public void testStopWord() throws Exception { assertAnalyzesTo(new CzechAnalyzer(), "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" }); } - + public void testReusableTokenStream() throws Exception { Analyzer analyzer = new CzechAnalyzer(); assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" }); assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česká", "republika" }); } + /* + * An input stream that always throws IOException for testing. + */ + private class UnreliableInputStream extends InputStream { + public int read() throws IOException { + throw new IOException(); + } + } + + /* + * The loadStopWords method does not throw IOException on error, + * instead previously it set the stoptable to null (versus empty) + * this would cause a NPE when it is time to create the StopFilter. + */ + public void testInvalidStopWordFile() throws Exception { + CzechAnalyzer cz = new CzechAnalyzer(); + cz.loadStopWords(new UnreliableInputStream(), "UTF-8"); + assertAnalyzesTo(cz, "Pokud mluvime o volnem", + new String[] { "pokud", "mluvime", "o", "volnem" }); + } + + /* + * Test that changes to the stop table via loadStopWords are applied immediately + * when using reusable token streams. + */ + public void testStopWordFileReuse() throws Exception { + CzechAnalyzer cz = new CzechAnalyzer(); + assertAnalyzesToReuse(cz, "Česká Republika", + new String[] { "česká", "republika" }); + + InputStream stopwords = new FileInputStream(customStopFile); + cz.loadStopWords(stopwords, "UTF-8"); + + assertAnalyzesToReuse(cz, "Česká Republika", new String[] { "česká" }); + } + private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); TermAttribute text = (TermAttribute) ts.getAttribute(TermAttribute.class); diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/customStopWordFile.txt b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/customStopWordFile.txt new file mode 100644 index 00000000000..6f9fd8703b3 --- /dev/null +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/customStopWordFile.txt @@ -0,0 +1,3 @@ +examplestopword +anotherexamplestopword +republika diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java index 64c64b239c6..a2cc2030f57 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java @@ -89,6 +89,17 @@ public class TestGermanStemFilter extends TestCase { checkReuse(new GermanSubclassAnalyzer(), "Tischen", "Tischen"); } + /* + * Test that changes to the exclusion table are applied immediately + * when using reusable token streams. + */ + public void testExclusionTableReuse() throws Exception { + GermanAnalyzer a = new GermanAnalyzer(); + checkReuse(a, "tischen", "tisch"); + a.setStemExclusionTable(new String[] { "tischen" }); + checkReuse(a, "tischen", "tischen"); + } + private void check(final String input, final String expected) throws IOException { Analyzer a = new GermanAnalyzer(); TokenStream tokenStream = a.tokenStream("dummy", new StringReader(input)); diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java index 31183bece03..b3c2b1cd5c9 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java @@ -221,4 +221,14 @@ public class TestFrenchAnalyzer extends TestCase { "captif" }); } + /* + * Test that changes to the exclusion table are applied immediately + * when using reusable token streams. + */ + public void testExclusionTableReuse() throws Exception { + FrenchAnalyzer fa = new FrenchAnalyzer(); + assertAnalyzesToReuse(fa, "habitable", new String[] { "habit" }); + fa.setStemExclusionTable(new String[] { "habitable" }); + assertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" }); + } } diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java index 3581c24dad1..fde17ae3476 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis.nl; * limitations under the License. */ +import java.io.File; import java.io.IOException; import java.io.Reader; import java.io.StringReader; @@ -35,6 +36,8 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute; * */ public class TestDutchStemmer extends TestCase { + File dataDir = new File(System.getProperty("dataDir", "./bin")); + File customDictFile = new File(dataDir, "org/apache/lucene/analysis/nl/customStemDict.txt"); public void testWithSnowballExamples() throws IOException { check("lichaamsziek", "lichaamsziek"); @@ -144,7 +147,28 @@ public class TestDutchStemmer extends TestCase { checkReuse(a, "lichamelijkheden", "lichamelijkheden"); } - + /* + * Test that changes to the exclusion table are applied immediately + * when using reusable token streams. + */ + public void testExclusionTableReuse() throws Exception { + DutchAnalyzer a = new DutchAnalyzer(); + checkReuse(a, "lichamelijk", "licham"); + a.setStemExclusionTable(new String[] { "lichamelijk" }); + checkReuse(a, "lichamelijk", "lichamelijk"); + } + + /* + * Test that changes to the dictionary stemming table are applied immediately + * when using reusable token streams. + */ + public void testStemDictionaryReuse() throws Exception { + DutchAnalyzer a = new DutchAnalyzer(); + checkReuse(a, "lichamelijk", "licham"); + a.setStemDictionary(customDictFile); + checkReuse(a, "lichamelijk", "somethingentirelydifferent"); + } + private void check(final String input, final String expected) throws IOException { Analyzer analyzer = new DutchAnalyzer(); TokenStream stream = analyzer.tokenStream("dummy", new StringReader(input)); diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/customStemDict.txt b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/customStemDict.txt new file mode 100644 index 00000000000..34d3abcbb27 --- /dev/null +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/customStemDict.txt @@ -0,0 +1,3 @@ +lichamelijk somethingentirelydifferent +lichamelijke licham +lichamelijkheden licham