LUCENE-1794: Ensure analyzer options are applied immediately when using reusable token streams

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@805766 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2009-08-19 11:56:31 +00:00
parent 39ada16b9c
commit 58cd4a04d7
12 changed files with 125 additions and 5 deletions

View File

@ -111,18 +111,21 @@ public final class BrazilianAnalyzer extends Analyzer {
*/
public void setStemExclusionTable( String[] exclusionlist ) {
excltable = StopFilter.makeStopSet( exclusionlist );
setPreviousTokenStream(null); // force a new stemmer to be created
}
/**
* Builds an exclusionlist from a {@link Map}.
*/
public void setStemExclusionTable( Map exclusionlist ) {
excltable = new HashSet(exclusionlist.keySet());
setPreviousTokenStream(null); // force a new stemmer to be created
}
/**
* Builds an exclusionlist from the words contained in the given file.
*/
public void setStemExclusionTable( File exclusionlist ) throws IOException {
excltable = WordlistLoader.getWordSet( exclusionlist );
setPreviousTokenStream(null); // force a new stemmer to be created
}
/**

View File

@ -100,6 +100,7 @@ public final class CzechAnalyzer extends Analyzer {
* @param encoding Encoding used (win-1250, iso-8859-2, ...), null for default system encoding
*/
public void loadStopWords( InputStream wordfile, String encoding ) {
setPreviousTokenStream(null); // force a new stopfilter to be created
if ( wordfile == null ) {
stoptable = new HashSet();
return;
@ -121,7 +122,9 @@ public final class CzechAnalyzer extends Analyzer {
}
} catch ( IOException e ) {
stoptable = null;
// clear any previous table (if present)
// TODO: throw IOException
stoptable = new HashSet();
}
}

View File

@ -114,6 +114,7 @@ public class GermanAnalyzer extends Analyzer {
*/
public void setStemExclusionTable(String[] exclusionlist) {
exclusionSet = StopFilter.makeStopSet(exclusionlist);
setPreviousTokenStream(null); // force a new stemmer to be created
}
/**
@ -121,6 +122,7 @@ public class GermanAnalyzer extends Analyzer {
*/
public void setStemExclusionTable(Map exclusionlist) {
exclusionSet = new HashSet(exclusionlist.keySet());
setPreviousTokenStream(null); // force a new stemmer to be created
}
/**
@ -128,6 +130,7 @@ public class GermanAnalyzer extends Analyzer {
*/
public void setStemExclusionTable(File exclusionlist) throws IOException {
exclusionSet = WordlistLoader.getWordSet(exclusionlist);
setPreviousTokenStream(null); // force a new stemmer to be created
}
/**

View File

@ -111,6 +111,7 @@ public final class FrenchAnalyzer extends Analyzer {
*/
public void setStemExclusionTable(String[] exclusionlist) {
excltable = StopFilter.makeStopSet(exclusionlist);
setPreviousTokenStream(null); // force a new stemmer to be created
}
/**
@ -118,6 +119,7 @@ public final class FrenchAnalyzer extends Analyzer {
*/
public void setStemExclusionTable(Map exclusionlist) {
excltable = new HashSet(exclusionlist.keySet());
setPreviousTokenStream(null); // force a new stemmer to be created
}
/**
@ -126,6 +128,7 @@ public final class FrenchAnalyzer extends Analyzer {
*/
public void setStemExclusionTable(File exclusionlist) throws IOException {
excltable = new HashSet(WordlistLoader.getWordSet(exclusionlist));
setPreviousTokenStream(null); // force a new stemmer to be created
}
/**

View File

@ -131,6 +131,7 @@ public class DutchAnalyzer extends Analyzer {
*/
public void setStemExclusionTable(String[] exclusionlist) {
excltable = StopFilter.makeStopSet(exclusionlist);
setPreviousTokenStream(null); // force a new stemmer to be created
}
/**
@ -138,6 +139,7 @@ public class DutchAnalyzer extends Analyzer {
*/
public void setStemExclusionTable(HashSet exclusionlist) {
excltable = exclusionlist;
setPreviousTokenStream(null); // force a new stemmer to be created
}
/**
@ -146,6 +148,7 @@ public class DutchAnalyzer extends Analyzer {
public void setStemExclusionTable(File exclusionlist) {
try {
excltable = org.apache.lucene.analysis.WordlistLoader.getWordSet(exclusionlist);
setPreviousTokenStream(null); // force a new stemmer to be created
} catch (IOException e) {
// TODO: throw IOException
throw new RuntimeException(e);
@ -160,6 +163,7 @@ public class DutchAnalyzer extends Analyzer {
public void setStemDictionary(File stemdictFile) {
try {
stemdict = org.apache.lucene.analysis.WordlistLoader.getStemDict(stemdictFile);
setPreviousTokenStream(null); // force a new stemmer to be created
} catch (IOException e) {
// TODO: throw IOException
throw new RuntimeException(e);
@ -210,7 +214,7 @@ public class DutchAnalyzer extends Analyzer {
streams.source = new StandardTokenizer(reader);
streams.result = new StandardFilter(streams.source);
streams.result = new StopFilter(streams.result, stoptable);
streams.result = new DutchStemFilter(streams.result, excltable);
streams.result = new DutchStemFilter(streams.result, excltable, stemdict);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);

View File

@ -139,6 +139,17 @@ public class TestBrazilianStemmer extends TestCase {
a.setStemExclusionTable(new String[] { "quintessência" });
checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
}
/*
* Test that changes to the exclusion table are applied immediately
* when using reusable token streams.
*/
public void testExclusionTableReuse() throws Exception {
BrazilianAnalyzer a = new BrazilianAnalyzer();
checkReuse(a, "quintessência", "quintessente");
a.setStemExclusionTable(new String[] { "quintessência" });
checkReuse(a, "quintessência", "quintessência");
}
private void check(final String input, final String expected) throws IOException {
Analyzer analyzer = new BrazilianAnalyzer();

View File

@ -17,6 +17,10 @@ package org.apache.lucene.analysis.cz;
* limitations under the License.
*/
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import junit.framework.TestCase;
@ -32,17 +36,55 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
*
*/
public class TestCzechAnalyzer extends TestCase {
File dataDir = new File(System.getProperty("dataDir", "./bin"));
File customStopFile = new File(dataDir, "org/apache/lucene/analysis/cz/customStopWordFile.txt");
public void testStopWord() throws Exception {
assertAnalyzesTo(new CzechAnalyzer(), "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" });
}
public void testReusableTokenStream() throws Exception {
Analyzer analyzer = new CzechAnalyzer();
assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" });
assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česká", "republika" });
}
/*
* An input stream that always throws IOException for testing.
*/
private class UnreliableInputStream extends InputStream {
public int read() throws IOException {
throw new IOException();
}
}
/*
* The loadStopWords method does not throw IOException on error,
* instead previously it set the stoptable to null (versus empty)
* this would cause a NPE when it is time to create the StopFilter.
*/
public void testInvalidStopWordFile() throws Exception {
CzechAnalyzer cz = new CzechAnalyzer();
cz.loadStopWords(new UnreliableInputStream(), "UTF-8");
assertAnalyzesTo(cz, "Pokud mluvime o volnem",
new String[] { "pokud", "mluvime", "o", "volnem" });
}
/*
* Test that changes to the stop table via loadStopWords are applied immediately
* when using reusable token streams.
*/
public void testStopWordFileReuse() throws Exception {
CzechAnalyzer cz = new CzechAnalyzer();
assertAnalyzesToReuse(cz, "Česká Republika",
new String[] { "česká", "republika" });
InputStream stopwords = new FileInputStream(customStopFile);
cz.loadStopWords(stopwords, "UTF-8");
assertAnalyzesToReuse(cz, "Česká Republika", new String[] { "česká" });
}
private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
TermAttribute text = (TermAttribute) ts.getAttribute(TermAttribute.class);

View File

@ -0,0 +1,3 @@
examplestopword
anotherexamplestopword
republika

View File

@ -89,6 +89,17 @@ public class TestGermanStemFilter extends TestCase {
checkReuse(new GermanSubclassAnalyzer(), "Tischen", "Tischen");
}
/*
* Test that changes to the exclusion table are applied immediately
* when using reusable token streams.
*/
public void testExclusionTableReuse() throws Exception {
GermanAnalyzer a = new GermanAnalyzer();
checkReuse(a, "tischen", "tisch");
a.setStemExclusionTable(new String[] { "tischen" });
checkReuse(a, "tischen", "tischen");
}
private void check(final String input, final String expected) throws IOException {
Analyzer a = new GermanAnalyzer();
TokenStream tokenStream = a.tokenStream("dummy", new StringReader(input));

View File

@ -221,4 +221,14 @@ public class TestFrenchAnalyzer extends TestCase {
"captif" });
}
/*
* Test that changes to the exclusion table are applied immediately
* when using reusable token streams.
*/
public void testExclusionTableReuse() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer();
assertAnalyzesToReuse(fa, "habitable", new String[] { "habit" });
fa.setStemExclusionTable(new String[] { "habitable" });
assertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" });
}
}

View File

@ -17,6 +17,7 @@ package org.apache.lucene.analysis.nl;
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
@ -35,6 +36,8 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
*
*/
public class TestDutchStemmer extends TestCase {
File dataDir = new File(System.getProperty("dataDir", "./bin"));
File customDictFile = new File(dataDir, "org/apache/lucene/analysis/nl/customStemDict.txt");
public void testWithSnowballExamples() throws IOException {
check("lichaamsziek", "lichaamsziek");
@ -144,7 +147,28 @@ public class TestDutchStemmer extends TestCase {
checkReuse(a, "lichamelijkheden", "lichamelijkheden");
}
/*
* Test that changes to the exclusion table are applied immediately
* when using reusable token streams.
*/
public void testExclusionTableReuse() throws Exception {
DutchAnalyzer a = new DutchAnalyzer();
checkReuse(a, "lichamelijk", "licham");
a.setStemExclusionTable(new String[] { "lichamelijk" });
checkReuse(a, "lichamelijk", "lichamelijk");
}
/*
* Test that changes to the dictionary stemming table are applied immediately
* when using reusable token streams.
*/
public void testStemDictionaryReuse() throws Exception {
DutchAnalyzer a = new DutchAnalyzer();
checkReuse(a, "lichamelijk", "licham");
a.setStemDictionary(customDictFile);
checkReuse(a, "lichamelijk", "somethingentirelydifferent");
}
private void check(final String input, final String expected) throws IOException {
Analyzer analyzer = new DutchAnalyzer();
TokenStream stream = analyzer.tokenStream("dummy", new StringReader(input));

View File

@ -0,0 +1,3 @@
lichamelijk somethingentirelydifferent
lichamelijke licham
lichamelijkheden licham