mirror of https://github.com/apache/lucene.git
LUCENE-1794: Ensure analyzer options are applied immediately when using reusable token streams
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@805766 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
39ada16b9c
commit
58cd4a04d7
|
@ -111,18 +111,21 @@ public final class BrazilianAnalyzer extends Analyzer {
|
|||
*/
|
||||
public void setStemExclusionTable( String[] exclusionlist ) {
|
||||
excltable = StopFilter.makeStopSet( exclusionlist );
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
/**
|
||||
* Builds an exclusionlist from a {@link Map}.
|
||||
*/
|
||||
public void setStemExclusionTable( Map exclusionlist ) {
|
||||
excltable = new HashSet(exclusionlist.keySet());
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
/**
|
||||
* Builds an exclusionlist from the words contained in the given file.
|
||||
*/
|
||||
public void setStemExclusionTable( File exclusionlist ) throws IOException {
|
||||
excltable = WordlistLoader.getWordSet( exclusionlist );
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -100,6 +100,7 @@ public final class CzechAnalyzer extends Analyzer {
|
|||
* @param encoding Encoding used (win-1250, iso-8859-2, ...), null for default system encoding
|
||||
*/
|
||||
public void loadStopWords( InputStream wordfile, String encoding ) {
|
||||
setPreviousTokenStream(null); // force a new stopfilter to be created
|
||||
if ( wordfile == null ) {
|
||||
stoptable = new HashSet();
|
||||
return;
|
||||
|
@ -121,7 +122,9 @@ public final class CzechAnalyzer extends Analyzer {
|
|||
}
|
||||
|
||||
} catch ( IOException e ) {
|
||||
stoptable = null;
|
||||
// clear any previous table (if present)
|
||||
// TODO: throw IOException
|
||||
stoptable = new HashSet();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -114,6 +114,7 @@ public class GermanAnalyzer extends Analyzer {
|
|||
*/
|
||||
public void setStemExclusionTable(String[] exclusionlist) {
|
||||
exclusionSet = StopFilter.makeStopSet(exclusionlist);
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -121,6 +122,7 @@ public class GermanAnalyzer extends Analyzer {
|
|||
*/
|
||||
public void setStemExclusionTable(Map exclusionlist) {
|
||||
exclusionSet = new HashSet(exclusionlist.keySet());
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -128,6 +130,7 @@ public class GermanAnalyzer extends Analyzer {
|
|||
*/
|
||||
public void setStemExclusionTable(File exclusionlist) throws IOException {
|
||||
exclusionSet = WordlistLoader.getWordSet(exclusionlist);
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -111,6 +111,7 @@ public final class FrenchAnalyzer extends Analyzer {
|
|||
*/
|
||||
public void setStemExclusionTable(String[] exclusionlist) {
|
||||
excltable = StopFilter.makeStopSet(exclusionlist);
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -118,6 +119,7 @@ public final class FrenchAnalyzer extends Analyzer {
|
|||
*/
|
||||
public void setStemExclusionTable(Map exclusionlist) {
|
||||
excltable = new HashSet(exclusionlist.keySet());
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -126,6 +128,7 @@ public final class FrenchAnalyzer extends Analyzer {
|
|||
*/
|
||||
public void setStemExclusionTable(File exclusionlist) throws IOException {
|
||||
excltable = new HashSet(WordlistLoader.getWordSet(exclusionlist));
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -131,6 +131,7 @@ public class DutchAnalyzer extends Analyzer {
|
|||
*/
|
||||
public void setStemExclusionTable(String[] exclusionlist) {
|
||||
excltable = StopFilter.makeStopSet(exclusionlist);
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -138,6 +139,7 @@ public class DutchAnalyzer extends Analyzer {
|
|||
*/
|
||||
public void setStemExclusionTable(HashSet exclusionlist) {
|
||||
excltable = exclusionlist;
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -146,6 +148,7 @@ public class DutchAnalyzer extends Analyzer {
|
|||
public void setStemExclusionTable(File exclusionlist) {
|
||||
try {
|
||||
excltable = org.apache.lucene.analysis.WordlistLoader.getWordSet(exclusionlist);
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
} catch (IOException e) {
|
||||
// TODO: throw IOException
|
||||
throw new RuntimeException(e);
|
||||
|
@ -160,6 +163,7 @@ public class DutchAnalyzer extends Analyzer {
|
|||
public void setStemDictionary(File stemdictFile) {
|
||||
try {
|
||||
stemdict = org.apache.lucene.analysis.WordlistLoader.getStemDict(stemdictFile);
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
} catch (IOException e) {
|
||||
// TODO: throw IOException
|
||||
throw new RuntimeException(e);
|
||||
|
@ -210,7 +214,7 @@ public class DutchAnalyzer extends Analyzer {
|
|||
streams.source = new StandardTokenizer(reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new StopFilter(streams.result, stoptable);
|
||||
streams.result = new DutchStemFilter(streams.result, excltable);
|
||||
streams.result = new DutchStemFilter(streams.result, excltable, stemdict);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
|
|
|
@ -140,6 +140,17 @@ public class TestBrazilianStemmer extends TestCase {
|
|||
checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
|
||||
}
|
||||
|
||||
/*
|
||||
* Test that changes to the exclusion table are applied immediately
|
||||
* when using reusable token streams.
|
||||
*/
|
||||
public void testExclusionTableReuse() throws Exception {
|
||||
BrazilianAnalyzer a = new BrazilianAnalyzer();
|
||||
checkReuse(a, "quintessência", "quintessente");
|
||||
a.setStemExclusionTable(new String[] { "quintessência" });
|
||||
checkReuse(a, "quintessência", "quintessência");
|
||||
}
|
||||
|
||||
private void check(final String input, final String expected) throws IOException {
|
||||
Analyzer analyzer = new BrazilianAnalyzer();
|
||||
TokenStream stream = analyzer.tokenStream("dummy", new StringReader(input));
|
||||
|
|
|
@ -17,6 +17,10 @@ package org.apache.lucene.analysis.cz;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.StringReader;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
@ -32,6 +36,8 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|||
*
|
||||
*/
|
||||
public class TestCzechAnalyzer extends TestCase {
|
||||
File dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||
File customStopFile = new File(dataDir, "org/apache/lucene/analysis/cz/customStopWordFile.txt");
|
||||
|
||||
public void testStopWord() throws Exception {
|
||||
assertAnalyzesTo(new CzechAnalyzer(), "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" });
|
||||
|
@ -43,6 +49,42 @@ public class TestCzechAnalyzer extends TestCase {
|
|||
assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česká", "republika" });
|
||||
}
|
||||
|
||||
/*
|
||||
* An input stream that always throws IOException for testing.
|
||||
*/
|
||||
private class UnreliableInputStream extends InputStream {
|
||||
public int read() throws IOException {
|
||||
throw new IOException();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The loadStopWords method does not throw IOException on error,
|
||||
* instead previously it set the stoptable to null (versus empty)
|
||||
* this would cause a NPE when it is time to create the StopFilter.
|
||||
*/
|
||||
public void testInvalidStopWordFile() throws Exception {
|
||||
CzechAnalyzer cz = new CzechAnalyzer();
|
||||
cz.loadStopWords(new UnreliableInputStream(), "UTF-8");
|
||||
assertAnalyzesTo(cz, "Pokud mluvime o volnem",
|
||||
new String[] { "pokud", "mluvime", "o", "volnem" });
|
||||
}
|
||||
|
||||
/*
|
||||
* Test that changes to the stop table via loadStopWords are applied immediately
|
||||
* when using reusable token streams.
|
||||
*/
|
||||
public void testStopWordFileReuse() throws Exception {
|
||||
CzechAnalyzer cz = new CzechAnalyzer();
|
||||
assertAnalyzesToReuse(cz, "Česká Republika",
|
||||
new String[] { "česká", "republika" });
|
||||
|
||||
InputStream stopwords = new FileInputStream(customStopFile);
|
||||
cz.loadStopWords(stopwords, "UTF-8");
|
||||
|
||||
assertAnalyzesToReuse(cz, "Česká Republika", new String[] { "česká" });
|
||||
}
|
||||
|
||||
private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
|
||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||
TermAttribute text = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
examplestopword
|
||||
anotherexamplestopword
|
||||
republika
|
|
@ -89,6 +89,17 @@ public class TestGermanStemFilter extends TestCase {
|
|||
checkReuse(new GermanSubclassAnalyzer(), "Tischen", "Tischen");
|
||||
}
|
||||
|
||||
/*
|
||||
* Test that changes to the exclusion table are applied immediately
|
||||
* when using reusable token streams.
|
||||
*/
|
||||
public void testExclusionTableReuse() throws Exception {
|
||||
GermanAnalyzer a = new GermanAnalyzer();
|
||||
checkReuse(a, "tischen", "tisch");
|
||||
a.setStemExclusionTable(new String[] { "tischen" });
|
||||
checkReuse(a, "tischen", "tischen");
|
||||
}
|
||||
|
||||
private void check(final String input, final String expected) throws IOException {
|
||||
Analyzer a = new GermanAnalyzer();
|
||||
TokenStream tokenStream = a.tokenStream("dummy", new StringReader(input));
|
||||
|
|
|
@ -221,4 +221,14 @@ public class TestFrenchAnalyzer extends TestCase {
|
|||
"captif" });
|
||||
}
|
||||
|
||||
/*
|
||||
* Test that changes to the exclusion table are applied immediately
|
||||
* when using reusable token streams.
|
||||
*/
|
||||
public void testExclusionTableReuse() throws Exception {
|
||||
FrenchAnalyzer fa = new FrenchAnalyzer();
|
||||
assertAnalyzesToReuse(fa, "habitable", new String[] { "habit" });
|
||||
fa.setStemExclusionTable(new String[] { "habitable" });
|
||||
assertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis.nl;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
@ -35,6 +36,8 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|||
*
|
||||
*/
|
||||
public class TestDutchStemmer extends TestCase {
|
||||
File dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||
File customDictFile = new File(dataDir, "org/apache/lucene/analysis/nl/customStemDict.txt");
|
||||
|
||||
public void testWithSnowballExamples() throws IOException {
|
||||
check("lichaamsziek", "lichaamsziek");
|
||||
|
@ -144,6 +147,27 @@ public class TestDutchStemmer extends TestCase {
|
|||
checkReuse(a, "lichamelijkheden", "lichamelijkheden");
|
||||
}
|
||||
|
||||
/*
|
||||
* Test that changes to the exclusion table are applied immediately
|
||||
* when using reusable token streams.
|
||||
*/
|
||||
public void testExclusionTableReuse() throws Exception {
|
||||
DutchAnalyzer a = new DutchAnalyzer();
|
||||
checkReuse(a, "lichamelijk", "licham");
|
||||
a.setStemExclusionTable(new String[] { "lichamelijk" });
|
||||
checkReuse(a, "lichamelijk", "lichamelijk");
|
||||
}
|
||||
|
||||
/*
|
||||
* Test that changes to the dictionary stemming table are applied immediately
|
||||
* when using reusable token streams.
|
||||
*/
|
||||
public void testStemDictionaryReuse() throws Exception {
|
||||
DutchAnalyzer a = new DutchAnalyzer();
|
||||
checkReuse(a, "lichamelijk", "licham");
|
||||
a.setStemDictionary(customDictFile);
|
||||
checkReuse(a, "lichamelijk", "somethingentirelydifferent");
|
||||
}
|
||||
|
||||
private void check(final String input, final String expected) throws IOException {
|
||||
Analyzer analyzer = new DutchAnalyzer();
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
lichamelijk somethingentirelydifferent
|
||||
lichamelijke licham
|
||||
lichamelijkheden licham
|
Loading…
Reference in New Issue