SOLR-1026: Add protected words support to SnowballPorterFilterFactory. Deprecated EnglishPorterFilterFactory and switched example to use English SnowballPorterFilterFactory instead

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@746122 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erik Hatcher 2009-02-20 03:23:58 +00:00
parent f11377a06c
commit 51d709de3c
4 changed files with 191 additions and 6 deletions

View File

@ -178,7 +178,7 @@
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
@ -191,7 +191,7 @@
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
@ -206,7 +206,7 @@
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>

View File

@ -31,6 +31,8 @@ import java.util.List;
/**
* @version $Id$
*
* @deprecated Use SnowballPortFilterFactory with language="English" instead
*/
public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
public static final String PROTECTED_TOKENS = "protected";

View File

@ -17,9 +17,18 @@
package org.apache.solr.analysis;
import java.util.Map;
import java.util.List;
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.tartarus.snowball.SnowballProgram;
/**
@ -30,10 +39,40 @@ import org.tartarus.snowball.SnowballProgram;
*
* @version $Id$
*/
public class SnowballPorterFilterFactory extends BaseTokenFilterFactory {
public class SnowballPorterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
public static final String PROTECTED_TOKENS = "protected";
private String language = "English";
private Class stemClass;
public void inform(ResourceLoader loader) {
String wordFiles = args.get(PROTECTED_TOKENS);
if (wordFiles != null) {
try {
File protectedWordFiles = new File(wordFiles);
if (protectedWordFiles.exists()) {
List<String> wlist = loader.getLines(wordFiles);
//This cast is safe in Lucene
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
} else {
List<String> files = StrUtils.splitFileNames(wordFiles);
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
if (protectedWords == null)
protectedWords = new CharArraySet(wlist, false);
else
protectedWords.addAll(wlist);
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
private CharArraySet protectedWords = null;
@Override
public void init(Map<String, String> args) {
super.init(args);
@ -47,14 +86,61 @@ public class SnowballPorterFilterFactory extends BaseTokenFilterFactory {
}
}
public SnowballFilter create(TokenStream input) {
public SnowballPorterFilter create(TokenStream input) {
SnowballProgram program;
try {
program = (SnowballProgram)stemClass.newInstance();
} catch (Exception e) {
throw new RuntimeException("Error instantiating stemmer for language " + language + "from class " +stemClass, e);
}
return new SnowballFilter(input, program);
return new SnowballPorterFilter(input, program, protectedWords);
}
}
class SnowballPorterFilter extends TokenFilter {
private final CharArraySet protWords;
private SnowballProgram stemmer;
public SnowballPorterFilter(TokenStream source, SnowballProgram stemmer, CharArraySet protWords) {
super(source);
this.protWords = protWords;
this.stemmer = stemmer;
}
/**
* the original code from lucene sandbox
* public final Token next() throws IOException {
* Token token = input.next();
* if (token == null)
* return null;
* stemmer.setCurrent(token.termText());
* try {
* stemMethod.invoke(stemmer, EMPTY_ARGS);
* } catch (Exception e) {
* throw new RuntimeException(e.toString());
* }
* return new Token(stemmer.getCurrent(),
* token.startOffset(), token.endOffset(), token.type());
* }
*/
@Override
public Token next(Token token) throws IOException {
Token result = input.next(token);
if (result != null) {
char[] termBuffer = result.termBuffer();
int len = result.termLength();
// if protected, don't stem. use this to avoid stemming collisions.
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
return result;
}
stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array
stemmer.stem();
String newstr = stemmer.getCurrent();
result.setTermBuffer(newstr.toCharArray(), 0, newstr.length());
}
return result;
}
}

View File

@ -0,0 +1,97 @@
package org.apache.solr.analysis;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.solr.common.ResourceLoader;
import org.tartarus.snowball.ext.EnglishStemmer;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.ArrayList;
import java.util.Collections;
public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
public void test() throws IOException {
EnglishStemmer stemmer = new EnglishStemmer();
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
StringBuilder gold = new StringBuilder();
for (String aTest : test) {
stemmer.setCurrent(aTest);
stemmer.stem();
gold.append(stemmer.getCurrent()).append(' ');
}
SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
Map<String, String> args = new HashMap<String, String>();
args.put("language", "English");
factory.init(args);
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
String out = tsToString(factory.create(new IterTokenStream(test)));
assertEquals(gold.toString().trim(), out);
}
public void testProtected() throws Exception {
EnglishStemmer stemmer = new EnglishStemmer();
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
StringBuilder gold = new StringBuilder();
for (int i = 0; i < test.length; i++) {
if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
stemmer.setCurrent(test[i]);
stemmer.stem();
gold.append(stemmer.getCurrent()).append(' ');
} else {
gold.append(test[i]).append(' ');
}
}
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
Map<String, String> args = new HashMap<String, String>();
args.put(SnowballPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
factory.init(args);
List<String> lines = new ArrayList<String>();
Collections.addAll(lines, "banks", "fledgling");
factory.inform(new LinesMockSolrResourceLoader(lines));
String out = tsToString(factory.create(new IterTokenStream(test)));
assertEquals(gold.toString().trim(), out);
}
class LinesMockSolrResourceLoader implements ResourceLoader {
List<String> lines;
LinesMockSolrResourceLoader(List<String> lines) {
this.lines = lines;
}
public List<String> getLines(String resource) throws IOException {
return lines;
}
public Object newInstance(String cname, String... subpackages) {
return null;
}
public InputStream openResource(String resource) throws IOException {
return null;
}
}
}