mirror of https://github.com/apache/lucene.git
SOLR-1026: Add protected words support to SnowballPorterFilterFactory. Deprecated EnglishPorterFilterFactory and switched example to use English SnowballPorterFilterFactory instead
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@746122 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f11377a06c
commit
51d709de3c
|
@ -178,7 +178,7 @@
|
|||
/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
|
@ -191,7 +191,7 @@
|
|||
/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
@ -206,7 +206,7 @@
|
|||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
|
|
@ -31,6 +31,8 @@ import java.util.List;
|
|||
|
||||
/**
|
||||
* @version $Id$
|
||||
*
|
||||
* @deprecated Use SnowballPortFilterFactory with language="English" instead
|
||||
*/
|
||||
public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
public static final String PROTECTED_TOKENS = "protected";
|
||||
|
|
|
@ -17,9 +17,18 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.List;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
import org.tartarus.snowball.SnowballProgram;
|
||||
|
||||
/**
|
||||
|
@ -30,10 +39,40 @@ import org.tartarus.snowball.SnowballProgram;
|
|||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public class SnowballPorterFilterFactory extends BaseTokenFilterFactory {
|
||||
public class SnowballPorterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
public static final String PROTECTED_TOKENS = "protected";
|
||||
|
||||
private String language = "English";
|
||||
private Class stemClass;
|
||||
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
String wordFiles = args.get(PROTECTED_TOKENS);
|
||||
if (wordFiles != null) {
|
||||
try {
|
||||
File protectedWordFiles = new File(wordFiles);
|
||||
if (protectedWordFiles.exists()) {
|
||||
List<String> wlist = loader.getLines(wordFiles);
|
||||
//This cast is safe in Lucene
|
||||
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
|
||||
} else {
|
||||
List<String> files = StrUtils.splitFileNames(wordFiles);
|
||||
for (String file : files) {
|
||||
List<String> wlist = loader.getLines(file.trim());
|
||||
if (protectedWords == null)
|
||||
protectedWords = new CharArraySet(wlist, false);
|
||||
else
|
||||
protectedWords.addAll(wlist);
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private CharArraySet protectedWords = null;
|
||||
|
||||
@Override
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
|
@ -47,14 +86,61 @@ public class SnowballPorterFilterFactory extends BaseTokenFilterFactory {
|
|||
}
|
||||
}
|
||||
|
||||
public SnowballFilter create(TokenStream input) {
|
||||
public SnowballPorterFilter create(TokenStream input) {
|
||||
SnowballProgram program;
|
||||
try {
|
||||
program = (SnowballProgram)stemClass.newInstance();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Error instantiating stemmer for language " + language + "from class " +stemClass, e);
|
||||
}
|
||||
return new SnowballFilter(input, program);
|
||||
return new SnowballPorterFilter(input, program, protectedWords);
|
||||
}
|
||||
}
|
||||
|
||||
class SnowballPorterFilter extends TokenFilter {
|
||||
private final CharArraySet protWords;
|
||||
private SnowballProgram stemmer;
|
||||
|
||||
public SnowballPorterFilter(TokenStream source, SnowballProgram stemmer, CharArraySet protWords) {
|
||||
super(source);
|
||||
this.protWords = protWords;
|
||||
this.stemmer = stemmer;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* the original code from lucene sandbox
|
||||
* public final Token next() throws IOException {
|
||||
* Token token = input.next();
|
||||
* if (token == null)
|
||||
* return null;
|
||||
* stemmer.setCurrent(token.termText());
|
||||
* try {
|
||||
* stemMethod.invoke(stemmer, EMPTY_ARGS);
|
||||
* } catch (Exception e) {
|
||||
* throw new RuntimeException(e.toString());
|
||||
* }
|
||||
* return new Token(stemmer.getCurrent(),
|
||||
* token.startOffset(), token.endOffset(), token.type());
|
||||
* }
|
||||
*/
|
||||
|
||||
@Override
|
||||
public Token next(Token token) throws IOException {
|
||||
Token result = input.next(token);
|
||||
if (result != null) {
|
||||
char[] termBuffer = result.termBuffer();
|
||||
int len = result.termLength();
|
||||
// if protected, don't stem. use this to avoid stemming collisions.
|
||||
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
|
||||
return result;
|
||||
}
|
||||
stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array
|
||||
stemmer.stem();
|
||||
String newstr = stemmer.getCurrent();
|
||||
result.setTermBuffer(newstr.toCharArray(), 0, newstr.length());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,97 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.tartarus.snowball.ext.EnglishStemmer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
|
||||
public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
|
||||
|
||||
public void test() throws IOException {
|
||||
EnglishStemmer stemmer = new EnglishStemmer();
|
||||
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
|
||||
StringBuilder gold = new StringBuilder();
|
||||
for (String aTest : test) {
|
||||
stemmer.setCurrent(aTest);
|
||||
stemmer.stem();
|
||||
gold.append(stemmer.getCurrent()).append(' ');
|
||||
}
|
||||
|
||||
SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
args.put("language", "English");
|
||||
|
||||
factory.init(args);
|
||||
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
|
||||
String out = tsToString(factory.create(new IterTokenStream(test)));
|
||||
assertEquals(gold.toString().trim(), out);
|
||||
}
|
||||
|
||||
public void testProtected() throws Exception {
|
||||
EnglishStemmer stemmer = new EnglishStemmer();
|
||||
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
|
||||
StringBuilder gold = new StringBuilder();
|
||||
for (int i = 0; i < test.length; i++) {
|
||||
if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
|
||||
stemmer.setCurrent(test[i]);
|
||||
stemmer.stem();
|
||||
gold.append(stemmer.getCurrent()).append(' ');
|
||||
} else {
|
||||
gold.append(test[i]).append(' ');
|
||||
}
|
||||
}
|
||||
|
||||
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
args.put(SnowballPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
|
||||
factory.init(args);
|
||||
List<String> lines = new ArrayList<String>();
|
||||
Collections.addAll(lines, "banks", "fledgling");
|
||||
factory.inform(new LinesMockSolrResourceLoader(lines));
|
||||
String out = tsToString(factory.create(new IterTokenStream(test)));
|
||||
assertEquals(gold.toString().trim(), out);
|
||||
}
|
||||
|
||||
class LinesMockSolrResourceLoader implements ResourceLoader {
|
||||
List<String> lines;
|
||||
|
||||
LinesMockSolrResourceLoader(List<String> lines) {
|
||||
this.lines = lines;
|
||||
}
|
||||
|
||||
public List<String> getLines(String resource) throws IOException {
|
||||
return lines;
|
||||
}
|
||||
|
||||
public Object newInstance(String cname, String... subpackages) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public InputStream openResource(String resource) throws IOException {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue