SOLR-1026: Add protected words support to SnowballPorterFilterFactory. Deprecated EnglishPorterFilterFactory and switched example to use English SnowballPorterFilterFactory instead

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@746122 13f79535-47bb-0310-9956-ffa450edef68
2009-02-20 03:23:58 +00:00 · 2009-02-20 03:23:58 +00:00 · 51d709de3c
parent f11377a06c
commit 51d709de3c
4 changed files with 191 additions and 6 deletions
--- a/example/solr/conf/schema.xml
+++ b/example/solr/conf/schema.xml
@ -178,7 +178,7 @@
                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>
      <analyzer type="query">
@ -191,7 +191,7 @@
                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>
    </fieldType>
@ -206,7 +206,7 @@
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
        <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>
    </fieldType>
--- a/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java
+++ b/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java
@ -31,6 +31,8 @@ import java.util.List;

 /**
 * @version $Id$
+ *
+ * @deprecated Use SnowballPortFilterFactory with language="English" instead
 */
 public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
  public static final String PROTECTED_TOKENS = "protected";
--- a/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java
+++ b/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java
@ -17,9 +17,18 @@
 package org.apache.solr.analysis;

 import java.util.Map;
+import java.util.List;
+import java.io.File;
+import java.io.IOException;

 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
 import org.tartarus.snowball.SnowballProgram;

 /**
@ -30,10 +39,40 @@ import org.tartarus.snowball.SnowballProgram;
 * 
 * @version $Id$
 */
-public class SnowballPorterFilterFactory extends BaseTokenFilterFactory {
+public class SnowballPorterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+  public static final String PROTECTED_TOKENS = "protected";
+
  private String language = "English";
  private Class stemClass;

+
+  public void inform(ResourceLoader loader) {
+    String wordFiles = args.get(PROTECTED_TOKENS);
+    if (wordFiles != null) {
+      try {
+        File protectedWordFiles = new File(wordFiles);
+        if (protectedWordFiles.exists()) {
+          List<String> wlist = loader.getLines(wordFiles);
+          //This cast is safe in Lucene
+          protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
+        } else  {
+          List<String> files = StrUtils.splitFileNames(wordFiles);
+          for (String file : files) {
+            List<String> wlist = loader.getLines(file.trim());
+            if (protectedWords == null)
+              protectedWords = new CharArraySet(wlist, false);
+            else
+              protectedWords.addAll(wlist);
+          }
+        }
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    }
+  }
+
+  private CharArraySet protectedWords = null;
+
  @Override
  public void init(Map<String, String> args) {
    super.init(args);
@ -47,14 +86,61 @@ public class SnowballPorterFilterFactory extends BaseTokenFilterFactory {
    }
  }
  
-  public SnowballFilter create(TokenStream input) {
+  public SnowballPorterFilter create(TokenStream input) {
    SnowballProgram program;
    try {
      program = (SnowballProgram)stemClass.newInstance();
    } catch (Exception e) {
      throw new RuntimeException("Error instantiating stemmer for language " + language + "from class " +stemClass, e);
    }
-    return new SnowballFilter(input, program);
+    return new SnowballPorterFilter(input, program, protectedWords);
+  }
+}
+
+class SnowballPorterFilter extends TokenFilter {
+  private final CharArraySet protWords;
+  private SnowballProgram stemmer;
+
+  public SnowballPorterFilter(TokenStream source, SnowballProgram stemmer, CharArraySet protWords) {
+    super(source);
+    this.protWords = protWords;
+    this.stemmer = stemmer;
+  }
+
+
+  /**
+   * the original code from lucene sandbox
+   * public final Token next() throws IOException {
+   * Token token = input.next();
+   * if (token == null)
+   * return null;
+   * stemmer.setCurrent(token.termText());
+   * try {
+   * stemMethod.invoke(stemmer, EMPTY_ARGS);
+   * } catch (Exception e) {
+   * throw new RuntimeException(e.toString());
+   * }
+   * return new Token(stemmer.getCurrent(),
+   * token.startOffset(), token.endOffset(), token.type());
+   * }
+   */
+
+  @Override
+  public Token next(Token token) throws IOException {
+    Token result = input.next(token);
+    if (result != null) {
+      char[] termBuffer = result.termBuffer();
+      int len = result.termLength();
+      // if protected, don't stem.  use this to avoid stemming collisions.
+      if (protWords != null && protWords.contains(termBuffer, 0, len)) {
+        return result;
+      }
+      stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array
+      stemmer.stem();
+      String newstr = stemmer.getCurrent();
+      result.setTermBuffer(newstr.toCharArray(), 0, newstr.length());
+    }
+    return result;
  }
 }

--- a/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java
+++ b/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java
@ -0,0 +1,97 @@
+package org.apache.solr.analysis;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.solr.common.ResourceLoader;
+import org.tartarus.snowball.ext.EnglishStemmer;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.ArrayList;
+import java.util.Collections;
+
+public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
+
+  public void test() throws IOException {
+    EnglishStemmer stemmer = new EnglishStemmer();
+    String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
+    StringBuilder gold = new StringBuilder();
+    for (String aTest : test) {
+      stemmer.setCurrent(aTest);
+      stemmer.stem();
+      gold.append(stemmer.getCurrent()).append(' ');
+    }
+
+    SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
+    Map<String, String> args = new HashMap<String, String>();
+    args.put("language", "English");
+
+    factory.init(args);
+    factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
+    String out = tsToString(factory.create(new IterTokenStream(test)));
+    assertEquals(gold.toString().trim(), out);
+  }
+
+  public void testProtected() throws Exception {
+    EnglishStemmer stemmer = new EnglishStemmer();
+    String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
+    StringBuilder gold = new StringBuilder();
+    for (int i = 0; i < test.length; i++) {
+      if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
+        stemmer.setCurrent(test[i]);
+        stemmer.stem();
+        gold.append(stemmer.getCurrent()).append(' ');
+      } else {
+        gold.append(test[i]).append(' ');
+      }
+    }
+
+    EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
+    Map<String, String> args = new HashMap<String, String>();
+    args.put(SnowballPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
+    factory.init(args);
+    List<String> lines = new ArrayList<String>();
+    Collections.addAll(lines, "banks", "fledgling");
+    factory.inform(new LinesMockSolrResourceLoader(lines));
+    String out = tsToString(factory.create(new IterTokenStream(test)));
+    assertEquals(gold.toString().trim(), out);
+  }
+
+  class LinesMockSolrResourceLoader implements ResourceLoader {
+    List<String> lines;
+
+    LinesMockSolrResourceLoader(List<String> lines) {
+      this.lines = lines;
+    }
+
+    public List<String> getLines(String resource) throws IOException {
+      return lines;
+    }
+
+    public Object newInstance(String cname, String... subpackages) {
+      return null;
+    }
+
+    public InputStream openResource(String resource) throws IOException {
+      return null;
+    }
+  }
+}
+