SOLR-253 -- adding KeepWordFilter, the inverse of StopFilter.

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@544328 13f79535-47bb-0310-9956-ffa450edef68
2025-03-01 13:59:12 +00:00 · 2007-06-04 23:45:00 +00:00 · 2007-06-04 23:45:00 +00:00 · cc053495de
commit cc053495de
parent 407ca95deb
5 changed files with 201 additions and 3 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -32,7 +32,11 @@ New Features
 1. SOLR-69: Adding MoreLikeThisHandler to search for similar documents using
    lucene contrib/queries MoreLikeThis.  MoreLikeThis is also avaliable from
    the StandardRequestHandler using ?mlt=true. (bdelacretaz, ryan)
- 
+
+ 2. SOLR-253: Adding KeepWordFilter and KeepWordFilterFactory.  A TokenFilter
+    that keeps tokens with text in the registered keeplist.  This behaves like
+    the inverse of StopFilter. (ryan)
+
 Changes in runtime behavior

 Optimizations
--- a/src/java/org/apache/solr/analysis/KeepWordFilter.java
+++ b/src/java/org/apache/solr/analysis/KeepWordFilter.java
@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Token;
+
+import java.io.IOException;
+import java.util.Set;
+
+/**
+ * A TokenFilter that only keeps tokens with text contained in the
+ * required words.  This filter behaves like the inverse of StopFilter.
+ * 
+ * @author ryan
+ * @version $Id$
+ * @since solr 1.3
+ */
+public final class KeepWordFilter extends TokenFilter {
+  final Set<String> words;
+  final boolean ignoreCase;
+  
+  public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
+    super(in);
+    this.words=words;
+    this.ignoreCase=ignoreCase;
+  }
+
+  @Override
+  public final Token next() throws IOException {
+    for (Token token=input.next(); token!=null; token=input.next()) {
+      String txt = ignoreCase
+        ? token.termText().toLowerCase()
+        : token.termText();
+     
+      if( words.contains( txt ) ) {
+        return token;
+      }
+    }
+    return null;
+  }
+}
--- a/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java
+++ b/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java
@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.solr.core.Config;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+import java.util.Map;
+import java.util.List;
+import java.util.Set;
+import java.io.IOException;
+
+/**
+ * @author ryan
+ * @version $Id$
+ * @since solr 1.3
+ */
+public class KeepWordFilterFactory extends BaseTokenFilterFactory {
+
+  private Set<String> words;
+  private boolean ignoreCase;
+
+  @Override
+  @SuppressWarnings("unchecked")
+  public void init(Map<String, String> args) {
+    super.init(args);
+    String wordFile = args.get("words");
+    ignoreCase = getBoolean("ignoreCase",false);
+
+    if (wordFile != null) {
+      try {
+        List<String> wlist = Config.getLines(wordFile);
+        words = StopFilter.makeStopSet(
+            (String[])wlist.toArray(new String[0]), ignoreCase);
+      } 
+      catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    }
+  }
+
+  /**
+   * Set the keep word list.
+   * NOTE: if ignoreCase==true, the words are expected to be lowercase
+   */
+  public void setWords(Set<String> words) {
+    this.words = words;
+  }
+
+  public void setIgnoreCase(boolean ignoreCase) {
+    this.ignoreCase = ignoreCase;
+  }
+  
+  public TokenStream create(TokenStream input) {
+    return new KeepWordFilter(input,words,ignoreCase);
+  }
+}
--- a/src/test/org/apache/solr/analysis/BaseTokenTestCase.java
+++ b/src/test/org/apache/solr/analysis/BaseTokenTestCase.java
@ -150,7 +150,6 @@ public abstract class BaseTokenTestCase extends TestCase
  // These may be useful beyond test cases...
  //------------------------------------------------------------------------

-  // This could probably be put in a utility class
  static List<Token> getTokens(TokenStream tstream) throws IOException {
    List<Token> tokens = new ArrayList<Token>();
    while (true) {
@ -161,12 +160,14 @@ public abstract class BaseTokenTestCase extends TestCase
    return tokens;
  }

-  // This could probably be put in a utility class
  public static class IterTokenStream extends TokenStream {
    Iterator<Token> toks;
    public IterTokenStream(Token... toks) {
      this.toks = Arrays.asList(toks).iterator();
    }
+    public IterTokenStream(Iterable<Token> toks) {
+      this.toks = toks.iterator();
+    }
    public IterTokenStream(Iterator<Token> toks) {
      this.toks = toks;
    }
--- a/src/test/org/apache/solr/analysis/TestKeepWordFilter.java
+++ b/src/test/org/apache/solr/analysis/TestKeepWordFilter.java
@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Token;
+
+
+/**
+ * @version $Id:$
+ */
+public class TestKeepWordFilter extends BaseTokenTestCase {
+  
+  public void testStopAndGo() throws Exception 
+  {  
+    Set<String> words = new HashSet<String>();
+    words.add( "aaa" );
+    words.add( "bbb" );
+    
+    List<Token> input = tokens( "aaa BBB ccc ddd EEE" );
+    Map<String,String> args = new HashMap<String, String>();
+
+    
+    // Test Stopwords
+    KeepWordFilterFactory factory = new KeepWordFilterFactory();
+    args.put( "ignoreCase", "true" );
+    factory.init( args );
+    factory.setWords( words );
+    
+    List<Token> expect = tokens( "aaa BBB" );
+    List<Token> real = getTokens(factory.create( new IterTokenStream(input) ));
+    assertTokEqual( expect, real );
+    
+    // Now force case
+    args.put( "ignoreCase", "false" );
+    factory.init( args );
+    
+    expect = tokens( "aaa" );
+    real = getTokens(factory.create( new IterTokenStream(input) ));
+    assertTokEqual( expect, real );
+  }
+}