SOLR-253 -- adding KeepWordFilter, the inverse of StopFilter.

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@544328 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Ryan McKinley 2007-06-04 23:45:00 +00:00
parent 407ca95deb
commit cc053495de
5 changed files with 201 additions and 3 deletions

View File

@ -32,7 +32,11 @@ New Features
1. SOLR-69: Adding MoreLikeThisHandler to search for similar documents using
lucene contrib/queries MoreLikeThis. MoreLikeThis is also avaliable from
the StandardRequestHandler using ?mlt=true. (bdelacretaz, ryan)
2. SOLR-253: Adding KeepWordFilter and KeepWordFilterFactory. A TokenFilter
that keeps tokens with text in the registered keeplist. This behaves like
the inverse of StopFilter. (ryan)
Changes in runtime behavior
Optimizations

View File

@ -0,0 +1,58 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import java.io.IOException;
import java.util.Set;
/**
* A TokenFilter that only keeps tokens with text contained in the
* required words. This filter behaves like the inverse of StopFilter.
*
* @author ryan
* @version $Id$
* @since solr 1.3
*/
public final class KeepWordFilter extends TokenFilter {
final Set<String> words;
final boolean ignoreCase;
public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
super(in);
this.words=words;
this.ignoreCase=ignoreCase;
}
@Override
public final Token next() throws IOException {
for (Token token=input.next(); token!=null; token=input.next()) {
String txt = ignoreCase
? token.termText().toLowerCase()
: token.termText();
if( words.contains( txt ) ) {
return token;
}
}
return null;
}
}

View File

@ -0,0 +1,73 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.solr.core.Config;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import java.util.Map;
import java.util.List;
import java.util.Set;
import java.io.IOException;
/**
* @author ryan
* @version $Id$
* @since solr 1.3
*/
public class KeepWordFilterFactory extends BaseTokenFilterFactory {
private Set<String> words;
private boolean ignoreCase;
@Override
@SuppressWarnings("unchecked")
public void init(Map<String, String> args) {
super.init(args);
String wordFile = args.get("words");
ignoreCase = getBoolean("ignoreCase",false);
if (wordFile != null) {
try {
List<String> wlist = Config.getLines(wordFile);
words = StopFilter.makeStopSet(
(String[])wlist.toArray(new String[0]), ignoreCase);
}
catch (IOException e) {
throw new RuntimeException(e);
}
}
}
/**
* Set the keep word list.
* NOTE: if ignoreCase==true, the words are expected to be lowercase
*/
public void setWords(Set<String> words) {
this.words = words;
}
public void setIgnoreCase(boolean ignoreCase) {
this.ignoreCase = ignoreCase;
}
public TokenStream create(TokenStream input) {
return new KeepWordFilter(input,words,ignoreCase);
}
}

View File

@ -150,7 +150,6 @@ public abstract class BaseTokenTestCase extends TestCase
// These may be useful beyond test cases...
//------------------------------------------------------------------------
// This could probably be put in a utility class
static List<Token> getTokens(TokenStream tstream) throws IOException {
List<Token> tokens = new ArrayList<Token>();
while (true) {
@ -161,12 +160,14 @@ public abstract class BaseTokenTestCase extends TestCase
return tokens;
}
// This could probably be put in a utility class
public static class IterTokenStream extends TokenStream {
Iterator<Token> toks;
public IterTokenStream(Token... toks) {
this.toks = Arrays.asList(toks).iterator();
}
public IterTokenStream(Iterable<Token> toks) {
this.toks = toks.iterator();
}
public IterTokenStream(Iterator<Token> toks) {
this.toks = toks;
}

View File

@ -0,0 +1,62 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Token;
/**
* @version $Id:$
*/
public class TestKeepWordFilter extends BaseTokenTestCase {
public void testStopAndGo() throws Exception
{
Set<String> words = new HashSet<String>();
words.add( "aaa" );
words.add( "bbb" );
List<Token> input = tokens( "aaa BBB ccc ddd EEE" );
Map<String,String> args = new HashMap<String, String>();
// Test Stopwords
KeepWordFilterFactory factory = new KeepWordFilterFactory();
args.put( "ignoreCase", "true" );
factory.init( args );
factory.setWords( words );
List<Token> expect = tokens( "aaa BBB" );
List<Token> real = getTokens(factory.create( new IterTokenStream(input) ));
assertTokEqual( expect, real );
// Now force case
args.put( "ignoreCase", "false" );
factory.init( args );
expect = tokens( "aaa" );
real = getTokens(factory.create( new IterTokenStream(input) ));
assertTokEqual( expect, real );
}
}