SOLR-1095: fix stopword and keep filter performance issue

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@761036 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2009-04-01 20:07:44 +00:00
parent 8c1725beee
commit c7c9ef6a63
12 changed files with 305 additions and 33 deletions

View File

@ -191,6 +191,8 @@ New Features
33. SOLR-939: ValueSourceRangeFilter/Query - filter based on values in a FieldCache entry or on any arbitrary function of field values. (yonik)
34. SOLR-1095: Fixed performance problem in the StopFilterFactory and simplified code. Added tests as well. (gsingers)
Optimizations
----------------------

View File

@ -43,7 +43,7 @@ public final class KeepWordFilter extends TokenFilter {
@Override
public final Token next(Token in) throws IOException {
for (Token token=input.next(in); token!=null; token=input.next()) {
for (Token token=input.next(in); token!=null; token=input.next(token)) {
if( words.contains( token.termBuffer(), 0, token.termLength() ) ) {
return token;
}

View File

@ -22,6 +22,7 @@ import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharArraySet;
import java.util.HashSet;
import java.util.List;
@ -36,31 +37,25 @@ import java.io.IOException;
*/
public class KeepWordFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
private Set<String> words;
private CharArraySet words;
private boolean ignoreCase;
@SuppressWarnings("unchecked")
public void inform(ResourceLoader loader) {
String wordFiles = args.get("words");
ignoreCase = getBoolean("ignoreCase",false);
ignoreCase = getBoolean("ignoreCase", false);
if (wordFiles != null) {
if (words == null)
words = new HashSet<String>();
try {
java.io.File keepWordsFile = new File(wordFiles);
if (keepWordsFile.exists()) {
List<String> wlist = loader.getLines(wordFiles);
words = StopFilter.makeStopSet(
(String[])wlist.toArray(new String[0]), ignoreCase);
} else {
List<String> files = StrUtils.splitFileNames(wordFiles);
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
words.addAll(StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), ignoreCase));
}
List<String> files = StrUtils.splitFileNames(wordFiles);
if (words == null && files.size() > 0){
words = new CharArraySet(files.size() * 10, ignoreCase);
}
}
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
//TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call
words.addAll(StopFilter.makeStopSet((String[]) wlist.toArray(new String[0]), ignoreCase));
}
}
catch (IOException e) {
throw new RuntimeException(e);
}
@ -72,15 +67,22 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
* NOTE: if ignoreCase==true, the words are expected to be lowercase
*/
public void setWords(Set<String> words) {
this.words = words;
this.words = new CharArraySet(words, ignoreCase);
}
public void setIgnoreCase(boolean ignoreCase) {
this.ignoreCase = ignoreCase;
}
public KeepWordFilter create(TokenStream input) {
return new KeepWordFilter(input,words,ignoreCase);
return new KeepWordFilter(input, words, ignoreCase);
}
public CharArraySet getWords() {
return words;
}
public boolean isIgnoreCase() {
return ignoreCase;
}
}

View File

@ -23,6 +23,7 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharArraySet;
import java.util.HashSet;
import java.util.List;
@ -42,29 +43,26 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
enablePositionIncrements = getBoolean("enablePositionIncrements",false);
if (stopWordFiles != null) {
if (stopWords == null)
stopWords = new HashSet<String>();
try {
java.io.File keepWordsFile = new File(stopWordFiles);
if (keepWordsFile.exists()) {
List<String> wlist = loader.getLines(stopWordFiles);
stopWords = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), ignoreCase);
} else {
List<String> files = StrUtils.splitFileNames(stopWordFiles);
List<String> files = StrUtils.splitFileNames(stopWordFiles);
if (stopWords == null && files.size() > 0){
//default stopwords list has 35 or so words, but maybe don't make it that big to start
stopWords = new CharArraySet(files.size() * 10, ignoreCase);
}
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
//TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call
stopWords.addAll(StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), ignoreCase));
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
} else {
stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
stopWords = (CharArraySet) StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
}
}
private Set stopWords;
//Force the use of a char array set, as it is the most performant, although this may break things if Lucene ever goes away from it. See SOLR-1095
private CharArraySet stopWords;
private boolean ignoreCase;
private boolean enablePositionIncrements;

View File

@ -0,0 +1,65 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.common.ResourceLoader;
import java.util.Set;
import java.util.Map;
import java.util.HashMap;
/**
*
*
**/
public class TestKeepFilterFactory extends AbstractSolrTestCase{
public String getSchemaFile() {
return "schema-stop-keep.xml";
}
public String getSolrConfigFile() {
return "solrconfig.xml";
}
public void testInform() throws Exception {
ResourceLoader loader = solrConfig.getResourceLoader();
assertTrue("loader is null and it shouldn't be", loader != null);
KeepWordFilterFactory factory = new KeepWordFilterFactory();
Map<String, String> args = new HashMap<String, String>();
args.put("words", "keep-1.txt");
args.put("ignoreCase", "true");
factory.init(args);
factory.inform(loader);
Set words = factory.getWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
factory = new KeepWordFilterFactory();
args.put("words", "keep-1.txt, keep-2.txt");
factory.init(args);
factory.inform(loader);
words = factory.getWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
}
}

View File

@ -0,0 +1,66 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.common.ResourceLoader;
import java.util.Set;
import java.util.Map;
import java.util.HashMap;
/**
*
*
**/
public class TestStopFilterFactory extends AbstractSolrTestCase{
public String getSchemaFile() {
return "schema-stop-keep.xml";
}
public String getSolrConfigFile() {
return "solrconfig.xml";
}
public void testInform() throws Exception {
ResourceLoader loader = solrConfig.getResourceLoader();
assertTrue("loader is null and it shouldn't be", loader != null);
StopFilterFactory factory = new StopFilterFactory();
Map<String, String> args = new HashMap<String, String>();
args.put("words", "stop-1.txt");
args.put("ignoreCase", "true");
factory.init(args);
factory.inform(loader);
Set words = factory.getStopWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);
factory = new StopFilterFactory();
args.put("words", "stop-1.txt, stop-2.txt");
factory.init(args);
factory.inform(loader);
words = factory.getStopWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);
}
}

View File

@ -67,6 +67,10 @@ public class TestUtils extends TestCase {
assertEquals(2,arr.size());
assertEquals("/h/s",arr.get(0));
assertEquals("/h/,s",arr.get(1));
arr = StrUtils.splitFileNames("/h/s");
assertEquals(1,arr.size());
assertEquals("/h/s",arr.get(0));
}
public void testNamedLists()

View File

@ -0,0 +1,17 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
foo
bar

View File

@ -0,0 +1,17 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
junk
more

View File

@ -0,0 +1,67 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!--
For testing stopword configuration and keep word configuration
$Id: schema.xml 382610 2006-03-03 01:43:03Z yonik $
$Source: /cvs/main/searching/solr-configs/test/WEB-INF/classes/schema.xml,v $
$Name: $
-->
<schema name="test" version="1.0">
<types>
<fieldtype name="integer" class="solr.IntField" />
<fieldtype name="string" class="solr.StrField" sortMissingLast="true"/>
<fieldtype name="stop-one" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stop-1.txt"/>
</analyzer>
</fieldtype>
<fieldtype name="stop-two" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stop-1.txt,stop-2.txt"/>
</analyzer>
</fieldtype>
</types>
<fields>
<field name="id" type="integer" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="one" type="stop-one" indexed="true" stored="false"/>
<field name="two" type="stop-two" indexed="true" stored="false"/>
</fields>
<defaultSearchField>one</defaultSearchField>
<uniqueKey>id</uniqueKey>
</schema>

View File

@ -0,0 +1,17 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
foo
bar

View File

@ -0,0 +1,17 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
junk
more