mirror of https://github.com/apache/lucene.git
SOLR-1095: fix stopword and keep filter performance issue
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@761036 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8c1725beee
commit
c7c9ef6a63
|
@ -191,6 +191,8 @@ New Features
|
|||
|
||||
33. SOLR-939: ValueSourceRangeFilter/Query - filter based on values in a FieldCache entry or on any arbitrary function of field values. (yonik)
|
||||
|
||||
34. SOLR-1095: Fixed performance problem in the StopFilterFactory and simplified code. Added tests as well. (gsingers)
|
||||
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
|
|
@ -43,7 +43,7 @@ public final class KeepWordFilter extends TokenFilter {
|
|||
|
||||
@Override
|
||||
public final Token next(Token in) throws IOException {
|
||||
for (Token token=input.next(in); token!=null; token=input.next()) {
|
||||
for (Token token=input.next(in); token!=null; token=input.next(token)) {
|
||||
if( words.contains( token.termBuffer(), 0, token.termLength() ) ) {
|
||||
return token;
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ import org.apache.solr.common.util.StrUtils;
|
|||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
@ -36,31 +37,25 @@ import java.io.IOException;
|
|||
*/
|
||||
public class KeepWordFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
|
||||
private Set<String> words;
|
||||
private CharArraySet words;
|
||||
private boolean ignoreCase;
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public void inform(ResourceLoader loader) {
|
||||
String wordFiles = args.get("words");
|
||||
ignoreCase = getBoolean("ignoreCase",false);
|
||||
|
||||
ignoreCase = getBoolean("ignoreCase", false);
|
||||
if (wordFiles != null) {
|
||||
if (words == null)
|
||||
words = new HashSet<String>();
|
||||
try {
|
||||
java.io.File keepWordsFile = new File(wordFiles);
|
||||
if (keepWordsFile.exists()) {
|
||||
List<String> wlist = loader.getLines(wordFiles);
|
||||
words = StopFilter.makeStopSet(
|
||||
(String[])wlist.toArray(new String[0]), ignoreCase);
|
||||
} else {
|
||||
List<String> files = StrUtils.splitFileNames(wordFiles);
|
||||
for (String file : files) {
|
||||
List<String> wlist = loader.getLines(file.trim());
|
||||
words.addAll(StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), ignoreCase));
|
||||
}
|
||||
List<String> files = StrUtils.splitFileNames(wordFiles);
|
||||
if (words == null && files.size() > 0){
|
||||
words = new CharArraySet(files.size() * 10, ignoreCase);
|
||||
}
|
||||
}
|
||||
for (String file : files) {
|
||||
List<String> wlist = loader.getLines(file.trim());
|
||||
//TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call
|
||||
words.addAll(StopFilter.makeStopSet((String[]) wlist.toArray(new String[0]), ignoreCase));
|
||||
}
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
@ -72,15 +67,22 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
|
|||
* NOTE: if ignoreCase==true, the words are expected to be lowercase
|
||||
*/
|
||||
public void setWords(Set<String> words) {
|
||||
this.words = words;
|
||||
this.words = new CharArraySet(words, ignoreCase);
|
||||
}
|
||||
|
||||
public void setIgnoreCase(boolean ignoreCase) {
|
||||
this.ignoreCase = ignoreCase;
|
||||
}
|
||||
|
||||
|
||||
public KeepWordFilter create(TokenStream input) {
|
||||
return new KeepWordFilter(input,words,ignoreCase);
|
||||
return new KeepWordFilter(input, words, ignoreCase);
|
||||
}
|
||||
|
||||
public CharArraySet getWords() {
|
||||
return words;
|
||||
}
|
||||
|
||||
public boolean isIgnoreCase() {
|
||||
return ignoreCase;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
|
|||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
@ -42,29 +43,26 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
|
|||
enablePositionIncrements = getBoolean("enablePositionIncrements",false);
|
||||
|
||||
if (stopWordFiles != null) {
|
||||
if (stopWords == null)
|
||||
stopWords = new HashSet<String>();
|
||||
try {
|
||||
java.io.File keepWordsFile = new File(stopWordFiles);
|
||||
if (keepWordsFile.exists()) {
|
||||
List<String> wlist = loader.getLines(stopWordFiles);
|
||||
stopWords = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), ignoreCase);
|
||||
} else {
|
||||
List<String> files = StrUtils.splitFileNames(stopWordFiles);
|
||||
List<String> files = StrUtils.splitFileNames(stopWordFiles);
|
||||
if (stopWords == null && files.size() > 0){
|
||||
//default stopwords list has 35 or so words, but maybe don't make it that big to start
|
||||
stopWords = new CharArraySet(files.size() * 10, ignoreCase);
|
||||
}
|
||||
for (String file : files) {
|
||||
List<String> wlist = loader.getLines(file.trim());
|
||||
//TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call
|
||||
stopWords.addAll(StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), ignoreCase));
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
} else {
|
||||
stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
|
||||
stopWords = (CharArraySet) StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
|
||||
}
|
||||
}
|
||||
|
||||
private Set stopWords;
|
||||
//Force the use of a char array set, as it is the most performant, although this may break things if Lucene ever goes away from it. See SOLR-1095
|
||||
private CharArraySet stopWords;
|
||||
private boolean ignoreCase;
|
||||
private boolean enablePositionIncrements;
|
||||
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
package org.apache.solr.analysis;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
**/
|
||||
public class TestKeepFilterFactory extends AbstractSolrTestCase{
|
||||
public String getSchemaFile() {
|
||||
return "schema-stop-keep.xml";
|
||||
}
|
||||
|
||||
public String getSolrConfigFile() {
|
||||
return "solrconfig.xml";
|
||||
}
|
||||
|
||||
public void testInform() throws Exception {
|
||||
ResourceLoader loader = solrConfig.getResourceLoader();
|
||||
assertTrue("loader is null and it shouldn't be", loader != null);
|
||||
KeepWordFilterFactory factory = new KeepWordFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
args.put("words", "keep-1.txt");
|
||||
args.put("ignoreCase", "true");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
Set words = factory.getWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
|
||||
|
||||
|
||||
factory = new KeepWordFilterFactory();
|
||||
args.put("words", "keep-1.txt, keep-2.txt");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
words = factory.getWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
package org.apache.solr.analysis;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
**/
|
||||
public class TestStopFilterFactory extends AbstractSolrTestCase{
|
||||
public String getSchemaFile() {
|
||||
return "schema-stop-keep.xml";
|
||||
}
|
||||
|
||||
public String getSolrConfigFile() {
|
||||
return "solrconfig.xml";
|
||||
}
|
||||
|
||||
public void testInform() throws Exception {
|
||||
ResourceLoader loader = solrConfig.getResourceLoader();
|
||||
assertTrue("loader is null and it shouldn't be", loader != null);
|
||||
StopFilterFactory factory = new StopFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
args.put("words", "stop-1.txt");
|
||||
args.put("ignoreCase", "true");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
Set words = factory.getStopWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
|
||||
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);
|
||||
|
||||
factory = new StopFilterFactory();
|
||||
args.put("words", "stop-1.txt, stop-2.txt");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
words = factory.getStopWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
|
||||
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);
|
||||
|
||||
|
||||
}
|
||||
}
|
|
@ -67,6 +67,10 @@ public class TestUtils extends TestCase {
|
|||
assertEquals(2,arr.size());
|
||||
assertEquals("/h/s",arr.get(0));
|
||||
assertEquals("/h/,s",arr.get(1));
|
||||
|
||||
arr = StrUtils.splitFileNames("/h/s");
|
||||
assertEquals(1,arr.size());
|
||||
assertEquals("/h/s",arr.get(0));
|
||||
}
|
||||
|
||||
public void testNamedLists()
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
foo
|
||||
bar
|
|
@ -0,0 +1,17 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
junk
|
||||
more
|
|
@ -0,0 +1,67 @@
|
|||
<?xml version="1.0" ?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!--
|
||||
|
||||
For testing stopword configuration and keep word configuration
|
||||
|
||||
$Id: schema.xml 382610 2006-03-03 01:43:03Z yonik $
|
||||
$Source: /cvs/main/searching/solr-configs/test/WEB-INF/classes/schema.xml,v $
|
||||
$Name: $
|
||||
-->
|
||||
|
||||
<schema name="test" version="1.0">
|
||||
<types>
|
||||
|
||||
|
||||
<fieldtype name="integer" class="solr.IntField" />
|
||||
|
||||
<fieldtype name="string" class="solr.StrField" sortMissingLast="true"/>
|
||||
<fieldtype name="stop-one" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true"
|
||||
words="stop-1.txt"/>
|
||||
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="stop-two" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true"
|
||||
words="stop-1.txt,stop-2.txt"/>
|
||||
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
</types>
|
||||
|
||||
|
||||
<fields>
|
||||
<field name="id" type="integer" indexed="true" stored="true" multiValued="false" required="false"/>
|
||||
<field name="one" type="stop-one" indexed="true" stored="false"/>
|
||||
<field name="two" type="stop-two" indexed="true" stored="false"/>
|
||||
|
||||
</fields>
|
||||
|
||||
<defaultSearchField>one</defaultSearchField>
|
||||
<uniqueKey>id</uniqueKey>
|
||||
|
||||
|
||||
</schema>
|
|
@ -0,0 +1,17 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
foo
|
||||
bar
|
|
@ -0,0 +1,17 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
junk
|
||||
more
|
Loading…
Reference in New Issue