SOLR-663 -- Allow multiple files for stopwords, keepwords, protwords and synonyms

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@680935 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shalin Shekhar Mangar 2008-07-30 08:11:56 +00:00
parent a18aea63f1
commit 4ac6ce3190
7 changed files with 96 additions and 17 deletions

View File

@ -329,6 +329,8 @@ New Features
64. SOLR-666: Expose warmup time in statistics for SolrIndexSearcher and LRUCache (shalin)
65. SOLR-663: Allow multiple files for stopwords, keepwords, protwords and synonyms (shalin)
Changes in runtime behavior
1. SOLR-559: use Lucene updateDocument, deleteDocuments methods. This

View File

@ -22,9 +22,11 @@ import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import java.io.IOException;
import java.io.File;
import java.util.List;
/**
@ -34,12 +36,24 @@ public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implement
public static final String PROTECTED_TOKENS = "protected";
public void inform(ResourceLoader loader) {
String wordFile = args.get(PROTECTED_TOKENS);
if (wordFile != null) {
String wordFiles = args.get(PROTECTED_TOKENS);
if (wordFiles != null) {
try {
List<String> wlist = loader.getLines(wordFile);
//This cast is safe in Lucene
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
File protectedWordFiles = new File(wordFiles);
if (protectedWordFiles.exists()) {
List<String> wlist = loader.getLines(wordFiles);
//This cast is safe in Lucene
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
} else {
List<String> files = StrUtils.splitFileNames(wordFiles);
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
if (protectedWords == null)
protectedWords = new CharArraySet(wlist, false);
else
protectedWords.addAll(wlist);
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}

View File

@ -18,14 +18,16 @@
package org.apache.solr.analysis;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import java.util.Map;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.io.File;
import java.io.File;
import java.io.IOException;
/**
@ -39,14 +41,25 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
@SuppressWarnings("unchecked")
public void inform(ResourceLoader loader) {
String wordFile = args.get("words");
String wordFiles = args.get("words");
ignoreCase = getBoolean("ignoreCase",false);
if (wordFile != null) {
if (wordFiles != null) {
if (words == null)
words = new HashSet<String>();
try {
List<String> wlist = loader.getLines(wordFile);
words = StopFilter.makeStopSet(
(String[])wlist.toArray(new String[0]), ignoreCase);
java.io.File keepWordsFile = new File(wordFiles);
if (keepWordsFile.exists()) {
List<String> wlist = loader.getLines(wordFiles);
words = StopFilter.makeStopSet(
(String[])wlist.toArray(new String[0]), ignoreCase);
} else {
List<String> files = StrUtils.splitFileNames(wordFiles);
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
words.addAll(StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), ignoreCase));
}
}
}
catch (IOException e) {
throw new RuntimeException(e);

View File

@ -18,13 +18,17 @@
package org.apache.solr.analysis;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import java.util.HashSet;
import java.util.List;
import java.io.File;
import java.util.Set;
import java.io.File;
import java.io.IOException;
/**
@ -33,14 +37,25 @@ import java.io.IOException;
public class StopFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
public void inform(ResourceLoader loader) {
String stopWordFile = args.get("words");
String stopWordFiles = args.get("words");
ignoreCase = getBoolean("ignoreCase",false);
enablePositionIncrements = getBoolean("enablePositionIncrements",false);
if (stopWordFile != null) {
if (stopWordFiles != null) {
if (stopWords == null)
stopWords = new HashSet<String>();
try {
List<String> wlist = loader.getLines(stopWordFile);
stopWords = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), ignoreCase);
java.io.File keepWordsFile = new File(stopWordFiles);
if (keepWordsFile.exists()) {
List<String> wlist = loader.getLines(stopWordFiles);
stopWords = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), ignoreCase);
} else {
List<String> files = StrUtils.splitFileNames(stopWordFiles);
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
stopWords.addAll(StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), ignoreCase));
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}

View File

@ -24,9 +24,11 @@ import org.apache.solr.common.util.StrUtils;
import org.apache.solr.core.SolrCore;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@ -51,7 +53,15 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso
if (synonyms != null) {
List<String> wlist=null;
try {
wlist = loader.getLines(synonyms);
File synonymFile = new java.io.File(synonyms);
if (synonymFile.exists()) {
wlist = loader.getLines(synonyms);
} else {
List<String> files = StrUtils.splitFileNames(synonyms);
for (String file : files) {
wlist = loader.getLines(file.trim());
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}

View File

@ -19,6 +19,7 @@ package org.apache.solr.common.util;
import java.util.List;
import java.util.ArrayList;
import java.util.Collections;
import java.io.IOException;
/**
@ -118,6 +119,25 @@ public class StrUtils {
return lst;
}
/**
* Splits file names separated by comma character.
* File names can contain comma characters escaped by backslash '\'
*
* @param fileNames the string containing file names
* @return a list of file names with the escaping backslashed removed
*/
public static List<String> splitFileNames(String fileNames) {
if (fileNames == null)
return Collections.<String>emptyList();
List<String> result = new ArrayList<String>();
for (String file : fileNames.split("(?<!\\\\),")) {
result.add(file.replaceAll("\\\\(?=,)", ""));
}
return result;
}
/** Creates a backslash escaped string, joining all the items. */
public static String join(List<String> items, char separator) {
StringBuilder sb = new StringBuilder(items.size() << 3);

View File

@ -62,6 +62,11 @@ public class TestUtils extends TestCase {
assertEquals(2,arr.size());
assertEquals(" foo ",arr.get(0));
assertEquals(" bar ",arr.get(1));
arr = StrUtils.splitFileNames("/h/s,/h/\\,s,");
assertEquals(2,arr.size());
assertEquals("/h/s",arr.get(0));
assertEquals("/h/,s",arr.get(1));
}
public void testNamedLists()