Adding Analysis.getWordList method which retrieves list of words from setting or file specified by setting.

This commit is contained in:
Edward Dale 2011-01-31 06:11:43 +01:00 committed by kimchy
parent c95544141b
commit f319625cb5
6 changed files with 109 additions and 71 deletions

View File

@ -19,11 +19,17 @@
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.WordlistLoader;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.collect.ImmutableSet;
import org.elasticsearch.common.collect.Iterators;
import org.elasticsearch.common.settings.Settings;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
/**
@ -52,4 +58,37 @@ public class Analysis {
return defaultStopWords;
}
}
/**
* Fetches a list of words from the specified settings file. The list should either be available at the key
* specified by settingsPrefix or in a file specified by settingsPrefix + _path.
*
* @throws ElasticSearchIllegalArgumentException If the word list cannot be found at either key.
*/
public static Set<String> getWordList(Settings settings, String settingPrefix) {
String wordListPath = settings.get(settingPrefix + "_path", null);
if (wordListPath == null) {
String[] explicitWordList = settings.getAsArray(settingPrefix, null);
if(explicitWordList == null) {
String message = String.format("%s or %s_path must be provided.", settingPrefix, settingPrefix);
throw new ElasticSearchIllegalArgumentException(message);
} else {
return new HashSet<String>(Arrays.asList(explicitWordList));
}
}
File wordListFile = new File(wordListPath);
if (!wordListFile.exists()) {
throw new ElasticSearchIllegalArgumentException(settingPrefix + "_path file must exist.");
}
try {
return WordlistLoader.getWordSet(wordListFile);
} catch (IOException ioe) {
String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());
throw new ElasticSearchIllegalArgumentException(message);
}
}
}

View File

@ -19,18 +19,15 @@
package org.elasticsearch.index.analysis.compound;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import org.elasticsearch.index.settings.IndexSettings;
import java.io.File;
import java.io.IOException;
import java.util.Set;
/**
@ -54,21 +51,6 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok
minSubwordSize = settings.getAsInt("min_subword_size", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
maxSubwordSize = settings.getAsInt("max_subword_size", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
onlyLongestMatch = settings.getAsBoolean("only_longest_max", false);
String wordListPath = settings.get("word_list_path", null);
if (wordListPath == null) {
throw new ElasticSearchIllegalArgumentException("word_list_path is a required setting.");
}
File wordListFile = new File(wordListPath);
if (!wordListFile.exists()) {
throw new ElasticSearchIllegalArgumentException("word_list_path file must exist.");
}
try {
wordList = WordlistLoader.getWordSet(wordListFile);
} catch (IOException ioe) {
throw new ElasticSearchIllegalArgumentException("IOException while reading word_list_path: " + ioe.getMessage());
}
wordList = Analysis.getWordList(settings, "word_list");
}
}

View File

@ -31,8 +31,14 @@ import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilt
import org.elasticsearch.index.analysis.filter1.MyFilterTokenFilterFactory;
import org.elasticsearch.index.analysis.phonetic.PhoneticTokenFilterFactory;
import org.elasticsearch.index.settings.IndexSettingsModule;
import org.hamcrest.MatcherAssert;
import org.testng.annotations.Test;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.util.Set;
import static org.elasticsearch.common.settings.ImmutableSettings.*;
import static org.hamcrest.MatcherAssert.*;
import static org.hamcrest.Matchers.*;
@ -107,11 +113,46 @@ public class AnalysisModuleTests {
assertThat(czechstemmeranalyzer.tokenFilters()[3], instanceOf(CzechStemTokenFilterFactory.class));
// check dictionary decompounder
analyzer = analysisService.analyzer("custom5").analyzer();
analyzer = analysisService.analyzer("decompoundingAnalyzer").analyzer();
assertThat(analyzer, instanceOf(CustomAnalyzer.class));
CustomAnalyzer dictionaryDecompounderAnalyze = (CustomAnalyzer) analyzer;
assertThat(dictionaryDecompounderAnalyze.tokenizerFactory(), instanceOf(StandardTokenizerFactory.class));
assertThat(dictionaryDecompounderAnalyze.tokenFilters().length, equalTo(1));
assertThat(dictionaryDecompounderAnalyze.tokenFilters()[0], instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
Set<String> wordList = Analysis.getWordList(settings, "index.analysis.filter.dict_dec.word_list");
MatcherAssert.assertThat(wordList.size(), equalTo(6));
MatcherAssert.assertThat(wordList, hasItems("donau", "dampf", "schiff", "spargel", "creme", "suppe"));
}
@Test public void testWordListPath() throws Exception {
String[] words = new String[] {"donau", "dampf", "schiff", "spargel", "creme", "suppe"};
File wordListFile = generateWordList(words);
Settings settings = settingsBuilder().loadFromSource("index: \n word_list_path: " + wordListFile.getAbsolutePath()).build();
Set<String> wordList = Analysis.getWordList(settings, "index.word_list");
MatcherAssert.assertThat(wordList.size(), equalTo(6));
MatcherAssert.assertThat(wordList, hasItems(words));
}
private File generateWordList(String[] words) throws Exception {
File wordListFile = File.createTempFile("wordlist", ".txt");
wordListFile.deleteOnExit();
BufferedWriter writer = null;
try {
writer = new BufferedWriter(new FileWriter(wordListFile));
for (String word : words) {
writer.write(word);
writer.write('\n');
}
} finally {
if (writer != null) {
writer.close();
}
}
return wordListFile;
}
}

View File

@ -36,14 +36,11 @@ import org.hamcrest.MatcherAssert;
import org.testng.annotations.Test;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import static org.elasticsearch.common.settings.ImmutableSettings.*;
import static org.elasticsearch.common.settings.ImmutableSettings.Builder.*;
import static org.hamcrest.Matchers.*;
/**
@ -51,43 +48,9 @@ import static org.hamcrest.Matchers.*;
*/
public class CompoundAnalysisTests {
private File generateWordList() throws Exception {
File wordListFile = File.createTempFile("wordlist", ".txt");
wordListFile.deleteOnExit();
BufferedWriter writer = null;
try {
writer = new BufferedWriter(new FileWriter(wordListFile));
writer.write("donau\ndampf\nschiff\n");
writer.write("spargel\ncreme\nsuppe");
} finally {
if (writer != null) {
writer.close();
}
}
return wordListFile;
}
private Settings generateSettings(File wordListFile) throws Exception {
StringBuilder settingsStr = new StringBuilder();
settingsStr.append("index : \n");
settingsStr.append(" analysis :\n");
settingsStr.append(" analyzer :\n");
settingsStr.append(" myAnalyzer2 :\n");
settingsStr.append(" tokenizer : standard\n");
settingsStr.append(" filter : [dict_dec, standard, lowercase, stop]\n");
settingsStr.append(" filter :\n");
settingsStr.append(" dict_dec :\n");
settingsStr.append(" type : dictionary_decompounder\n");
settingsStr.append(" word_list_path : ").append(wordListFile.getAbsolutePath()).append('\n');
return settingsBuilder().loadFromSource(settingsStr.toString()).build();
}
@Test public void testDefaultsCompoundAnalysis() throws Exception {
Index index = new Index("test");
Settings settings = generateSettings(generateWordList());
Settings settings = getJsonSettings();
Injector injector = new ModulesBuilder().add(
new IndexSettingsModule(settings),
@ -101,9 +64,16 @@ public class CompoundAnalysisTests {
}
@Test public void testDictionaryDecompounder() throws Exception {
Index index = new Index("test");
Settings settings = generateSettings(generateWordList());
Settings[] settingsArr = new Settings[] {getJsonSettings(), getYamlSettings()};
for (Settings settings : settingsArr) {
List<String> terms = analyze(settings, "decompoundingAnalyzer", "donaudampfschiff spargelcremesuppe");
MatcherAssert.assertThat(terms.size(), equalTo(8));
MatcherAssert.assertThat(terms, hasItems("donau", "dampf", "schiff", "donaudampfschiff", "spargel", "creme", "suppe", "spargelcremesuppe"));
}
}
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
Index index = new Index("test");
Injector injector = new ModulesBuilder().add(
new IndexSettingsModule(settings),
new IndexNameModule(index),
@ -111,11 +81,10 @@ public class CompoundAnalysisTests {
AnalysisService analysisService = injector.getInstance(AnalysisService.class);
Analyzer analyzer = analysisService.analyzer("myAnalyzer2").analyzer();
Analyzer analyzer = analysisService.analyzer(analyzerName).analyzer();
AllEntries allEntries = new AllEntries();
allEntries.addText("field1", "donaudampfschiff", 1.0f);
allEntries.addText("field2", "spargelcremesuppe", 1.0f);
allEntries.addText("field1", text, 1.0f);
allEntries.reset();
TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer);
@ -126,7 +95,14 @@ public class CompoundAnalysisTests {
String tokText = termAtt.term();
terms.add(tokText);
}
MatcherAssert.assertThat(terms.size(), equalTo(8));
MatcherAssert.assertThat(terms, hasItems("donau", "dampf", "schiff", "donaudampfschiff", "spargel", "creme", "suppe", "spargelcremesuppe"));
return terms;
}
private Settings getJsonSettings() {
return settingsBuilder().loadFromClasspath("org/elasticsearch/index/analysis/test1.json").build();
}
private Settings getYamlSettings() {
return settingsBuilder().loadFromClasspath("org/elasticsearch/index/analysis/test1.yml").build();
}
}

View File

@ -31,7 +31,7 @@
},
"dict_dec" : {
"type" : "dictionary_decompounder",
"word_list_path" : "/dev/null"
"word_list" : ["donau", "dampf", "schiff", "spargel", "creme", "suppe"]
}
},
"analyzer" : {
@ -61,7 +61,7 @@
"tokenizer" : "standard",
"filter" : ["standard", "lowercase", "stop", "czech_stem"]
},
"custom5" : {
"decompoundingAnalyzer" : {
"tokenizer" : "standard",
"filter" : ["dict_dec"]
}

View File

@ -22,7 +22,7 @@ index :
type : org.elasticsearch.index.analysis.filter1.MyFilterTokenFilterFactory
dict_dec :
type : dictionary_decompounder
word_list_path : /dev/null
word_list : [donau, dampf, schiff, spargel, creme, suppe]
analyzer :
standard :
alias: alias1,alias2
@ -44,6 +44,6 @@ index :
czechAnalyzerWithStemmer :
tokenizer : standard
filter : [standard, lowercase, stop, czech_stem]
custom5 :
decompoundingAnalyzer :
tokenizer : standard
filter : [dict_dec]