Adding Analysis.getWordList method which retrieves list of words from setting or file specified by setting.
This commit is contained in:
parent
c95544141b
commit
f319625cb5
|
@ -19,11 +19,17 @@
|
|||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.collect.ImmutableSet;
|
||||
import org.elasticsearch.common.collect.Iterators;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
|
@ -52,4 +58,37 @@ public class Analysis {
|
|||
return defaultStopWords;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches a list of words from the specified settings file. The list should either be available at the key
|
||||
* specified by settingsPrefix or in a file specified by settingsPrefix + _path.
|
||||
*
|
||||
* @throws ElasticSearchIllegalArgumentException If the word list cannot be found at either key.
|
||||
*/
|
||||
public static Set<String> getWordList(Settings settings, String settingPrefix) {
|
||||
String wordListPath = settings.get(settingPrefix + "_path", null);
|
||||
|
||||
if (wordListPath == null) {
|
||||
String[] explicitWordList = settings.getAsArray(settingPrefix, null);
|
||||
if(explicitWordList == null) {
|
||||
String message = String.format("%s or %s_path must be provided.", settingPrefix, settingPrefix);
|
||||
throw new ElasticSearchIllegalArgumentException(message);
|
||||
} else {
|
||||
|
||||
return new HashSet<String>(Arrays.asList(explicitWordList));
|
||||
}
|
||||
}
|
||||
|
||||
File wordListFile = new File(wordListPath);
|
||||
if (!wordListFile.exists()) {
|
||||
throw new ElasticSearchIllegalArgumentException(settingPrefix + "_path file must exist.");
|
||||
}
|
||||
|
||||
try {
|
||||
return WordlistLoader.getWordSet(wordListFile);
|
||||
} catch (IOException ioe) {
|
||||
String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());
|
||||
throw new ElasticSearchIllegalArgumentException(message);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,18 +19,15 @@
|
|||
|
||||
package org.elasticsearch.index.analysis.compound;
|
||||
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
|
||||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.Analysis;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
|
@ -54,21 +51,6 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok
|
|||
minSubwordSize = settings.getAsInt("min_subword_size", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
|
||||
maxSubwordSize = settings.getAsInt("max_subword_size", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
|
||||
onlyLongestMatch = settings.getAsBoolean("only_longest_max", false);
|
||||
|
||||
String wordListPath = settings.get("word_list_path", null);
|
||||
if (wordListPath == null) {
|
||||
throw new ElasticSearchIllegalArgumentException("word_list_path is a required setting.");
|
||||
}
|
||||
|
||||
File wordListFile = new File(wordListPath);
|
||||
if (!wordListFile.exists()) {
|
||||
throw new ElasticSearchIllegalArgumentException("word_list_path file must exist.");
|
||||
}
|
||||
|
||||
try {
|
||||
wordList = WordlistLoader.getWordSet(wordListFile);
|
||||
} catch (IOException ioe) {
|
||||
throw new ElasticSearchIllegalArgumentException("IOException while reading word_list_path: " + ioe.getMessage());
|
||||
}
|
||||
wordList = Analysis.getWordList(settings, "word_list");
|
||||
}
|
||||
}
|
|
@ -31,8 +31,14 @@ import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilt
|
|||
import org.elasticsearch.index.analysis.filter1.MyFilterTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.phonetic.PhoneticTokenFilterFactory;
|
||||
import org.elasticsearch.index.settings.IndexSettingsModule;
|
||||
import org.hamcrest.MatcherAssert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.util.Set;
|
||||
|
||||
import static org.elasticsearch.common.settings.ImmutableSettings.*;
|
||||
import static org.hamcrest.MatcherAssert.*;
|
||||
import static org.hamcrest.Matchers.*;
|
||||
|
@ -107,11 +113,46 @@ public class AnalysisModuleTests {
|
|||
assertThat(czechstemmeranalyzer.tokenFilters()[3], instanceOf(CzechStemTokenFilterFactory.class));
|
||||
|
||||
// check dictionary decompounder
|
||||
analyzer = analysisService.analyzer("custom5").analyzer();
|
||||
analyzer = analysisService.analyzer("decompoundingAnalyzer").analyzer();
|
||||
assertThat(analyzer, instanceOf(CustomAnalyzer.class));
|
||||
CustomAnalyzer dictionaryDecompounderAnalyze = (CustomAnalyzer) analyzer;
|
||||
assertThat(dictionaryDecompounderAnalyze.tokenizerFactory(), instanceOf(StandardTokenizerFactory.class));
|
||||
assertThat(dictionaryDecompounderAnalyze.tokenFilters().length, equalTo(1));
|
||||
assertThat(dictionaryDecompounderAnalyze.tokenFilters()[0], instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
|
||||
|
||||
Set<String> wordList = Analysis.getWordList(settings, "index.analysis.filter.dict_dec.word_list");
|
||||
MatcherAssert.assertThat(wordList.size(), equalTo(6));
|
||||
MatcherAssert.assertThat(wordList, hasItems("donau", "dampf", "schiff", "spargel", "creme", "suppe"));
|
||||
}
|
||||
|
||||
@Test public void testWordListPath() throws Exception {
|
||||
String[] words = new String[] {"donau", "dampf", "schiff", "spargel", "creme", "suppe"};
|
||||
|
||||
File wordListFile = generateWordList(words);
|
||||
Settings settings = settingsBuilder().loadFromSource("index: \n word_list_path: " + wordListFile.getAbsolutePath()).build();
|
||||
|
||||
Set<String> wordList = Analysis.getWordList(settings, "index.word_list");
|
||||
MatcherAssert.assertThat(wordList.size(), equalTo(6));
|
||||
MatcherAssert.assertThat(wordList, hasItems(words));
|
||||
}
|
||||
|
||||
private File generateWordList(String[] words) throws Exception {
|
||||
File wordListFile = File.createTempFile("wordlist", ".txt");
|
||||
wordListFile.deleteOnExit();
|
||||
|
||||
BufferedWriter writer = null;
|
||||
try {
|
||||
writer = new BufferedWriter(new FileWriter(wordListFile));
|
||||
for (String word : words) {
|
||||
writer.write(word);
|
||||
writer.write('\n');
|
||||
}
|
||||
} finally {
|
||||
if (writer != null) {
|
||||
writer.close();
|
||||
}
|
||||
}
|
||||
return wordListFile;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -36,14 +36,11 @@ import org.hamcrest.MatcherAssert;
|
|||
import org.testng.annotations.Test;
|
||||
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import static org.elasticsearch.common.settings.ImmutableSettings.*;
|
||||
import static org.elasticsearch.common.settings.ImmutableSettings.Builder.*;
|
||||
import static org.hamcrest.Matchers.*;
|
||||
|
||||
/**
|
||||
|
@ -51,43 +48,9 @@ import static org.hamcrest.Matchers.*;
|
|||
*/
|
||||
public class CompoundAnalysisTests {
|
||||
|
||||
private File generateWordList() throws Exception {
|
||||
File wordListFile = File.createTempFile("wordlist", ".txt");
|
||||
wordListFile.deleteOnExit();
|
||||
|
||||
BufferedWriter writer = null;
|
||||
try {
|
||||
writer = new BufferedWriter(new FileWriter(wordListFile));
|
||||
writer.write("donau\ndampf\nschiff\n");
|
||||
writer.write("spargel\ncreme\nsuppe");
|
||||
} finally {
|
||||
if (writer != null) {
|
||||
writer.close();
|
||||
}
|
||||
}
|
||||
return wordListFile;
|
||||
}
|
||||
|
||||
private Settings generateSettings(File wordListFile) throws Exception {
|
||||
StringBuilder settingsStr = new StringBuilder();
|
||||
|
||||
settingsStr.append("index : \n");
|
||||
settingsStr.append(" analysis :\n");
|
||||
settingsStr.append(" analyzer :\n");
|
||||
settingsStr.append(" myAnalyzer2 :\n");
|
||||
settingsStr.append(" tokenizer : standard\n");
|
||||
settingsStr.append(" filter : [dict_dec, standard, lowercase, stop]\n");
|
||||
settingsStr.append(" filter :\n");
|
||||
settingsStr.append(" dict_dec :\n");
|
||||
settingsStr.append(" type : dictionary_decompounder\n");
|
||||
settingsStr.append(" word_list_path : ").append(wordListFile.getAbsolutePath()).append('\n');
|
||||
|
||||
return settingsBuilder().loadFromSource(settingsStr.toString()).build();
|
||||
}
|
||||
|
||||
@Test public void testDefaultsCompoundAnalysis() throws Exception {
|
||||
Index index = new Index("test");
|
||||
Settings settings = generateSettings(generateWordList());
|
||||
Settings settings = getJsonSettings();
|
||||
|
||||
Injector injector = new ModulesBuilder().add(
|
||||
new IndexSettingsModule(settings),
|
||||
|
@ -101,9 +64,16 @@ public class CompoundAnalysisTests {
|
|||
}
|
||||
|
||||
@Test public void testDictionaryDecompounder() throws Exception {
|
||||
Index index = new Index("test");
|
||||
Settings settings = generateSettings(generateWordList());
|
||||
Settings[] settingsArr = new Settings[] {getJsonSettings(), getYamlSettings()};
|
||||
for (Settings settings : settingsArr) {
|
||||
List<String> terms = analyze(settings, "decompoundingAnalyzer", "donaudampfschiff spargelcremesuppe");
|
||||
MatcherAssert.assertThat(terms.size(), equalTo(8));
|
||||
MatcherAssert.assertThat(terms, hasItems("donau", "dampf", "schiff", "donaudampfschiff", "spargel", "creme", "suppe", "spargelcremesuppe"));
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
|
||||
Index index = new Index("test");
|
||||
Injector injector = new ModulesBuilder().add(
|
||||
new IndexSettingsModule(settings),
|
||||
new IndexNameModule(index),
|
||||
|
@ -111,11 +81,10 @@ public class CompoundAnalysisTests {
|
|||
|
||||
AnalysisService analysisService = injector.getInstance(AnalysisService.class);
|
||||
|
||||
Analyzer analyzer = analysisService.analyzer("myAnalyzer2").analyzer();
|
||||
Analyzer analyzer = analysisService.analyzer(analyzerName).analyzer();
|
||||
|
||||
AllEntries allEntries = new AllEntries();
|
||||
allEntries.addText("field1", "donaudampfschiff", 1.0f);
|
||||
allEntries.addText("field2", "spargelcremesuppe", 1.0f);
|
||||
allEntries.addText("field1", text, 1.0f);
|
||||
allEntries.reset();
|
||||
|
||||
TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer);
|
||||
|
@ -126,7 +95,14 @@ public class CompoundAnalysisTests {
|
|||
String tokText = termAtt.term();
|
||||
terms.add(tokText);
|
||||
}
|
||||
MatcherAssert.assertThat(terms.size(), equalTo(8));
|
||||
MatcherAssert.assertThat(terms, hasItems("donau", "dampf", "schiff", "donaudampfschiff", "spargel", "creme", "suppe", "spargelcremesuppe"));
|
||||
return terms;
|
||||
}
|
||||
|
||||
private Settings getJsonSettings() {
|
||||
return settingsBuilder().loadFromClasspath("org/elasticsearch/index/analysis/test1.json").build();
|
||||
}
|
||||
|
||||
private Settings getYamlSettings() {
|
||||
return settingsBuilder().loadFromClasspath("org/elasticsearch/index/analysis/test1.yml").build();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -31,7 +31,7 @@
|
|||
},
|
||||
"dict_dec" : {
|
||||
"type" : "dictionary_decompounder",
|
||||
"word_list_path" : "/dev/null"
|
||||
"word_list" : ["donau", "dampf", "schiff", "spargel", "creme", "suppe"]
|
||||
}
|
||||
},
|
||||
"analyzer" : {
|
||||
|
@ -61,7 +61,7 @@
|
|||
"tokenizer" : "standard",
|
||||
"filter" : ["standard", "lowercase", "stop", "czech_stem"]
|
||||
},
|
||||
"custom5" : {
|
||||
"decompoundingAnalyzer" : {
|
||||
"tokenizer" : "standard",
|
||||
"filter" : ["dict_dec"]
|
||||
}
|
||||
|
|
|
@ -22,7 +22,7 @@ index :
|
|||
type : org.elasticsearch.index.analysis.filter1.MyFilterTokenFilterFactory
|
||||
dict_dec :
|
||||
type : dictionary_decompounder
|
||||
word_list_path : /dev/null
|
||||
word_list : [donau, dampf, schiff, spargel, creme, suppe]
|
||||
analyzer :
|
||||
standard :
|
||||
alias: alias1,alias2
|
||||
|
@ -44,6 +44,6 @@ index :
|
|||
czechAnalyzerWithStemmer :
|
||||
tokenizer : standard
|
||||
filter : [standard, lowercase, stop, czech_stem]
|
||||
custom5 :
|
||||
decompoundingAnalyzer :
|
||||
tokenizer : standard
|
||||
filter : [dict_dec]
|
Loading…
Reference in New Issue