Adding Analysis.getWordList method which retrieves list of words from setting or file specified by setting.

2011-01-31 06:11:43 +01:00 · 2011-01-31 06:11:43 +01:00 · f319625cb5
parent c95544141b
commit f319625cb5
6 changed files with 109 additions and 71 deletions
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/Analysis.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/Analysis.java
@ -19,11 +19,17 @@

 package org.elasticsearch.index.analysis;

+import org.apache.lucene.analysis.WordlistLoader;
+import org.elasticsearch.ElasticSearchIllegalArgumentException;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.collect.ImmutableSet;
 import org.elasticsearch.common.collect.Iterators;
 import org.elasticsearch.common.settings.Settings;

+import java.io.File;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
 import java.util.Set;

 /**
@ -52,4 +58,37 @@ public class Analysis {
            return defaultStopWords;
        }
    }
+
+    /**
+     * Fetches a list of words from the specified settings file. The list should either be available at the key
+     * specified by settingsPrefix or in a file specified by settingsPrefix + _path.
+     *
+     * @throws ElasticSearchIllegalArgumentException If the word list cannot be found at either key.
+     */
+    public static Set<String> getWordList(Settings settings, String settingPrefix) {
+        String wordListPath = settings.get(settingPrefix + "_path", null);
+
+        if (wordListPath == null) {
+            String[] explicitWordList = settings.getAsArray(settingPrefix, null);
+            if(explicitWordList == null) {
+                String message = String.format("%s or %s_path must be provided.", settingPrefix, settingPrefix);
+                throw new ElasticSearchIllegalArgumentException(message);
+            } else {
+
+                return new HashSet<String>(Arrays.asList(explicitWordList));
+            }
+        }
+
+        File wordListFile = new File(wordListPath);
+        if (!wordListFile.exists()) {
+            throw new ElasticSearchIllegalArgumentException(settingPrefix + "_path file must exist.");
+        }
+
+        try {
+            return WordlistLoader.getWordSet(wordListFile);
+        } catch (IOException ioe) {
+            String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());
+            throw new ElasticSearchIllegalArgumentException(message);
+        }
+    }
 }
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/AbstractCompoundWordTokenFilterFactory.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/AbstractCompoundWordTokenFilterFactory.java
@ -19,18 +19,15 @@

 package org.elasticsearch.index.analysis.compound;

-import org.apache.lucene.analysis.WordlistLoader;
 import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
-import org.elasticsearch.ElasticSearchIllegalArgumentException;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.Analysis;
 import org.elasticsearch.index.settings.IndexSettings;

-import java.io.File;
-import java.io.IOException;
 import java.util.Set;

 /**
@ -54,21 +51,6 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok
        minSubwordSize = settings.getAsInt("min_subword_size", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
        maxSubwordSize = settings.getAsInt("max_subword_size", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
        onlyLongestMatch = settings.getAsBoolean("only_longest_max", false);
-
-        String wordListPath = settings.get("word_list_path", null);
-        if (wordListPath == null) {
-            throw new ElasticSearchIllegalArgumentException("word_list_path is a required setting.");
-        }
-
-        File wordListFile = new File(wordListPath);
-        if (!wordListFile.exists()) {
-            throw new ElasticSearchIllegalArgumentException("word_list_path file must exist.");
-        }
-
-        try {
-            wordList = WordlistLoader.getWordSet(wordListFile);
-        } catch (IOException ioe) {
-            throw new ElasticSearchIllegalArgumentException("IOException while reading word_list_path: " + ioe.getMessage());
-        }
+        wordList = Analysis.getWordList(settings, "word_list");
    }
 }
--- a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java
+++ b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java
@ -31,8 +31,14 @@ import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilt
 import org.elasticsearch.index.analysis.filter1.MyFilterTokenFilterFactory;
 import org.elasticsearch.index.analysis.phonetic.PhoneticTokenFilterFactory;
 import org.elasticsearch.index.settings.IndexSettingsModule;
+import org.hamcrest.MatcherAssert;
 import org.testng.annotations.Test;

+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.util.Set;
+
 import static org.elasticsearch.common.settings.ImmutableSettings.*;
 import static org.hamcrest.MatcherAssert.*;
 import static org.hamcrest.Matchers.*;
@ -107,11 +113,46 @@ public class AnalysisModuleTests {
        assertThat(czechstemmeranalyzer.tokenFilters()[3], instanceOf(CzechStemTokenFilterFactory.class));

        // check dictionary decompounder
-        analyzer = analysisService.analyzer("custom5").analyzer();
+        analyzer = analysisService.analyzer("decompoundingAnalyzer").analyzer();
        assertThat(analyzer, instanceOf(CustomAnalyzer.class));
        CustomAnalyzer dictionaryDecompounderAnalyze = (CustomAnalyzer) analyzer;
        assertThat(dictionaryDecompounderAnalyze.tokenizerFactory(), instanceOf(StandardTokenizerFactory.class));
        assertThat(dictionaryDecompounderAnalyze.tokenFilters().length, equalTo(1));
        assertThat(dictionaryDecompounderAnalyze.tokenFilters()[0], instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
+
+        Set<String> wordList = Analysis.getWordList(settings, "index.analysis.filter.dict_dec.word_list");
+        MatcherAssert.assertThat(wordList.size(), equalTo(6));
+        MatcherAssert.assertThat(wordList, hasItems("donau", "dampf", "schiff", "spargel", "creme", "suppe"));
    }
+
+    @Test public void testWordListPath() throws Exception {
+        String[] words = new String[] {"donau", "dampf", "schiff", "spargel", "creme", "suppe"};
+
+        File wordListFile = generateWordList(words);
+        Settings settings = settingsBuilder().loadFromSource("index: \n  word_list_path: " + wordListFile.getAbsolutePath()).build();
+
+        Set<String> wordList = Analysis.getWordList(settings, "index.word_list");
+        MatcherAssert.assertThat(wordList.size(), equalTo(6));
+        MatcherAssert.assertThat(wordList, hasItems(words));
+    }
+
+    private File generateWordList(String[] words) throws Exception {
+        File wordListFile = File.createTempFile("wordlist", ".txt");
+        wordListFile.deleteOnExit();
+
+        BufferedWriter writer = null;
+        try {
+            writer = new BufferedWriter(new FileWriter(wordListFile));
+            for (String word : words) {
+                writer.write(word);
+                writer.write('\n');
+            }
+        } finally {
+            if (writer != null) {
+                writer.close();
+            }
+        }
+        return wordListFile;
+    }
+
 }
--- a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/CompoundAnalysisTests.java
+++ b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/CompoundAnalysisTests.java
@ -36,14 +36,11 @@ import org.hamcrest.MatcherAssert;
 import org.testng.annotations.Test;


-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileWriter;
+import java.io.IOException;
 import java.util.List;
 import java.util.ArrayList;

 import static org.elasticsearch.common.settings.ImmutableSettings.*;
-import static org.elasticsearch.common.settings.ImmutableSettings.Builder.*;
 import static org.hamcrest.Matchers.*;

 /**
@ -51,43 +48,9 @@ import static org.hamcrest.Matchers.*;
 */
 public class CompoundAnalysisTests {

-    private File generateWordList() throws Exception {
-        File wordListFile = File.createTempFile("wordlist", ".txt");
-        wordListFile.deleteOnExit();
-
-        BufferedWriter writer = null;
-        try {
-            writer = new BufferedWriter(new FileWriter(wordListFile));
-            writer.write("donau\ndampf\nschiff\n");
-            writer.write("spargel\ncreme\nsuppe");
-        } finally {
-            if (writer != null) {
-                writer.close();
-            }
-        }
-        return wordListFile;
-    }
-
-    private Settings generateSettings(File wordListFile) throws Exception {
-        StringBuilder settingsStr = new StringBuilder();
-
-        settingsStr.append("index : \n");
-        settingsStr.append("  analysis :\n");
-        settingsStr.append("    analyzer :\n");
-        settingsStr.append("      myAnalyzer2 :\n");
-        settingsStr.append("        tokenizer : standard\n");
-        settingsStr.append("        filter : [dict_dec, standard, lowercase, stop]\n");
-        settingsStr.append("    filter :\n");
-        settingsStr.append("      dict_dec :\n");
-        settingsStr.append("        type : dictionary_decompounder\n");
-        settingsStr.append("        word_list_path : ").append(wordListFile.getAbsolutePath()).append('\n');
-
-        return settingsBuilder().loadFromSource(settingsStr.toString()).build();
-    }
-
    @Test public void testDefaultsCompoundAnalysis() throws Exception {
        Index index = new Index("test");
-        Settings settings = generateSettings(generateWordList());
+        Settings settings = getJsonSettings();

        Injector injector = new ModulesBuilder().add(
                new IndexSettingsModule(settings),
@ -101,9 +64,16 @@ public class CompoundAnalysisTests {
    }

    @Test public void testDictionaryDecompounder() throws Exception {
-        Index index = new Index("test");
-        Settings settings = generateSettings(generateWordList());
+        Settings[] settingsArr = new Settings[] {getJsonSettings(), getYamlSettings()};
+        for (Settings settings : settingsArr) {
+            List<String> terms = analyze(settings, "decompoundingAnalyzer", "donaudampfschiff spargelcremesuppe");
+            MatcherAssert.assertThat(terms.size(), equalTo(8));
+            MatcherAssert.assertThat(terms, hasItems("donau", "dampf", "schiff", "donaudampfschiff", "spargel", "creme", "suppe", "spargelcremesuppe"));
+        }
+    }

+    private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
+        Index index = new Index("test");
        Injector injector = new ModulesBuilder().add(
                new IndexSettingsModule(settings),
                new IndexNameModule(index),
@ -111,11 +81,10 @@ public class CompoundAnalysisTests {

        AnalysisService analysisService = injector.getInstance(AnalysisService.class);

-        Analyzer analyzer = analysisService.analyzer("myAnalyzer2").analyzer();
+        Analyzer analyzer = analysisService.analyzer(analyzerName).analyzer();

        AllEntries allEntries = new AllEntries();
-        allEntries.addText("field1", "donaudampfschiff", 1.0f);
-        allEntries.addText("field2", "spargelcremesuppe", 1.0f);
+        allEntries.addText("field1", text, 1.0f);
        allEntries.reset();

        TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer);
@ -126,7 +95,14 @@ public class CompoundAnalysisTests {
            String tokText = termAtt.term();
            terms.add(tokText);
        }
-        MatcherAssert.assertThat(terms.size(), equalTo(8));
-        MatcherAssert.assertThat(terms, hasItems("donau", "dampf", "schiff", "donaudampfschiff", "spargel", "creme", "suppe", "spargelcremesuppe"));
+        return terms;
+    }
+
+    private Settings getJsonSettings() {
+        return settingsBuilder().loadFromClasspath("org/elasticsearch/index/analysis/test1.json").build();
+    }
+
+    private Settings getYamlSettings() {
+        return settingsBuilder().loadFromClasspath("org/elasticsearch/index/analysis/test1.yml").build();
    }
 }
--- a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.json
+++ b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.json
@ -31,7 +31,7 @@
                },
                "dict_dec" : {
                    "type" : "dictionary_decompounder",
-                    "word_list_path" : "/dev/null"
+                    "word_list" : ["donau", "dampf", "schiff", "spargel", "creme", "suppe"]
                }
            },
            "analyzer" : {
@ -61,7 +61,7 @@
                    "tokenizer" : "standard",
                    "filter" : ["standard", "lowercase", "stop", "czech_stem"]
                },
-                "custom5" : {
+                "decompoundingAnalyzer" : {
                    "tokenizer" : "standard",
                    "filter" : ["dict_dec"]
                }
--- a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.yml
+++ b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.yml
@ -22,7 +22,7 @@ index :
        type : org.elasticsearch.index.analysis.filter1.MyFilterTokenFilterFactory
      dict_dec :
        type : dictionary_decompounder
-        word_list_path : /dev/null
+        word_list : [donau, dampf, schiff, spargel, creme, suppe]
    analyzer :
      standard :
        alias: alias1,alias2
@ -44,6 +44,6 @@ index :
      czechAnalyzerWithStemmer :
        tokenizer : standard
        filter : [standard, lowercase, stop, czech_stem]
-      custom5 :
+      decompoundingAnalyzer :
        tokenizer : standard
        filter : [dict_dec]