Analysis: Synonym Token Filter, closes #900.

2011-05-10 22:37:33 +03:00 · 2011-05-10 22:37:33 +03:00 · 15d8f0b1ac
parent 1b686d3c2b
commit 15d8f0b1ac
9 changed files with 735 additions and 16 deletions
--- a/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
+++ b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
@ -0,0 +1,258 @@
 /*
 * Licensed to Elastic Search and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Elastic Search licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.apache.lucene.analysis.synonym;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.AttributeSource;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.LinkedList;
 /**
 * SynonymFilter handles multi-token synonyms with variable position increment offsets.
 * <p>
 * The matched tokens from the input stream may be optionally passed through (includeOrig=true)
 * or discarded.  If the original tokens are included, the position increments may be modified
 * to retain absolute positions after merging with the synonym tokenstream.
 * <p>
 * Generated synonyms will start at the same position as the first matched source token.
 */
 // LUCENE MONITOR: Taken from 4.0, remove once upgraded
 public final class SynonymFilter extends TokenFilter {
    private final SynonymMap map;  // Map<String, SynonymMap>
    private Iterator<AttributeSource> replacement;  // iterator over generated tokens
    public SynonymFilter(TokenStream in, SynonymMap map) {
        super(in);
        if (map == null)
            throw new IllegalArgumentException("map is required");
        this.map = map;
        // just ensuring these attributes exist...
        addAttribute(CharTermAttribute.class);
        addAttribute(PositionIncrementAttribute.class);
        addAttribute(OffsetAttribute.class);
        addAttribute(TypeAttribute.class);
    }
    /*
    * Need to worry about multiple scenarios:
    *  - need to go for the longest match
    *    a b => foo      #shouldn't match if "a b" is followed by "c d"
    *    a b c d => bar
    *  - need to backtrack - retry matches for tokens already read
    *     a b c d => foo
    *       b c => bar
    *     If the input stream is "a b c x", one will consume "a b c d"
    *     trying to match the first rule... all but "a" should be
    *     pushed back so a match may be made on "b c".
    *  - don't try and match generated tokens (thus need separate queue)
    *    matching is not recursive.
    *  - handle optional generation of original tokens in all these cases,
    *    merging token streams to preserve token positions.
    *  - preserve original positionIncrement of first matched token
    */
    @Override
    public boolean incrementToken() throws IOException {
        while (true) {
            // if there are any generated tokens, return them... don't try any
            // matches against them, as we specifically don't want recursion.
            if (replacement != null && replacement.hasNext()) {
                copy(this, replacement.next());
                return true;
            }
            // common case fast-path of first token not matching anything
            AttributeSource firstTok = nextTok();
            if (firstTok == null) return false;
            CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
            SynonymMap result = map.submap != null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
            if (result == null) {
                copy(this, firstTok);
                return true;
            }
            // fast-path failed, clone ourselves if needed
            if (firstTok == this)
                firstTok = cloneAttributes();
            // OK, we matched a token, so find the longest match.
            matched = new LinkedList<AttributeSource>();
            result = match(result);
            if (result == null) {
                // no match, simply return the first token read.
                copy(this, firstTok);
                return true;
            }
            // reuse, or create new one each time?
            ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(result.synonyms.length + matched.size() + 1);
            //
            // there was a match... let's generate the new tokens, merging
            // in the matched tokens (position increments need adjusting)
            //
            AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
            boolean includeOrig = result.includeOrig();
            AttributeSource origTok = includeOrig ? firstTok : null;
            PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
            int origPos = firstPosIncAtt.getPositionIncrement();  // position of origTok in the original stream
            int repPos = 0; // curr position in replacement token stream
            int pos = 0;  // current position in merged token stream
            for (int i = 0; i < result.synonyms.length; i++) {
                Token repTok = result.synonyms[i];
                AttributeSource newTok = firstTok.cloneAttributes();
                CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
                OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
                PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);
                OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);
                newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
                newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
                repPos += repTok.getPositionIncrement();
                if (i == 0) repPos = origPos;  // make position of first token equal to original
                // if necessary, insert original tokens and adjust position increment
                while (origTok != null && origPos <= repPos) {
                    PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
                    origPosInc.setPositionIncrement(origPos - pos);
                    generated.add(origTok);
                    pos += origPosInc.getPositionIncrement();
                    origTok = matched.isEmpty() ? null : matched.removeFirst();
                    if (origTok != null) {
                        origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
                        origPos += origPosInc.getPositionIncrement();
                    }
                }
                newPosIncAtt.setPositionIncrement(repPos - pos);
                generated.add(newTok);
                pos += newPosIncAtt.getPositionIncrement();
            }
            // finish up any leftover original tokens
            while (origTok != null) {
                PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
                origPosInc.setPositionIncrement(origPos - pos);
                generated.add(origTok);
                pos += origPosInc.getPositionIncrement();
                origTok = matched.isEmpty() ? null : matched.removeFirst();
                if (origTok != null) {
                    origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
                    origPos += origPosInc.getPositionIncrement();
                }
            }
            // what if we replaced a longer sequence with a shorter one?
            // a/0 b/5 =>  foo/0
            // should I re-create the gap on the next buffered token?
            replacement = generated.iterator();
            // Now return to the top of the loop to read and return the first
            // generated token.. The reason this is done is that we may have generated
            // nothing at all, and may need to continue with more matching logic.
        }
    }
    //
    // Defer creation of the buffer until the first time it is used to
    // optimize short fields with no matches.
    //
    private LinkedList<AttributeSource> buffer;
    private LinkedList<AttributeSource> matched;
    private AttributeSource nextTok() throws IOException {
        if (buffer != null && !buffer.isEmpty()) {
            return buffer.removeFirst();
        } else {
            if (input.incrementToken()) {
                return this;
            } else
                return null;
        }
    }
    private void pushTok(AttributeSource t) {
        if (buffer == null) buffer = new LinkedList<AttributeSource>();
        buffer.addFirst(t);
    }
    private SynonymMap match(SynonymMap map) throws IOException {
        SynonymMap result = null;
        if (map.submap != null) {
            AttributeSource tok = nextTok();
            if (tok != null) {
                // clone ourselves.
                if (tok == this)
                    tok = cloneAttributes();
                // check for positionIncrement!=1?  if>1, should not match, if==0, check multiple at this level?
                CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
                SynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length());
                if (subMap != null) {
                    // recurse
                    result = match(subMap);
                }
                if (result != null) {
                    matched.addFirst(tok);
                } else {
                    // push back unmatched token
                    pushTok(tok);
                }
            }
        }
        // if no longer sequence matched, so if this node has synonyms, it's the match.
        if (result == null && map.synonyms != null) {
            result = map;
        }
        return result;
    }
    private void copy(AttributeSource target, AttributeSource source) {
        if (target != source)
            source.copyTo(target);
    }
    @Override
    public void reset() throws IOException {
        input.reset();
        replacement = null;
    }
 }
--- a/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/synonym/SynonymMap.java
+++ b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/synonym/SynonymMap.java
@ -0,0 +1,177 @@
 /*
 * Licensed to Elastic Search and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Elastic Search licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.apache.lucene.analysis.synonym;
 import org.apache.lucene.analysis.CharArrayMap;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.util.Version;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 /**
 * Mapping rules for use with {@link SynonymFilter}
 */
 public class SynonymMap {
    /**
     * @lucene.internal
     */
    public CharArrayMap<SynonymMap> submap; // recursive: Map<String, SynonymMap>
    /**
     * @lucene.internal
     */
    public Token[] synonyms;
    int flags;
    static final int INCLUDE_ORIG = 0x01;
    static final int IGNORE_CASE = 0x02;
    public SynonymMap() {
    }
    public SynonymMap(boolean ignoreCase) {
        if (ignoreCase) flags |= IGNORE_CASE;
    }
    public boolean includeOrig() {
        return (flags & INCLUDE_ORIG) != 0;
    }
    public boolean ignoreCase() {
        return (flags & IGNORE_CASE) != 0;
    }
    /**
     * @param singleMatch   List<String>, the sequence of strings to match
     * @param replacement   List<Token> the list of tokens to use on a match
     * @param includeOrig   sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens
     * @param mergeExisting merge the replacement tokens with any other mappings that exist
     */
    public void add(List<String> singleMatch, List<Token> replacement, boolean includeOrig, boolean mergeExisting) {
        SynonymMap currMap = this;
        for (String str : singleMatch) {
            if (currMap.submap == null) {
                // for now hardcode at 4.0, as its what the old code did.
                // would be nice to fix, but shouldn't store a version in each submap!!!
                currMap.submap = new CharArrayMap<SynonymMap>(Version.LUCENE_31, 1, ignoreCase());
            }
            SynonymMap map = currMap.submap.get(str);
            if (map == null) {
                map = new SynonymMap();
                map.flags |= flags & IGNORE_CASE;
                currMap.submap.put(str, map);
            }
            currMap = map;
        }
        if (currMap.synonyms != null && !mergeExisting) {
            throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch);
        }
        List<Token> superset = currMap.synonyms == null ? replacement :
                mergeTokens(Arrays.asList(currMap.synonyms), replacement);
        currMap.synonyms = superset.toArray(new Token[superset.size()]);
        if (includeOrig) currMap.flags |= INCLUDE_ORIG;
    }
    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder("<");
        if (synonyms != null) {
            sb.append("[");
            for (int i = 0; i < synonyms.length; i++) {
                if (i != 0) sb.append(',');
                sb.append(synonyms[i]);
            }
            if ((flags & INCLUDE_ORIG) != 0) {
                sb.append(",ORIG");
            }
            sb.append("],");
        }
        sb.append(submap);
        sb.append(">");
        return sb.toString();
    }
    /**
     * Produces a List<Token> from a List<String>
     */
    public static List<Token> makeTokens(List<String> strings) {
        List<Token> ret = new ArrayList<Token>(strings.size());
        for (String str : strings) {
            //Token newTok = new Token(str,0,0,"SYNONYM");
            Token newTok = new Token(str, 0, 0, "SYNONYM");
            ret.add(newTok);
        }
        return ret;
    }
    /**
     * Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
     * the tokens end up at the same position.
     *
     * Example:  [a b] merged with [c d] produces [a/b c/d]  ('/' denotes tokens in the same position)
     * Example:  [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2]  (a,n means a has posInc=n)
     */
    public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) {
        ArrayList<Token> result = new ArrayList<Token>();
        if (lst1 == null || lst2 == null) {
            if (lst2 != null) result.addAll(lst2);
            if (lst1 != null) result.addAll(lst1);
            return result;
        }
        int pos = 0;
        Iterator<Token> iter1 = lst1.iterator();
        Iterator<Token> iter2 = lst2.iterator();
        Token tok1 = iter1.hasNext() ? iter1.next() : null;
        Token tok2 = iter2.hasNext() ? iter2.next() : null;
        int pos1 = tok1 != null ? tok1.getPositionIncrement() : 0;
        int pos2 = tok2 != null ? tok2.getPositionIncrement() : 0;
        while (tok1 != null || tok2 != null) {
            while (tok1 != null && (pos1 <= pos2 || tok2 == null)) {
                Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
                tok.copyBuffer(tok1.buffer(), 0, tok1.length());
                tok.setPositionIncrement(pos1 - pos);
                result.add(tok);
                pos = pos1;
                tok1 = iter1.hasNext() ? iter1.next() : null;
                pos1 += tok1 != null ? tok1.getPositionIncrement() : 0;
            }
            while (tok2 != null && (pos2 <= pos1 || tok1 == null)) {
                Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
                tok.copyBuffer(tok2.buffer(), 0, tok2.length());
                tok.setPositionIncrement(pos2 - pos);
                result.add(tok);
                pos = pos2;
                tok2 = iter2.hasNext() ? iter2.next() : null;
                pos2 += tok2 != null ? tok2.getPositionIncrement() : 0;
            }
        }
        return result;
    }
 }
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/common/Strings.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/common/Strings.java
@ -42,6 +42,117 @@ public class Strings {
    private static final char EXTENSION_SEPARATOR = '.';
    /**
     * Splits a backslash escaped string on the separator.
     * <p>
     * Current backslash escaping supported:
     * <br> \n \t \r \b \f are escaped the same as a Java String
     * <br> Other characters following a backslash are produced verbatim (\c => c)
     *
     * @param s         the string to split
     * @param separator the separator to split on
     * @param decode    decode backslash escaping
     */
    public static List<String> splitSmart(String s, String separator, boolean decode) {
        ArrayList<String> lst = new ArrayList<String>(2);
        StringBuilder sb = new StringBuilder();
        int pos = 0, end = s.length();
        while (pos < end) {
            if (s.startsWith(separator, pos)) {
                if (sb.length() > 0) {
                    lst.add(sb.toString());
                    sb = new StringBuilder();
                }
                pos += separator.length();
                continue;
            }
            char ch = s.charAt(pos++);
            if (ch == '\\') {
                if (!decode) sb.append(ch);
                if (pos >= end) break;  // ERROR, or let it go?
                ch = s.charAt(pos++);
                if (decode) {
                    switch (ch) {
                        case 'n':
                            ch = '\n';
                            break;
                        case 't':
                            ch = '\t';
                            break;
                        case 'r':
                            ch = '\r';
                            break;
                        case 'b':
                            ch = '\b';
                            break;
                        case 'f':
                            ch = '\f';
                            break;
                    }
                }
            }
            sb.append(ch);
        }
        if (sb.length() > 0) {
            lst.add(sb.toString());
        }
        return lst;
    }
    public static List<String> splitWS(String s, boolean decode) {
        ArrayList<String> lst = new ArrayList<String>(2);
        StringBuilder sb = new StringBuilder();
        int pos = 0, end = s.length();
        while (pos < end) {
            char ch = s.charAt(pos++);
            if (Character.isWhitespace(ch)) {
                if (sb.length() > 0) {
                    lst.add(sb.toString());
                    sb = new StringBuilder();
                }
                continue;
            }
            if (ch == '\\') {
                if (!decode) sb.append(ch);
                if (pos >= end) break;  // ERROR, or let it go?
                ch = s.charAt(pos++);
                if (decode) {
                    switch (ch) {
                        case 'n':
                            ch = '\n';
                            break;
                        case 't':
                            ch = '\t';
                            break;
                        case 'r':
                            ch = '\r';
                            break;
                        case 'b':
                            ch = '\b';
                            break;
                        case 'f':
                            ch = '\f';
                            break;
                    }
                }
            }
            sb.append(ch);
        }
        if (sb.length() > 0) {
            lst.add(sb.toString());
        }
        return lst;
    }
    //---------------------------------------------------------------------
    // General convenience methods for working with Strings
    //---------------------------------------------------------------------
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/Analysis.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/Analysis.java
@ -19,7 +19,6 @@
 package org.elasticsearch.index.analysis;
 import org.apache.lucene.analysis.WordlistLoader;
 import org.apache.lucene.analysis.ar.ArabicAnalyzer;
 import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
 import org.apache.lucene.analysis.br.BrazilianAnalyzer;
@ -56,12 +55,12 @@ import org.elasticsearch.common.collect.MapBuilder;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.net.URL;
-import java.util.Arrays;
+import java.util.*;
 import java.util.HashSet;
 import java.util.Set;
 /**
 * @author kimchy (shay.banon)
@ -140,7 +139,7 @@ public class Analysis {
            }
            return setStopWords;
        }
-        Set<String> pathLoadedStopWords = getWordList(env, settings, "stopwords");
+        Set<String> pathLoadedStopWords = getWordSet(env, settings, "stopwords");
        if (pathLoadedStopWords != null) {
            Set setStopWords = new HashSet<String>();
            for (String stopWord : pathLoadedStopWords) {
@ -156,6 +155,14 @@ public class Analysis {
        return defaultStopWords;
    }
    public static Set<String> getWordSet(Environment env, Settings settings, String settingsPrefix) {
        List<String> wordList = getWordList(env, settings, settingsPrefix);
        if (wordList == null) {
            return null;
        }
        return new HashSet<String>(wordList);
    }
    /**
     * Fetches a list of words from the specified settings file. The list should either be available at the key
     * specified by settingsPrefix or in a file specified by settingsPrefix + _path.
@ -163,7 +170,7 @@ public class Analysis {
     * @throws ElasticSearchIllegalArgumentException
     *          If the word list cannot be found at either key.
     */
-    public static Set<String> getWordList(Environment env, Settings settings, String settingPrefix) {
+    public static List<String> getWordList(Environment env, Settings settings, String settingPrefix) {
        String wordListPath = settings.get(settingPrefix + "_path", null);
        if (wordListPath == null) {
@ -171,17 +178,42 @@ public class Analysis {
            if (explicitWordList == null) {
                return null;
            } else {
-                return new HashSet<String>(Arrays.asList(explicitWordList));
+                return Arrays.asList(explicitWordList);
            }
        }
        URL wordListFile = env.resolveConfig(wordListPath);
        try {
-            return WordlistLoader.getWordSet(new InputStreamReader(wordListFile.openStream(), Charsets.UTF_8), "#");
+            return loadWordList(new InputStreamReader(wordListFile.openStream(), Charsets.UTF_8), "#");
        } catch (IOException ioe) {
            String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());
            throw new ElasticSearchIllegalArgumentException(message);
        }
    }
    public static List<String> loadWordList(Reader reader, String comment) throws IOException {
        final List<String> result = new ArrayList<String>();
        BufferedReader br = null;
        try {
            if (reader instanceof BufferedReader) {
                br = (BufferedReader) reader;
            } else {
                br = new BufferedReader(reader);
            }
            String word = null;
            while ((word = br.readLine()) != null) {
                if (!Strings.hasText(word)) {
                    continue;
                }
                if (!word.startsWith(comment)) {
                    result.add(word.trim());
                }
            }
        } finally {
            if (br != null)
                br.close();
        }
        return result;
    }
 }
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java
@ -326,9 +326,6 @@ public class AnalysisModule extends AbstractModule {
            tokenFiltersBindings.processTokenFilter("edgeNGram", EdgeNGramTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("edge_ngram", EdgeNGramTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("shingle", ShingleTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("phonetic", PhoneticTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("dictionary_decompounder", DictionaryCompoundWordTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("hypennation_decompounder", HyphenationCompoundWordTokenFilterFactory.class);
        }
        @Override public void processTokenizers(TokenizersBindings tokenizersBindings) {
@ -362,6 +359,11 @@ public class AnalysisModule extends AbstractModule {
            tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("stemmer", StemmerTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("word_delimiter", WordDelimiterTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("synonym", SynonymTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("phonetic", PhoneticTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("dictionary_decompounder", DictionaryCompoundWordTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("hypennation_decompounder", HyphenationCompoundWordTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("arabic_stem", ArabicStemTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("brazilian_stem", BrazilianStemTokenFilterFactory.class);
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java
@ -0,0 +1,139 @@
 /*
 * Licensed to Elastic Search and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Elastic Search licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.index.analysis;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.synonym.SynonymFilter;
 import org.apache.lucene.analysis.synonym.SynonymMap;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.elasticsearch.ElasticSearchIllegalArgumentException;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
 import org.elasticsearch.common.io.FastStringReader;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.settings.IndexSettings;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
@AnalysisSettingsRequired
 public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
    private final SynonymMap synonymMap;
    @Inject public SynonymTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, Map<String, TokenizerFactoryFactory> tokenizerFactories,
                                             @Assisted String name, @Assisted Settings settings) {
        super(index, indexSettings, name, settings);
        List<String> rules = Analysis.getWordList(env, settings, "synonyms");
        if (rules == null) {
            throw new ElasticSearchIllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured");
        }
        boolean ignoreCase = settings.getAsBoolean("ignore_case", false);
        boolean expand = settings.getAsBoolean("expand", true);
        TokenizerFactoryFactory tokenizerFactoryFactory = tokenizerFactories.get(settings.get("tokenizer", "whitespace"));
        TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.create(settings.get("tokenizer", "whitespace"), settings);
        synonymMap = new SynonymMap(ignoreCase);
        parseRules(rules, synonymMap, "=>", ",", expand, tokenizerFactory);
    }
    @Override public TokenStream create(TokenStream tokenStream) {
        return new SynonymFilter(tokenStream, synonymMap);
    }
    static void parseRules(List<String> rules, SynonymMap map, String mappingSep,
                           String synSep, boolean expansion, TokenizerFactory tokFactory) {
        int count = 0;
        for (String rule : rules) {
            // To use regexes, we need an expression that specifies an odd number of chars.
            // This can't really be done with string.split(), and since we need to
            // do unescaping at some point anyway, we wouldn't be saving any effort
            // by using regexes.
            List<String> mapping = Strings.splitSmart(rule, mappingSep, false);
            List<List<String>> source;
            List<List<String>> target;
            if (mapping.size() > 2) {
                throw new RuntimeException("Invalid Synonym Rule:" + rule);
            } else if (mapping.size() == 2) {
                source = getSynList(mapping.get(0), synSep, tokFactory);
                target = getSynList(mapping.get(1), synSep, tokFactory);
            } else {
                source = getSynList(mapping.get(0), synSep, tokFactory);
                if (expansion) {
                    // expand to all arguments
                    target = source;
                } else {
                    // reduce to first argument
                    target = new ArrayList<List<String>>(1);
                    target.add(source.get(0));
                }
            }
            boolean includeOrig = false;
            for (List<String> fromToks : source) {
                count++;
                for (List<String> toToks : target) {
                    map.add(fromToks,
                            SynonymMap.makeTokens(toToks),
                            includeOrig,
                            true
                    );
                }
            }
        }
    }
    // a , b c , d e f => [[a],[b,c],[d,e,f]]
    private static List<List<String>> getSynList(String str, String separator, TokenizerFactory tokFactory) {
        List<String> strList = Strings.splitSmart(str, separator, false);
        // now split on whitespace to get a list of token strings
        List<List<String>> synList = new ArrayList<List<String>>();
        for (String toks : strList) {
            List<String> tokList = tokFactory == null ?
                    Strings.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
            synList.add(tokList);
        }
        return synList;
    }
    private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) {
        TokenStream ts = tokFactory.create(new FastStringReader(source));
        List<String> tokList = new ArrayList<String>();
        try {
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            while (ts.incrementToken()) {
                if (termAtt.length() > 0)
                    tokList.add(termAtt.toString());
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return tokList;
    }
 }
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java
@ -58,7 +58,7 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
        // . => DIGIT
        // \u002C => DIGIT
        // \u200D => ALPHANUM
-        Set<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
+        List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
        if (charTypeTableValues == null) {
            this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
        } else {
@ -84,7 +84,7 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
        // If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
        this.stemEnglishPossessive = settings.getAsBoolean("stem_english_possessive", true);
        // If not null is the set of tokens to protect from being delimited
-        Set<String> protectedWords = Analysis.getWordList(env, settings, "protected_words");
+        Set<String> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
        this.protoWords = protectedWords == null ? null : CharArraySet.copy(Lucene.VERSION, protectedWords);
    }
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/AbstractCompoundWordTokenFilterFactory.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/AbstractCompoundWordTokenFilterFactory.java
@ -53,7 +53,7 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok
        minSubwordSize = settings.getAsInt("min_subword_size", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
        maxSubwordSize = settings.getAsInt("max_subword_size", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
        onlyLongestMatch = settings.getAsBoolean("only_longest_max", false);
-        wordList = Analysis.getWordList(env, settings, "word_list");
+        wordList = Analysis.getWordSet(env, settings, "word_list");
        if (wordList == null) {
            throw new ElasticSearchIllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly");
        }
--- a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java
+++ b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java
@ -124,7 +124,7 @@ public class AnalysisModuleTests {
        assertThat(dictionaryDecompounderAnalyze.tokenFilters().length, equalTo(1));
        assertThat(dictionaryDecompounderAnalyze.tokenFilters()[0], instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
-        Set<String> wordList = Analysis.getWordList(null, settings, "index.analysis.filter.dict_dec.word_list");
+        Set<String> wordList = Analysis.getWordSet(null, settings, "index.analysis.filter.dict_dec.word_list");
        MatcherAssert.assertThat(wordList.size(), equalTo(6));
        MatcherAssert.assertThat(wordList, hasItems("donau", "dampf", "schiff", "spargel", "creme", "suppe"));
    }
@ -136,7 +136,7 @@ public class AnalysisModuleTests {
        File wordListFile = generateWordList(words);
        Settings settings = settingsBuilder().loadFromSource("index: \n  word_list_path: " + wordListFile.getAbsolutePath()).build();
-        Set<String> wordList = Analysis.getWordList(env, settings, "index.word_list");
+        Set<String> wordList = Analysis.getWordSet(env, settings, "index.word_list");
        MatcherAssert.assertThat(wordList.size(), equalTo(6));
        MatcherAssert.assertThat(wordList, hasItems(words));
    }