Analysis: Synonym Token Filter, closes #900.

2011-05-10 22:37:33 +03:00 · 2011-05-10 22:37:33 +03:00 · 15d8f0b1ac
parent 1b686d3c2b
commit 15d8f0b1ac
9 changed files with 735 additions and 16 deletions
--- a/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
+++ b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
@ -0,0 +1,258 @@
+/*
+ * Licensed to Elastic Search and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elastic Search licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.analysis.synonym;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.LinkedList;
+
+/**
+ * SynonymFilter handles multi-token synonyms with variable position increment offsets.
+ * <p>
+ * The matched tokens from the input stream may be optionally passed through (includeOrig=true)
+ * or discarded.  If the original tokens are included, the position increments may be modified
+ * to retain absolute positions after merging with the synonym tokenstream.
+ * <p>
+ * Generated synonyms will start at the same position as the first matched source token.
+ */
+// LUCENE MONITOR: Taken from 4.0, remove once upgraded
+public final class SynonymFilter extends TokenFilter {
+
+    private final SynonymMap map;  // Map<String, SynonymMap>
+    private Iterator<AttributeSource> replacement;  // iterator over generated tokens
+
+    public SynonymFilter(TokenStream in, SynonymMap map) {
+        super(in);
+        if (map == null)
+            throw new IllegalArgumentException("map is required");
+
+        this.map = map;
+        // just ensuring these attributes exist...
+        addAttribute(CharTermAttribute.class);
+        addAttribute(PositionIncrementAttribute.class);
+        addAttribute(OffsetAttribute.class);
+        addAttribute(TypeAttribute.class);
+    }
+
+
+    /*
+    * Need to worry about multiple scenarios:
+    *  - need to go for the longest match
+    *    a b => foo      #shouldn't match if "a b" is followed by "c d"
+    *    a b c d => bar
+    *  - need to backtrack - retry matches for tokens already read
+    *     a b c d => foo
+    *       b c => bar
+    *     If the input stream is "a b c x", one will consume "a b c d"
+    *     trying to match the first rule... all but "a" should be
+    *     pushed back so a match may be made on "b c".
+    *  - don't try and match generated tokens (thus need separate queue)
+    *    matching is not recursive.
+    *  - handle optional generation of original tokens in all these cases,
+    *    merging token streams to preserve token positions.
+    *  - preserve original positionIncrement of first matched token
+    */
+    @Override
+    public boolean incrementToken() throws IOException {
+        while (true) {
+            // if there are any generated tokens, return them... don't try any
+            // matches against them, as we specifically don't want recursion.
+            if (replacement != null && replacement.hasNext()) {
+                copy(this, replacement.next());
+                return true;
+            }
+
+            // common case fast-path of first token not matching anything
+            AttributeSource firstTok = nextTok();
+            if (firstTok == null) return false;
+            CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
+            SynonymMap result = map.submap != null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
+            if (result == null) {
+                copy(this, firstTok);
+                return true;
+            }
+
+            // fast-path failed, clone ourselves if needed
+            if (firstTok == this)
+                firstTok = cloneAttributes();
+            // OK, we matched a token, so find the longest match.
+
+            matched = new LinkedList<AttributeSource>();
+
+            result = match(result);
+
+            if (result == null) {
+                // no match, simply return the first token read.
+                copy(this, firstTok);
+                return true;
+            }
+
+            // reuse, or create new one each time?
+            ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(result.synonyms.length + matched.size() + 1);
+
+            //
+            // there was a match... let's generate the new tokens, merging
+            // in the matched tokens (position increments need adjusting)
+            //
+            AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
+            boolean includeOrig = result.includeOrig();
+
+            AttributeSource origTok = includeOrig ? firstTok : null;
+            PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
+            int origPos = firstPosIncAtt.getPositionIncrement();  // position of origTok in the original stream
+            int repPos = 0; // curr position in replacement token stream
+            int pos = 0;  // current position in merged token stream
+
+            for (int i = 0; i < result.synonyms.length; i++) {
+                Token repTok = result.synonyms[i];
+                AttributeSource newTok = firstTok.cloneAttributes();
+                CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
+                OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
+                PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);
+
+                OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);
+
+                newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
+                newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
+                repPos += repTok.getPositionIncrement();
+                if (i == 0) repPos = origPos;  // make position of first token equal to original
+
+                // if necessary, insert original tokens and adjust position increment
+                while (origTok != null && origPos <= repPos) {
+                    PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
+                    origPosInc.setPositionIncrement(origPos - pos);
+                    generated.add(origTok);
+                    pos += origPosInc.getPositionIncrement();
+                    origTok = matched.isEmpty() ? null : matched.removeFirst();
+                    if (origTok != null) {
+                        origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
+                        origPos += origPosInc.getPositionIncrement();
+                    }
+                }
+
+                newPosIncAtt.setPositionIncrement(repPos - pos);
+                generated.add(newTok);
+                pos += newPosIncAtt.getPositionIncrement();
+            }
+
+            // finish up any leftover original tokens
+            while (origTok != null) {
+                PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
+                origPosInc.setPositionIncrement(origPos - pos);
+                generated.add(origTok);
+                pos += origPosInc.getPositionIncrement();
+                origTok = matched.isEmpty() ? null : matched.removeFirst();
+                if (origTok != null) {
+                    origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
+                    origPos += origPosInc.getPositionIncrement();
+                }
+            }
+
+            // what if we replaced a longer sequence with a shorter one?
+            // a/0 b/5 =>  foo/0
+            // should I re-create the gap on the next buffered token?
+
+            replacement = generated.iterator();
+            // Now return to the top of the loop to read and return the first
+            // generated token.. The reason this is done is that we may have generated
+            // nothing at all, and may need to continue with more matching logic.
+        }
+    }
+
+
+    //
+    // Defer creation of the buffer until the first time it is used to
+    // optimize short fields with no matches.
+    //
+    private LinkedList<AttributeSource> buffer;
+    private LinkedList<AttributeSource> matched;
+
+    private AttributeSource nextTok() throws IOException {
+        if (buffer != null && !buffer.isEmpty()) {
+            return buffer.removeFirst();
+        } else {
+            if (input.incrementToken()) {
+                return this;
+            } else
+                return null;
+        }
+    }
+
+    private void pushTok(AttributeSource t) {
+        if (buffer == null) buffer = new LinkedList<AttributeSource>();
+        buffer.addFirst(t);
+    }
+
+    private SynonymMap match(SynonymMap map) throws IOException {
+        SynonymMap result = null;
+
+        if (map.submap != null) {
+            AttributeSource tok = nextTok();
+            if (tok != null) {
+                // clone ourselves.
+                if (tok == this)
+                    tok = cloneAttributes();
+                // check for positionIncrement!=1?  if>1, should not match, if==0, check multiple at this level?
+                CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
+                SynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length());
+
+                if (subMap != null) {
+                    // recurse
+                    result = match(subMap);
+                }
+
+                if (result != null) {
+                    matched.addFirst(tok);
+                } else {
+                    // push back unmatched token
+                    pushTok(tok);
+                }
+            }
+        }
+
+        // if no longer sequence matched, so if this node has synonyms, it's the match.
+        if (result == null && map.synonyms != null) {
+            result = map;
+        }
+
+        return result;
+    }
+
+    private void copy(AttributeSource target, AttributeSource source) {
+        if (target != source)
+            source.copyTo(target);
+    }
+
+    @Override
+    public void reset() throws IOException {
+        input.reset();
+        replacement = null;
+    }
+}
--- a/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/synonym/SynonymMap.java
+++ b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/synonym/SynonymMap.java
@ -0,0 +1,177 @@
+/*
+ * Licensed to Elastic Search and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elastic Search licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.analysis.synonym;
+
+import org.apache.lucene.analysis.CharArrayMap;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.util.Version;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * Mapping rules for use with {@link SynonymFilter}
+ */
+public class SynonymMap {
+    /**
+     * @lucene.internal
+     */
+    public CharArrayMap<SynonymMap> submap; // recursive: Map<String, SynonymMap>
+    /**
+     * @lucene.internal
+     */
+    public Token[] synonyms;
+    int flags;
+
+    static final int INCLUDE_ORIG = 0x01;
+    static final int IGNORE_CASE = 0x02;
+
+    public SynonymMap() {
+    }
+
+    public SynonymMap(boolean ignoreCase) {
+        if (ignoreCase) flags |= IGNORE_CASE;
+    }
+
+    public boolean includeOrig() {
+        return (flags & INCLUDE_ORIG) != 0;
+    }
+
+    public boolean ignoreCase() {
+        return (flags & IGNORE_CASE) != 0;
+    }
+
+    /**
+     * @param singleMatch   List<String>, the sequence of strings to match
+     * @param replacement   List<Token> the list of tokens to use on a match
+     * @param includeOrig   sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens
+     * @param mergeExisting merge the replacement tokens with any other mappings that exist
+     */
+    public void add(List<String> singleMatch, List<Token> replacement, boolean includeOrig, boolean mergeExisting) {
+        SynonymMap currMap = this;
+        for (String str : singleMatch) {
+            if (currMap.submap == null) {
+                // for now hardcode at 4.0, as its what the old code did.
+                // would be nice to fix, but shouldn't store a version in each submap!!!
+                currMap.submap = new CharArrayMap<SynonymMap>(Version.LUCENE_31, 1, ignoreCase());
+            }
+
+            SynonymMap map = currMap.submap.get(str);
+            if (map == null) {
+                map = new SynonymMap();
+                map.flags |= flags & IGNORE_CASE;
+                currMap.submap.put(str, map);
+            }
+
+            currMap = map;
+        }
+
+        if (currMap.synonyms != null && !mergeExisting) {
+            throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch);
+        }
+        List<Token> superset = currMap.synonyms == null ? replacement :
+                mergeTokens(Arrays.asList(currMap.synonyms), replacement);
+        currMap.synonyms = superset.toArray(new Token[superset.size()]);
+        if (includeOrig) currMap.flags |= INCLUDE_ORIG;
+    }
+
+
+    @Override
+    public String toString() {
+        StringBuilder sb = new StringBuilder("<");
+        if (synonyms != null) {
+            sb.append("[");
+            for (int i = 0; i < synonyms.length; i++) {
+                if (i != 0) sb.append(',');
+                sb.append(synonyms[i]);
+            }
+            if ((flags & INCLUDE_ORIG) != 0) {
+                sb.append(",ORIG");
+            }
+            sb.append("],");
+        }
+        sb.append(submap);
+        sb.append(">");
+        return sb.toString();
+    }
+
+
+    /**
+     * Produces a List<Token> from a List<String>
+     */
+    public static List<Token> makeTokens(List<String> strings) {
+        List<Token> ret = new ArrayList<Token>(strings.size());
+        for (String str : strings) {
+            //Token newTok = new Token(str,0,0,"SYNONYM");
+            Token newTok = new Token(str, 0, 0, "SYNONYM");
+            ret.add(newTok);
+        }
+        return ret;
+    }
+
+
+    /**
+     * Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
+     * the tokens end up at the same position.
+     *
+     * Example:  [a b] merged with [c d] produces [a/b c/d]  ('/' denotes tokens in the same position)
+     * Example:  [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2]  (a,n means a has posInc=n)
+     */
+    public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) {
+        ArrayList<Token> result = new ArrayList<Token>();
+        if (lst1 == null || lst2 == null) {
+            if (lst2 != null) result.addAll(lst2);
+            if (lst1 != null) result.addAll(lst1);
+            return result;
+        }
+
+        int pos = 0;
+        Iterator<Token> iter1 = lst1.iterator();
+        Iterator<Token> iter2 = lst2.iterator();
+        Token tok1 = iter1.hasNext() ? iter1.next() : null;
+        Token tok2 = iter2.hasNext() ? iter2.next() : null;
+        int pos1 = tok1 != null ? tok1.getPositionIncrement() : 0;
+        int pos2 = tok2 != null ? tok2.getPositionIncrement() : 0;
+        while (tok1 != null || tok2 != null) {
+            while (tok1 != null && (pos1 <= pos2 || tok2 == null)) {
+                Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
+                tok.copyBuffer(tok1.buffer(), 0, tok1.length());
+                tok.setPositionIncrement(pos1 - pos);
+                result.add(tok);
+                pos = pos1;
+                tok1 = iter1.hasNext() ? iter1.next() : null;
+                pos1 += tok1 != null ? tok1.getPositionIncrement() : 0;
+            }
+            while (tok2 != null && (pos2 <= pos1 || tok1 == null)) {
+                Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
+                tok.copyBuffer(tok2.buffer(), 0, tok2.length());
+                tok.setPositionIncrement(pos2 - pos);
+                result.add(tok);
+                pos = pos2;
+                tok2 = iter2.hasNext() ? iter2.next() : null;
+                pos2 += tok2 != null ? tok2.getPositionIncrement() : 0;
+            }
+        }
+        return result;
+    }
+
+}
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/common/Strings.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/common/Strings.java
@ -42,6 +42,117 @@ public class Strings {
    private static final char EXTENSION_SEPARATOR = '.';


+    /**
+     * Splits a backslash escaped string on the separator.
+     * <p>
+     * Current backslash escaping supported:
+     * <br> \n \t \r \b \f are escaped the same as a Java String
+     * <br> Other characters following a backslash are produced verbatim (\c => c)
+     *
+     * @param s         the string to split
+     * @param separator the separator to split on
+     * @param decode    decode backslash escaping
+     */
+    public static List<String> splitSmart(String s, String separator, boolean decode) {
+        ArrayList<String> lst = new ArrayList<String>(2);
+        StringBuilder sb = new StringBuilder();
+        int pos = 0, end = s.length();
+        while (pos < end) {
+            if (s.startsWith(separator, pos)) {
+                if (sb.length() > 0) {
+                    lst.add(sb.toString());
+                    sb = new StringBuilder();
+                }
+                pos += separator.length();
+                continue;
+            }
+
+            char ch = s.charAt(pos++);
+            if (ch == '\\') {
+                if (!decode) sb.append(ch);
+                if (pos >= end) break;  // ERROR, or let it go?
+                ch = s.charAt(pos++);
+                if (decode) {
+                    switch (ch) {
+                        case 'n':
+                            ch = '\n';
+                            break;
+                        case 't':
+                            ch = '\t';
+                            break;
+                        case 'r':
+                            ch = '\r';
+                            break;
+                        case 'b':
+                            ch = '\b';
+                            break;
+                        case 'f':
+                            ch = '\f';
+                            break;
+                    }
+                }
+            }
+
+            sb.append(ch);
+        }
+
+        if (sb.length() > 0) {
+            lst.add(sb.toString());
+        }
+
+        return lst;
+    }
+
+
+    public static List<String> splitWS(String s, boolean decode) {
+        ArrayList<String> lst = new ArrayList<String>(2);
+        StringBuilder sb = new StringBuilder();
+        int pos = 0, end = s.length();
+        while (pos < end) {
+            char ch = s.charAt(pos++);
+            if (Character.isWhitespace(ch)) {
+                if (sb.length() > 0) {
+                    lst.add(sb.toString());
+                    sb = new StringBuilder();
+                }
+                continue;
+            }
+
+            if (ch == '\\') {
+                if (!decode) sb.append(ch);
+                if (pos >= end) break;  // ERROR, or let it go?
+                ch = s.charAt(pos++);
+                if (decode) {
+                    switch (ch) {
+                        case 'n':
+                            ch = '\n';
+                            break;
+                        case 't':
+                            ch = '\t';
+                            break;
+                        case 'r':
+                            ch = '\r';
+                            break;
+                        case 'b':
+                            ch = '\b';
+                            break;
+                        case 'f':
+                            ch = '\f';
+                            break;
+                    }
+                }
+            }
+
+            sb.append(ch);
+        }
+
+        if (sb.length() > 0) {
+            lst.add(sb.toString());
+        }
+
+        return lst;
+    }
+
    //---------------------------------------------------------------------
    // General convenience methods for working with Strings
    //---------------------------------------------------------------------
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/Analysis.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/Analysis.java
@ -19,7 +19,6 @@

 package org.elasticsearch.index.analysis;

-import org.apache.lucene.analysis.WordlistLoader;
 import org.apache.lucene.analysis.ar.ArabicAnalyzer;
 import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
 import org.apache.lucene.analysis.br.BrazilianAnalyzer;
@ -56,12 +55,12 @@ import org.elasticsearch.common.collect.MapBuilder;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;

+import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
+import java.io.Reader;
 import java.net.URL;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Set;
+import java.util.*;

 /**
 * @author kimchy (shay.banon)
@ -140,7 +139,7 @@ public class Analysis {
            }
            return setStopWords;
        }
-        Set<String> pathLoadedStopWords = getWordList(env, settings, "stopwords");
+        Set<String> pathLoadedStopWords = getWordSet(env, settings, "stopwords");
        if (pathLoadedStopWords != null) {
            Set setStopWords = new HashSet<String>();
            for (String stopWord : pathLoadedStopWords) {
@ -156,6 +155,14 @@ public class Analysis {
        return defaultStopWords;
    }

+    public static Set<String> getWordSet(Environment env, Settings settings, String settingsPrefix) {
+        List<String> wordList = getWordList(env, settings, settingsPrefix);
+        if (wordList == null) {
+            return null;
+        }
+        return new HashSet<String>(wordList);
+    }
+
    /**
     * Fetches a list of words from the specified settings file. The list should either be available at the key
     * specified by settingsPrefix or in a file specified by settingsPrefix + _path.
@ -163,7 +170,7 @@ public class Analysis {
     * @throws ElasticSearchIllegalArgumentException
     *          If the word list cannot be found at either key.
     */
-    public static Set<String> getWordList(Environment env, Settings settings, String settingPrefix) {
+    public static List<String> getWordList(Environment env, Settings settings, String settingPrefix) {
        String wordListPath = settings.get(settingPrefix + "_path", null);

        if (wordListPath == null) {
@ -171,17 +178,42 @@ public class Analysis {
            if (explicitWordList == null) {
                return null;
            } else {
-                return new HashSet<String>(Arrays.asList(explicitWordList));
+                return Arrays.asList(explicitWordList);
            }
        }

        URL wordListFile = env.resolveConfig(wordListPath);

        try {
-            return WordlistLoader.getWordSet(new InputStreamReader(wordListFile.openStream(), Charsets.UTF_8), "#");
+            return loadWordList(new InputStreamReader(wordListFile.openStream(), Charsets.UTF_8), "#");
        } catch (IOException ioe) {
            String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());
            throw new ElasticSearchIllegalArgumentException(message);
        }
    }
+
+    public static List<String> loadWordList(Reader reader, String comment) throws IOException {
+        final List<String> result = new ArrayList<String>();
+        BufferedReader br = null;
+        try {
+            if (reader instanceof BufferedReader) {
+                br = (BufferedReader) reader;
+            } else {
+                br = new BufferedReader(reader);
+            }
+            String word = null;
+            while ((word = br.readLine()) != null) {
+                if (!Strings.hasText(word)) {
+                    continue;
+                }
+                if (!word.startsWith(comment)) {
+                    result.add(word.trim());
+                }
+            }
+        } finally {
+            if (br != null)
+                br.close();
+        }
+        return result;
+    }
 }
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java
@ -326,9 +326,6 @@ public class AnalysisModule extends AbstractModule {
            tokenFiltersBindings.processTokenFilter("edgeNGram", EdgeNGramTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("edge_ngram", EdgeNGramTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("shingle", ShingleTokenFilterFactory.class);
-            tokenFiltersBindings.processTokenFilter("phonetic", PhoneticTokenFilterFactory.class);
-            tokenFiltersBindings.processTokenFilter("dictionary_decompounder", DictionaryCompoundWordTokenFilterFactory.class);
-            tokenFiltersBindings.processTokenFilter("hypennation_decompounder", HyphenationCompoundWordTokenFilterFactory.class);
        }

        @Override public void processTokenizers(TokenizersBindings tokenizersBindings) {
@ -362,6 +359,11 @@ public class AnalysisModule extends AbstractModule {
            tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("stemmer", StemmerTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("word_delimiter", WordDelimiterTokenFilterFactory.class);
+            tokenFiltersBindings.processTokenFilter("synonym", SynonymTokenFilterFactory.class);
+
+            tokenFiltersBindings.processTokenFilter("phonetic", PhoneticTokenFilterFactory.class);
+            tokenFiltersBindings.processTokenFilter("dictionary_decompounder", DictionaryCompoundWordTokenFilterFactory.class);
+            tokenFiltersBindings.processTokenFilter("hypennation_decompounder", HyphenationCompoundWordTokenFilterFactory.class);

            tokenFiltersBindings.processTokenFilter("arabic_stem", ArabicStemTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("brazilian_stem", BrazilianStemTokenFilterFactory.class);
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java
@ -0,0 +1,139 @@
+/*
+ * Licensed to Elastic Search and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elastic Search licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.synonym.SynonymFilter;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.elasticsearch.ElasticSearchIllegalArgumentException;
+import org.elasticsearch.common.Strings;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.io.FastStringReader;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+@AnalysisSettingsRequired
+public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
+
+    private final SynonymMap synonymMap;
+
+    @Inject public SynonymTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, Map<String, TokenizerFactoryFactory> tokenizerFactories,
+                                             @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, name, settings);
+
+        List<String> rules = Analysis.getWordList(env, settings, "synonyms");
+        if (rules == null) {
+            throw new ElasticSearchIllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured");
+        }
+        boolean ignoreCase = settings.getAsBoolean("ignore_case", false);
+        boolean expand = settings.getAsBoolean("expand", true);
+
+        TokenizerFactoryFactory tokenizerFactoryFactory = tokenizerFactories.get(settings.get("tokenizer", "whitespace"));
+        TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.create(settings.get("tokenizer", "whitespace"), settings);
+        synonymMap = new SynonymMap(ignoreCase);
+        parseRules(rules, synonymMap, "=>", ",", expand, tokenizerFactory);
+    }
+
+    @Override public TokenStream create(TokenStream tokenStream) {
+        return new SynonymFilter(tokenStream, synonymMap);
+    }
+
+    static void parseRules(List<String> rules, SynonymMap map, String mappingSep,
+                           String synSep, boolean expansion, TokenizerFactory tokFactory) {
+        int count = 0;
+        for (String rule : rules) {
+            // To use regexes, we need an expression that specifies an odd number of chars.
+            // This can't really be done with string.split(), and since we need to
+            // do unescaping at some point anyway, we wouldn't be saving any effort
+            // by using regexes.
+
+            List<String> mapping = Strings.splitSmart(rule, mappingSep, false);
+
+            List<List<String>> source;
+            List<List<String>> target;
+
+            if (mapping.size() > 2) {
+                throw new RuntimeException("Invalid Synonym Rule:" + rule);
+            } else if (mapping.size() == 2) {
+                source = getSynList(mapping.get(0), synSep, tokFactory);
+                target = getSynList(mapping.get(1), synSep, tokFactory);
+            } else {
+                source = getSynList(mapping.get(0), synSep, tokFactory);
+                if (expansion) {
+                    // expand to all arguments
+                    target = source;
+                } else {
+                    // reduce to first argument
+                    target = new ArrayList<List<String>>(1);
+                    target.add(source.get(0));
+                }
+            }
+
+            boolean includeOrig = false;
+            for (List<String> fromToks : source) {
+                count++;
+                for (List<String> toToks : target) {
+                    map.add(fromToks,
+                            SynonymMap.makeTokens(toToks),
+                            includeOrig,
+                            true
+                    );
+                }
+            }
+        }
+    }
+
+    // a , b c , d e f => [[a],[b,c],[d,e,f]]
+    private static List<List<String>> getSynList(String str, String separator, TokenizerFactory tokFactory) {
+        List<String> strList = Strings.splitSmart(str, separator, false);
+        // now split on whitespace to get a list of token strings
+        List<List<String>> synList = new ArrayList<List<String>>();
+        for (String toks : strList) {
+            List<String> tokList = tokFactory == null ?
+                    Strings.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
+            synList.add(tokList);
+        }
+        return synList;
+    }
+
+    private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) {
+        TokenStream ts = tokFactory.create(new FastStringReader(source));
+        List<String> tokList = new ArrayList<String>();
+        try {
+            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+            while (ts.incrementToken()) {
+                if (termAtt.length() > 0)
+                    tokList.add(termAtt.toString());
+            }
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+        return tokList;
+    }
+}
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java
@ -58,7 +58,7 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
        // . => DIGIT
        // \u002C => DIGIT
        // \u200D => ALPHANUM
-        Set<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
+        List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
        if (charTypeTableValues == null) {
            this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
        } else {
@ -84,7 +84,7 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
        // If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
        this.stemEnglishPossessive = settings.getAsBoolean("stem_english_possessive", true);
        // If not null is the set of tokens to protect from being delimited
-        Set<String> protectedWords = Analysis.getWordList(env, settings, "protected_words");
+        Set<String> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
        this.protoWords = protectedWords == null ? null : CharArraySet.copy(Lucene.VERSION, protectedWords);
    }

--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/AbstractCompoundWordTokenFilterFactory.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/AbstractCompoundWordTokenFilterFactory.java
@ -53,7 +53,7 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok
        minSubwordSize = settings.getAsInt("min_subword_size", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
        maxSubwordSize = settings.getAsInt("max_subword_size", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
        onlyLongestMatch = settings.getAsBoolean("only_longest_max", false);
-        wordList = Analysis.getWordList(env, settings, "word_list");
+        wordList = Analysis.getWordSet(env, settings, "word_list");
        if (wordList == null) {
            throw new ElasticSearchIllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly");
        }
--- a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java
+++ b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java
@ -124,7 +124,7 @@ public class AnalysisModuleTests {
        assertThat(dictionaryDecompounderAnalyze.tokenFilters().length, equalTo(1));
        assertThat(dictionaryDecompounderAnalyze.tokenFilters()[0], instanceOf(DictionaryCompoundWordTokenFilterFactory.class));

-        Set<String> wordList = Analysis.getWordList(null, settings, "index.analysis.filter.dict_dec.word_list");
+        Set<String> wordList = Analysis.getWordSet(null, settings, "index.analysis.filter.dict_dec.word_list");
        MatcherAssert.assertThat(wordList.size(), equalTo(6));
        MatcherAssert.assertThat(wordList, hasItems("donau", "dampf", "schiff", "spargel", "creme", "suppe"));
    }
@ -136,7 +136,7 @@ public class AnalysisModuleTests {
        File wordListFile = generateWordList(words);
        Settings settings = settingsBuilder().loadFromSource("index: \n  word_list_path: " + wordListFile.getAbsolutePath()).build();

-        Set<String> wordList = Analysis.getWordList(env, settings, "index.word_list");
+        Set<String> wordList = Analysis.getWordSet(env, settings, "index.word_list");
        MatcherAssert.assertThat(wordList.size(), equalTo(6));
        MatcherAssert.assertThat(wordList, hasItems(words));
    }