From 563ad625c0f69f3ff0f4c39f46421b1dc2c91b6f Mon Sep 17 00:00:00 2001
From: kimchy <kimchy@gmail.com>
Date: Mon, 9 May 2011 02:48:11 +0300
Subject: [PATCH] Analysis: Word Delimiter Token Filter, closes #918.

---
 .idea/dictionaries/kimchy.xml                 |   2 +
 .../miscellaneous/WordDelimiterFilter.java    | 574 ++++++++++++++++++
 .../miscellaneous/WordDelimiterIterator.java  | 341 +++++++++++
 .../index/analysis/Analysis.java              |  18 +-
 .../index/analysis/AnalysisModule.java        |   1 +
 .../WordDelimiterTokenFilterFactory.java      | 196 ++++++
 ...bstractCompoundWordTokenFilterFactory.java |   9 +-
 ...tionaryCompoundWordTokenFilterFactory.java |   5 +-
 ...enationCompoundWordTokenFilterFactory.java |  15 +-
 .../index/analysis/AnalysisModuleTests.java   |   9 +-
 10 files changed, 1146 insertions(+), 24 deletions(-)
 create mode 100644 modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
 create mode 100644 modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java
 create mode 100644 modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java
diff --git a/.idea/dictionaries/kimchy.xml b/.idea/dictionaries/kimchy.xml
index 09db370775e..c04da9e1c0e 100644
--- a/.idea/dictionaries/kimchy.xml
+++ b/.idea/dictionaries/kimchy.xml
@@ -23,6 +23,7 @@
       <w>calc</w>
       <w>camelcase</w>
       <w>canonicalhost</w>
+      <w>catenate</w>
       <w>charfilter</w>
       <w>charsets</w>
       <w>checksum</w>
@@ -107,6 +108,7 @@
       <w>ngram</w>
       <w>noop</w>
       <w>nospawn</w>
+      <w>numerics</w>
       <w>param</w>
       <w>params</w>
       <w>persistency</w>
diff --git a/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
new file mode 100644
index 00000000000..ef0e143479a
--- /dev/null
+++ b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
@@ -0,0 +1,574 @@
+/*
+ * Licensed to Elastic Search and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elastic Search licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.RamUsageEstimator;
+
+import java.io.IOException;
+
+/**
+ * Splits words into subwords and performs optional transformations on subword groups.
+ * Words are split into subwords with the following rules:
+ * - split on intra-word delimiters (by default, all non alpha-numeric characters).
+ * - "Wi-Fi" -> "Wi", "Fi"
+ * - split on case transitions
+ * - "PowerShot" -> "Power", "Shot"
+ * - split on letter-number transitions
+ * - "SD500" -> "SD", "500"
+ * - leading and trailing intra-word delimiters on each subword are ignored
+ * - "//hello---there, 'dude'" -> "hello", "there", "dude"
+ * - trailing "'s" are removed for each subword
+ * - "O'Neil's" -> "O", "Neil"
+ * - Note: this step isn't performed in a separate filter because of possible subword combinations.
+ *
+ * The <b>combinations</b> parameter affects how subwords are combined:
+ * - combinations="0" causes no subword combinations.
+ * - "PowerShot" -> 0:"Power", 1:"Shot"  (0 and 1 are the token positions)
+ * - combinations="1" means that in addition to the subwords, maximum runs of non-numeric subwords are catenated and produced at the same position of the last subword in the run.
+ * - "PowerShot" -> 0:"Power", 1:"Shot" 1:"PowerShot"
+ * - "A's+B's&C's" -> 0:"A", 1:"B", 2:"C", 2:"ABC"
+ * - "Super-Duper-XL500-42-AutoCoder!" -> 0:"Super", 1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder"
+ *
+ * One use for WordDelimiterFilter is to help match words with different subword delimiters.
+ * For example, if the source text contained "wi-fi" one may want "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match.
+ * One way of doing so is to specify combinations="1" in the analyzer used for indexing, and combinations="0" (the default)
+ * in the analyzer used for querying.  Given that the current StandardTokenizer immediately removes many intra-word
+ * delimiters, it is recommended that this filter be used after a tokenizer that does not do this (such as WhitespaceTokenizer).
+ */
+// LUCENE MONITOR: Part of Lucene 4.0, once we upgrade, remove it
+public final class WordDelimiterFilter extends TokenFilter {
+
+    public static final int LOWER = 0x01;
+    public static final int UPPER = 0x02;
+    public static final int DIGIT = 0x04;
+    public static final int SUBWORD_DELIM = 0x08;
+
+    // combinations: for testing, not for setting bits
+    public static final int ALPHA = 0x03;
+    public static final int ALPHANUM = 0x07;
+
+    /**
+     * If true, causes parts of words to be generated:
+     * <p/>
+     * "PowerShot" => "Power" "Shot"
+     */
+    final boolean generateWordParts;
+
+    /**
+     * If true, causes number subwords to be generated:
+     * <p/>
+     * "500-42" => "500" "42"
+     */
+    final boolean generateNumberParts;
+
+    /**
+     * If true, causes maximum runs of word parts to be catenated:
+     * <p/>
+     * "wi-fi" => "wifi"
+     */
+    final boolean catenateWords;
+
+    /**
+     * If true, causes maximum runs of number parts to be catenated:
+     * <p/>
+     * "500-42" => "50042"
+     */
+    final boolean catenateNumbers;
+
+    /**
+     * If true, causes all subword parts to be catenated:
+     * <p/>
+     * "wi-fi-4000" => "wifi4000"
+     */
+    final boolean catenateAll;
+
+    /**
+     * If true, original words are preserved and added to the subword list (Defaults to false)
+     * <p/>
+     * "500-42" => "500" "42" "500-42"
+     */
+    final boolean preserveOriginal;
+
+    /**
+     * If not null is the set of tokens to protect from being delimited
+     */
+    final CharArraySet protWords;
+
+    private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+    private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
+    private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
+    private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
+
+    // used for iterating word delimiter breaks
+    private final WordDelimiterIterator iterator;
+
+    // used for concatenating runs of similar typed subwords (word,number)
+    private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation();
+    // number of subwords last output by concat.
+    private int lastConcatCount = 0;
+
+    // used for catenate all
+    private final WordDelimiterConcatenation concatAll = new WordDelimiterConcatenation();
+
+    // used for accumulating position increment gaps
+    private int accumPosInc = 0;
+
+    private char savedBuffer[] = new char[1024];
+    private int savedStartOffset;
+    private int savedEndOffset;
+    private String savedType;
+    private boolean hasSavedState = false;
+    // if length by start + end offsets doesn't match the term text then assume
+    // this is a synonym and don't adjust the offsets.
+    private boolean hasIllegalOffsets = false;
+
+    // for a run of the same subword type within a word, have we output anything?
+    private boolean hasOutputToken = false;
+    // when preserve original is on, have we output any token following it?
+    // this token must have posInc=0!
+    private boolean hasOutputFollowingOriginal = false;
+
+    /**
+     * @param in                    Token stream to be filtered.
+     * @param charTypeTable
+     * @param generateWordParts     If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
+     * @param generateNumberParts   If 1, causes number subwords to be generated: "500-42" => "500" "42"
+     * @param catenateWords         1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
+     * @param catenateNumbers       If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
+     * @param catenateAll           If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
+     * @param splitOnCaseChange     1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
+     * @param preserveOriginal      If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
+     * @param splitOnNumerics       1, causes "j2se" to be three tokens; "j" "2" "se"
+     * @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
+     * @param protWords             If not null is the set of tokens to protect from being delimited
+     */
+    public WordDelimiterFilter(TokenStream in,
+                               byte[] charTypeTable,
+                               int generateWordParts,
+                               int generateNumberParts,
+                               int catenateWords,
+                               int catenateNumbers,
+                               int catenateAll,
+                               int splitOnCaseChange,
+                               int preserveOriginal,
+                               int splitOnNumerics,
+                               int stemEnglishPossessive,
+                               CharArraySet protWords) {
+        super(in);
+        this.generateWordParts = generateWordParts != 0;
+        this.generateNumberParts = generateNumberParts != 0;
+        this.catenateWords = catenateWords != 0;
+        this.catenateNumbers = catenateNumbers != 0;
+        this.catenateAll = catenateAll != 0;
+        this.preserveOriginal = preserveOriginal != 0;
+        this.protWords = protWords;
+        this.iterator = new WordDelimiterIterator(charTypeTable, splitOnCaseChange != 0, splitOnNumerics != 0, stemEnglishPossessive != 0);
+    }
+
+    /**
+     * @param in                    Token stream to be filtered.
+     * @param generateWordParts     If 1, causes parts of words to be generated: "PowerShot", "Power-Shot" => "Power" "Shot"
+     * @param generateNumberParts   If 1, causes number subwords to be generated: "500-42" => "500" "42"
+     * @param catenateWords         1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
+     * @param catenateNumbers       If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
+     * @param catenateAll           If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
+     * @param splitOnCaseChange     1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
+     * @param preserveOriginal      If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
+     * @param splitOnNumerics       1, causes "j2se" to be three tokens; "j" "2" "se"
+     * @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
+     * @param protWords             If not null is the set of tokens to protect from being delimited
+     */
+    public WordDelimiterFilter(TokenStream in,
+                               int generateWordParts,
+                               int generateNumberParts,
+                               int catenateWords,
+                               int catenateNumbers,
+                               int catenateAll,
+                               int splitOnCaseChange,
+                               int preserveOriginal,
+                               int splitOnNumerics,
+                               int stemEnglishPossessive,
+                               CharArraySet protWords) {
+        this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, stemEnglishPossessive, protWords);
+    }
+
+    public boolean incrementToken() throws IOException {
+        while (true) {
+            if (!hasSavedState) {
+                // process a new input word
+                if (!input.incrementToken()) {
+                    return false;
+                }
+
+                int termLength = termAttribute.length();
+                char[] termBuffer = termAttribute.buffer();
+
+                accumPosInc += posIncAttribute.getPositionIncrement();
+
+                iterator.setText(termBuffer, termLength);
+                iterator.next();
+
+                // word of no delimiters, or protected word: just return it
+                if ((iterator.current == 0 && iterator.end == termLength) ||
+                        (protWords != null && protWords.contains(termBuffer, 0, termLength))) {
+                    posIncAttribute.setPositionIncrement(accumPosInc);
+                    accumPosInc = 0;
+                    return true;
+                }
+
+                // word of simply delimiters
+                if (iterator.end == WordDelimiterIterator.DONE && !preserveOriginal) {
+                    // if the posInc is 1, simply ignore it in the accumulation
+                    if (posIncAttribute.getPositionIncrement() == 1) {
+                        accumPosInc--;
+                    }
+                    continue;
+                }
+
+                saveState();
+
+                hasOutputToken = false;
+                hasOutputFollowingOriginal = !preserveOriginal;
+                lastConcatCount = 0;
+
+                if (preserveOriginal) {
+                    posIncAttribute.setPositionIncrement(accumPosInc);
+                    accumPosInc = 0;
+                    return true;
+                }
+            }
+
+            // at the end of the string, output any concatenations
+            if (iterator.end == WordDelimiterIterator.DONE) {
+                if (!concat.isEmpty()) {
+                    if (flushConcatenation(concat)) {
+                        return true;
+                    }
+                }
+
+                if (!concatAll.isEmpty()) {
+                    // only if we haven't output this same combo above!
+                    if (concatAll.subwordCount > lastConcatCount) {
+                        concatAll.writeAndClear();
+                        return true;
+                    }
+                    concatAll.clear();
+                }
+
+                // no saved concatenations, on to the next input word
+                hasSavedState = false;
+                continue;
+            }
+
+            // word surrounded by delimiters: always output
+            if (iterator.isSingleWord()) {
+                generatePart(true);
+                iterator.next();
+                return true;
+            }
+
+            int wordType = iterator.type();
+
+            // do we already have queued up incompatible concatenations?
+            if (!concat.isEmpty() && (concat.type & wordType) == 0) {
+                if (flushConcatenation(concat)) {
+                    hasOutputToken = false;
+                    return true;
+                }
+                hasOutputToken = false;
+            }
+
+            // add subwords depending upon options
+            if (shouldConcatenate(wordType)) {
+                if (concat.isEmpty()) {
+                    concat.type = wordType;
+                }
+                concatenate(concat);
+            }
+
+            // add all subwords (catenateAll)
+            if (catenateAll) {
+                concatenate(concatAll);
+            }
+
+            // if we should output the word or number part
+            if (shouldGenerateParts(wordType)) {
+                generatePart(false);
+                iterator.next();
+                return true;
+            }
+
+            iterator.next();
+        }
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public void reset() throws IOException {
+        super.reset();
+        hasSavedState = false;
+        concat.clear();
+        concatAll.clear();
+        accumPosInc = 0;
+    }
+
+    // ================================================= Helper Methods ================================================
+
+    /**
+     * Saves the existing attribute states
+     */
+    private void saveState() {
+        // otherwise, we have delimiters, save state
+        savedStartOffset = offsetAttribute.startOffset();
+        savedEndOffset = offsetAttribute.endOffset();
+        // if length by start + end offsets doesn't match the term text then assume this is a synonym and don't adjust the offsets.
+        hasIllegalOffsets = (savedEndOffset - savedStartOffset != termAttribute.length());
+        savedType = typeAttribute.type();
+
+        if (savedBuffer.length < termAttribute.length()) {
+            savedBuffer = new char[ArrayUtil.oversize(termAttribute.length(), RamUsageEstimator.NUM_BYTES_CHAR)];
+        }
+
+        System.arraycopy(termAttribute.buffer(), 0, savedBuffer, 0, termAttribute.length());
+        iterator.text = savedBuffer;
+
+        hasSavedState = true;
+    }
+
+    /**
+     * Flushes the given WordDelimiterConcatenation by either writing its concat and then clearing, or just clearing.
+     *
+     * @param concatenation WordDelimiterConcatenation that will be flushed
+     * @return {@code true} if the concatenation was written before it was cleared, {@code} false otherwise
+     */
+    private boolean flushConcatenation(WordDelimiterConcatenation concatenation) {
+        lastConcatCount = concatenation.subwordCount;
+        if (concatenation.subwordCount != 1 || !shouldGenerateParts(concatenation.type)) {
+            concatenation.writeAndClear();
+            return true;
+        }
+        concatenation.clear();
+        return false;
+    }
+
+    /**
+     * Determines whether to concatenate a word or number if the current word is the given type
+     *
+     * @param wordType Type of the current word used to determine if it should be concatenated
+     * @return {@code true} if concatenation should occur, {@code false} otherwise
+     */
+    private boolean shouldConcatenate(int wordType) {
+        return (catenateWords && isAlpha(wordType)) || (catenateNumbers && isDigit(wordType));
+    }
+
+    /**
+     * Determines whether a word/number part should be generated for a word of the given type
+     *
+     * @param wordType Type of the word used to determine if a word/number part should be generated
+     * @return {@code true} if a word/number part should be generated, {@code false} otherwise
+     */
+    private boolean shouldGenerateParts(int wordType) {
+        return (generateWordParts && isAlpha(wordType)) || (generateNumberParts && isDigit(wordType));
+    }
+
+    /**
+     * Concatenates the saved buffer to the given WordDelimiterConcatenation
+     *
+     * @param concatenation WordDelimiterConcatenation to concatenate the buffer to
+     */
+    private void concatenate(WordDelimiterConcatenation concatenation) {
+        if (concatenation.isEmpty()) {
+            concatenation.startOffset = savedStartOffset + iterator.current;
+        }
+        concatenation.append(savedBuffer, iterator.current, iterator.end - iterator.current);
+        concatenation.endOffset = savedStartOffset + iterator.end;
+    }
+
+    /**
+     * Generates a word/number part, updating the appropriate attributes
+     *
+     * @param isSingleWord {@code true} if the generation is occurring from a single word, {@code false} otherwise
+     */
+    private void generatePart(boolean isSingleWord) {
+        clearAttributes();
+        termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current);
+
+        int startOffSet = (isSingleWord || !hasIllegalOffsets) ? savedStartOffset + iterator.current : savedStartOffset;
+        int endOffSet = (hasIllegalOffsets) ? savedEndOffset : savedStartOffset + iterator.end;
+
+        offsetAttribute.setOffset(startOffSet, endOffSet);
+        posIncAttribute.setPositionIncrement(position(false));
+        typeAttribute.setType(savedType);
+    }
+
+    /**
+     * Get the position increment gap for a subword or concatenation
+     *
+     * @param inject true if this token wants to be injected
+     * @return position increment gap
+     */
+    private int position(boolean inject) {
+        int posInc = accumPosInc;
+
+        if (hasOutputToken) {
+            accumPosInc = 0;
+            return inject ? 0 : Math.max(1, posInc);
+        }
+
+        hasOutputToken = true;
+
+        if (!hasOutputFollowingOriginal) {
+            // the first token following the original is 0 regardless
+            hasOutputFollowingOriginal = true;
+            return 0;
+        }
+        // clear the accumulated position increment
+        accumPosInc = 0;
+        return Math.max(1, posInc);
+    }
+
+    /**
+     * Checks if the given word type includes {@link #ALPHA}
+     *
+     * @param type Word type to check
+     * @return {@code true} if the type contains ALPHA, {@code false} otherwise
+     */
+    static boolean isAlpha(int type) {
+        return (type & ALPHA) != 0;
+    }
+
+    /**
+     * Checks if the given word type includes {@link #DIGIT}
+     *
+     * @param type Word type to check
+     * @return {@code true} if the type contains DIGIT, {@code false} otherwise
+     */
+    static boolean isDigit(int type) {
+        return (type & DIGIT) != 0;
+    }
+
+    /**
+     * Checks if the given word type includes {@link #SUBWORD_DELIM}
+     *
+     * @param type Word type to check
+     * @return {@code true} if the type contains SUBWORD_DELIM, {@code false} otherwise
+     */
+    static boolean isSubwordDelim(int type) {
+        return (type & SUBWORD_DELIM) != 0;
+    }
+
+    /**
+     * Checks if the given word type includes {@link #UPPER}
+     *
+     * @param type Word type to check
+     * @return {@code true} if the type contains UPPER, {@code false} otherwise
+     */
+    static boolean isUpper(int type) {
+        return (type & UPPER) != 0;
+    }
+
+    // ================================================= Inner Classes =================================================
+
+    /**
+     * A WDF concatenated 'run'
+     */
+    final class WordDelimiterConcatenation {
+        final StringBuilder buffer = new StringBuilder();
+        int startOffset;
+        int endOffset;
+        int type;
+        int subwordCount;
+
+        /**
+         * Appends the given text of the given length, to the concetenation at the given offset
+         *
+         * @param text   Text to append
+         * @param offset Offset in the concetenation to add the text
+         * @param length Length of the text to append
+         */
+        void append(char text[], int offset, int length) {
+            buffer.append(text, offset, length);
+            subwordCount++;
+        }
+
+        /**
+         * Writes the concatenation to the attributes
+         */
+        void write() {
+            clearAttributes();
+            if (termAttribute.length() < buffer.length()) {
+                termAttribute.resizeBuffer(buffer.length());
+            }
+            char termbuffer[] = termAttribute.buffer();
+
+            buffer.getChars(0, buffer.length(), termbuffer, 0);
+            termAttribute.setLength(buffer.length());
+
+            if (hasIllegalOffsets) {
+                offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
+            } else {
+                offsetAttribute.setOffset(startOffset, endOffset);
+            }
+            posIncAttribute.setPositionIncrement(position(true));
+            typeAttribute.setType(savedType);
+            accumPosInc = 0;
+        }
+
+        /**
+         * Determines if the concatenation is empty
+         *
+         * @return {@code true} if the concatenation is empty, {@code false} otherwise
+         */
+        boolean isEmpty() {
+            return buffer.length() == 0;
+        }
+
+        /**
+         * Clears the concatenation and resets its state
+         */
+        void clear() {
+            buffer.setLength(0);
+            startOffset = endOffset = type = subwordCount = 0;
+        }
+
+        /**
+         * Convenience method for the common scenario of having to write the concetenation and then clearing its state
+         */
+        void writeAndClear() {
+            write();
+            clear();
+        }
+    }
+    // questions:
+    // negative numbers?  -42 indexed as just 42?
+    // dollar sign?  $42
+    // percent sign?  33%
+    // downsides:  if source text is "powershot" then a query of "PowerShot" won't match!
+}
diff --git a/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java
new file mode 100644
index 00000000000..943ccfb6097
--- /dev/null
+++ b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java
@@ -0,0 +1,341 @@
+/*
+ * Licensed to Elastic Search and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elastic Search licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
+
+/**
+ * A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules.
+ *
+ * @lucene.internal
+ */
+public final class WordDelimiterIterator {
+
+    /**
+     * Indicates the end of iteration
+     */
+    public static final int DONE = -1;
+
+    public static final byte[] DEFAULT_WORD_DELIM_TABLE;
+
+    char text[];
+    int length;
+
+    /**
+     * start position of text, excluding leading delimiters
+     */
+    int startBounds;
+    /**
+     * end position of text, excluding trailing delimiters
+     */
+    int endBounds;
+
+    /**
+     * Beginning of subword
+     */
+    int current;
+    /**
+     * End of subword
+     */
+    int end;
+
+    /* does this string end with a possessive such as 's */
+    private boolean hasFinalPossessive = false;
+
+    /**
+     * If false, causes case changes to be ignored (subwords will only be generated
+     * given SUBWORD_DELIM tokens). (Defaults to true)
+     */
+    final boolean splitOnCaseChange;
+
+    /**
+     * If false, causes numeric changes to be ignored (subwords will only be generated
+     * given SUBWORD_DELIM tokens). (Defaults to true)
+     */
+    final boolean splitOnNumerics;
+
+    /**
+     * If true, causes trailing "'s" to be removed for each subword. (Defaults to true)
+     * <p/>
+     * "O'Neil's" => "O", "Neil"
+     */
+    final boolean stemEnglishPossessive;
+
+    private final byte[] charTypeTable;
+
+    /**
+     * if true, need to skip over a possessive found in the last call to next()
+     */
+    private boolean skipPossessive = false;
+
+    // TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be
+    // done if separated by these chars?) "," would be an obvious candidate...
+    static {
+        byte[] tab = new byte[256];
+        for (int i = 0; i < 256; i++) {
+            byte code = 0;
+            if (Character.isLowerCase(i)) {
+                code |= LOWER;
+            } else if (Character.isUpperCase(i)) {
+                code |= UPPER;
+            } else if (Character.isDigit(i)) {
+                code |= DIGIT;
+            }
+            if (code == 0) {
+                code = SUBWORD_DELIM;
+            }
+            tab[i] = code;
+        }
+        DEFAULT_WORD_DELIM_TABLE = tab;
+    }
+
+    /**
+     * Create a new WordDelimiterIterator operating with the supplied rules.
+     *
+     * @param charTypeTable         table containing character types
+     * @param splitOnCaseChange     if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
+     * @param splitOnNumerics       if true, causes "j2se" to be three tokens; "j" "2" "se"
+     * @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
+     */
+    WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive) {
+        this.charTypeTable = charTypeTable;
+        this.splitOnCaseChange = splitOnCaseChange;
+        this.splitOnNumerics = splitOnNumerics;
+        this.stemEnglishPossessive = stemEnglishPossessive;
+    }
+
+    /**
+     * Advance to the next subword in the string.
+     *
+     * @return index of the next subword, or {@link #DONE} if all subwords have been returned
+     */
+    int next() {
+        current = end;
+        if (current == DONE) {
+            return DONE;
+        }
+
+        if (skipPossessive) {
+            current += 2;
+            skipPossessive = false;
+        }
+
+        int lastType = 0;
+
+        while (current < endBounds && (isSubwordDelim(lastType = charType(text[current])))) {
+            current++;
+        }
+
+        if (current >= endBounds) {
+            return end = DONE;
+        }
+
+        for (end = current + 1; end < endBounds; end++) {
+            int type = charType(text[end]);
+            if (isBreak(lastType, type)) {
+                break;
+            }
+            lastType = type;
+        }
+
+        if (end < endBounds - 1 && endsWithPossessive(end + 2)) {
+            skipPossessive = true;
+        }
+
+        return end;
+    }
+
+
+    /**
+     * Return the type of the current subword.
+     * This currently uses the type of the first character in the subword.
+     *
+     * @return type of the current word
+     */
+    int type() {
+        if (end == DONE) {
+            return 0;
+        }
+
+        int type = charType(text[current]);
+        switch (type) {
+            // return ALPHA word type for both lower and upper
+            case LOWER:
+            case UPPER:
+                return ALPHA;
+            default:
+                return type;
+        }
+    }
+
+    /**
+     * Reset the text to a new value, and reset all state
+     *
+     * @param text   New text
+     * @param length length of the text
+     */
+    void setText(char text[], int length) {
+        this.text = text;
+        this.length = this.endBounds = length;
+        current = startBounds = end = 0;
+        skipPossessive = hasFinalPossessive = false;
+        setBounds();
+    }
+
+    // ================================================= Helper Methods ================================================
+
+    /**
+     * Determines whether the transition from lastType to type indicates a break
+     *
+     * @param lastType Last subword type
+     * @param type     Current subword type
+     * @return {@code true} if the transition indicates a break, {@code false} otherwise
+     */
+    private boolean isBreak(int lastType, int type) {
+        if ((type & lastType) != 0) {
+            return false;
+        }
+
+        if (!splitOnCaseChange && isAlpha(lastType) && isAlpha(type)) {
+            // ALPHA->ALPHA: always ignore if case isn't considered.
+            return false;
+        } else if (isUpper(lastType) && isAlpha(type)) {
+            // UPPER->letter: Don't split
+            return false;
+        } else if (!splitOnNumerics && ((isAlpha(lastType) && isDigit(type)) || (isDigit(lastType) && isAlpha(type)))) {
+            // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
+            return false;
+        }
+
+        return true;
+    }
+
+    /**
+     * Determines if the current word contains only one subword.  Note, it could be potentially surrounded by delimiters
+     *
+     * @return {@code true} if the current word contains only one subword, {@code false} otherwise
+     */
+    boolean isSingleWord() {
+        if (hasFinalPossessive) {
+            return current == startBounds && end == endBounds - 2;
+        } else {
+            return current == startBounds && end == endBounds;
+        }
+    }
+
+    /**
+     * Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove
+     * it yet, simply note it.
+     */
+    private void setBounds() {
+        while (startBounds < length && (isSubwordDelim(charType(text[startBounds])))) {
+            startBounds++;
+        }
+
+        while (endBounds > startBounds && (isSubwordDelim(charType(text[endBounds - 1])))) {
+            endBounds--;
+        }
+        if (endsWithPossessive(endBounds)) {
+            hasFinalPossessive = true;
+        }
+        current = startBounds;
+    }
+
+    /**
+     * Determines if the text at the given position indicates an English possessive which should be removed
+     *
+     * @param pos Position in the text to check if it indicates an English possessive
+     * @return {@code true} if the text at the position indicates an English posessive, {@code false} otherwise
+     */
+    private boolean endsWithPossessive(int pos) {
+        return (stemEnglishPossessive &&
+                pos > 2 &&
+                text[pos - 2] == '\'' &&
+                (text[pos - 1] == 's' || text[pos - 1] == 'S') &&
+                isAlpha(charType(text[pos - 3])) &&
+                (pos == endBounds || isSubwordDelim(charType(text[pos]))));
+    }
+
+    /**
+     * Determines the type of the given character
+     *
+     * @param ch Character whose type is to be determined
+     * @return Type of the character
+     */
+    private int charType(int ch) {
+        if (ch < charTypeTable.length) {
+            return charTypeTable[ch];
+        }
+        return getType(ch);
+    }
+
+    /**
+     * Computes the type of the given character
+     *
+     * @param ch Character whose type is to be determined
+     * @return Type of the character
+     */
+    public static byte getType(int ch) {
+        switch (Character.getType(ch)) {
+            case Character.UPPERCASE_LETTER:
+                return UPPER;
+            case Character.LOWERCASE_LETTER:
+                return LOWER;
+
+            case Character.TITLECASE_LETTER:
+            case Character.MODIFIER_LETTER:
+            case Character.OTHER_LETTER:
+            case Character.NON_SPACING_MARK:
+            case Character.ENCLOSING_MARK:  // depends what it encloses?
+            case Character.COMBINING_SPACING_MARK:
+                return ALPHA;
+
+            case Character.DECIMAL_DIGIT_NUMBER:
+            case Character.LETTER_NUMBER:
+            case Character.OTHER_NUMBER:
+                return DIGIT;
+
+            // case Character.SPACE_SEPARATOR:
+            // case Character.LINE_SEPARATOR:
+            // case Character.PARAGRAPH_SEPARATOR:
+            // case Character.CONTROL:
+            // case Character.FORMAT:
+            // case Character.PRIVATE_USE:
+
+            case Character.SURROGATE:  // prevent splitting
+                return ALPHA | DIGIT;
+
+            // case Character.DASH_PUNCTUATION:
+            // case Character.START_PUNCTUATION:
+            // case Character.END_PUNCTUATION:
+            // case Character.CONNECTOR_PUNCTUATION:
+            // case Character.OTHER_PUNCTUATION:
+            // case Character.MATH_SYMBOL:
+            // case Character.CURRENCY_SYMBOL:
+            // case Character.MODIFIER_SYMBOL:
+            // case Character.OTHER_SYMBOL:
+            // case Character.INITIAL_QUOTE_PUNCTUATION:
+            // case Character.FINAL_QUOTE_PUNCTUATION:
+
+            default:
+                return SUBWORD_DELIM;
+        }
+    }
+}
\ No newline at end of file
diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/Analysis.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/Analysis.java
index ac006c6aa6c..08ab255d8ac 100644
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/Analysis.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/Analysis.java
@@ -48,14 +48,17 @@ import org.apache.lucene.analysis.sv.SwedishAnalyzer;
 import org.apache.lucene.analysis.tr.TurkishAnalyzer;
 import org.elasticsearch.ElasticSearchIllegalArgumentException;
 import org.elasticsearch.common.Strings;
+import org.elasticsearch.common.base.Charsets;
 import org.elasticsearch.common.collect.ImmutableMap;
 import org.elasticsearch.common.collect.ImmutableSet;
 import org.elasticsearch.common.collect.Iterators;
 import org.elasticsearch.common.collect.MapBuilder;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
 
-import java.io.File;
 import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.URL;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;
@@ -148,27 +151,22 @@ public class Analysis {
      * @throws ElasticSearchIllegalArgumentException
      *          If the word list cannot be found at either key.
      */
-    public static Set<String> getWordList(Settings settings, String settingPrefix) {
+    public static Set<String> getWordList(Environment env, Settings settings, String settingPrefix) {
         String wordListPath = settings.get(settingPrefix + "_path", null);
 
         if (wordListPath == null) {
             String[] explicitWordList = settings.getAsArray(settingPrefix, null);
             if (explicitWordList == null) {
-                String message = String.format("%s or %s_path must be provided.", settingPrefix, settingPrefix);
-                throw new ElasticSearchIllegalArgumentException(message);
+                return null;
             } else {
-
                 return new HashSet<String>(Arrays.asList(explicitWordList));
             }
         }
 
-        File wordListFile = new File(wordListPath);
-        if (!wordListFile.exists()) {
-            throw new ElasticSearchIllegalArgumentException(settingPrefix + "_path file must exist.");
-        }
+        URL wordListFile = env.resolveConfig(wordListPath);
 
         try {
-            return WordlistLoader.getWordSet(wordListFile);
+            return WordlistLoader.getWordSet(new InputStreamReader(wordListFile.openStream(), Charsets.UTF_8), "#");
         } catch (IOException ioe) {
             String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());
             throw new ElasticSearchIllegalArgumentException(message);
diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java
index e36af0de311..dbb5f4a9e9d 100644
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java
@@ -361,6 +361,7 @@ public class AnalysisModule extends AbstractModule {
         @Override public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
             tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class);
             tokenFiltersBindings.processTokenFilter("stemmer", StemmerTokenFilterFactory.class);
+            tokenFiltersBindings.processTokenFilter("word_delimiter", WordDelimiterTokenFilterFactory.class);
 
             tokenFiltersBindings.processTokenFilter("arabic_stem", ArabicStemTokenFilterFactory.class);
             tokenFiltersBindings.processTokenFilter("brazilian_stem", BrazilianStemTokenFilterFactory.class);
diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java
new file mode 100644
index 00000000000..e56bade53c0
--- /dev/null
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java
@@ -0,0 +1,196 @@
+/*
+ * Licensed to Elastic Search and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elastic Search licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
+import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.lucene.Lucene;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory {
+
+    private final byte[] charTypeTable;
+    private final boolean generateWordParts;
+    private final boolean generateNumberParts;
+    private final boolean catenateWords;
+    private final boolean catenateNumbers;
+    private final boolean catenateAll;
+    private final boolean splitOnCaseChange;
+    private final boolean preserveOriginal;
+    private final boolean splitOnNumerics;
+    private final boolean stemEnglishPossessive;
+    private final CharArraySet protoWords;
+
+    @Inject public WordDelimiterTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, name, settings);
+
+        // Sample Format for the type table:
+        // $ => DIGIT
+        // % => DIGIT
+        // . => DIGIT
+        // \u002C => DIGIT
+        // \u200D => ALPHANUM
+        Set<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
+        if (charTypeTableValues == null) {
+            this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
+        } else {
+            this.charTypeTable = parseTypes(charTypeTableValues);
+        }
+
+        // If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
+        this.generateWordParts = settings.getAsBoolean("generate_word_parts", true);
+        // If 1, causes number subwords to be generated: "500-42" => "500" "42"
+        this.generateNumberParts = settings.getAsBoolean("generate_number_parts", true);
+        // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
+        this.catenateWords = settings.getAsBoolean("catenate_words", false);
+        // If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
+        this.catenateNumbers = settings.getAsBoolean("catenate_numbers", false);
+        // If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
+        this.catenateAll = settings.getAsBoolean("catenate_all", false);
+        // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
+        this.splitOnCaseChange = settings.getAsBoolean("split_on_case_change", true);
+        // If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
+        this.preserveOriginal = settings.getAsBoolean("preserve_original", false);
+        // 1, causes "j2se" to be three tokens; "j" "2" "se"
+        this.splitOnNumerics = settings.getAsBoolean("split_on_numerics", true);
+        // If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
+        this.stemEnglishPossessive = settings.getAsBoolean("stem_english_possessive", true);
+        // If not null is the set of tokens to protect from being delimited
+        Set<String> protectedWords = Analysis.getWordList(env, settings, "protected_words");
+        this.protoWords = protectedWords == null ? null : CharArraySet.copy(Lucene.VERSION, protectedWords);
+    }
+
+    @Override public TokenStream create(TokenStream tokenStream) {
+        return new WordDelimiterFilter(tokenStream,
+                WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
+                generateWordParts ? 1 : 0,
+                generateNumberParts ? 1 : 0,
+                catenateWords ? 1 : 0,
+                catenateNumbers ? 1 : 0,
+                catenateAll ? 1 : 0,
+                splitOnCaseChange ? 1 : 0,
+                preserveOriginal ? 1 : 0,
+                splitOnNumerics ? 1 : 0,
+                stemEnglishPossessive ? 1 : 0,
+                protoWords);
+    }
+
+    // source => type
+    private static Pattern typePattern = Pattern.compile("(.*)\\s*=>\\s*(.*)\\s*$");
+
+    /**
+     * parses a list of MappingCharFilter style rules into a custom byte[] type table
+     */
+    private byte[] parseTypes(Collection<String> rules) {
+        SortedMap<Character, Byte> typeMap = new TreeMap<Character, Byte>();
+        for (String rule : rules) {
+            Matcher m = typePattern.matcher(rule);
+            if (!m.find())
+                throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
+            String lhs = parseString(m.group(1).trim());
+            Byte rhs = parseType(m.group(2).trim());
+            if (lhs.length() != 1)
+                throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed.");
+            if (rhs == null)
+                throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
+            typeMap.put(lhs.charAt(0), rhs);
+        }
+
+        // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
+        byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
+        for (int i = 0; i < types.length; i++)
+            types[i] = WordDelimiterIterator.getType(i);
+        for (Map.Entry<Character, Byte> mapping : typeMap.entrySet())
+            types[mapping.getKey()] = mapping.getValue();
+        return types;
+    }
+
+    private Byte parseType(String s) {
+        if (s.equals("LOWER"))
+            return WordDelimiterFilter.LOWER;
+        else if (s.equals("UPPER"))
+            return WordDelimiterFilter.UPPER;
+        else if (s.equals("ALPHA"))
+            return WordDelimiterFilter.ALPHA;
+        else if (s.equals("DIGIT"))
+            return WordDelimiterFilter.DIGIT;
+        else if (s.equals("ALPHANUM"))
+            return WordDelimiterFilter.ALPHANUM;
+        else if (s.equals("SUBWORD_DELIM"))
+            return WordDelimiterFilter.SUBWORD_DELIM;
+        else
+            return null;
+    }
+
+    char[] out = new char[256];
+
+    private String parseString(String s) {
+        int readPos = 0;
+        int len = s.length();
+        int writePos = 0;
+        while (readPos < len) {
+            char c = s.charAt(readPos++);
+            if (c == '\\') {
+                if (readPos >= len)
+                    throw new RuntimeException("Invalid escaped char in [" + s + "]");
+                c = s.charAt(readPos++);
+                switch (c) {
+                    case '\\':
+                        c = '\\';
+                        break;
+                    case 'n':
+                        c = '\n';
+                        break;
+                    case 't':
+                        c = '\t';
+                        break;
+                    case 'r':
+                        c = '\r';
+                        break;
+                    case 'b':
+                        c = '\b';
+                        break;
+                    case 'f':
+                        c = '\f';
+                        break;
+                    case 'u':
+                        if (readPos + 3 >= len)
+                            throw new RuntimeException("Invalid escaped char in [" + s + "]");
+                        c = (char) Integer.parseInt(s.substring(readPos, readPos + 4), 16);
+                        readPos += 4;
+                        break;
+                }
+            }
+            out[writePos++] = c;
+        }
+        return new String(out, 0, writePos);
+    }
+}
\ No newline at end of file
diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/AbstractCompoundWordTokenFilterFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/AbstractCompoundWordTokenFilterFactory.java
index d70c235cac3..fb3eb253e34 100644
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/AbstractCompoundWordTokenFilterFactory.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/AbstractCompoundWordTokenFilterFactory.java
@@ -20,9 +20,11 @@
 package org.elasticsearch.index.analysis.compound;
 
 import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
+import org.elasticsearch.ElasticSearchIllegalArgumentException;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 import org.elasticsearch.index.analysis.Analysis;
@@ -44,13 +46,16 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok
     protected final boolean onlyLongestMatch;
     protected final Set<String> wordList;
 
-    @Inject public AbstractCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
+    @Inject public AbstractCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
         super(index, indexSettings, name, settings);
 
         minWordSize = settings.getAsInt("min_word_size", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
         minSubwordSize = settings.getAsInt("min_subword_size", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
         maxSubwordSize = settings.getAsInt("max_subword_size", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
         onlyLongestMatch = settings.getAsBoolean("only_longest_max", false);
-        wordList = Analysis.getWordList(settings, "word_list");
+        wordList = Analysis.getWordList(env, settings, "word_list");
+        if (wordList == null) {
+            throw new ElasticSearchIllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly");
+        }
     }
 }
\ No newline at end of file
diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java
index af40c6f9b39..977d50d1bcb 100644
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java
@@ -24,6 +24,7 @@ import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.analysis.AnalysisSettingsRequired;
 import org.elasticsearch.index.settings.IndexSettings;
@@ -39,8 +40,8 @@ import org.elasticsearch.index.settings.IndexSettings;
 @AnalysisSettingsRequired
 public class DictionaryCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory {
 
-    @Inject public DictionaryCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
-        super(index, indexSettings, name, settings);
+    @Inject public DictionaryCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, env, name, settings);
     }
 
     @Override public TokenStream create(TokenStream tokenStream) {
diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java
index 012cfb3b66c..9dd599265dd 100644
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java
@@ -26,11 +26,13 @@ import org.elasticsearch.ElasticSearchIllegalArgumentException;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.analysis.AnalysisSettingsRequired;
 import org.elasticsearch.index.settings.IndexSettings;
+import org.xml.sax.InputSource;
 
-import java.io.File;
+import java.net.URL;
 
 /**
  * Uses the {@link org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter} to decompound tokens based on hyphenation rules.
@@ -44,21 +46,18 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
 
     private final HyphenationTree hyphenationTree;
 
-    @Inject public HyphenationCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
-        super(index, indexSettings, name, settings);
+    @Inject public HyphenationCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, env, name, settings);
 
         String hyphenationPatternsPath = settings.get("hyphenation_patterns_path", null);
         if (hyphenationPatternsPath == null) {
             throw new ElasticSearchIllegalArgumentException("hyphenation_patterns_path is a required setting.");
         }
 
-        File hyphenationPatternsFile = new File(hyphenationPatternsPath);
-        if (!hyphenationPatternsFile.exists()) {
-            throw new ElasticSearchIllegalArgumentException("hyphenation_patterns_path file must exist.");
-        }
+        URL hyphenationPatternsFile = env.resolveConfig(hyphenationPatternsPath);
 
         try {
-            hyphenationTree = HyphenationCompoundWordTokenFilter.getHyphenationTree(hyphenationPatternsFile);
+            hyphenationTree = HyphenationCompoundWordTokenFilter.getHyphenationTree(new InputSource(hyphenationPatternsFile.toExternalForm()));
         } catch (Exception e) {
             throw new ElasticSearchIllegalArgumentException("Exception while reading hyphenation_patterns_path: " + e.getMessage());
         }
diff --git a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java
index 0b34196b5f3..73895253dbb 100644
--- a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java
+++ b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java
@@ -24,7 +24,10 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.elasticsearch.common.inject.Injector;
 import org.elasticsearch.common.inject.ModulesBuilder;
 import org.elasticsearch.common.lucene.analysis.HTMLStripCharFilter;
+import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.env.EnvironmentModule;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.IndexNameModule;
 import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
@@ -61,6 +64,7 @@ public class AnalysisModuleTests {
     private void testSimpleConfiguration(Settings settings) {
         Index index = new Index("test");
         Injector injector = new ModulesBuilder().add(
+                new EnvironmentModule(new Environment(settings)),
                 new IndexSettingsModule(index, settings),
                 new IndexNameModule(index),
                 new AnalysisModule(settings)).createInjector();
@@ -120,18 +124,19 @@ public class AnalysisModuleTests {
         assertThat(dictionaryDecompounderAnalyze.tokenFilters().length, equalTo(1));
         assertThat(dictionaryDecompounderAnalyze.tokenFilters()[0], instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
 
-        Set<String> wordList = Analysis.getWordList(settings, "index.analysis.filter.dict_dec.word_list");
+        Set<String> wordList = Analysis.getWordList(null, settings, "index.analysis.filter.dict_dec.word_list");
         MatcherAssert.assertThat(wordList.size(), equalTo(6));
         MatcherAssert.assertThat(wordList, hasItems("donau", "dampf", "schiff", "spargel", "creme", "suppe"));
     }
 
     @Test public void testWordListPath() throws Exception {
+        Environment env = new Environment(ImmutableSettings.Builder.EMPTY_SETTINGS);
         String[] words = new String[]{"donau", "dampf", "schiff", "spargel", "creme", "suppe"};
 
         File wordListFile = generateWordList(words);
         Settings settings = settingsBuilder().loadFromSource("index: \n  word_list_path: " + wordListFile.getAbsolutePath()).build();
 
-        Set<String> wordList = Analysis.getWordList(settings, "index.word_list");
+        Set<String> wordList = Analysis.getWordList(env, settings, "index.word_list");
         MatcherAssert.assertThat(wordList.size(), equalTo(6));
         MatcherAssert.assertThat(wordList, hasItems(words));
     }