From 563ad625c0f69f3ff0f4c39f46421b1dc2c91b6f Mon Sep 17 00:00:00 2001 From: kimchy Date: Mon, 9 May 2011 02:48:11 +0300 Subject: [PATCH] Analysis: Word Delimiter Token Filter, closes #918. --- .idea/dictionaries/kimchy.xml | 2 + .../miscellaneous/WordDelimiterFilter.java | 574 ++++++++++++++++++ .../miscellaneous/WordDelimiterIterator.java | 341 +++++++++++ .../index/analysis/Analysis.java | 18 +- .../index/analysis/AnalysisModule.java | 1 + .../WordDelimiterTokenFilterFactory.java | 196 ++++++ ...bstractCompoundWordTokenFilterFactory.java | 9 +- ...tionaryCompoundWordTokenFilterFactory.java | 5 +- ...enationCompoundWordTokenFilterFactory.java | 15 +- .../index/analysis/AnalysisModuleTests.java | 9 +- 10 files changed, 1146 insertions(+), 24 deletions(-) create mode 100644 modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java create mode 100644 modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java create mode 100644 modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java diff --git a/.idea/dictionaries/kimchy.xml b/.idea/dictionaries/kimchy.xml index 09db370775e..c04da9e1c0e 100644 --- a/.idea/dictionaries/kimchy.xml +++ b/.idea/dictionaries/kimchy.xml @@ -23,6 +23,7 @@ calc camelcase canonicalhost + catenate charfilter charsets checksum @@ -107,6 +108,7 @@ ngram noop nospawn + numerics param params persistency diff --git a/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java new file mode 100644 index 00000000000..ef0e143479a --- /dev/null +++ b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java @@ -0,0 +1,574 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.analysis.miscellaneous; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.RamUsageEstimator; + +import java.io.IOException; + +/** + * Splits words into subwords and performs optional transformations on subword groups. + * Words are split into subwords with the following rules: + * - split on intra-word delimiters (by default, all non alpha-numeric characters). + * - "Wi-Fi" -> "Wi", "Fi" + * - split on case transitions + * - "PowerShot" -> "Power", "Shot" + * - split on letter-number transitions + * - "SD500" -> "SD", "500" + * - leading and trailing intra-word delimiters on each subword are ignored + * - "//hello---there, 'dude'" -> "hello", "there", "dude" + * - trailing "'s" are removed for each subword + * - "O'Neil's" -> "O", "Neil" + * - Note: this step isn't performed in a separate filter because of possible subword combinations. + * + * The combinations parameter affects how subwords are combined: + * - combinations="0" causes no subword combinations. + * - "PowerShot" -> 0:"Power", 1:"Shot" (0 and 1 are the token positions) + * - combinations="1" means that in addition to the subwords, maximum runs of non-numeric subwords are catenated and produced at the same position of the last subword in the run. + * - "PowerShot" -> 0:"Power", 1:"Shot" 1:"PowerShot" + * - "A's+B's&C's" -> 0:"A", 1:"B", 2:"C", 2:"ABC" + * - "Super-Duper-XL500-42-AutoCoder!" -> 0:"Super", 1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder" + * + * One use for WordDelimiterFilter is to help match words with different subword delimiters. + * For example, if the source text contained "wi-fi" one may want "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match. + * One way of doing so is to specify combinations="1" in the analyzer used for indexing, and combinations="0" (the default) + * in the analyzer used for querying. Given that the current StandardTokenizer immediately removes many intra-word + * delimiters, it is recommended that this filter be used after a tokenizer that does not do this (such as WhitespaceTokenizer). + */ +// LUCENE MONITOR: Part of Lucene 4.0, once we upgrade, remove it +public final class WordDelimiterFilter extends TokenFilter { + + public static final int LOWER = 0x01; + public static final int UPPER = 0x02; + public static final int DIGIT = 0x04; + public static final int SUBWORD_DELIM = 0x08; + + // combinations: for testing, not for setting bits + public static final int ALPHA = 0x03; + public static final int ALPHANUM = 0x07; + + /** + * If true, causes parts of words to be generated: + *

+ * "PowerShot" => "Power" "Shot" + */ + final boolean generateWordParts; + + /** + * If true, causes number subwords to be generated: + *

+ * "500-42" => "500" "42" + */ + final boolean generateNumberParts; + + /** + * If true, causes maximum runs of word parts to be catenated: + *

+ * "wi-fi" => "wifi" + */ + final boolean catenateWords; + + /** + * If true, causes maximum runs of number parts to be catenated: + *

+ * "500-42" => "50042" + */ + final boolean catenateNumbers; + + /** + * If true, causes all subword parts to be catenated: + *

+ * "wi-fi-4000" => "wifi4000" + */ + final boolean catenateAll; + + /** + * If true, original words are preserved and added to the subword list (Defaults to false) + *

+ * "500-42" => "500" "42" "500-42" + */ + final boolean preserveOriginal; + + /** + * If not null is the set of tokens to protect from being delimited + */ + final CharArraySet protWords; + + private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); + private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class); + + // used for iterating word delimiter breaks + private final WordDelimiterIterator iterator; + + // used for concatenating runs of similar typed subwords (word,number) + private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation(); + // number of subwords last output by concat. + private int lastConcatCount = 0; + + // used for catenate all + private final WordDelimiterConcatenation concatAll = new WordDelimiterConcatenation(); + + // used for accumulating position increment gaps + private int accumPosInc = 0; + + private char savedBuffer[] = new char[1024]; + private int savedStartOffset; + private int savedEndOffset; + private String savedType; + private boolean hasSavedState = false; + // if length by start + end offsets doesn't match the term text then assume + // this is a synonym and don't adjust the offsets. + private boolean hasIllegalOffsets = false; + + // for a run of the same subword type within a word, have we output anything? + private boolean hasOutputToken = false; + // when preserve original is on, have we output any token following it? + // this token must have posInc=0! + private boolean hasOutputFollowingOriginal = false; + + /** + * @param in Token stream to be filtered. + * @param charTypeTable + * @param generateWordParts If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot" + * @param generateNumberParts If 1, causes number subwords to be generated: "500-42" => "500" "42" + * @param catenateWords 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" + * @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042" + * @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" + * @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) + * @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42" + * @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se" + * @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" + * @param protWords If not null is the set of tokens to protect from being delimited + */ + public WordDelimiterFilter(TokenStream in, + byte[] charTypeTable, + int generateWordParts, + int generateNumberParts, + int catenateWords, + int catenateNumbers, + int catenateAll, + int splitOnCaseChange, + int preserveOriginal, + int splitOnNumerics, + int stemEnglishPossessive, + CharArraySet protWords) { + super(in); + this.generateWordParts = generateWordParts != 0; + this.generateNumberParts = generateNumberParts != 0; + this.catenateWords = catenateWords != 0; + this.catenateNumbers = catenateNumbers != 0; + this.catenateAll = catenateAll != 0; + this.preserveOriginal = preserveOriginal != 0; + this.protWords = protWords; + this.iterator = new WordDelimiterIterator(charTypeTable, splitOnCaseChange != 0, splitOnNumerics != 0, stemEnglishPossessive != 0); + } + + /** + * @param in Token stream to be filtered. + * @param generateWordParts If 1, causes parts of words to be generated: "PowerShot", "Power-Shot" => "Power" "Shot" + * @param generateNumberParts If 1, causes number subwords to be generated: "500-42" => "500" "42" + * @param catenateWords 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" + * @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042" + * @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" + * @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) + * @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42" + * @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se" + * @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" + * @param protWords If not null is the set of tokens to protect from being delimited + */ + public WordDelimiterFilter(TokenStream in, + int generateWordParts, + int generateNumberParts, + int catenateWords, + int catenateNumbers, + int catenateAll, + int splitOnCaseChange, + int preserveOriginal, + int splitOnNumerics, + int stemEnglishPossessive, + CharArraySet protWords) { + this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, stemEnglishPossessive, protWords); + } + + public boolean incrementToken() throws IOException { + while (true) { + if (!hasSavedState) { + // process a new input word + if (!input.incrementToken()) { + return false; + } + + int termLength = termAttribute.length(); + char[] termBuffer = termAttribute.buffer(); + + accumPosInc += posIncAttribute.getPositionIncrement(); + + iterator.setText(termBuffer, termLength); + iterator.next(); + + // word of no delimiters, or protected word: just return it + if ((iterator.current == 0 && iterator.end == termLength) || + (protWords != null && protWords.contains(termBuffer, 0, termLength))) { + posIncAttribute.setPositionIncrement(accumPosInc); + accumPosInc = 0; + return true; + } + + // word of simply delimiters + if (iterator.end == WordDelimiterIterator.DONE && !preserveOriginal) { + // if the posInc is 1, simply ignore it in the accumulation + if (posIncAttribute.getPositionIncrement() == 1) { + accumPosInc--; + } + continue; + } + + saveState(); + + hasOutputToken = false; + hasOutputFollowingOriginal = !preserveOriginal; + lastConcatCount = 0; + + if (preserveOriginal) { + posIncAttribute.setPositionIncrement(accumPosInc); + accumPosInc = 0; + return true; + } + } + + // at the end of the string, output any concatenations + if (iterator.end == WordDelimiterIterator.DONE) { + if (!concat.isEmpty()) { + if (flushConcatenation(concat)) { + return true; + } + } + + if (!concatAll.isEmpty()) { + // only if we haven't output this same combo above! + if (concatAll.subwordCount > lastConcatCount) { + concatAll.writeAndClear(); + return true; + } + concatAll.clear(); + } + + // no saved concatenations, on to the next input word + hasSavedState = false; + continue; + } + + // word surrounded by delimiters: always output + if (iterator.isSingleWord()) { + generatePart(true); + iterator.next(); + return true; + } + + int wordType = iterator.type(); + + // do we already have queued up incompatible concatenations? + if (!concat.isEmpty() && (concat.type & wordType) == 0) { + if (flushConcatenation(concat)) { + hasOutputToken = false; + return true; + } + hasOutputToken = false; + } + + // add subwords depending upon options + if (shouldConcatenate(wordType)) { + if (concat.isEmpty()) { + concat.type = wordType; + } + concatenate(concat); + } + + // add all subwords (catenateAll) + if (catenateAll) { + concatenate(concatAll); + } + + // if we should output the word or number part + if (shouldGenerateParts(wordType)) { + generatePart(false); + iterator.next(); + return true; + } + + iterator.next(); + } + } + + /** + * {@inheritDoc} + */ + @Override + public void reset() throws IOException { + super.reset(); + hasSavedState = false; + concat.clear(); + concatAll.clear(); + accumPosInc = 0; + } + + // ================================================= Helper Methods ================================================ + + /** + * Saves the existing attribute states + */ + private void saveState() { + // otherwise, we have delimiters, save state + savedStartOffset = offsetAttribute.startOffset(); + savedEndOffset = offsetAttribute.endOffset(); + // if length by start + end offsets doesn't match the term text then assume this is a synonym and don't adjust the offsets. + hasIllegalOffsets = (savedEndOffset - savedStartOffset != termAttribute.length()); + savedType = typeAttribute.type(); + + if (savedBuffer.length < termAttribute.length()) { + savedBuffer = new char[ArrayUtil.oversize(termAttribute.length(), RamUsageEstimator.NUM_BYTES_CHAR)]; + } + + System.arraycopy(termAttribute.buffer(), 0, savedBuffer, 0, termAttribute.length()); + iterator.text = savedBuffer; + + hasSavedState = true; + } + + /** + * Flushes the given WordDelimiterConcatenation by either writing its concat and then clearing, or just clearing. + * + * @param concatenation WordDelimiterConcatenation that will be flushed + * @return {@code true} if the concatenation was written before it was cleared, {@code} false otherwise + */ + private boolean flushConcatenation(WordDelimiterConcatenation concatenation) { + lastConcatCount = concatenation.subwordCount; + if (concatenation.subwordCount != 1 || !shouldGenerateParts(concatenation.type)) { + concatenation.writeAndClear(); + return true; + } + concatenation.clear(); + return false; + } + + /** + * Determines whether to concatenate a word or number if the current word is the given type + * + * @param wordType Type of the current word used to determine if it should be concatenated + * @return {@code true} if concatenation should occur, {@code false} otherwise + */ + private boolean shouldConcatenate(int wordType) { + return (catenateWords && isAlpha(wordType)) || (catenateNumbers && isDigit(wordType)); + } + + /** + * Determines whether a word/number part should be generated for a word of the given type + * + * @param wordType Type of the word used to determine if a word/number part should be generated + * @return {@code true} if a word/number part should be generated, {@code false} otherwise + */ + private boolean shouldGenerateParts(int wordType) { + return (generateWordParts && isAlpha(wordType)) || (generateNumberParts && isDigit(wordType)); + } + + /** + * Concatenates the saved buffer to the given WordDelimiterConcatenation + * + * @param concatenation WordDelimiterConcatenation to concatenate the buffer to + */ + private void concatenate(WordDelimiterConcatenation concatenation) { + if (concatenation.isEmpty()) { + concatenation.startOffset = savedStartOffset + iterator.current; + } + concatenation.append(savedBuffer, iterator.current, iterator.end - iterator.current); + concatenation.endOffset = savedStartOffset + iterator.end; + } + + /** + * Generates a word/number part, updating the appropriate attributes + * + * @param isSingleWord {@code true} if the generation is occurring from a single word, {@code false} otherwise + */ + private void generatePart(boolean isSingleWord) { + clearAttributes(); + termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current); + + int startOffSet = (isSingleWord || !hasIllegalOffsets) ? savedStartOffset + iterator.current : savedStartOffset; + int endOffSet = (hasIllegalOffsets) ? savedEndOffset : savedStartOffset + iterator.end; + + offsetAttribute.setOffset(startOffSet, endOffSet); + posIncAttribute.setPositionIncrement(position(false)); + typeAttribute.setType(savedType); + } + + /** + * Get the position increment gap for a subword or concatenation + * + * @param inject true if this token wants to be injected + * @return position increment gap + */ + private int position(boolean inject) { + int posInc = accumPosInc; + + if (hasOutputToken) { + accumPosInc = 0; + return inject ? 0 : Math.max(1, posInc); + } + + hasOutputToken = true; + + if (!hasOutputFollowingOriginal) { + // the first token following the original is 0 regardless + hasOutputFollowingOriginal = true; + return 0; + } + // clear the accumulated position increment + accumPosInc = 0; + return Math.max(1, posInc); + } + + /** + * Checks if the given word type includes {@link #ALPHA} + * + * @param type Word type to check + * @return {@code true} if the type contains ALPHA, {@code false} otherwise + */ + static boolean isAlpha(int type) { + return (type & ALPHA) != 0; + } + + /** + * Checks if the given word type includes {@link #DIGIT} + * + * @param type Word type to check + * @return {@code true} if the type contains DIGIT, {@code false} otherwise + */ + static boolean isDigit(int type) { + return (type & DIGIT) != 0; + } + + /** + * Checks if the given word type includes {@link #SUBWORD_DELIM} + * + * @param type Word type to check + * @return {@code true} if the type contains SUBWORD_DELIM, {@code false} otherwise + */ + static boolean isSubwordDelim(int type) { + return (type & SUBWORD_DELIM) != 0; + } + + /** + * Checks if the given word type includes {@link #UPPER} + * + * @param type Word type to check + * @return {@code true} if the type contains UPPER, {@code false} otherwise + */ + static boolean isUpper(int type) { + return (type & UPPER) != 0; + } + + // ================================================= Inner Classes ================================================= + + /** + * A WDF concatenated 'run' + */ + final class WordDelimiterConcatenation { + final StringBuilder buffer = new StringBuilder(); + int startOffset; + int endOffset; + int type; + int subwordCount; + + /** + * Appends the given text of the given length, to the concetenation at the given offset + * + * @param text Text to append + * @param offset Offset in the concetenation to add the text + * @param length Length of the text to append + */ + void append(char text[], int offset, int length) { + buffer.append(text, offset, length); + subwordCount++; + } + + /** + * Writes the concatenation to the attributes + */ + void write() { + clearAttributes(); + if (termAttribute.length() < buffer.length()) { + termAttribute.resizeBuffer(buffer.length()); + } + char termbuffer[] = termAttribute.buffer(); + + buffer.getChars(0, buffer.length(), termbuffer, 0); + termAttribute.setLength(buffer.length()); + + if (hasIllegalOffsets) { + offsetAttribute.setOffset(savedStartOffset, savedEndOffset); + } else { + offsetAttribute.setOffset(startOffset, endOffset); + } + posIncAttribute.setPositionIncrement(position(true)); + typeAttribute.setType(savedType); + accumPosInc = 0; + } + + /** + * Determines if the concatenation is empty + * + * @return {@code true} if the concatenation is empty, {@code false} otherwise + */ + boolean isEmpty() { + return buffer.length() == 0; + } + + /** + * Clears the concatenation and resets its state + */ + void clear() { + buffer.setLength(0); + startOffset = endOffset = type = subwordCount = 0; + } + + /** + * Convenience method for the common scenario of having to write the concetenation and then clearing its state + */ + void writeAndClear() { + write(); + clear(); + } + } + // questions: + // negative numbers? -42 indexed as just 42? + // dollar sign? $42 + // percent sign? 33% + // downsides: if source text is "powershot" then a query of "PowerShot" won't match! +} diff --git a/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java new file mode 100644 index 00000000000..943ccfb6097 --- /dev/null +++ b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java @@ -0,0 +1,341 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.analysis.miscellaneous; + +import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*; + +/** + * A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules. + * + * @lucene.internal + */ +public final class WordDelimiterIterator { + + /** + * Indicates the end of iteration + */ + public static final int DONE = -1; + + public static final byte[] DEFAULT_WORD_DELIM_TABLE; + + char text[]; + int length; + + /** + * start position of text, excluding leading delimiters + */ + int startBounds; + /** + * end position of text, excluding trailing delimiters + */ + int endBounds; + + /** + * Beginning of subword + */ + int current; + /** + * End of subword + */ + int end; + + /* does this string end with a possessive such as 's */ + private boolean hasFinalPossessive = false; + + /** + * If false, causes case changes to be ignored (subwords will only be generated + * given SUBWORD_DELIM tokens). (Defaults to true) + */ + final boolean splitOnCaseChange; + + /** + * If false, causes numeric changes to be ignored (subwords will only be generated + * given SUBWORD_DELIM tokens). (Defaults to true) + */ + final boolean splitOnNumerics; + + /** + * If true, causes trailing "'s" to be removed for each subword. (Defaults to true) + *

+ * "O'Neil's" => "O", "Neil" + */ + final boolean stemEnglishPossessive; + + private final byte[] charTypeTable; + + /** + * if true, need to skip over a possessive found in the last call to next() + */ + private boolean skipPossessive = false; + + // TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be + // done if separated by these chars?) "," would be an obvious candidate... + static { + byte[] tab = new byte[256]; + for (int i = 0; i < 256; i++) { + byte code = 0; + if (Character.isLowerCase(i)) { + code |= LOWER; + } else if (Character.isUpperCase(i)) { + code |= UPPER; + } else if (Character.isDigit(i)) { + code |= DIGIT; + } + if (code == 0) { + code = SUBWORD_DELIM; + } + tab[i] = code; + } + DEFAULT_WORD_DELIM_TABLE = tab; + } + + /** + * Create a new WordDelimiterIterator operating with the supplied rules. + * + * @param charTypeTable table containing character types + * @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) + * @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" "se" + * @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" + */ + WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive) { + this.charTypeTable = charTypeTable; + this.splitOnCaseChange = splitOnCaseChange; + this.splitOnNumerics = splitOnNumerics; + this.stemEnglishPossessive = stemEnglishPossessive; + } + + /** + * Advance to the next subword in the string. + * + * @return index of the next subword, or {@link #DONE} if all subwords have been returned + */ + int next() { + current = end; + if (current == DONE) { + return DONE; + } + + if (skipPossessive) { + current += 2; + skipPossessive = false; + } + + int lastType = 0; + + while (current < endBounds && (isSubwordDelim(lastType = charType(text[current])))) { + current++; + } + + if (current >= endBounds) { + return end = DONE; + } + + for (end = current + 1; end < endBounds; end++) { + int type = charType(text[end]); + if (isBreak(lastType, type)) { + break; + } + lastType = type; + } + + if (end < endBounds - 1 && endsWithPossessive(end + 2)) { + skipPossessive = true; + } + + return end; + } + + + /** + * Return the type of the current subword. + * This currently uses the type of the first character in the subword. + * + * @return type of the current word + */ + int type() { + if (end == DONE) { + return 0; + } + + int type = charType(text[current]); + switch (type) { + // return ALPHA word type for both lower and upper + case LOWER: + case UPPER: + return ALPHA; + default: + return type; + } + } + + /** + * Reset the text to a new value, and reset all state + * + * @param text New text + * @param length length of the text + */ + void setText(char text[], int length) { + this.text = text; + this.length = this.endBounds = length; + current = startBounds = end = 0; + skipPossessive = hasFinalPossessive = false; + setBounds(); + } + + // ================================================= Helper Methods ================================================ + + /** + * Determines whether the transition from lastType to type indicates a break + * + * @param lastType Last subword type + * @param type Current subword type + * @return {@code true} if the transition indicates a break, {@code false} otherwise + */ + private boolean isBreak(int lastType, int type) { + if ((type & lastType) != 0) { + return false; + } + + if (!splitOnCaseChange && isAlpha(lastType) && isAlpha(type)) { + // ALPHA->ALPHA: always ignore if case isn't considered. + return false; + } else if (isUpper(lastType) && isAlpha(type)) { + // UPPER->letter: Don't split + return false; + } else if (!splitOnNumerics && ((isAlpha(lastType) && isDigit(type)) || (isDigit(lastType) && isAlpha(type)))) { + // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split + return false; + } + + return true; + } + + /** + * Determines if the current word contains only one subword. Note, it could be potentially surrounded by delimiters + * + * @return {@code true} if the current word contains only one subword, {@code false} otherwise + */ + boolean isSingleWord() { + if (hasFinalPossessive) { + return current == startBounds && end == endBounds - 2; + } else { + return current == startBounds && end == endBounds; + } + } + + /** + * Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove + * it yet, simply note it. + */ + private void setBounds() { + while (startBounds < length && (isSubwordDelim(charType(text[startBounds])))) { + startBounds++; + } + + while (endBounds > startBounds && (isSubwordDelim(charType(text[endBounds - 1])))) { + endBounds--; + } + if (endsWithPossessive(endBounds)) { + hasFinalPossessive = true; + } + current = startBounds; + } + + /** + * Determines if the text at the given position indicates an English possessive which should be removed + * + * @param pos Position in the text to check if it indicates an English possessive + * @return {@code true} if the text at the position indicates an English posessive, {@code false} otherwise + */ + private boolean endsWithPossessive(int pos) { + return (stemEnglishPossessive && + pos > 2 && + text[pos - 2] == '\'' && + (text[pos - 1] == 's' || text[pos - 1] == 'S') && + isAlpha(charType(text[pos - 3])) && + (pos == endBounds || isSubwordDelim(charType(text[pos])))); + } + + /** + * Determines the type of the given character + * + * @param ch Character whose type is to be determined + * @return Type of the character + */ + private int charType(int ch) { + if (ch < charTypeTable.length) { + return charTypeTable[ch]; + } + return getType(ch); + } + + /** + * Computes the type of the given character + * + * @param ch Character whose type is to be determined + * @return Type of the character + */ + public static byte getType(int ch) { + switch (Character.getType(ch)) { + case Character.UPPERCASE_LETTER: + return UPPER; + case Character.LOWERCASE_LETTER: + return LOWER; + + case Character.TITLECASE_LETTER: + case Character.MODIFIER_LETTER: + case Character.OTHER_LETTER: + case Character.NON_SPACING_MARK: + case Character.ENCLOSING_MARK: // depends what it encloses? + case Character.COMBINING_SPACING_MARK: + return ALPHA; + + case Character.DECIMAL_DIGIT_NUMBER: + case Character.LETTER_NUMBER: + case Character.OTHER_NUMBER: + return DIGIT; + + // case Character.SPACE_SEPARATOR: + // case Character.LINE_SEPARATOR: + // case Character.PARAGRAPH_SEPARATOR: + // case Character.CONTROL: + // case Character.FORMAT: + // case Character.PRIVATE_USE: + + case Character.SURROGATE: // prevent splitting + return ALPHA | DIGIT; + + // case Character.DASH_PUNCTUATION: + // case Character.START_PUNCTUATION: + // case Character.END_PUNCTUATION: + // case Character.CONNECTOR_PUNCTUATION: + // case Character.OTHER_PUNCTUATION: + // case Character.MATH_SYMBOL: + // case Character.CURRENCY_SYMBOL: + // case Character.MODIFIER_SYMBOL: + // case Character.OTHER_SYMBOL: + // case Character.INITIAL_QUOTE_PUNCTUATION: + // case Character.FINAL_QUOTE_PUNCTUATION: + + default: + return SUBWORD_DELIM; + } + } +} \ No newline at end of file diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/Analysis.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/Analysis.java index ac006c6aa6c..08ab255d8ac 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/Analysis.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/Analysis.java @@ -48,14 +48,17 @@ import org.apache.lucene.analysis.sv.SwedishAnalyzer; import org.apache.lucene.analysis.tr.TurkishAnalyzer; import org.elasticsearch.ElasticSearchIllegalArgumentException; import org.elasticsearch.common.Strings; +import org.elasticsearch.common.base.Charsets; import org.elasticsearch.common.collect.ImmutableMap; import org.elasticsearch.common.collect.ImmutableSet; import org.elasticsearch.common.collect.Iterators; import org.elasticsearch.common.collect.MapBuilder; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; -import java.io.File; import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URL; import java.util.Arrays; import java.util.HashSet; import java.util.Set; @@ -148,27 +151,22 @@ public class Analysis { * @throws ElasticSearchIllegalArgumentException * If the word list cannot be found at either key. */ - public static Set getWordList(Settings settings, String settingPrefix) { + public static Set getWordList(Environment env, Settings settings, String settingPrefix) { String wordListPath = settings.get(settingPrefix + "_path", null); if (wordListPath == null) { String[] explicitWordList = settings.getAsArray(settingPrefix, null); if (explicitWordList == null) { - String message = String.format("%s or %s_path must be provided.", settingPrefix, settingPrefix); - throw new ElasticSearchIllegalArgumentException(message); + return null; } else { - return new HashSet(Arrays.asList(explicitWordList)); } } - File wordListFile = new File(wordListPath); - if (!wordListFile.exists()) { - throw new ElasticSearchIllegalArgumentException(settingPrefix + "_path file must exist."); - } + URL wordListFile = env.resolveConfig(wordListPath); try { - return WordlistLoader.getWordSet(wordListFile); + return WordlistLoader.getWordSet(new InputStreamReader(wordListFile.openStream(), Charsets.UTF_8), "#"); } catch (IOException ioe) { String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage()); throw new ElasticSearchIllegalArgumentException(message); diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java index e36af0de311..dbb5f4a9e9d 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java @@ -361,6 +361,7 @@ public class AnalysisModule extends AbstractModule { @Override public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("stemmer", StemmerTokenFilterFactory.class); + tokenFiltersBindings.processTokenFilter("word_delimiter", WordDelimiterTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("arabic_stem", ArabicStemTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("brazilian_stem", BrazilianStemTokenFilterFactory.class); diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java new file mode 100644 index 00000000000..e56bade53c0 --- /dev/null +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java @@ -0,0 +1,196 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; +import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory { + + private final byte[] charTypeTable; + private final boolean generateWordParts; + private final boolean generateNumberParts; + private final boolean catenateWords; + private final boolean catenateNumbers; + private final boolean catenateAll; + private final boolean splitOnCaseChange; + private final boolean preserveOriginal; + private final boolean splitOnNumerics; + private final boolean stemEnglishPossessive; + private final CharArraySet protoWords; + + @Inject public WordDelimiterTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + + // Sample Format for the type table: + // $ => DIGIT + // % => DIGIT + // . => DIGIT + // \u002C => DIGIT + // \u200D => ALPHANUM + Set charTypeTableValues = Analysis.getWordList(env, settings, "type_table"); + if (charTypeTableValues == null) { + this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; + } else { + this.charTypeTable = parseTypes(charTypeTableValues); + } + + // If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot" + this.generateWordParts = settings.getAsBoolean("generate_word_parts", true); + // If 1, causes number subwords to be generated: "500-42" => "500" "42" + this.generateNumberParts = settings.getAsBoolean("generate_number_parts", true); + // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" + this.catenateWords = settings.getAsBoolean("catenate_words", false); + // If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042" + this.catenateNumbers = settings.getAsBoolean("catenate_numbers", false); + // If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" + this.catenateAll = settings.getAsBoolean("catenate_all", false); + // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) + this.splitOnCaseChange = settings.getAsBoolean("split_on_case_change", true); + // If 1, includes original words in subwords: "500-42" => "500" "42" "500-42" + this.preserveOriginal = settings.getAsBoolean("preserve_original", false); + // 1, causes "j2se" to be three tokens; "j" "2" "se" + this.splitOnNumerics = settings.getAsBoolean("split_on_numerics", true); + // If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" + this.stemEnglishPossessive = settings.getAsBoolean("stem_english_possessive", true); + // If not null is the set of tokens to protect from being delimited + Set protectedWords = Analysis.getWordList(env, settings, "protected_words"); + this.protoWords = protectedWords == null ? null : CharArraySet.copy(Lucene.VERSION, protectedWords); + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new WordDelimiterFilter(tokenStream, + WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, + generateWordParts ? 1 : 0, + generateNumberParts ? 1 : 0, + catenateWords ? 1 : 0, + catenateNumbers ? 1 : 0, + catenateAll ? 1 : 0, + splitOnCaseChange ? 1 : 0, + preserveOriginal ? 1 : 0, + splitOnNumerics ? 1 : 0, + stemEnglishPossessive ? 1 : 0, + protoWords); + } + + // source => type + private static Pattern typePattern = Pattern.compile("(.*)\\s*=>\\s*(.*)\\s*$"); + + /** + * parses a list of MappingCharFilter style rules into a custom byte[] type table + */ + private byte[] parseTypes(Collection rules) { + SortedMap typeMap = new TreeMap(); + for (String rule : rules) { + Matcher m = typePattern.matcher(rule); + if (!m.find()) + throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]"); + String lhs = parseString(m.group(1).trim()); + Byte rhs = parseType(m.group(2).trim()); + if (lhs.length() != 1) + throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed."); + if (rhs == null) + throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type."); + typeMap.put(lhs.charAt(0), rhs); + } + + // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance + byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)]; + for (int i = 0; i < types.length; i++) + types[i] = WordDelimiterIterator.getType(i); + for (Map.Entry mapping : typeMap.entrySet()) + types[mapping.getKey()] = mapping.getValue(); + return types; + } + + private Byte parseType(String s) { + if (s.equals("LOWER")) + return WordDelimiterFilter.LOWER; + else if (s.equals("UPPER")) + return WordDelimiterFilter.UPPER; + else if (s.equals("ALPHA")) + return WordDelimiterFilter.ALPHA; + else if (s.equals("DIGIT")) + return WordDelimiterFilter.DIGIT; + else if (s.equals("ALPHANUM")) + return WordDelimiterFilter.ALPHANUM; + else if (s.equals("SUBWORD_DELIM")) + return WordDelimiterFilter.SUBWORD_DELIM; + else + return null; + } + + char[] out = new char[256]; + + private String parseString(String s) { + int readPos = 0; + int len = s.length(); + int writePos = 0; + while (readPos < len) { + char c = s.charAt(readPos++); + if (c == '\\') { + if (readPos >= len) + throw new RuntimeException("Invalid escaped char in [" + s + "]"); + c = s.charAt(readPos++); + switch (c) { + case '\\': + c = '\\'; + break; + case 'n': + c = '\n'; + break; + case 't': + c = '\t'; + break; + case 'r': + c = '\r'; + break; + case 'b': + c = '\b'; + break; + case 'f': + c = '\f'; + break; + case 'u': + if (readPos + 3 >= len) + throw new RuntimeException("Invalid escaped char in [" + s + "]"); + c = (char) Integer.parseInt(s.substring(readPos, readPos + 4), 16); + readPos += 4; + break; + } + } + out[writePos++] = c; + } + return new String(out, 0, writePos); + } +} \ No newline at end of file diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/AbstractCompoundWordTokenFilterFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/AbstractCompoundWordTokenFilterFactory.java index d70c235cac3..fb3eb253e34 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/AbstractCompoundWordTokenFilterFactory.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/AbstractCompoundWordTokenFilterFactory.java @@ -20,9 +20,11 @@ package org.elasticsearch.index.analysis.compound; import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase; +import org.elasticsearch.ElasticSearchIllegalArgumentException; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; import org.elasticsearch.index.Index; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import org.elasticsearch.index.analysis.Analysis; @@ -44,13 +46,16 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok protected final boolean onlyLongestMatch; protected final Set wordList; - @Inject public AbstractCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + @Inject public AbstractCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); minWordSize = settings.getAsInt("min_word_size", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE); minSubwordSize = settings.getAsInt("min_subword_size", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE); maxSubwordSize = settings.getAsInt("max_subword_size", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE); onlyLongestMatch = settings.getAsBoolean("only_longest_max", false); - wordList = Analysis.getWordList(settings, "word_list"); + wordList = Analysis.getWordList(env, settings, "word_list"); + if (wordList == null) { + throw new ElasticSearchIllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly"); + } } } \ No newline at end of file diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java index af40c6f9b39..977d50d1bcb 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; import org.elasticsearch.index.Index; import org.elasticsearch.index.analysis.AnalysisSettingsRequired; import org.elasticsearch.index.settings.IndexSettings; @@ -39,8 +40,8 @@ import org.elasticsearch.index.settings.IndexSettings; @AnalysisSettingsRequired public class DictionaryCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory { - @Inject public DictionaryCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { - super(index, indexSettings, name, settings); + @Inject public DictionaryCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, env, name, settings); } @Override public TokenStream create(TokenStream tokenStream) { diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java index 012cfb3b66c..9dd599265dd 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java @@ -26,11 +26,13 @@ import org.elasticsearch.ElasticSearchIllegalArgumentException; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; import org.elasticsearch.index.Index; import org.elasticsearch.index.analysis.AnalysisSettingsRequired; import org.elasticsearch.index.settings.IndexSettings; +import org.xml.sax.InputSource; -import java.io.File; +import java.net.URL; /** * Uses the {@link org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter} to decompound tokens based on hyphenation rules. @@ -44,21 +46,18 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW private final HyphenationTree hyphenationTree; - @Inject public HyphenationCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { - super(index, indexSettings, name, settings); + @Inject public HyphenationCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, env, name, settings); String hyphenationPatternsPath = settings.get("hyphenation_patterns_path", null); if (hyphenationPatternsPath == null) { throw new ElasticSearchIllegalArgumentException("hyphenation_patterns_path is a required setting."); } - File hyphenationPatternsFile = new File(hyphenationPatternsPath); - if (!hyphenationPatternsFile.exists()) { - throw new ElasticSearchIllegalArgumentException("hyphenation_patterns_path file must exist."); - } + URL hyphenationPatternsFile = env.resolveConfig(hyphenationPatternsPath); try { - hyphenationTree = HyphenationCompoundWordTokenFilter.getHyphenationTree(hyphenationPatternsFile); + hyphenationTree = HyphenationCompoundWordTokenFilter.getHyphenationTree(new InputSource(hyphenationPatternsFile.toExternalForm())); } catch (Exception e) { throw new ElasticSearchIllegalArgumentException("Exception while reading hyphenation_patterns_path: " + e.getMessage()); } diff --git a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java index 0b34196b5f3..73895253dbb 100644 --- a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java +++ b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java @@ -24,7 +24,10 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.elasticsearch.common.inject.Injector; import org.elasticsearch.common.inject.ModulesBuilder; import org.elasticsearch.common.lucene.analysis.HTMLStripCharFilter; +import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.EnvironmentModule; import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexNameModule; import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; @@ -61,6 +64,7 @@ public class AnalysisModuleTests { private void testSimpleConfiguration(Settings settings) { Index index = new Index("test"); Injector injector = new ModulesBuilder().add( + new EnvironmentModule(new Environment(settings)), new IndexSettingsModule(index, settings), new IndexNameModule(index), new AnalysisModule(settings)).createInjector(); @@ -120,18 +124,19 @@ public class AnalysisModuleTests { assertThat(dictionaryDecompounderAnalyze.tokenFilters().length, equalTo(1)); assertThat(dictionaryDecompounderAnalyze.tokenFilters()[0], instanceOf(DictionaryCompoundWordTokenFilterFactory.class)); - Set wordList = Analysis.getWordList(settings, "index.analysis.filter.dict_dec.word_list"); + Set wordList = Analysis.getWordList(null, settings, "index.analysis.filter.dict_dec.word_list"); MatcherAssert.assertThat(wordList.size(), equalTo(6)); MatcherAssert.assertThat(wordList, hasItems("donau", "dampf", "schiff", "spargel", "creme", "suppe")); } @Test public void testWordListPath() throws Exception { + Environment env = new Environment(ImmutableSettings.Builder.EMPTY_SETTINGS); String[] words = new String[]{"donau", "dampf", "schiff", "spargel", "creme", "suppe"}; File wordListFile = generateWordList(words); Settings settings = settingsBuilder().loadFromSource("index: \n word_list_path: " + wordListFile.getAbsolutePath()).build(); - Set wordList = Analysis.getWordList(settings, "index.word_list"); + Set wordList = Analysis.getWordList(env, settings, "index.word_list"); MatcherAssert.assertThat(wordList.size(), equalTo(6)); MatcherAssert.assertThat(wordList, hasItems(words)); }