Analysis: Word Delimiter Token Filter, closes #918.

This commit is contained in:
kimchy 2011-05-09 02:48:11 +03:00
parent 465036655f
commit 563ad625c0
10 changed files with 1146 additions and 24 deletions

View File

@ -23,6 +23,7 @@
@ -107,6 +108,7 @@

View File

@ -0,0 +1,574 @@
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
* Splits words into subwords and performs optional transformations on subword groups.
* Words are split into subwords with the following rules:
* - split on intra-word delimiters (by default, all non alpha-numeric characters).
* - "Wi-Fi" -> "Wi", "Fi"
* - split on case transitions
* - "PowerShot" -> "Power", "Shot"
* - split on letter-number transitions
* - "SD500" -> "SD", "500"
* - leading and trailing intra-word delimiters on each subword are ignored
* - "//hello---there, 'dude'" -> "hello", "there", "dude"
* - trailing "'s" are removed for each subword
* - "O'Neil's" -> "O", "Neil"
* - Note: this step isn't performed in a separate filter because of possible subword combinations.
* The <b>combinations</b> parameter affects how subwords are combined:
* - combinations="0" causes no subword combinations.
* - "PowerShot" -> 0:"Power", 1:"Shot" (0 and 1 are the token positions)
* - combinations="1" means that in addition to the subwords, maximum runs of non-numeric subwords are catenated and produced at the same position of the last subword in the run.
* - "PowerShot" -> 0:"Power", 1:"Shot" 1:"PowerShot"
* - "A's+B's&C's" -> 0:"A", 1:"B", 2:"C", 2:"ABC"
* - "Super-Duper-XL500-42-AutoCoder!" -> 0:"Super", 1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder"
* One use for WordDelimiterFilter is to help match words with different subword delimiters.
* For example, if the source text contained "wi-fi" one may want "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match.
* One way of doing so is to specify combinations="1" in the analyzer used for indexing, and combinations="0" (the default)
* in the analyzer used for querying. Given that the current StandardTokenizer immediately removes many intra-word
* delimiters, it is recommended that this filter be used after a tokenizer that does not do this (such as WhitespaceTokenizer).
// LUCENE MONITOR: Part of Lucene 4.0, once we upgrade, remove it
public final class WordDelimiterFilter extends TokenFilter {
public static final int LOWER = 0x01;
public static final int UPPER = 0x02;
public static final int DIGIT = 0x04;
public static final int SUBWORD_DELIM = 0x08;
// combinations: for testing, not for setting bits
public static final int ALPHA = 0x03;
public static final int ALPHANUM = 0x07;
* If true, causes parts of words to be generated:
* <p/>
* "PowerShot" => "Power" "Shot"
final boolean generateWordParts;
* If true, causes number subwords to be generated:
* <p/>
* "500-42" => "500" "42"
final boolean generateNumberParts;
* If true, causes maximum runs of word parts to be catenated:
* <p/>
* "wi-fi" => "wifi"
final boolean catenateWords;
* If true, causes maximum runs of number parts to be catenated:
* <p/>
* "500-42" => "50042"
final boolean catenateNumbers;
* If true, causes all subword parts to be catenated:
* <p/>
* "wi-fi-4000" => "wifi4000"
final boolean catenateAll;
* If true, original words are preserved and added to the subword list (Defaults to false)
* <p/>
* "500-42" => "500" "42" "500-42"
final boolean preserveOriginal;
* If not null is the set of tokens to protect from being delimited
final CharArraySet protWords;
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
// used for iterating word delimiter breaks
private final WordDelimiterIterator iterator;
// used for concatenating runs of similar typed subwords (word,number)
private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation();
// number of subwords last output by concat.
private int lastConcatCount = 0;
// used for catenate all
private final WordDelimiterConcatenation concatAll = new WordDelimiterConcatenation();
// used for accumulating position increment gaps
private int accumPosInc = 0;
private char savedBuffer[] = new char[1024];
private int savedStartOffset;
private int savedEndOffset;
private String savedType;
private boolean hasSavedState = false;
// if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets.
private boolean hasIllegalOffsets = false;
// for a run of the same subword type within a word, have we output anything?
private boolean hasOutputToken = false;
// when preserve original is on, have we output any token following it?
// this token must have posInc=0!
private boolean hasOutputFollowingOriginal = false;
* @param in Token stream to be filtered.
* @param charTypeTable
* @param generateWordParts If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
* @param generateNumberParts If 1, causes number subwords to be generated: "500-42" => "500" "42"
* @param catenateWords 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
* @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
* @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
* @param protWords If not null is the set of tokens to protect from being delimited
public WordDelimiterFilter(TokenStream in,
byte[] charTypeTable,
int generateWordParts,
int generateNumberParts,
int catenateWords,
int catenateNumbers,
int catenateAll,
int splitOnCaseChange,
int preserveOriginal,
int splitOnNumerics,
int stemEnglishPossessive,
CharArraySet protWords) {
this.generateWordParts = generateWordParts != 0;
this.generateNumberParts = generateNumberParts != 0;
this.catenateWords = catenateWords != 0;
this.catenateNumbers = catenateNumbers != 0;
this.catenateAll = catenateAll != 0;
this.preserveOriginal = preserveOriginal != 0;
this.protWords = protWords;
this.iterator = new WordDelimiterIterator(charTypeTable, splitOnCaseChange != 0, splitOnNumerics != 0, stemEnglishPossessive != 0);
* @param in Token stream to be filtered.
* @param generateWordParts If 1, causes parts of words to be generated: "PowerShot", "Power-Shot" => "Power" "Shot"
* @param generateNumberParts If 1, causes number subwords to be generated: "500-42" => "500" "42"
* @param catenateWords 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
* @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
* @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
* @param protWords If not null is the set of tokens to protect from being delimited
public WordDelimiterFilter(TokenStream in,
int generateWordParts,
int generateNumberParts,
int catenateWords,
int catenateNumbers,
int catenateAll,
int splitOnCaseChange,
int preserveOriginal,
int splitOnNumerics,
int stemEnglishPossessive,
CharArraySet protWords) {
this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, stemEnglishPossessive, protWords);
public boolean incrementToken() throws IOException {
while (true) {
if (!hasSavedState) {
// process a new input word
if (!input.incrementToken()) {
return false;
int termLength = termAttribute.length();
char[] termBuffer = termAttribute.buffer();
accumPosInc += posIncAttribute.getPositionIncrement();
iterator.setText(termBuffer, termLength);;
// word of no delimiters, or protected word: just return it
if ((iterator.current == 0 && iterator.end == termLength) ||
(protWords != null && protWords.contains(termBuffer, 0, termLength))) {
accumPosInc = 0;
return true;
// word of simply delimiters
if (iterator.end == WordDelimiterIterator.DONE && !preserveOriginal) {
// if the posInc is 1, simply ignore it in the accumulation
if (posIncAttribute.getPositionIncrement() == 1) {
hasOutputToken = false;
hasOutputFollowingOriginal = !preserveOriginal;
lastConcatCount = 0;
if (preserveOriginal) {
accumPosInc = 0;
return true;
// at the end of the string, output any concatenations
if (iterator.end == WordDelimiterIterator.DONE) {
if (!concat.isEmpty()) {
if (flushConcatenation(concat)) {
return true;
if (!concatAll.isEmpty()) {
// only if we haven't output this same combo above!
if (concatAll.subwordCount > lastConcatCount) {
return true;
// no saved concatenations, on to the next input word
hasSavedState = false;
// word surrounded by delimiters: always output
if (iterator.isSingleWord()) {
return true;
int wordType = iterator.type();
// do we already have queued up incompatible concatenations?
if (!concat.isEmpty() && (concat.type & wordType) == 0) {
if (flushConcatenation(concat)) {
hasOutputToken = false;
return true;
hasOutputToken = false;
// add subwords depending upon options
if (shouldConcatenate(wordType)) {
if (concat.isEmpty()) {
concat.type = wordType;
// add all subwords (catenateAll)
if (catenateAll) {
// if we should output the word or number part
if (shouldGenerateParts(wordType)) {
return true;
* {@inheritDoc}
public void reset() throws IOException {
hasSavedState = false;
accumPosInc = 0;
// ================================================= Helper Methods ================================================
* Saves the existing attribute states
private void saveState() {
// otherwise, we have delimiters, save state
savedStartOffset = offsetAttribute.startOffset();
savedEndOffset = offsetAttribute.endOffset();
// if length by start + end offsets doesn't match the term text then assume this is a synonym and don't adjust the offsets.
hasIllegalOffsets = (savedEndOffset - savedStartOffset != termAttribute.length());
savedType = typeAttribute.type();
if (savedBuffer.length < termAttribute.length()) {
savedBuffer = new char[ArrayUtil.oversize(termAttribute.length(), RamUsageEstimator.NUM_BYTES_CHAR)];
System.arraycopy(termAttribute.buffer(), 0, savedBuffer, 0, termAttribute.length());
iterator.text = savedBuffer;
hasSavedState = true;
* Flushes the given WordDelimiterConcatenation by either writing its concat and then clearing, or just clearing.
* @param concatenation WordDelimiterConcatenation that will be flushed
* @return {@code true} if the concatenation was written before it was cleared, {@code} false otherwise
private boolean flushConcatenation(WordDelimiterConcatenation concatenation) {
lastConcatCount = concatenation.subwordCount;
if (concatenation.subwordCount != 1 || !shouldGenerateParts(concatenation.type)) {
return true;
return false;
* Determines whether to concatenate a word or number if the current word is the given type
* @param wordType Type of the current word used to determine if it should be concatenated
* @return {@code true} if concatenation should occur, {@code false} otherwise
private boolean shouldConcatenate(int wordType) {
return (catenateWords && isAlpha(wordType)) || (catenateNumbers && isDigit(wordType));
* Determines whether a word/number part should be generated for a word of the given type
* @param wordType Type of the word used to determine if a word/number part should be generated
* @return {@code true} if a word/number part should be generated, {@code false} otherwise
private boolean shouldGenerateParts(int wordType) {
return (generateWordParts && isAlpha(wordType)) || (generateNumberParts && isDigit(wordType));
* Concatenates the saved buffer to the given WordDelimiterConcatenation
* @param concatenation WordDelimiterConcatenation to concatenate the buffer to
private void concatenate(WordDelimiterConcatenation concatenation) {
if (concatenation.isEmpty()) {
concatenation.startOffset = savedStartOffset + iterator.current;
concatenation.append(savedBuffer, iterator.current, iterator.end - iterator.current);
concatenation.endOffset = savedStartOffset + iterator.end;
* Generates a word/number part, updating the appropriate attributes
* @param isSingleWord {@code true} if the generation is occurring from a single word, {@code false} otherwise
private void generatePart(boolean isSingleWord) {
termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current);
int startOffSet = (isSingleWord || !hasIllegalOffsets) ? savedStartOffset + iterator.current : savedStartOffset;
int endOffSet = (hasIllegalOffsets) ? savedEndOffset : savedStartOffset + iterator.end;
offsetAttribute.setOffset(startOffSet, endOffSet);
* Get the position increment gap for a subword or concatenation
* @param inject true if this token wants to be injected
* @return position increment gap
private int position(boolean inject) {
int posInc = accumPosInc;
if (hasOutputToken) {
accumPosInc = 0;
return inject ? 0 : Math.max(1, posInc);
hasOutputToken = true;
if (!hasOutputFollowingOriginal) {
// the first token following the original is 0 regardless
hasOutputFollowingOriginal = true;
return 0;
// clear the accumulated position increment
accumPosInc = 0;
return Math.max(1, posInc);
* Checks if the given word type includes {@link #ALPHA}
* @param type Word type to check
* @return {@code true} if the type contains ALPHA, {@code false} otherwise
static boolean isAlpha(int type) {
return (type & ALPHA) != 0;
* Checks if the given word type includes {@link #DIGIT}
* @param type Word type to check
* @return {@code true} if the type contains DIGIT, {@code false} otherwise
static boolean isDigit(int type) {
return (type & DIGIT) != 0;
* Checks if the given word type includes {@link #SUBWORD_DELIM}
* @param type Word type to check
* @return {@code true} if the type contains SUBWORD_DELIM, {@code false} otherwise
static boolean isSubwordDelim(int type) {
return (type & SUBWORD_DELIM) != 0;
* Checks if the given word type includes {@link #UPPER}
* @param type Word type to check
* @return {@code true} if the type contains UPPER, {@code false} otherwise
static boolean isUpper(int type) {
return (type & UPPER) != 0;
// ================================================= Inner Classes =================================================
* A WDF concatenated 'run'
final class WordDelimiterConcatenation {
final StringBuilder buffer = new StringBuilder();
int startOffset;
int endOffset;
int type;
int subwordCount;
* Appends the given text of the given length, to the concetenation at the given offset
* @param text Text to append
* @param offset Offset in the concetenation to add the text
* @param length Length of the text to append
void append(char text[], int offset, int length) {
buffer.append(text, offset, length);
* Writes the concatenation to the attributes
void write() {
if (termAttribute.length() < buffer.length()) {
char termbuffer[] = termAttribute.buffer();
buffer.getChars(0, buffer.length(), termbuffer, 0);
if (hasIllegalOffsets) {
offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
} else {
offsetAttribute.setOffset(startOffset, endOffset);
accumPosInc = 0;
* Determines if the concatenation is empty
* @return {@code true} if the concatenation is empty, {@code false} otherwise
boolean isEmpty() {
return buffer.length() == 0;
* Clears the concatenation and resets its state
void clear() {
startOffset = endOffset = type = subwordCount = 0;
* Convenience method for the common scenario of having to write the concetenation and then clearing its state
void writeAndClear() {
// questions:
// negative numbers? -42 indexed as just 42?
// dollar sign? $42
// percent sign? 33%
// downsides: if source text is "powershot" then a query of "PowerShot" won't match!

View File

@ -0,0 +1,341 @@
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.apache.lucene.analysis.miscellaneous;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
* A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules.
* @lucene.internal
public final class WordDelimiterIterator {
* Indicates the end of iteration
public static final int DONE = -1;
public static final byte[] DEFAULT_WORD_DELIM_TABLE;
char text[];
int length;
* start position of text, excluding leading delimiters
int startBounds;
* end position of text, excluding trailing delimiters
int endBounds;
* Beginning of subword
int current;
* End of subword
int end;
/* does this string end with a possessive such as 's */
private boolean hasFinalPossessive = false;
* If false, causes case changes to be ignored (subwords will only be generated
* given SUBWORD_DELIM tokens). (Defaults to true)
final boolean splitOnCaseChange;
* If false, causes numeric changes to be ignored (subwords will only be generated
* given SUBWORD_DELIM tokens). (Defaults to true)
final boolean splitOnNumerics;
* If true, causes trailing "'s" to be removed for each subword. (Defaults to true)
* <p/>
* "O'Neil's" => "O", "Neil"
final boolean stemEnglishPossessive;
private final byte[] charTypeTable;
* if true, need to skip over a possessive found in the last call to next()
private boolean skipPossessive = false;
// TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be
// done if separated by these chars?) "," would be an obvious candidate...
static {
byte[] tab = new byte[256];
for (int i = 0; i < 256; i++) {
byte code = 0;
if (Character.isLowerCase(i)) {
code |= LOWER;
} else if (Character.isUpperCase(i)) {
code |= UPPER;
} else if (Character.isDigit(i)) {
code |= DIGIT;
if (code == 0) {
tab[i] = code;
* Create a new WordDelimiterIterator operating with the supplied rules.
* @param charTypeTable table containing character types
* @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
* @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" "se"
* @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive) {
this.charTypeTable = charTypeTable;
this.splitOnCaseChange = splitOnCaseChange;
this.splitOnNumerics = splitOnNumerics;
this.stemEnglishPossessive = stemEnglishPossessive;
* Advance to the next subword in the string.
* @return index of the next subword, or {@link #DONE} if all subwords have been returned
int next() {
current = end;
if (current == DONE) {
return DONE;
if (skipPossessive) {
current += 2;
skipPossessive = false;
int lastType = 0;
while (current < endBounds && (isSubwordDelim(lastType = charType(text[current])))) {
if (current >= endBounds) {
return end = DONE;
for (end = current + 1; end < endBounds; end++) {
int type = charType(text[end]);
if (isBreak(lastType, type)) {
lastType = type;
if (end < endBounds - 1 && endsWithPossessive(end + 2)) {
skipPossessive = true;
return end;
* Return the type of the current subword.
* This currently uses the type of the first character in the subword.
* @return type of the current word
int type() {
if (end == DONE) {
return 0;
int type = charType(text[current]);
switch (type) {
// return ALPHA word type for both lower and upper
case LOWER:
case UPPER:
return ALPHA;
return type;
* Reset the text to a new value, and reset all state
* @param text New text
* @param length length of the text
void setText(char text[], int length) {
this.text = text;
this.length = this.endBounds = length;
current = startBounds = end = 0;
skipPossessive = hasFinalPossessive = false;
// ================================================= Helper Methods ================================================
* Determines whether the transition from lastType to type indicates a break
* @param lastType Last subword type
* @param type Current subword type
* @return {@code true} if the transition indicates a break, {@code false} otherwise
private boolean isBreak(int lastType, int type) {
if ((type & lastType) != 0) {
return false;
if (!splitOnCaseChange && isAlpha(lastType) && isAlpha(type)) {
// ALPHA->ALPHA: always ignore if case isn't considered.
return false;
} else if (isUpper(lastType) && isAlpha(type)) {
// UPPER->letter: Don't split
return false;
} else if (!splitOnNumerics && ((isAlpha(lastType) && isDigit(type)) || (isDigit(lastType) && isAlpha(type)))) {
return false;
return true;
* Determines if the current word contains only one subword. Note, it could be potentially surrounded by delimiters
* @return {@code true} if the current word contains only one subword, {@code false} otherwise
boolean isSingleWord() {
if (hasFinalPossessive) {
return current == startBounds && end == endBounds - 2;
} else {
return current == startBounds && end == endBounds;
* Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove
* it yet, simply note it.
private void setBounds() {
while (startBounds < length && (isSubwordDelim(charType(text[startBounds])))) {
while (endBounds > startBounds && (isSubwordDelim(charType(text[endBounds - 1])))) {
if (endsWithPossessive(endBounds)) {
hasFinalPossessive = true;
current = startBounds;
* Determines if the text at the given position indicates an English possessive which should be removed
* @param pos Position in the text to check if it indicates an English possessive
* @return {@code true} if the text at the position indicates an English posessive, {@code false} otherwise
private boolean endsWithPossessive(int pos) {
return (stemEnglishPossessive &&
pos > 2 &&
text[pos - 2] == '\'' &&
(text[pos - 1] == 's' || text[pos - 1] == 'S') &&
isAlpha(charType(text[pos - 3])) &&
(pos == endBounds || isSubwordDelim(charType(text[pos]))));
* Determines the type of the given character
* @param ch Character whose type is to be determined
* @return Type of the character
private int charType(int ch) {
if (ch < charTypeTable.length) {
return charTypeTable[ch];
return getType(ch);
* Computes the type of the given character
* @param ch Character whose type is to be determined
* @return Type of the character
public static byte getType(int ch) {
switch (Character.getType(ch)) {
case Character.UPPERCASE_LETTER:
return UPPER;
case Character.LOWERCASE_LETTER:
return LOWER;
case Character.TITLECASE_LETTER:
case Character.MODIFIER_LETTER:
case Character.OTHER_LETTER:
case Character.NON_SPACING_MARK:
case Character.ENCLOSING_MARK: // depends what it encloses?
return ALPHA;
case Character.LETTER_NUMBER:
case Character.OTHER_NUMBER:
return DIGIT;
// case Character.SPACE_SEPARATOR:
// case Character.LINE_SEPARATOR:
// case Character.PARAGRAPH_SEPARATOR:
// case Character.CONTROL:
// case Character.FORMAT:
// case Character.PRIVATE_USE:
case Character.SURROGATE: // prevent splitting
return ALPHA | DIGIT;
// case Character.DASH_PUNCTUATION:
// case Character.START_PUNCTUATION:
// case Character.END_PUNCTUATION:
// case Character.OTHER_PUNCTUATION:
// case Character.MATH_SYMBOL:
// case Character.CURRENCY_SYMBOL:
// case Character.MODIFIER_SYMBOL:
// case Character.OTHER_SYMBOL:

View File

@ -48,14 +48,17 @@ import;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.base.Charsets;
import org.elasticsearch.common.collect.ImmutableMap;
import org.elasticsearch.common.collect.ImmutableSet;
import org.elasticsearch.common.collect.Iterators;
import org.elasticsearch.common.collect.MapBuilder;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
@ -148,27 +151,22 @@ public class Analysis {
* @throws ElasticSearchIllegalArgumentException
* If the word list cannot be found at either key.
public static Set<String> getWordList(Settings settings, String settingPrefix) {
public static Set<String> getWordList(Environment env, Settings settings, String settingPrefix) {
String wordListPath = settings.get(settingPrefix + "_path", null);
if (wordListPath == null) {
String[] explicitWordList = settings.getAsArray(settingPrefix, null);
if (explicitWordList == null) {
String message = String.format("%s or %s_path must be provided.", settingPrefix, settingPrefix);
throw new ElasticSearchIllegalArgumentException(message);
return null;
} else {
return new HashSet<String>(Arrays.asList(explicitWordList));
File wordListFile = new File(wordListPath);
if (!wordListFile.exists()) {
throw new ElasticSearchIllegalArgumentException(settingPrefix + "_path file must exist.");
URL wordListFile = env.resolveConfig(wordListPath);
try {
return WordlistLoader.getWordSet(wordListFile);
return WordlistLoader.getWordSet(new InputStreamReader(wordListFile.openStream(), Charsets.UTF_8), "#");
} catch (IOException ioe) {
String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());
throw new ElasticSearchIllegalArgumentException(message);

View File

@ -361,6 +361,7 @@ public class AnalysisModule extends AbstractModule {
@Override public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("stemmer", StemmerTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("word_delimiter", WordDelimiterTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("arabic_stem", ArabicStemTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("brazilian_stem", BrazilianStemTokenFilterFactory.class);

View File

@ -0,0 +1,196 @@
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory {
private final byte[] charTypeTable;
private final boolean generateWordParts;
private final boolean generateNumberParts;
private final boolean catenateWords;
private final boolean catenateNumbers;
private final boolean catenateAll;
private final boolean splitOnCaseChange;
private final boolean preserveOriginal;
private final boolean splitOnNumerics;
private final boolean stemEnglishPossessive;
private final CharArraySet protoWords;
@Inject public WordDelimiterTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
// Sample Format for the type table:
// $ => DIGIT
// % => DIGIT
// . => DIGIT
// \u002C => DIGIT
// \u200D => ALPHANUM
Set<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
if (charTypeTableValues == null) {
this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
} else {
this.charTypeTable = parseTypes(charTypeTableValues);
// If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
this.generateWordParts = settings.getAsBoolean("generate_word_parts", true);
// If 1, causes number subwords to be generated: "500-42" => "500" "42"
this.generateNumberParts = settings.getAsBoolean("generate_number_parts", true);
// 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
this.catenateWords = settings.getAsBoolean("catenate_words", false);
// If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
this.catenateNumbers = settings.getAsBoolean("catenate_numbers", false);
// If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
this.catenateAll = settings.getAsBoolean("catenate_all", false);
// 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
this.splitOnCaseChange = settings.getAsBoolean("split_on_case_change", true);
// If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
this.preserveOriginal = settings.getAsBoolean("preserve_original", false);
// 1, causes "j2se" to be three tokens; "j" "2" "se"
this.splitOnNumerics = settings.getAsBoolean("split_on_numerics", true);
// If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
this.stemEnglishPossessive = settings.getAsBoolean("stem_english_possessive", true);
// If not null is the set of tokens to protect from being delimited
Set<String> protectedWords = Analysis.getWordList(env, settings, "protected_words");
this.protoWords = protectedWords == null ? null : CharArraySet.copy(Lucene.VERSION, protectedWords);
@Override public TokenStream create(TokenStream tokenStream) {
return new WordDelimiterFilter(tokenStream,
generateWordParts ? 1 : 0,
generateNumberParts ? 1 : 0,
catenateWords ? 1 : 0,
catenateNumbers ? 1 : 0,
catenateAll ? 1 : 0,
splitOnCaseChange ? 1 : 0,
preserveOriginal ? 1 : 0,
splitOnNumerics ? 1 : 0,
stemEnglishPossessive ? 1 : 0,
// source => type
private static Pattern typePattern = Pattern.compile("(.*)\\s*=>\\s*(.*)\\s*$");
* parses a list of MappingCharFilter style rules into a custom byte[] type table
private byte[] parseTypes(Collection<String> rules) {
SortedMap<Character, Byte> typeMap = new TreeMap<Character, Byte>();
for (String rule : rules) {
Matcher m = typePattern.matcher(rule);
if (!m.find())
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
String lhs = parseString(;
Byte rhs = parseType(;
if (lhs.length() != 1)
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed.");
if (rhs == null)
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
typeMap.put(lhs.charAt(0), rhs);
// ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
for (int i = 0; i < types.length; i++)
types[i] = WordDelimiterIterator.getType(i);
for (Map.Entry<Character, Byte> mapping : typeMap.entrySet())
types[mapping.getKey()] = mapping.getValue();
return types;
private Byte parseType(String s) {
if (s.equals("LOWER"))
return WordDelimiterFilter.LOWER;
else if (s.equals("UPPER"))
return WordDelimiterFilter.UPPER;
else if (s.equals("ALPHA"))
return WordDelimiterFilter.ALPHA;
else if (s.equals("DIGIT"))
return WordDelimiterFilter.DIGIT;
else if (s.equals("ALPHANUM"))
return WordDelimiterFilter.ALPHANUM;
else if (s.equals("SUBWORD_DELIM"))
return WordDelimiterFilter.SUBWORD_DELIM;
return null;
char[] out = new char[256];
private String parseString(String s) {
int readPos = 0;
int len = s.length();
int writePos = 0;
while (readPos < len) {
char c = s.charAt(readPos++);
if (c == '\\') {
if (readPos >= len)
throw new RuntimeException("Invalid escaped char in [" + s + "]");
c = s.charAt(readPos++);
switch (c) {
case '\\':
c = '\\';
case 'n':
c = '\n';
case 't':
c = '\t';
case 'r':
c = '\r';
case 'b':
c = '\b';
case 'f':
c = '\f';
case 'u':
if (readPos + 3 >= len)
throw new RuntimeException("Invalid escaped char in [" + s + "]");
c = (char) Integer.parseInt(s.substring(readPos, readPos + 4), 16);
readPos += 4;
out[writePos++] = c;
return new String(out, 0, writePos);

View File

@ -20,9 +20,11 @@
package org.elasticsearch.index.analysis.compound;
import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
@ -44,13 +46,16 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok
protected final boolean onlyLongestMatch;
protected final Set<String> wordList;
@Inject public AbstractCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
@Inject public AbstractCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
minWordSize = settings.getAsInt("min_word_size", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
minSubwordSize = settings.getAsInt("min_subword_size", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
maxSubwordSize = settings.getAsInt("max_subword_size", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
onlyLongestMatch = settings.getAsBoolean("only_longest_max", false);
wordList = Analysis.getWordList(settings, "word_list");
wordList = Analysis.getWordList(env, settings, "word_list");
if (wordList == null) {
throw new ElasticSearchIllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly");

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.analysis.AnalysisSettingsRequired;
import org.elasticsearch.index.settings.IndexSettings;
@ -39,8 +40,8 @@ import org.elasticsearch.index.settings.IndexSettings;
public class DictionaryCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory {
@Inject public DictionaryCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
@Inject public DictionaryCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, env, name, settings);
@Override public TokenStream create(TokenStream tokenStream) {

View File

@ -26,11 +26,13 @@ import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.analysis.AnalysisSettingsRequired;
import org.elasticsearch.index.settings.IndexSettings;
import org.xml.sax.InputSource;
* Uses the {@link org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter} to decompound tokens based on hyphenation rules.
@ -44,21 +46,18 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
private final HyphenationTree hyphenationTree;
@Inject public HyphenationCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
@Inject public HyphenationCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, env, name, settings);
String hyphenationPatternsPath = settings.get("hyphenation_patterns_path", null);
if (hyphenationPatternsPath == null) {
throw new ElasticSearchIllegalArgumentException("hyphenation_patterns_path is a required setting.");
File hyphenationPatternsFile = new File(hyphenationPatternsPath);
if (!hyphenationPatternsFile.exists()) {
throw new ElasticSearchIllegalArgumentException("hyphenation_patterns_path file must exist.");
URL hyphenationPatternsFile = env.resolveConfig(hyphenationPatternsPath);
try {
hyphenationTree = HyphenationCompoundWordTokenFilter.getHyphenationTree(hyphenationPatternsFile);
hyphenationTree = HyphenationCompoundWordTokenFilter.getHyphenationTree(new InputSource(hyphenationPatternsFile.toExternalForm()));
} catch (Exception e) {
throw new ElasticSearchIllegalArgumentException("Exception while reading hyphenation_patterns_path: " + e.getMessage());

View File

@ -24,7 +24,10 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.elasticsearch.common.inject.Injector;
import org.elasticsearch.common.inject.ModulesBuilder;
import org.elasticsearch.common.lucene.analysis.HTMLStripCharFilter;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.env.EnvironmentModule;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexNameModule;
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
@ -61,6 +64,7 @@ public class AnalysisModuleTests {
private void testSimpleConfiguration(Settings settings) {
Index index = new Index("test");
Injector injector = new ModulesBuilder().add(
new EnvironmentModule(new Environment(settings)),
new IndexSettingsModule(index, settings),
new IndexNameModule(index),
new AnalysisModule(settings)).createInjector();
@ -120,18 +124,19 @@ public class AnalysisModuleTests {
assertThat(dictionaryDecompounderAnalyze.tokenFilters().length, equalTo(1));
assertThat(dictionaryDecompounderAnalyze.tokenFilters()[0], instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
Set<String> wordList = Analysis.getWordList(settings, "index.analysis.filter.dict_dec.word_list");
Set<String> wordList = Analysis.getWordList(null, settings, "index.analysis.filter.dict_dec.word_list");
MatcherAssert.assertThat(wordList.size(), equalTo(6));
MatcherAssert.assertThat(wordList, hasItems("donau", "dampf", "schiff", "spargel", "creme", "suppe"));
@Test public void testWordListPath() throws Exception {
Environment env = new Environment(ImmutableSettings.Builder.EMPTY_SETTINGS);
String[] words = new String[]{"donau", "dampf", "schiff", "spargel", "creme", "suppe"};
File wordListFile = generateWordList(words);
Settings settings = settingsBuilder().loadFromSource("index: \n word_list_path: " + wordListFile.getAbsolutePath()).build();
Set<String> wordList = Analysis.getWordList(settings, "index.word_list");
Set<String> wordList = Analysis.getWordList(env, settings, "index.word_list");
MatcherAssert.assertThat(wordList.size(), equalTo(6));
MatcherAssert.assertThat(wordList, hasItems(words));