Analysis: Synonym Token Filter, closes #900.

This commit is contained in:
kimchy 2011-05-10 22:37:33 +03:00
parent 1b686d3c2b
commit 15d8f0b1ac
9 changed files with 735 additions and 16 deletions

View File

@ -0,0 +1,258 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.analysis.synonym;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
/**
* SynonymFilter handles multi-token synonyms with variable position increment offsets.
* <p>
* The matched tokens from the input stream may be optionally passed through (includeOrig=true)
* or discarded. If the original tokens are included, the position increments may be modified
* to retain absolute positions after merging with the synonym tokenstream.
* <p>
* Generated synonyms will start at the same position as the first matched source token.
*/
// LUCENE MONITOR: Taken from 4.0, remove once upgraded
public final class SynonymFilter extends TokenFilter {
private final SynonymMap map; // Map<String, SynonymMap>
private Iterator<AttributeSource> replacement; // iterator over generated tokens
public SynonymFilter(TokenStream in, SynonymMap map) {
super(in);
if (map == null)
throw new IllegalArgumentException("map is required");
this.map = map;
// just ensuring these attributes exist...
addAttribute(CharTermAttribute.class);
addAttribute(PositionIncrementAttribute.class);
addAttribute(OffsetAttribute.class);
addAttribute(TypeAttribute.class);
}
/*
* Need to worry about multiple scenarios:
* - need to go for the longest match
* a b => foo #shouldn't match if "a b" is followed by "c d"
* a b c d => bar
* - need to backtrack - retry matches for tokens already read
* a b c d => foo
* b c => bar
* If the input stream is "a b c x", one will consume "a b c d"
* trying to match the first rule... all but "a" should be
* pushed back so a match may be made on "b c".
* - don't try and match generated tokens (thus need separate queue)
* matching is not recursive.
* - handle optional generation of original tokens in all these cases,
* merging token streams to preserve token positions.
* - preserve original positionIncrement of first matched token
*/
@Override
public boolean incrementToken() throws IOException {
while (true) {
// if there are any generated tokens, return them... don't try any
// matches against them, as we specifically don't want recursion.
if (replacement != null && replacement.hasNext()) {
copy(this, replacement.next());
return true;
}
// common case fast-path of first token not matching anything
AttributeSource firstTok = nextTok();
if (firstTok == null) return false;
CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
SynonymMap result = map.submap != null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
if (result == null) {
copy(this, firstTok);
return true;
}
// fast-path failed, clone ourselves if needed
if (firstTok == this)
firstTok = cloneAttributes();
// OK, we matched a token, so find the longest match.
matched = new LinkedList<AttributeSource>();
result = match(result);
if (result == null) {
// no match, simply return the first token read.
copy(this, firstTok);
return true;
}
// reuse, or create new one each time?
ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(result.synonyms.length + matched.size() + 1);
//
// there was a match... let's generate the new tokens, merging
// in the matched tokens (position increments need adjusting)
//
AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
boolean includeOrig = result.includeOrig();
AttributeSource origTok = includeOrig ? firstTok : null;
PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
int repPos = 0; // curr position in replacement token stream
int pos = 0; // current position in merged token stream
for (int i = 0; i < result.synonyms.length; i++) {
Token repTok = result.synonyms[i];
AttributeSource newTok = firstTok.cloneAttributes();
CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);
newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
repPos += repTok.getPositionIncrement();
if (i == 0) repPos = origPos; // make position of first token equal to original
// if necessary, insert original tokens and adjust position increment
while (origTok != null && origPos <= repPos) {
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPosInc.setPositionIncrement(origPos - pos);
generated.add(origTok);
pos += origPosInc.getPositionIncrement();
origTok = matched.isEmpty() ? null : matched.removeFirst();
if (origTok != null) {
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPos += origPosInc.getPositionIncrement();
}
}
newPosIncAtt.setPositionIncrement(repPos - pos);
generated.add(newTok);
pos += newPosIncAtt.getPositionIncrement();
}
// finish up any leftover original tokens
while (origTok != null) {
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPosInc.setPositionIncrement(origPos - pos);
generated.add(origTok);
pos += origPosInc.getPositionIncrement();
origTok = matched.isEmpty() ? null : matched.removeFirst();
if (origTok != null) {
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPos += origPosInc.getPositionIncrement();
}
}
// what if we replaced a longer sequence with a shorter one?
// a/0 b/5 => foo/0
// should I re-create the gap on the next buffered token?
replacement = generated.iterator();
// Now return to the top of the loop to read and return the first
// generated token.. The reason this is done is that we may have generated
// nothing at all, and may need to continue with more matching logic.
}
}
//
// Defer creation of the buffer until the first time it is used to
// optimize short fields with no matches.
//
private LinkedList<AttributeSource> buffer;
private LinkedList<AttributeSource> matched;
private AttributeSource nextTok() throws IOException {
if (buffer != null && !buffer.isEmpty()) {
return buffer.removeFirst();
} else {
if (input.incrementToken()) {
return this;
} else
return null;
}
}
private void pushTok(AttributeSource t) {
if (buffer == null) buffer = new LinkedList<AttributeSource>();
buffer.addFirst(t);
}
private SynonymMap match(SynonymMap map) throws IOException {
SynonymMap result = null;
if (map.submap != null) {
AttributeSource tok = nextTok();
if (tok != null) {
// clone ourselves.
if (tok == this)
tok = cloneAttributes();
// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
SynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length());
if (subMap != null) {
// recurse
result = match(subMap);
}
if (result != null) {
matched.addFirst(tok);
} else {
// push back unmatched token
pushTok(tok);
}
}
}
// if no longer sequence matched, so if this node has synonyms, it's the match.
if (result == null && map.synonyms != null) {
result = map;
}
return result;
}
private void copy(AttributeSource target, AttributeSource source) {
if (target != source)
source.copyTo(target);
}
@Override
public void reset() throws IOException {
input.reset();
replacement = null;
}
}

View File

@ -0,0 +1,177 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.analysis.synonym;
import org.apache.lucene.analysis.CharArrayMap;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.util.Version;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
/**
* Mapping rules for use with {@link SynonymFilter}
*/
public class SynonymMap {
/**
* @lucene.internal
*/
public CharArrayMap<SynonymMap> submap; // recursive: Map<String, SynonymMap>
/**
* @lucene.internal
*/
public Token[] synonyms;
int flags;
static final int INCLUDE_ORIG = 0x01;
static final int IGNORE_CASE = 0x02;
public SynonymMap() {
}
public SynonymMap(boolean ignoreCase) {
if (ignoreCase) flags |= IGNORE_CASE;
}
public boolean includeOrig() {
return (flags & INCLUDE_ORIG) != 0;
}
public boolean ignoreCase() {
return (flags & IGNORE_CASE) != 0;
}
/**
* @param singleMatch List<String>, the sequence of strings to match
* @param replacement List<Token> the list of tokens to use on a match
* @param includeOrig sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens
* @param mergeExisting merge the replacement tokens with any other mappings that exist
*/
public void add(List<String> singleMatch, List<Token> replacement, boolean includeOrig, boolean mergeExisting) {
SynonymMap currMap = this;
for (String str : singleMatch) {
if (currMap.submap == null) {
// for now hardcode at 4.0, as its what the old code did.
// would be nice to fix, but shouldn't store a version in each submap!!!
currMap.submap = new CharArrayMap<SynonymMap>(Version.LUCENE_31, 1, ignoreCase());
}
SynonymMap map = currMap.submap.get(str);
if (map == null) {
map = new SynonymMap();
map.flags |= flags & IGNORE_CASE;
currMap.submap.put(str, map);
}
currMap = map;
}
if (currMap.synonyms != null && !mergeExisting) {
throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch);
}
List<Token> superset = currMap.synonyms == null ? replacement :
mergeTokens(Arrays.asList(currMap.synonyms), replacement);
currMap.synonyms = superset.toArray(new Token[superset.size()]);
if (includeOrig) currMap.flags |= INCLUDE_ORIG;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("<");
if (synonyms != null) {
sb.append("[");
for (int i = 0; i < synonyms.length; i++) {
if (i != 0) sb.append(',');
sb.append(synonyms[i]);
}
if ((flags & INCLUDE_ORIG) != 0) {
sb.append(",ORIG");
}
sb.append("],");
}
sb.append(submap);
sb.append(">");
return sb.toString();
}
/**
* Produces a List<Token> from a List<String>
*/
public static List<Token> makeTokens(List<String> strings) {
List<Token> ret = new ArrayList<Token>(strings.size());
for (String str : strings) {
//Token newTok = new Token(str,0,0,"SYNONYM");
Token newTok = new Token(str, 0, 0, "SYNONYM");
ret.add(newTok);
}
return ret;
}
/**
* Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
* the tokens end up at the same position.
*
* Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same position)
* Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a has posInc=n)
*/
public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) {
ArrayList<Token> result = new ArrayList<Token>();
if (lst1 == null || lst2 == null) {
if (lst2 != null) result.addAll(lst2);
if (lst1 != null) result.addAll(lst1);
return result;
}
int pos = 0;
Iterator<Token> iter1 = lst1.iterator();
Iterator<Token> iter2 = lst2.iterator();
Token tok1 = iter1.hasNext() ? iter1.next() : null;
Token tok2 = iter2.hasNext() ? iter2.next() : null;
int pos1 = tok1 != null ? tok1.getPositionIncrement() : 0;
int pos2 = tok2 != null ? tok2.getPositionIncrement() : 0;
while (tok1 != null || tok2 != null) {
while (tok1 != null && (pos1 <= pos2 || tok2 == null)) {
Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
tok.copyBuffer(tok1.buffer(), 0, tok1.length());
tok.setPositionIncrement(pos1 - pos);
result.add(tok);
pos = pos1;
tok1 = iter1.hasNext() ? iter1.next() : null;
pos1 += tok1 != null ? tok1.getPositionIncrement() : 0;
}
while (tok2 != null && (pos2 <= pos1 || tok1 == null)) {
Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
tok.copyBuffer(tok2.buffer(), 0, tok2.length());
tok.setPositionIncrement(pos2 - pos);
result.add(tok);
pos = pos2;
tok2 = iter2.hasNext() ? iter2.next() : null;
pos2 += tok2 != null ? tok2.getPositionIncrement() : 0;
}
}
return result;
}
}

View File

@ -42,6 +42,117 @@ public class Strings {
private static final char EXTENSION_SEPARATOR = '.';
/**
* Splits a backslash escaped string on the separator.
* <p>
* Current backslash escaping supported:
* <br> \n \t \r \b \f are escaped the same as a Java String
* <br> Other characters following a backslash are produced verbatim (\c => c)
*
* @param s the string to split
* @param separator the separator to split on
* @param decode decode backslash escaping
*/
public static List<String> splitSmart(String s, String separator, boolean decode) {
ArrayList<String> lst = new ArrayList<String>(2);
StringBuilder sb = new StringBuilder();
int pos = 0, end = s.length();
while (pos < end) {
if (s.startsWith(separator, pos)) {
if (sb.length() > 0) {
lst.add(sb.toString());
sb = new StringBuilder();
}
pos += separator.length();
continue;
}
char ch = s.charAt(pos++);
if (ch == '\\') {
if (!decode) sb.append(ch);
if (pos >= end) break; // ERROR, or let it go?
ch = s.charAt(pos++);
if (decode) {
switch (ch) {
case 'n':
ch = '\n';
break;
case 't':
ch = '\t';
break;
case 'r':
ch = '\r';
break;
case 'b':
ch = '\b';
break;
case 'f':
ch = '\f';
break;
}
}
}
sb.append(ch);
}
if (sb.length() > 0) {
lst.add(sb.toString());
}
return lst;
}
public static List<String> splitWS(String s, boolean decode) {
ArrayList<String> lst = new ArrayList<String>(2);
StringBuilder sb = new StringBuilder();
int pos = 0, end = s.length();
while (pos < end) {
char ch = s.charAt(pos++);
if (Character.isWhitespace(ch)) {
if (sb.length() > 0) {
lst.add(sb.toString());
sb = new StringBuilder();
}
continue;
}
if (ch == '\\') {
if (!decode) sb.append(ch);
if (pos >= end) break; // ERROR, or let it go?
ch = s.charAt(pos++);
if (decode) {
switch (ch) {
case 'n':
ch = '\n';
break;
case 't':
ch = '\t';
break;
case 'r':
ch = '\r';
break;
case 'b':
ch = '\b';
break;
case 'f':
ch = '\f';
break;
}
}
}
sb.append(ch);
}
if (sb.length() > 0) {
lst.add(sb.toString());
}
return lst;
}
//---------------------------------------------------------------------
// General convenience methods for working with Strings
//---------------------------------------------------------------------

View File

@ -19,7 +19,6 @@
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
@ -56,12 +55,12 @@ import org.elasticsearch.common.collect.MapBuilder;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.*;
/**
* @author kimchy (shay.banon)
@ -140,7 +139,7 @@ public class Analysis {
}
return setStopWords;
}
Set<String> pathLoadedStopWords = getWordList(env, settings, "stopwords");
Set<String> pathLoadedStopWords = getWordSet(env, settings, "stopwords");
if (pathLoadedStopWords != null) {
Set setStopWords = new HashSet<String>();
for (String stopWord : pathLoadedStopWords) {
@ -156,6 +155,14 @@ public class Analysis {
return defaultStopWords;
}
public static Set<String> getWordSet(Environment env, Settings settings, String settingsPrefix) {
List<String> wordList = getWordList(env, settings, settingsPrefix);
if (wordList == null) {
return null;
}
return new HashSet<String>(wordList);
}
/**
* Fetches a list of words from the specified settings file. The list should either be available at the key
* specified by settingsPrefix or in a file specified by settingsPrefix + _path.
@ -163,7 +170,7 @@ public class Analysis {
* @throws ElasticSearchIllegalArgumentException
* If the word list cannot be found at either key.
*/
public static Set<String> getWordList(Environment env, Settings settings, String settingPrefix) {
public static List<String> getWordList(Environment env, Settings settings, String settingPrefix) {
String wordListPath = settings.get(settingPrefix + "_path", null);
if (wordListPath == null) {
@ -171,17 +178,42 @@ public class Analysis {
if (explicitWordList == null) {
return null;
} else {
return new HashSet<String>(Arrays.asList(explicitWordList));
return Arrays.asList(explicitWordList);
}
}
URL wordListFile = env.resolveConfig(wordListPath);
try {
return WordlistLoader.getWordSet(new InputStreamReader(wordListFile.openStream(), Charsets.UTF_8), "#");
return loadWordList(new InputStreamReader(wordListFile.openStream(), Charsets.UTF_8), "#");
} catch (IOException ioe) {
String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());
throw new ElasticSearchIllegalArgumentException(message);
}
}
public static List<String> loadWordList(Reader reader, String comment) throws IOException {
final List<String> result = new ArrayList<String>();
BufferedReader br = null;
try {
if (reader instanceof BufferedReader) {
br = (BufferedReader) reader;
} else {
br = new BufferedReader(reader);
}
String word = null;
while ((word = br.readLine()) != null) {
if (!Strings.hasText(word)) {
continue;
}
if (!word.startsWith(comment)) {
result.add(word.trim());
}
}
} finally {
if (br != null)
br.close();
}
return result;
}
}

View File

@ -326,9 +326,6 @@ public class AnalysisModule extends AbstractModule {
tokenFiltersBindings.processTokenFilter("edgeNGram", EdgeNGramTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("edge_ngram", EdgeNGramTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("shingle", ShingleTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("phonetic", PhoneticTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("dictionary_decompounder", DictionaryCompoundWordTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("hypennation_decompounder", HyphenationCompoundWordTokenFilterFactory.class);
}
@Override public void processTokenizers(TokenizersBindings tokenizersBindings) {
@ -362,6 +359,11 @@ public class AnalysisModule extends AbstractModule {
tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("stemmer", StemmerTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("word_delimiter", WordDelimiterTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("synonym", SynonymTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("phonetic", PhoneticTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("dictionary_decompounder", DictionaryCompoundWordTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("hypennation_decompounder", HyphenationCompoundWordTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("arabic_stem", ArabicStemTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("brazilian_stem", BrazilianStemTokenFilterFactory.class);

View File

@ -0,0 +1,139 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.io.FastStringReader;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@AnalysisSettingsRequired
public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
private final SynonymMap synonymMap;
@Inject public SynonymTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, Map<String, TokenizerFactoryFactory> tokenizerFactories,
@Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
List<String> rules = Analysis.getWordList(env, settings, "synonyms");
if (rules == null) {
throw new ElasticSearchIllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured");
}
boolean ignoreCase = settings.getAsBoolean("ignore_case", false);
boolean expand = settings.getAsBoolean("expand", true);
TokenizerFactoryFactory tokenizerFactoryFactory = tokenizerFactories.get(settings.get("tokenizer", "whitespace"));
TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.create(settings.get("tokenizer", "whitespace"), settings);
synonymMap = new SynonymMap(ignoreCase);
parseRules(rules, synonymMap, "=>", ",", expand, tokenizerFactory);
}
@Override public TokenStream create(TokenStream tokenStream) {
return new SynonymFilter(tokenStream, synonymMap);
}
static void parseRules(List<String> rules, SynonymMap map, String mappingSep,
String synSep, boolean expansion, TokenizerFactory tokFactory) {
int count = 0;
for (String rule : rules) {
// To use regexes, we need an expression that specifies an odd number of chars.
// This can't really be done with string.split(), and since we need to
// do unescaping at some point anyway, we wouldn't be saving any effort
// by using regexes.
List<String> mapping = Strings.splitSmart(rule, mappingSep, false);
List<List<String>> source;
List<List<String>> target;
if (mapping.size() > 2) {
throw new RuntimeException("Invalid Synonym Rule:" + rule);
} else if (mapping.size() == 2) {
source = getSynList(mapping.get(0), synSep, tokFactory);
target = getSynList(mapping.get(1), synSep, tokFactory);
} else {
source = getSynList(mapping.get(0), synSep, tokFactory);
if (expansion) {
// expand to all arguments
target = source;
} else {
// reduce to first argument
target = new ArrayList<List<String>>(1);
target.add(source.get(0));
}
}
boolean includeOrig = false;
for (List<String> fromToks : source) {
count++;
for (List<String> toToks : target) {
map.add(fromToks,
SynonymMap.makeTokens(toToks),
includeOrig,
true
);
}
}
}
}
// a , b c , d e f => [[a],[b,c],[d,e,f]]
private static List<List<String>> getSynList(String str, String separator, TokenizerFactory tokFactory) {
List<String> strList = Strings.splitSmart(str, separator, false);
// now split on whitespace to get a list of token strings
List<List<String>> synList = new ArrayList<List<String>>();
for (String toks : strList) {
List<String> tokList = tokFactory == null ?
Strings.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
synList.add(tokList);
}
return synList;
}
private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) {
TokenStream ts = tokFactory.create(new FastStringReader(source));
List<String> tokList = new ArrayList<String>();
try {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
while (ts.incrementToken()) {
if (termAtt.length() > 0)
tokList.add(termAtt.toString());
}
} catch (IOException e) {
throw new RuntimeException(e);
}
return tokList;
}
}

View File

@ -58,7 +58,7 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
// . => DIGIT
// \u002C => DIGIT
// \u200D => ALPHANUM
Set<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
if (charTypeTableValues == null) {
this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
} else {
@ -84,7 +84,7 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
// If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
this.stemEnglishPossessive = settings.getAsBoolean("stem_english_possessive", true);
// If not null is the set of tokens to protect from being delimited
Set<String> protectedWords = Analysis.getWordList(env, settings, "protected_words");
Set<String> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
this.protoWords = protectedWords == null ? null : CharArraySet.copy(Lucene.VERSION, protectedWords);
}

View File

@ -53,7 +53,7 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok
minSubwordSize = settings.getAsInt("min_subword_size", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
maxSubwordSize = settings.getAsInt("max_subword_size", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
onlyLongestMatch = settings.getAsBoolean("only_longest_max", false);
wordList = Analysis.getWordList(env, settings, "word_list");
wordList = Analysis.getWordSet(env, settings, "word_list");
if (wordList == null) {
throw new ElasticSearchIllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly");
}

View File

@ -124,7 +124,7 @@ public class AnalysisModuleTests {
assertThat(dictionaryDecompounderAnalyze.tokenFilters().length, equalTo(1));
assertThat(dictionaryDecompounderAnalyze.tokenFilters()[0], instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
Set<String> wordList = Analysis.getWordList(null, settings, "index.analysis.filter.dict_dec.word_list");
Set<String> wordList = Analysis.getWordSet(null, settings, "index.analysis.filter.dict_dec.word_list");
MatcherAssert.assertThat(wordList.size(), equalTo(6));
MatcherAssert.assertThat(wordList, hasItems("donau", "dampf", "schiff", "spargel", "creme", "suppe"));
}
@ -136,7 +136,7 @@ public class AnalysisModuleTests {
File wordListFile = generateWordList(words);
Settings settings = settingsBuilder().loadFromSource("index: \n word_list_path: " + wordListFile.getAbsolutePath()).build();
Set<String> wordList = Analysis.getWordList(env, settings, "index.word_list");
Set<String> wordList = Analysis.getWordSet(env, settings, "index.word_list");
MatcherAssert.assertThat(wordList.size(), equalTo(6));
MatcherAssert.assertThat(wordList, hasItems(words));
}