Analysis: Synonym Token Filter, closes #900.
This commit is contained in:
parent
1b686d3c2b
commit
15d8f0b1ac
|
@ -0,0 +1,258 @@
|
|||
/*
|
||||
* Licensed to Elastic Search and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. Elastic Search licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.synonym;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
|
||||
/**
|
||||
* SynonymFilter handles multi-token synonyms with variable position increment offsets.
|
||||
* <p>
|
||||
* The matched tokens from the input stream may be optionally passed through (includeOrig=true)
|
||||
* or discarded. If the original tokens are included, the position increments may be modified
|
||||
* to retain absolute positions after merging with the synonym tokenstream.
|
||||
* <p>
|
||||
* Generated synonyms will start at the same position as the first matched source token.
|
||||
*/
|
||||
// LUCENE MONITOR: Taken from 4.0, remove once upgraded
|
||||
public final class SynonymFilter extends TokenFilter {
|
||||
|
||||
private final SynonymMap map; // Map<String, SynonymMap>
|
||||
private Iterator<AttributeSource> replacement; // iterator over generated tokens
|
||||
|
||||
public SynonymFilter(TokenStream in, SynonymMap map) {
|
||||
super(in);
|
||||
if (map == null)
|
||||
throw new IllegalArgumentException("map is required");
|
||||
|
||||
this.map = map;
|
||||
// just ensuring these attributes exist...
|
||||
addAttribute(CharTermAttribute.class);
|
||||
addAttribute(PositionIncrementAttribute.class);
|
||||
addAttribute(OffsetAttribute.class);
|
||||
addAttribute(TypeAttribute.class);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Need to worry about multiple scenarios:
|
||||
* - need to go for the longest match
|
||||
* a b => foo #shouldn't match if "a b" is followed by "c d"
|
||||
* a b c d => bar
|
||||
* - need to backtrack - retry matches for tokens already read
|
||||
* a b c d => foo
|
||||
* b c => bar
|
||||
* If the input stream is "a b c x", one will consume "a b c d"
|
||||
* trying to match the first rule... all but "a" should be
|
||||
* pushed back so a match may be made on "b c".
|
||||
* - don't try and match generated tokens (thus need separate queue)
|
||||
* matching is not recursive.
|
||||
* - handle optional generation of original tokens in all these cases,
|
||||
* merging token streams to preserve token positions.
|
||||
* - preserve original positionIncrement of first matched token
|
||||
*/
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (true) {
|
||||
// if there are any generated tokens, return them... don't try any
|
||||
// matches against them, as we specifically don't want recursion.
|
||||
if (replacement != null && replacement.hasNext()) {
|
||||
copy(this, replacement.next());
|
||||
return true;
|
||||
}
|
||||
|
||||
// common case fast-path of first token not matching anything
|
||||
AttributeSource firstTok = nextTok();
|
||||
if (firstTok == null) return false;
|
||||
CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
|
||||
SynonymMap result = map.submap != null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
|
||||
if (result == null) {
|
||||
copy(this, firstTok);
|
||||
return true;
|
||||
}
|
||||
|
||||
// fast-path failed, clone ourselves if needed
|
||||
if (firstTok == this)
|
||||
firstTok = cloneAttributes();
|
||||
// OK, we matched a token, so find the longest match.
|
||||
|
||||
matched = new LinkedList<AttributeSource>();
|
||||
|
||||
result = match(result);
|
||||
|
||||
if (result == null) {
|
||||
// no match, simply return the first token read.
|
||||
copy(this, firstTok);
|
||||
return true;
|
||||
}
|
||||
|
||||
// reuse, or create new one each time?
|
||||
ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(result.synonyms.length + matched.size() + 1);
|
||||
|
||||
//
|
||||
// there was a match... let's generate the new tokens, merging
|
||||
// in the matched tokens (position increments need adjusting)
|
||||
//
|
||||
AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
|
||||
boolean includeOrig = result.includeOrig();
|
||||
|
||||
AttributeSource origTok = includeOrig ? firstTok : null;
|
||||
PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
|
||||
int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
|
||||
int repPos = 0; // curr position in replacement token stream
|
||||
int pos = 0; // current position in merged token stream
|
||||
|
||||
for (int i = 0; i < result.synonyms.length; i++) {
|
||||
Token repTok = result.synonyms[i];
|
||||
AttributeSource newTok = firstTok.cloneAttributes();
|
||||
CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
|
||||
PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);
|
||||
|
||||
newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
|
||||
newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
|
||||
repPos += repTok.getPositionIncrement();
|
||||
if (i == 0) repPos = origPos; // make position of first token equal to original
|
||||
|
||||
// if necessary, insert original tokens and adjust position increment
|
||||
while (origTok != null && origPos <= repPos) {
|
||||
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPosInc.setPositionIncrement(origPos - pos);
|
||||
generated.add(origTok);
|
||||
pos += origPosInc.getPositionIncrement();
|
||||
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
||||
if (origTok != null) {
|
||||
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPos += origPosInc.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
|
||||
newPosIncAtt.setPositionIncrement(repPos - pos);
|
||||
generated.add(newTok);
|
||||
pos += newPosIncAtt.getPositionIncrement();
|
||||
}
|
||||
|
||||
// finish up any leftover original tokens
|
||||
while (origTok != null) {
|
||||
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPosInc.setPositionIncrement(origPos - pos);
|
||||
generated.add(origTok);
|
||||
pos += origPosInc.getPositionIncrement();
|
||||
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
||||
if (origTok != null) {
|
||||
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPos += origPosInc.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
|
||||
// what if we replaced a longer sequence with a shorter one?
|
||||
// a/0 b/5 => foo/0
|
||||
// should I re-create the gap on the next buffered token?
|
||||
|
||||
replacement = generated.iterator();
|
||||
// Now return to the top of the loop to read and return the first
|
||||
// generated token.. The reason this is done is that we may have generated
|
||||
// nothing at all, and may need to continue with more matching logic.
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Defer creation of the buffer until the first time it is used to
|
||||
// optimize short fields with no matches.
|
||||
//
|
||||
private LinkedList<AttributeSource> buffer;
|
||||
private LinkedList<AttributeSource> matched;
|
||||
|
||||
private AttributeSource nextTok() throws IOException {
|
||||
if (buffer != null && !buffer.isEmpty()) {
|
||||
return buffer.removeFirst();
|
||||
} else {
|
||||
if (input.incrementToken()) {
|
||||
return this;
|
||||
} else
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private void pushTok(AttributeSource t) {
|
||||
if (buffer == null) buffer = new LinkedList<AttributeSource>();
|
||||
buffer.addFirst(t);
|
||||
}
|
||||
|
||||
private SynonymMap match(SynonymMap map) throws IOException {
|
||||
SynonymMap result = null;
|
||||
|
||||
if (map.submap != null) {
|
||||
AttributeSource tok = nextTok();
|
||||
if (tok != null) {
|
||||
// clone ourselves.
|
||||
if (tok == this)
|
||||
tok = cloneAttributes();
|
||||
// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
|
||||
CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
|
||||
SynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length());
|
||||
|
||||
if (subMap != null) {
|
||||
// recurse
|
||||
result = match(subMap);
|
||||
}
|
||||
|
||||
if (result != null) {
|
||||
matched.addFirst(tok);
|
||||
} else {
|
||||
// push back unmatched token
|
||||
pushTok(tok);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if no longer sequence matched, so if this node has synonyms, it's the match.
|
||||
if (result == null && map.synonyms != null) {
|
||||
result = map;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private void copy(AttributeSource target, AttributeSource source) {
|
||||
if (target != source)
|
||||
source.copyTo(target);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
input.reset();
|
||||
replacement = null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,177 @@
|
|||
/*
|
||||
* Licensed to Elastic Search and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. Elastic Search licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.synonym;
|
||||
|
||||
import org.apache.lucene.analysis.CharArrayMap;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Mapping rules for use with {@link SynonymFilter}
|
||||
*/
|
||||
public class SynonymMap {
|
||||
/**
|
||||
* @lucene.internal
|
||||
*/
|
||||
public CharArrayMap<SynonymMap> submap; // recursive: Map<String, SynonymMap>
|
||||
/**
|
||||
* @lucene.internal
|
||||
*/
|
||||
public Token[] synonyms;
|
||||
int flags;
|
||||
|
||||
static final int INCLUDE_ORIG = 0x01;
|
||||
static final int IGNORE_CASE = 0x02;
|
||||
|
||||
public SynonymMap() {
|
||||
}
|
||||
|
||||
public SynonymMap(boolean ignoreCase) {
|
||||
if (ignoreCase) flags |= IGNORE_CASE;
|
||||
}
|
||||
|
||||
public boolean includeOrig() {
|
||||
return (flags & INCLUDE_ORIG) != 0;
|
||||
}
|
||||
|
||||
public boolean ignoreCase() {
|
||||
return (flags & IGNORE_CASE) != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param singleMatch List<String>, the sequence of strings to match
|
||||
* @param replacement List<Token> the list of tokens to use on a match
|
||||
* @param includeOrig sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens
|
||||
* @param mergeExisting merge the replacement tokens with any other mappings that exist
|
||||
*/
|
||||
public void add(List<String> singleMatch, List<Token> replacement, boolean includeOrig, boolean mergeExisting) {
|
||||
SynonymMap currMap = this;
|
||||
for (String str : singleMatch) {
|
||||
if (currMap.submap == null) {
|
||||
// for now hardcode at 4.0, as its what the old code did.
|
||||
// would be nice to fix, but shouldn't store a version in each submap!!!
|
||||
currMap.submap = new CharArrayMap<SynonymMap>(Version.LUCENE_31, 1, ignoreCase());
|
||||
}
|
||||
|
||||
SynonymMap map = currMap.submap.get(str);
|
||||
if (map == null) {
|
||||
map = new SynonymMap();
|
||||
map.flags |= flags & IGNORE_CASE;
|
||||
currMap.submap.put(str, map);
|
||||
}
|
||||
|
||||
currMap = map;
|
||||
}
|
||||
|
||||
if (currMap.synonyms != null && !mergeExisting) {
|
||||
throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch);
|
||||
}
|
||||
List<Token> superset = currMap.synonyms == null ? replacement :
|
||||
mergeTokens(Arrays.asList(currMap.synonyms), replacement);
|
||||
currMap.synonyms = superset.toArray(new Token[superset.size()]);
|
||||
if (includeOrig) currMap.flags |= INCLUDE_ORIG;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder("<");
|
||||
if (synonyms != null) {
|
||||
sb.append("[");
|
||||
for (int i = 0; i < synonyms.length; i++) {
|
||||
if (i != 0) sb.append(',');
|
||||
sb.append(synonyms[i]);
|
||||
}
|
||||
if ((flags & INCLUDE_ORIG) != 0) {
|
||||
sb.append(",ORIG");
|
||||
}
|
||||
sb.append("],");
|
||||
}
|
||||
sb.append(submap);
|
||||
sb.append(">");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Produces a List<Token> from a List<String>
|
||||
*/
|
||||
public static List<Token> makeTokens(List<String> strings) {
|
||||
List<Token> ret = new ArrayList<Token>(strings.size());
|
||||
for (String str : strings) {
|
||||
//Token newTok = new Token(str,0,0,"SYNONYM");
|
||||
Token newTok = new Token(str, 0, 0, "SYNONYM");
|
||||
ret.add(newTok);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
|
||||
* the tokens end up at the same position.
|
||||
*
|
||||
* Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same position)
|
||||
* Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a has posInc=n)
|
||||
*/
|
||||
public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) {
|
||||
ArrayList<Token> result = new ArrayList<Token>();
|
||||
if (lst1 == null || lst2 == null) {
|
||||
if (lst2 != null) result.addAll(lst2);
|
||||
if (lst1 != null) result.addAll(lst1);
|
||||
return result;
|
||||
}
|
||||
|
||||
int pos = 0;
|
||||
Iterator<Token> iter1 = lst1.iterator();
|
||||
Iterator<Token> iter2 = lst2.iterator();
|
||||
Token tok1 = iter1.hasNext() ? iter1.next() : null;
|
||||
Token tok2 = iter2.hasNext() ? iter2.next() : null;
|
||||
int pos1 = tok1 != null ? tok1.getPositionIncrement() : 0;
|
||||
int pos2 = tok2 != null ? tok2.getPositionIncrement() : 0;
|
||||
while (tok1 != null || tok2 != null) {
|
||||
while (tok1 != null && (pos1 <= pos2 || tok2 == null)) {
|
||||
Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
|
||||
tok.copyBuffer(tok1.buffer(), 0, tok1.length());
|
||||
tok.setPositionIncrement(pos1 - pos);
|
||||
result.add(tok);
|
||||
pos = pos1;
|
||||
tok1 = iter1.hasNext() ? iter1.next() : null;
|
||||
pos1 += tok1 != null ? tok1.getPositionIncrement() : 0;
|
||||
}
|
||||
while (tok2 != null && (pos2 <= pos1 || tok1 == null)) {
|
||||
Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
|
||||
tok.copyBuffer(tok2.buffer(), 0, tok2.length());
|
||||
tok.setPositionIncrement(pos2 - pos);
|
||||
result.add(tok);
|
||||
pos = pos2;
|
||||
tok2 = iter2.hasNext() ? iter2.next() : null;
|
||||
pos2 += tok2 != null ? tok2.getPositionIncrement() : 0;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
|
@ -42,6 +42,117 @@ public class Strings {
|
|||
private static final char EXTENSION_SEPARATOR = '.';
|
||||
|
||||
|
||||
/**
|
||||
* Splits a backslash escaped string on the separator.
|
||||
* <p>
|
||||
* Current backslash escaping supported:
|
||||
* <br> \n \t \r \b \f are escaped the same as a Java String
|
||||
* <br> Other characters following a backslash are produced verbatim (\c => c)
|
||||
*
|
||||
* @param s the string to split
|
||||
* @param separator the separator to split on
|
||||
* @param decode decode backslash escaping
|
||||
*/
|
||||
public static List<String> splitSmart(String s, String separator, boolean decode) {
|
||||
ArrayList<String> lst = new ArrayList<String>(2);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int pos = 0, end = s.length();
|
||||
while (pos < end) {
|
||||
if (s.startsWith(separator, pos)) {
|
||||
if (sb.length() > 0) {
|
||||
lst.add(sb.toString());
|
||||
sb = new StringBuilder();
|
||||
}
|
||||
pos += separator.length();
|
||||
continue;
|
||||
}
|
||||
|
||||
char ch = s.charAt(pos++);
|
||||
if (ch == '\\') {
|
||||
if (!decode) sb.append(ch);
|
||||
if (pos >= end) break; // ERROR, or let it go?
|
||||
ch = s.charAt(pos++);
|
||||
if (decode) {
|
||||
switch (ch) {
|
||||
case 'n':
|
||||
ch = '\n';
|
||||
break;
|
||||
case 't':
|
||||
ch = '\t';
|
||||
break;
|
||||
case 'r':
|
||||
ch = '\r';
|
||||
break;
|
||||
case 'b':
|
||||
ch = '\b';
|
||||
break;
|
||||
case 'f':
|
||||
ch = '\f';
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sb.append(ch);
|
||||
}
|
||||
|
||||
if (sb.length() > 0) {
|
||||
lst.add(sb.toString());
|
||||
}
|
||||
|
||||
return lst;
|
||||
}
|
||||
|
||||
|
||||
public static List<String> splitWS(String s, boolean decode) {
|
||||
ArrayList<String> lst = new ArrayList<String>(2);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int pos = 0, end = s.length();
|
||||
while (pos < end) {
|
||||
char ch = s.charAt(pos++);
|
||||
if (Character.isWhitespace(ch)) {
|
||||
if (sb.length() > 0) {
|
||||
lst.add(sb.toString());
|
||||
sb = new StringBuilder();
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ch == '\\') {
|
||||
if (!decode) sb.append(ch);
|
||||
if (pos >= end) break; // ERROR, or let it go?
|
||||
ch = s.charAt(pos++);
|
||||
if (decode) {
|
||||
switch (ch) {
|
||||
case 'n':
|
||||
ch = '\n';
|
||||
break;
|
||||
case 't':
|
||||
ch = '\t';
|
||||
break;
|
||||
case 'r':
|
||||
ch = '\r';
|
||||
break;
|
||||
case 'b':
|
||||
ch = '\b';
|
||||
break;
|
||||
case 'f':
|
||||
ch = '\f';
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sb.append(ch);
|
||||
}
|
||||
|
||||
if (sb.length() > 0) {
|
||||
lst.add(sb.toString());
|
||||
}
|
||||
|
||||
return lst;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// General convenience methods for working with Strings
|
||||
//---------------------------------------------------------------------
|
||||
|
|
|
@ -19,7 +19,6 @@
|
|||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
|
||||
import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
|
||||
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
|
||||
|
@ -56,12 +55,12 @@ import org.elasticsearch.common.collect.MapBuilder;
|
|||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.net.URL;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* @author kimchy (shay.banon)
|
||||
|
@ -140,7 +139,7 @@ public class Analysis {
|
|||
}
|
||||
return setStopWords;
|
||||
}
|
||||
Set<String> pathLoadedStopWords = getWordList(env, settings, "stopwords");
|
||||
Set<String> pathLoadedStopWords = getWordSet(env, settings, "stopwords");
|
||||
if (pathLoadedStopWords != null) {
|
||||
Set setStopWords = new HashSet<String>();
|
||||
for (String stopWord : pathLoadedStopWords) {
|
||||
|
@ -156,6 +155,14 @@ public class Analysis {
|
|||
return defaultStopWords;
|
||||
}
|
||||
|
||||
public static Set<String> getWordSet(Environment env, Settings settings, String settingsPrefix) {
|
||||
List<String> wordList = getWordList(env, settings, settingsPrefix);
|
||||
if (wordList == null) {
|
||||
return null;
|
||||
}
|
||||
return new HashSet<String>(wordList);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches a list of words from the specified settings file. The list should either be available at the key
|
||||
* specified by settingsPrefix or in a file specified by settingsPrefix + _path.
|
||||
|
@ -163,7 +170,7 @@ public class Analysis {
|
|||
* @throws ElasticSearchIllegalArgumentException
|
||||
* If the word list cannot be found at either key.
|
||||
*/
|
||||
public static Set<String> getWordList(Environment env, Settings settings, String settingPrefix) {
|
||||
public static List<String> getWordList(Environment env, Settings settings, String settingPrefix) {
|
||||
String wordListPath = settings.get(settingPrefix + "_path", null);
|
||||
|
||||
if (wordListPath == null) {
|
||||
|
@ -171,17 +178,42 @@ public class Analysis {
|
|||
if (explicitWordList == null) {
|
||||
return null;
|
||||
} else {
|
||||
return new HashSet<String>(Arrays.asList(explicitWordList));
|
||||
return Arrays.asList(explicitWordList);
|
||||
}
|
||||
}
|
||||
|
||||
URL wordListFile = env.resolveConfig(wordListPath);
|
||||
|
||||
try {
|
||||
return WordlistLoader.getWordSet(new InputStreamReader(wordListFile.openStream(), Charsets.UTF_8), "#");
|
||||
return loadWordList(new InputStreamReader(wordListFile.openStream(), Charsets.UTF_8), "#");
|
||||
} catch (IOException ioe) {
|
||||
String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());
|
||||
throw new ElasticSearchIllegalArgumentException(message);
|
||||
}
|
||||
}
|
||||
|
||||
public static List<String> loadWordList(Reader reader, String comment) throws IOException {
|
||||
final List<String> result = new ArrayList<String>();
|
||||
BufferedReader br = null;
|
||||
try {
|
||||
if (reader instanceof BufferedReader) {
|
||||
br = (BufferedReader) reader;
|
||||
} else {
|
||||
br = new BufferedReader(reader);
|
||||
}
|
||||
String word = null;
|
||||
while ((word = br.readLine()) != null) {
|
||||
if (!Strings.hasText(word)) {
|
||||
continue;
|
||||
}
|
||||
if (!word.startsWith(comment)) {
|
||||
result.add(word.trim());
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
if (br != null)
|
||||
br.close();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -326,9 +326,6 @@ public class AnalysisModule extends AbstractModule {
|
|||
tokenFiltersBindings.processTokenFilter("edgeNGram", EdgeNGramTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("edge_ngram", EdgeNGramTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("shingle", ShingleTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("phonetic", PhoneticTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("dictionary_decompounder", DictionaryCompoundWordTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("hypennation_decompounder", HyphenationCompoundWordTokenFilterFactory.class);
|
||||
}
|
||||
|
||||
@Override public void processTokenizers(TokenizersBindings tokenizersBindings) {
|
||||
|
@ -362,6 +359,11 @@ public class AnalysisModule extends AbstractModule {
|
|||
tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("stemmer", StemmerTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("word_delimiter", WordDelimiterTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("synonym", SynonymTokenFilterFactory.class);
|
||||
|
||||
tokenFiltersBindings.processTokenFilter("phonetic", PhoneticTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("dictionary_decompounder", DictionaryCompoundWordTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("hypennation_decompounder", HyphenationCompoundWordTokenFilterFactory.class);
|
||||
|
||||
tokenFiltersBindings.processTokenFilter("arabic_stem", ArabicStemTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("brazilian_stem", BrazilianStemTokenFilterFactory.class);
|
||||
|
|
|
@ -0,0 +1,139 @@
|
|||
/*
|
||||
* Licensed to Elastic Search and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. Elastic Search licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.io.FastStringReader;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@AnalysisSettingsRequired
|
||||
public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private final SynonymMap synonymMap;
|
||||
|
||||
@Inject public SynonymTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, Map<String, TokenizerFactoryFactory> tokenizerFactories,
|
||||
@Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name, settings);
|
||||
|
||||
List<String> rules = Analysis.getWordList(env, settings, "synonyms");
|
||||
if (rules == null) {
|
||||
throw new ElasticSearchIllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured");
|
||||
}
|
||||
boolean ignoreCase = settings.getAsBoolean("ignore_case", false);
|
||||
boolean expand = settings.getAsBoolean("expand", true);
|
||||
|
||||
TokenizerFactoryFactory tokenizerFactoryFactory = tokenizerFactories.get(settings.get("tokenizer", "whitespace"));
|
||||
TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.create(settings.get("tokenizer", "whitespace"), settings);
|
||||
synonymMap = new SynonymMap(ignoreCase);
|
||||
parseRules(rules, synonymMap, "=>", ",", expand, tokenizerFactory);
|
||||
}
|
||||
|
||||
@Override public TokenStream create(TokenStream tokenStream) {
|
||||
return new SynonymFilter(tokenStream, synonymMap);
|
||||
}
|
||||
|
||||
static void parseRules(List<String> rules, SynonymMap map, String mappingSep,
|
||||
String synSep, boolean expansion, TokenizerFactory tokFactory) {
|
||||
int count = 0;
|
||||
for (String rule : rules) {
|
||||
// To use regexes, we need an expression that specifies an odd number of chars.
|
||||
// This can't really be done with string.split(), and since we need to
|
||||
// do unescaping at some point anyway, we wouldn't be saving any effort
|
||||
// by using regexes.
|
||||
|
||||
List<String> mapping = Strings.splitSmart(rule, mappingSep, false);
|
||||
|
||||
List<List<String>> source;
|
||||
List<List<String>> target;
|
||||
|
||||
if (mapping.size() > 2) {
|
||||
throw new RuntimeException("Invalid Synonym Rule:" + rule);
|
||||
} else if (mapping.size() == 2) {
|
||||
source = getSynList(mapping.get(0), synSep, tokFactory);
|
||||
target = getSynList(mapping.get(1), synSep, tokFactory);
|
||||
} else {
|
||||
source = getSynList(mapping.get(0), synSep, tokFactory);
|
||||
if (expansion) {
|
||||
// expand to all arguments
|
||||
target = source;
|
||||
} else {
|
||||
// reduce to first argument
|
||||
target = new ArrayList<List<String>>(1);
|
||||
target.add(source.get(0));
|
||||
}
|
||||
}
|
||||
|
||||
boolean includeOrig = false;
|
||||
for (List<String> fromToks : source) {
|
||||
count++;
|
||||
for (List<String> toToks : target) {
|
||||
map.add(fromToks,
|
||||
SynonymMap.makeTokens(toToks),
|
||||
includeOrig,
|
||||
true
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// a , b c , d e f => [[a],[b,c],[d,e,f]]
|
||||
private static List<List<String>> getSynList(String str, String separator, TokenizerFactory tokFactory) {
|
||||
List<String> strList = Strings.splitSmart(str, separator, false);
|
||||
// now split on whitespace to get a list of token strings
|
||||
List<List<String>> synList = new ArrayList<List<String>>();
|
||||
for (String toks : strList) {
|
||||
List<String> tokList = tokFactory == null ?
|
||||
Strings.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
|
||||
synList.add(tokList);
|
||||
}
|
||||
return synList;
|
||||
}
|
||||
|
||||
private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) {
|
||||
TokenStream ts = tokFactory.create(new FastStringReader(source));
|
||||
List<String> tokList = new ArrayList<String>();
|
||||
try {
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
while (ts.incrementToken()) {
|
||||
if (termAtt.length() > 0)
|
||||
tokList.add(termAtt.toString());
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return tokList;
|
||||
}
|
||||
}
|
|
@ -58,7 +58,7 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
|||
// . => DIGIT
|
||||
// \u002C => DIGIT
|
||||
// \u200D => ALPHANUM
|
||||
Set<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
|
||||
List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
|
||||
if (charTypeTableValues == null) {
|
||||
this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
|
||||
} else {
|
||||
|
@ -84,7 +84,7 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
|||
// If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||
this.stemEnglishPossessive = settings.getAsBoolean("stem_english_possessive", true);
|
||||
// If not null is the set of tokens to protect from being delimited
|
||||
Set<String> protectedWords = Analysis.getWordList(env, settings, "protected_words");
|
||||
Set<String> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
|
||||
this.protoWords = protectedWords == null ? null : CharArraySet.copy(Lucene.VERSION, protectedWords);
|
||||
}
|
||||
|
||||
|
|
|
@ -53,7 +53,7 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok
|
|||
minSubwordSize = settings.getAsInt("min_subword_size", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
|
||||
maxSubwordSize = settings.getAsInt("max_subword_size", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
|
||||
onlyLongestMatch = settings.getAsBoolean("only_longest_max", false);
|
||||
wordList = Analysis.getWordList(env, settings, "word_list");
|
||||
wordList = Analysis.getWordSet(env, settings, "word_list");
|
||||
if (wordList == null) {
|
||||
throw new ElasticSearchIllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly");
|
||||
}
|
||||
|
|
|
@ -124,7 +124,7 @@ public class AnalysisModuleTests {
|
|||
assertThat(dictionaryDecompounderAnalyze.tokenFilters().length, equalTo(1));
|
||||
assertThat(dictionaryDecompounderAnalyze.tokenFilters()[0], instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
|
||||
|
||||
Set<String> wordList = Analysis.getWordList(null, settings, "index.analysis.filter.dict_dec.word_list");
|
||||
Set<String> wordList = Analysis.getWordSet(null, settings, "index.analysis.filter.dict_dec.word_list");
|
||||
MatcherAssert.assertThat(wordList.size(), equalTo(6));
|
||||
MatcherAssert.assertThat(wordList, hasItems("donau", "dampf", "schiff", "spargel", "creme", "suppe"));
|
||||
}
|
||||
|
@ -136,7 +136,7 @@ public class AnalysisModuleTests {
|
|||
File wordListFile = generateWordList(words);
|
||||
Settings settings = settingsBuilder().loadFromSource("index: \n word_list_path: " + wordListFile.getAbsolutePath()).build();
|
||||
|
||||
Set<String> wordList = Analysis.getWordList(env, settings, "index.word_list");
|
||||
Set<String> wordList = Analysis.getWordSet(env, settings, "index.word_list");
|
||||
MatcherAssert.assertThat(wordList.size(), equalTo(6));
|
||||
MatcherAssert.assertThat(wordList, hasItems(words));
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue