Analysis: Synonym Token Filter, closes #900.
This commit is contained in:
parent
1b686d3c2b
commit
15d8f0b1ac
|
@ -0,0 +1,258 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elastic Search and Shay Banon under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. Elastic Search licenses this
|
||||||
|
* file to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.synonym;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* SynonymFilter handles multi-token synonyms with variable position increment offsets.
|
||||||
|
* <p>
|
||||||
|
* The matched tokens from the input stream may be optionally passed through (includeOrig=true)
|
||||||
|
* or discarded. If the original tokens are included, the position increments may be modified
|
||||||
|
* to retain absolute positions after merging with the synonym tokenstream.
|
||||||
|
* <p>
|
||||||
|
* Generated synonyms will start at the same position as the first matched source token.
|
||||||
|
*/
|
||||||
|
// LUCENE MONITOR: Taken from 4.0, remove once upgraded
|
||||||
|
public final class SynonymFilter extends TokenFilter {
|
||||||
|
|
||||||
|
private final SynonymMap map; // Map<String, SynonymMap>
|
||||||
|
private Iterator<AttributeSource> replacement; // iterator over generated tokens
|
||||||
|
|
||||||
|
public SynonymFilter(TokenStream in, SynonymMap map) {
|
||||||
|
super(in);
|
||||||
|
if (map == null)
|
||||||
|
throw new IllegalArgumentException("map is required");
|
||||||
|
|
||||||
|
this.map = map;
|
||||||
|
// just ensuring these attributes exist...
|
||||||
|
addAttribute(CharTermAttribute.class);
|
||||||
|
addAttribute(PositionIncrementAttribute.class);
|
||||||
|
addAttribute(OffsetAttribute.class);
|
||||||
|
addAttribute(TypeAttribute.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Need to worry about multiple scenarios:
|
||||||
|
* - need to go for the longest match
|
||||||
|
* a b => foo #shouldn't match if "a b" is followed by "c d"
|
||||||
|
* a b c d => bar
|
||||||
|
* - need to backtrack - retry matches for tokens already read
|
||||||
|
* a b c d => foo
|
||||||
|
* b c => bar
|
||||||
|
* If the input stream is "a b c x", one will consume "a b c d"
|
||||||
|
* trying to match the first rule... all but "a" should be
|
||||||
|
* pushed back so a match may be made on "b c".
|
||||||
|
* - don't try and match generated tokens (thus need separate queue)
|
||||||
|
* matching is not recursive.
|
||||||
|
* - handle optional generation of original tokens in all these cases,
|
||||||
|
* merging token streams to preserve token positions.
|
||||||
|
* - preserve original positionIncrement of first matched token
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
while (true) {
|
||||||
|
// if there are any generated tokens, return them... don't try any
|
||||||
|
// matches against them, as we specifically don't want recursion.
|
||||||
|
if (replacement != null && replacement.hasNext()) {
|
||||||
|
copy(this, replacement.next());
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// common case fast-path of first token not matching anything
|
||||||
|
AttributeSource firstTok = nextTok();
|
||||||
|
if (firstTok == null) return false;
|
||||||
|
CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
|
||||||
|
SynonymMap result = map.submap != null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
|
||||||
|
if (result == null) {
|
||||||
|
copy(this, firstTok);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// fast-path failed, clone ourselves if needed
|
||||||
|
if (firstTok == this)
|
||||||
|
firstTok = cloneAttributes();
|
||||||
|
// OK, we matched a token, so find the longest match.
|
||||||
|
|
||||||
|
matched = new LinkedList<AttributeSource>();
|
||||||
|
|
||||||
|
result = match(result);
|
||||||
|
|
||||||
|
if (result == null) {
|
||||||
|
// no match, simply return the first token read.
|
||||||
|
copy(this, firstTok);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// reuse, or create new one each time?
|
||||||
|
ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(result.synonyms.length + matched.size() + 1);
|
||||||
|
|
||||||
|
//
|
||||||
|
// there was a match... let's generate the new tokens, merging
|
||||||
|
// in the matched tokens (position increments need adjusting)
|
||||||
|
//
|
||||||
|
AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
|
||||||
|
boolean includeOrig = result.includeOrig();
|
||||||
|
|
||||||
|
AttributeSource origTok = includeOrig ? firstTok : null;
|
||||||
|
PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
|
||||||
|
int repPos = 0; // curr position in replacement token stream
|
||||||
|
int pos = 0; // current position in merged token stream
|
||||||
|
|
||||||
|
for (int i = 0; i < result.synonyms.length; i++) {
|
||||||
|
Token repTok = result.synonyms[i];
|
||||||
|
AttributeSource newTok = firstTok.cloneAttributes();
|
||||||
|
CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
|
||||||
|
OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
|
||||||
|
PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
|
OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
|
newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
|
||||||
|
newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
|
||||||
|
repPos += repTok.getPositionIncrement();
|
||||||
|
if (i == 0) repPos = origPos; // make position of first token equal to original
|
||||||
|
|
||||||
|
// if necessary, insert original tokens and adjust position increment
|
||||||
|
while (origTok != null && origPos <= repPos) {
|
||||||
|
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
origPosInc.setPositionIncrement(origPos - pos);
|
||||||
|
generated.add(origTok);
|
||||||
|
pos += origPosInc.getPositionIncrement();
|
||||||
|
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
||||||
|
if (origTok != null) {
|
||||||
|
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
origPos += origPosInc.getPositionIncrement();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
newPosIncAtt.setPositionIncrement(repPos - pos);
|
||||||
|
generated.add(newTok);
|
||||||
|
pos += newPosIncAtt.getPositionIncrement();
|
||||||
|
}
|
||||||
|
|
||||||
|
// finish up any leftover original tokens
|
||||||
|
while (origTok != null) {
|
||||||
|
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
origPosInc.setPositionIncrement(origPos - pos);
|
||||||
|
generated.add(origTok);
|
||||||
|
pos += origPosInc.getPositionIncrement();
|
||||||
|
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
||||||
|
if (origTok != null) {
|
||||||
|
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
origPos += origPosInc.getPositionIncrement();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// what if we replaced a longer sequence with a shorter one?
|
||||||
|
// a/0 b/5 => foo/0
|
||||||
|
// should I re-create the gap on the next buffered token?
|
||||||
|
|
||||||
|
replacement = generated.iterator();
|
||||||
|
// Now return to the top of the loop to read and return the first
|
||||||
|
// generated token.. The reason this is done is that we may have generated
|
||||||
|
// nothing at all, and may need to continue with more matching logic.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// Defer creation of the buffer until the first time it is used to
|
||||||
|
// optimize short fields with no matches.
|
||||||
|
//
|
||||||
|
private LinkedList<AttributeSource> buffer;
|
||||||
|
private LinkedList<AttributeSource> matched;
|
||||||
|
|
||||||
|
private AttributeSource nextTok() throws IOException {
|
||||||
|
if (buffer != null && !buffer.isEmpty()) {
|
||||||
|
return buffer.removeFirst();
|
||||||
|
} else {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
return this;
|
||||||
|
} else
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void pushTok(AttributeSource t) {
|
||||||
|
if (buffer == null) buffer = new LinkedList<AttributeSource>();
|
||||||
|
buffer.addFirst(t);
|
||||||
|
}
|
||||||
|
|
||||||
|
private SynonymMap match(SynonymMap map) throws IOException {
|
||||||
|
SynonymMap result = null;
|
||||||
|
|
||||||
|
if (map.submap != null) {
|
||||||
|
AttributeSource tok = nextTok();
|
||||||
|
if (tok != null) {
|
||||||
|
// clone ourselves.
|
||||||
|
if (tok == this)
|
||||||
|
tok = cloneAttributes();
|
||||||
|
// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
|
||||||
|
CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
|
||||||
|
SynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length());
|
||||||
|
|
||||||
|
if (subMap != null) {
|
||||||
|
// recurse
|
||||||
|
result = match(subMap);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result != null) {
|
||||||
|
matched.addFirst(tok);
|
||||||
|
} else {
|
||||||
|
// push back unmatched token
|
||||||
|
pushTok(tok);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// if no longer sequence matched, so if this node has synonyms, it's the match.
|
||||||
|
if (result == null && map.synonyms != null) {
|
||||||
|
result = map;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void copy(AttributeSource target, AttributeSource source) {
|
||||||
|
if (target != source)
|
||||||
|
source.copyTo(target);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset() throws IOException {
|
||||||
|
input.reset();
|
||||||
|
replacement = null;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,177 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elastic Search and Shay Banon under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. Elastic Search licenses this
|
||||||
|
* file to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.synonym;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharArrayMap;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Mapping rules for use with {@link SynonymFilter}
|
||||||
|
*/
|
||||||
|
public class SynonymMap {
|
||||||
|
/**
|
||||||
|
* @lucene.internal
|
||||||
|
*/
|
||||||
|
public CharArrayMap<SynonymMap> submap; // recursive: Map<String, SynonymMap>
|
||||||
|
/**
|
||||||
|
* @lucene.internal
|
||||||
|
*/
|
||||||
|
public Token[] synonyms;
|
||||||
|
int flags;
|
||||||
|
|
||||||
|
static final int INCLUDE_ORIG = 0x01;
|
||||||
|
static final int IGNORE_CASE = 0x02;
|
||||||
|
|
||||||
|
public SynonymMap() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public SynonymMap(boolean ignoreCase) {
|
||||||
|
if (ignoreCase) flags |= IGNORE_CASE;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean includeOrig() {
|
||||||
|
return (flags & INCLUDE_ORIG) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean ignoreCase() {
|
||||||
|
return (flags & IGNORE_CASE) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param singleMatch List<String>, the sequence of strings to match
|
||||||
|
* @param replacement List<Token> the list of tokens to use on a match
|
||||||
|
* @param includeOrig sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens
|
||||||
|
* @param mergeExisting merge the replacement tokens with any other mappings that exist
|
||||||
|
*/
|
||||||
|
public void add(List<String> singleMatch, List<Token> replacement, boolean includeOrig, boolean mergeExisting) {
|
||||||
|
SynonymMap currMap = this;
|
||||||
|
for (String str : singleMatch) {
|
||||||
|
if (currMap.submap == null) {
|
||||||
|
// for now hardcode at 4.0, as its what the old code did.
|
||||||
|
// would be nice to fix, but shouldn't store a version in each submap!!!
|
||||||
|
currMap.submap = new CharArrayMap<SynonymMap>(Version.LUCENE_31, 1, ignoreCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
SynonymMap map = currMap.submap.get(str);
|
||||||
|
if (map == null) {
|
||||||
|
map = new SynonymMap();
|
||||||
|
map.flags |= flags & IGNORE_CASE;
|
||||||
|
currMap.submap.put(str, map);
|
||||||
|
}
|
||||||
|
|
||||||
|
currMap = map;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (currMap.synonyms != null && !mergeExisting) {
|
||||||
|
throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch);
|
||||||
|
}
|
||||||
|
List<Token> superset = currMap.synonyms == null ? replacement :
|
||||||
|
mergeTokens(Arrays.asList(currMap.synonyms), replacement);
|
||||||
|
currMap.synonyms = superset.toArray(new Token[superset.size()]);
|
||||||
|
if (includeOrig) currMap.flags |= INCLUDE_ORIG;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
StringBuilder sb = new StringBuilder("<");
|
||||||
|
if (synonyms != null) {
|
||||||
|
sb.append("[");
|
||||||
|
for (int i = 0; i < synonyms.length; i++) {
|
||||||
|
if (i != 0) sb.append(',');
|
||||||
|
sb.append(synonyms[i]);
|
||||||
|
}
|
||||||
|
if ((flags & INCLUDE_ORIG) != 0) {
|
||||||
|
sb.append(",ORIG");
|
||||||
|
}
|
||||||
|
sb.append("],");
|
||||||
|
}
|
||||||
|
sb.append(submap);
|
||||||
|
sb.append(">");
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Produces a List<Token> from a List<String>
|
||||||
|
*/
|
||||||
|
public static List<Token> makeTokens(List<String> strings) {
|
||||||
|
List<Token> ret = new ArrayList<Token>(strings.size());
|
||||||
|
for (String str : strings) {
|
||||||
|
//Token newTok = new Token(str,0,0,"SYNONYM");
|
||||||
|
Token newTok = new Token(str, 0, 0, "SYNONYM");
|
||||||
|
ret.add(newTok);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
|
||||||
|
* the tokens end up at the same position.
|
||||||
|
*
|
||||||
|
* Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same position)
|
||||||
|
* Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a has posInc=n)
|
||||||
|
*/
|
||||||
|
public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) {
|
||||||
|
ArrayList<Token> result = new ArrayList<Token>();
|
||||||
|
if (lst1 == null || lst2 == null) {
|
||||||
|
if (lst2 != null) result.addAll(lst2);
|
||||||
|
if (lst1 != null) result.addAll(lst1);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
int pos = 0;
|
||||||
|
Iterator<Token> iter1 = lst1.iterator();
|
||||||
|
Iterator<Token> iter2 = lst2.iterator();
|
||||||
|
Token tok1 = iter1.hasNext() ? iter1.next() : null;
|
||||||
|
Token tok2 = iter2.hasNext() ? iter2.next() : null;
|
||||||
|
int pos1 = tok1 != null ? tok1.getPositionIncrement() : 0;
|
||||||
|
int pos2 = tok2 != null ? tok2.getPositionIncrement() : 0;
|
||||||
|
while (tok1 != null || tok2 != null) {
|
||||||
|
while (tok1 != null && (pos1 <= pos2 || tok2 == null)) {
|
||||||
|
Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
|
||||||
|
tok.copyBuffer(tok1.buffer(), 0, tok1.length());
|
||||||
|
tok.setPositionIncrement(pos1 - pos);
|
||||||
|
result.add(tok);
|
||||||
|
pos = pos1;
|
||||||
|
tok1 = iter1.hasNext() ? iter1.next() : null;
|
||||||
|
pos1 += tok1 != null ? tok1.getPositionIncrement() : 0;
|
||||||
|
}
|
||||||
|
while (tok2 != null && (pos2 <= pos1 || tok1 == null)) {
|
||||||
|
Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
|
||||||
|
tok.copyBuffer(tok2.buffer(), 0, tok2.length());
|
||||||
|
tok.setPositionIncrement(pos2 - pos);
|
||||||
|
result.add(tok);
|
||||||
|
pos = pos2;
|
||||||
|
tok2 = iter2.hasNext() ? iter2.next() : null;
|
||||||
|
pos2 += tok2 != null ? tok2.getPositionIncrement() : 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -42,6 +42,117 @@ public class Strings {
|
||||||
private static final char EXTENSION_SEPARATOR = '.';
|
private static final char EXTENSION_SEPARATOR = '.';
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Splits a backslash escaped string on the separator.
|
||||||
|
* <p>
|
||||||
|
* Current backslash escaping supported:
|
||||||
|
* <br> \n \t \r \b \f are escaped the same as a Java String
|
||||||
|
* <br> Other characters following a backslash are produced verbatim (\c => c)
|
||||||
|
*
|
||||||
|
* @param s the string to split
|
||||||
|
* @param separator the separator to split on
|
||||||
|
* @param decode decode backslash escaping
|
||||||
|
*/
|
||||||
|
public static List<String> splitSmart(String s, String separator, boolean decode) {
|
||||||
|
ArrayList<String> lst = new ArrayList<String>(2);
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
int pos = 0, end = s.length();
|
||||||
|
while (pos < end) {
|
||||||
|
if (s.startsWith(separator, pos)) {
|
||||||
|
if (sb.length() > 0) {
|
||||||
|
lst.add(sb.toString());
|
||||||
|
sb = new StringBuilder();
|
||||||
|
}
|
||||||
|
pos += separator.length();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
char ch = s.charAt(pos++);
|
||||||
|
if (ch == '\\') {
|
||||||
|
if (!decode) sb.append(ch);
|
||||||
|
if (pos >= end) break; // ERROR, or let it go?
|
||||||
|
ch = s.charAt(pos++);
|
||||||
|
if (decode) {
|
||||||
|
switch (ch) {
|
||||||
|
case 'n':
|
||||||
|
ch = '\n';
|
||||||
|
break;
|
||||||
|
case 't':
|
||||||
|
ch = '\t';
|
||||||
|
break;
|
||||||
|
case 'r':
|
||||||
|
ch = '\r';
|
||||||
|
break;
|
||||||
|
case 'b':
|
||||||
|
ch = '\b';
|
||||||
|
break;
|
||||||
|
case 'f':
|
||||||
|
ch = '\f';
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sb.append(ch);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sb.length() > 0) {
|
||||||
|
lst.add(sb.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
return lst;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static List<String> splitWS(String s, boolean decode) {
|
||||||
|
ArrayList<String> lst = new ArrayList<String>(2);
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
int pos = 0, end = s.length();
|
||||||
|
while (pos < end) {
|
||||||
|
char ch = s.charAt(pos++);
|
||||||
|
if (Character.isWhitespace(ch)) {
|
||||||
|
if (sb.length() > 0) {
|
||||||
|
lst.add(sb.toString());
|
||||||
|
sb = new StringBuilder();
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ch == '\\') {
|
||||||
|
if (!decode) sb.append(ch);
|
||||||
|
if (pos >= end) break; // ERROR, or let it go?
|
||||||
|
ch = s.charAt(pos++);
|
||||||
|
if (decode) {
|
||||||
|
switch (ch) {
|
||||||
|
case 'n':
|
||||||
|
ch = '\n';
|
||||||
|
break;
|
||||||
|
case 't':
|
||||||
|
ch = '\t';
|
||||||
|
break;
|
||||||
|
case 'r':
|
||||||
|
ch = '\r';
|
||||||
|
break;
|
||||||
|
case 'b':
|
||||||
|
ch = '\b';
|
||||||
|
break;
|
||||||
|
case 'f':
|
||||||
|
ch = '\f';
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sb.append(ch);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sb.length() > 0) {
|
||||||
|
lst.add(sb.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
return lst;
|
||||||
|
}
|
||||||
|
|
||||||
//---------------------------------------------------------------------
|
//---------------------------------------------------------------------
|
||||||
// General convenience methods for working with Strings
|
// General convenience methods for working with Strings
|
||||||
//---------------------------------------------------------------------
|
//---------------------------------------------------------------------
|
||||||
|
|
|
@ -19,7 +19,6 @@
|
||||||
|
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.WordlistLoader;
|
|
||||||
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
|
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
|
||||||
import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
|
import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
|
||||||
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
|
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
|
||||||
|
@ -56,12 +55,12 @@ import org.elasticsearch.common.collect.MapBuilder;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.Reader;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.util.Arrays;
|
import java.util.*;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author kimchy (shay.banon)
|
* @author kimchy (shay.banon)
|
||||||
|
@ -140,7 +139,7 @@ public class Analysis {
|
||||||
}
|
}
|
||||||
return setStopWords;
|
return setStopWords;
|
||||||
}
|
}
|
||||||
Set<String> pathLoadedStopWords = getWordList(env, settings, "stopwords");
|
Set<String> pathLoadedStopWords = getWordSet(env, settings, "stopwords");
|
||||||
if (pathLoadedStopWords != null) {
|
if (pathLoadedStopWords != null) {
|
||||||
Set setStopWords = new HashSet<String>();
|
Set setStopWords = new HashSet<String>();
|
||||||
for (String stopWord : pathLoadedStopWords) {
|
for (String stopWord : pathLoadedStopWords) {
|
||||||
|
@ -156,6 +155,14 @@ public class Analysis {
|
||||||
return defaultStopWords;
|
return defaultStopWords;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Set<String> getWordSet(Environment env, Settings settings, String settingsPrefix) {
|
||||||
|
List<String> wordList = getWordList(env, settings, settingsPrefix);
|
||||||
|
if (wordList == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return new HashSet<String>(wordList);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fetches a list of words from the specified settings file. The list should either be available at the key
|
* Fetches a list of words from the specified settings file. The list should either be available at the key
|
||||||
* specified by settingsPrefix or in a file specified by settingsPrefix + _path.
|
* specified by settingsPrefix or in a file specified by settingsPrefix + _path.
|
||||||
|
@ -163,7 +170,7 @@ public class Analysis {
|
||||||
* @throws ElasticSearchIllegalArgumentException
|
* @throws ElasticSearchIllegalArgumentException
|
||||||
* If the word list cannot be found at either key.
|
* If the word list cannot be found at either key.
|
||||||
*/
|
*/
|
||||||
public static Set<String> getWordList(Environment env, Settings settings, String settingPrefix) {
|
public static List<String> getWordList(Environment env, Settings settings, String settingPrefix) {
|
||||||
String wordListPath = settings.get(settingPrefix + "_path", null);
|
String wordListPath = settings.get(settingPrefix + "_path", null);
|
||||||
|
|
||||||
if (wordListPath == null) {
|
if (wordListPath == null) {
|
||||||
|
@ -171,17 +178,42 @@ public class Analysis {
|
||||||
if (explicitWordList == null) {
|
if (explicitWordList == null) {
|
||||||
return null;
|
return null;
|
||||||
} else {
|
} else {
|
||||||
return new HashSet<String>(Arrays.asList(explicitWordList));
|
return Arrays.asList(explicitWordList);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
URL wordListFile = env.resolveConfig(wordListPath);
|
URL wordListFile = env.resolveConfig(wordListPath);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
return WordlistLoader.getWordSet(new InputStreamReader(wordListFile.openStream(), Charsets.UTF_8), "#");
|
return loadWordList(new InputStreamReader(wordListFile.openStream(), Charsets.UTF_8), "#");
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());
|
String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());
|
||||||
throw new ElasticSearchIllegalArgumentException(message);
|
throw new ElasticSearchIllegalArgumentException(message);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static List<String> loadWordList(Reader reader, String comment) throws IOException {
|
||||||
|
final List<String> result = new ArrayList<String>();
|
||||||
|
BufferedReader br = null;
|
||||||
|
try {
|
||||||
|
if (reader instanceof BufferedReader) {
|
||||||
|
br = (BufferedReader) reader;
|
||||||
|
} else {
|
||||||
|
br = new BufferedReader(reader);
|
||||||
|
}
|
||||||
|
String word = null;
|
||||||
|
while ((word = br.readLine()) != null) {
|
||||||
|
if (!Strings.hasText(word)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!word.startsWith(comment)) {
|
||||||
|
result.add(word.trim());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
if (br != null)
|
||||||
|
br.close();
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -326,9 +326,6 @@ public class AnalysisModule extends AbstractModule {
|
||||||
tokenFiltersBindings.processTokenFilter("edgeNGram", EdgeNGramTokenFilterFactory.class);
|
tokenFiltersBindings.processTokenFilter("edgeNGram", EdgeNGramTokenFilterFactory.class);
|
||||||
tokenFiltersBindings.processTokenFilter("edge_ngram", EdgeNGramTokenFilterFactory.class);
|
tokenFiltersBindings.processTokenFilter("edge_ngram", EdgeNGramTokenFilterFactory.class);
|
||||||
tokenFiltersBindings.processTokenFilter("shingle", ShingleTokenFilterFactory.class);
|
tokenFiltersBindings.processTokenFilter("shingle", ShingleTokenFilterFactory.class);
|
||||||
tokenFiltersBindings.processTokenFilter("phonetic", PhoneticTokenFilterFactory.class);
|
|
||||||
tokenFiltersBindings.processTokenFilter("dictionary_decompounder", DictionaryCompoundWordTokenFilterFactory.class);
|
|
||||||
tokenFiltersBindings.processTokenFilter("hypennation_decompounder", HyphenationCompoundWordTokenFilterFactory.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void processTokenizers(TokenizersBindings tokenizersBindings) {
|
@Override public void processTokenizers(TokenizersBindings tokenizersBindings) {
|
||||||
|
@ -362,6 +359,11 @@ public class AnalysisModule extends AbstractModule {
|
||||||
tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class);
|
tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class);
|
||||||
tokenFiltersBindings.processTokenFilter("stemmer", StemmerTokenFilterFactory.class);
|
tokenFiltersBindings.processTokenFilter("stemmer", StemmerTokenFilterFactory.class);
|
||||||
tokenFiltersBindings.processTokenFilter("word_delimiter", WordDelimiterTokenFilterFactory.class);
|
tokenFiltersBindings.processTokenFilter("word_delimiter", WordDelimiterTokenFilterFactory.class);
|
||||||
|
tokenFiltersBindings.processTokenFilter("synonym", SynonymTokenFilterFactory.class);
|
||||||
|
|
||||||
|
tokenFiltersBindings.processTokenFilter("phonetic", PhoneticTokenFilterFactory.class);
|
||||||
|
tokenFiltersBindings.processTokenFilter("dictionary_decompounder", DictionaryCompoundWordTokenFilterFactory.class);
|
||||||
|
tokenFiltersBindings.processTokenFilter("hypennation_decompounder", HyphenationCompoundWordTokenFilterFactory.class);
|
||||||
|
|
||||||
tokenFiltersBindings.processTokenFilter("arabic_stem", ArabicStemTokenFilterFactory.class);
|
tokenFiltersBindings.processTokenFilter("arabic_stem", ArabicStemTokenFilterFactory.class);
|
||||||
tokenFiltersBindings.processTokenFilter("brazilian_stem", BrazilianStemTokenFilterFactory.class);
|
tokenFiltersBindings.processTokenFilter("brazilian_stem", BrazilianStemTokenFilterFactory.class);
|
||||||
|
|
|
@ -0,0 +1,139 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elastic Search and Shay Banon under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. Elastic Search licenses this
|
||||||
|
* file to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||||
|
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||||
|
import org.elasticsearch.common.Strings;
|
||||||
|
import org.elasticsearch.common.inject.Inject;
|
||||||
|
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||||
|
import org.elasticsearch.common.io.FastStringReader;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.env.Environment;
|
||||||
|
import org.elasticsearch.index.Index;
|
||||||
|
import org.elasticsearch.index.settings.IndexSettings;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
@AnalysisSettingsRequired
|
||||||
|
public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
|
|
||||||
|
private final SynonymMap synonymMap;
|
||||||
|
|
||||||
|
@Inject public SynonymTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, Map<String, TokenizerFactoryFactory> tokenizerFactories,
|
||||||
|
@Assisted String name, @Assisted Settings settings) {
|
||||||
|
super(index, indexSettings, name, settings);
|
||||||
|
|
||||||
|
List<String> rules = Analysis.getWordList(env, settings, "synonyms");
|
||||||
|
if (rules == null) {
|
||||||
|
throw new ElasticSearchIllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured");
|
||||||
|
}
|
||||||
|
boolean ignoreCase = settings.getAsBoolean("ignore_case", false);
|
||||||
|
boolean expand = settings.getAsBoolean("expand", true);
|
||||||
|
|
||||||
|
TokenizerFactoryFactory tokenizerFactoryFactory = tokenizerFactories.get(settings.get("tokenizer", "whitespace"));
|
||||||
|
TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.create(settings.get("tokenizer", "whitespace"), settings);
|
||||||
|
synonymMap = new SynonymMap(ignoreCase);
|
||||||
|
parseRules(rules, synonymMap, "=>", ",", expand, tokenizerFactory);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new SynonymFilter(tokenStream, synonymMap);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void parseRules(List<String> rules, SynonymMap map, String mappingSep,
|
||||||
|
String synSep, boolean expansion, TokenizerFactory tokFactory) {
|
||||||
|
int count = 0;
|
||||||
|
for (String rule : rules) {
|
||||||
|
// To use regexes, we need an expression that specifies an odd number of chars.
|
||||||
|
// This can't really be done with string.split(), and since we need to
|
||||||
|
// do unescaping at some point anyway, we wouldn't be saving any effort
|
||||||
|
// by using regexes.
|
||||||
|
|
||||||
|
List<String> mapping = Strings.splitSmart(rule, mappingSep, false);
|
||||||
|
|
||||||
|
List<List<String>> source;
|
||||||
|
List<List<String>> target;
|
||||||
|
|
||||||
|
if (mapping.size() > 2) {
|
||||||
|
throw new RuntimeException("Invalid Synonym Rule:" + rule);
|
||||||
|
} else if (mapping.size() == 2) {
|
||||||
|
source = getSynList(mapping.get(0), synSep, tokFactory);
|
||||||
|
target = getSynList(mapping.get(1), synSep, tokFactory);
|
||||||
|
} else {
|
||||||
|
source = getSynList(mapping.get(0), synSep, tokFactory);
|
||||||
|
if (expansion) {
|
||||||
|
// expand to all arguments
|
||||||
|
target = source;
|
||||||
|
} else {
|
||||||
|
// reduce to first argument
|
||||||
|
target = new ArrayList<List<String>>(1);
|
||||||
|
target.add(source.get(0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean includeOrig = false;
|
||||||
|
for (List<String> fromToks : source) {
|
||||||
|
count++;
|
||||||
|
for (List<String> toToks : target) {
|
||||||
|
map.add(fromToks,
|
||||||
|
SynonymMap.makeTokens(toToks),
|
||||||
|
includeOrig,
|
||||||
|
true
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// a , b c , d e f => [[a],[b,c],[d,e,f]]
|
||||||
|
private static List<List<String>> getSynList(String str, String separator, TokenizerFactory tokFactory) {
|
||||||
|
List<String> strList = Strings.splitSmart(str, separator, false);
|
||||||
|
// now split on whitespace to get a list of token strings
|
||||||
|
List<List<String>> synList = new ArrayList<List<String>>();
|
||||||
|
for (String toks : strList) {
|
||||||
|
List<String> tokList = tokFactory == null ?
|
||||||
|
Strings.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
|
||||||
|
synList.add(tokList);
|
||||||
|
}
|
||||||
|
return synList;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) {
|
||||||
|
TokenStream ts = tokFactory.create(new FastStringReader(source));
|
||||||
|
List<String> tokList = new ArrayList<String>();
|
||||||
|
try {
|
||||||
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
while (ts.incrementToken()) {
|
||||||
|
if (termAtt.length() > 0)
|
||||||
|
tokList.add(termAtt.toString());
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
return tokList;
|
||||||
|
}
|
||||||
|
}
|
|
@ -58,7 +58,7 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
||||||
// . => DIGIT
|
// . => DIGIT
|
||||||
// \u002C => DIGIT
|
// \u002C => DIGIT
|
||||||
// \u200D => ALPHANUM
|
// \u200D => ALPHANUM
|
||||||
Set<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
|
List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
|
||||||
if (charTypeTableValues == null) {
|
if (charTypeTableValues == null) {
|
||||||
this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
|
this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
|
||||||
} else {
|
} else {
|
||||||
|
@ -84,7 +84,7 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
||||||
// If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
// If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||||
this.stemEnglishPossessive = settings.getAsBoolean("stem_english_possessive", true);
|
this.stemEnglishPossessive = settings.getAsBoolean("stem_english_possessive", true);
|
||||||
// If not null is the set of tokens to protect from being delimited
|
// If not null is the set of tokens to protect from being delimited
|
||||||
Set<String> protectedWords = Analysis.getWordList(env, settings, "protected_words");
|
Set<String> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
|
||||||
this.protoWords = protectedWords == null ? null : CharArraySet.copy(Lucene.VERSION, protectedWords);
|
this.protoWords = protectedWords == null ? null : CharArraySet.copy(Lucene.VERSION, protectedWords);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -53,7 +53,7 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok
|
||||||
minSubwordSize = settings.getAsInt("min_subword_size", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
|
minSubwordSize = settings.getAsInt("min_subword_size", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
|
||||||
maxSubwordSize = settings.getAsInt("max_subword_size", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
|
maxSubwordSize = settings.getAsInt("max_subword_size", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
|
||||||
onlyLongestMatch = settings.getAsBoolean("only_longest_max", false);
|
onlyLongestMatch = settings.getAsBoolean("only_longest_max", false);
|
||||||
wordList = Analysis.getWordList(env, settings, "word_list");
|
wordList = Analysis.getWordSet(env, settings, "word_list");
|
||||||
if (wordList == null) {
|
if (wordList == null) {
|
||||||
throw new ElasticSearchIllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly");
|
throw new ElasticSearchIllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly");
|
||||||
}
|
}
|
||||||
|
|
|
@ -124,7 +124,7 @@ public class AnalysisModuleTests {
|
||||||
assertThat(dictionaryDecompounderAnalyze.tokenFilters().length, equalTo(1));
|
assertThat(dictionaryDecompounderAnalyze.tokenFilters().length, equalTo(1));
|
||||||
assertThat(dictionaryDecompounderAnalyze.tokenFilters()[0], instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
|
assertThat(dictionaryDecompounderAnalyze.tokenFilters()[0], instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
|
||||||
|
|
||||||
Set<String> wordList = Analysis.getWordList(null, settings, "index.analysis.filter.dict_dec.word_list");
|
Set<String> wordList = Analysis.getWordSet(null, settings, "index.analysis.filter.dict_dec.word_list");
|
||||||
MatcherAssert.assertThat(wordList.size(), equalTo(6));
|
MatcherAssert.assertThat(wordList.size(), equalTo(6));
|
||||||
MatcherAssert.assertThat(wordList, hasItems("donau", "dampf", "schiff", "spargel", "creme", "suppe"));
|
MatcherAssert.assertThat(wordList, hasItems("donau", "dampf", "schiff", "spargel", "creme", "suppe"));
|
||||||
}
|
}
|
||||||
|
@ -136,7 +136,7 @@ public class AnalysisModuleTests {
|
||||||
File wordListFile = generateWordList(words);
|
File wordListFile = generateWordList(words);
|
||||||
Settings settings = settingsBuilder().loadFromSource("index: \n word_list_path: " + wordListFile.getAbsolutePath()).build();
|
Settings settings = settingsBuilder().loadFromSource("index: \n word_list_path: " + wordListFile.getAbsolutePath()).build();
|
||||||
|
|
||||||
Set<String> wordList = Analysis.getWordList(env, settings, "index.word_list");
|
Set<String> wordList = Analysis.getWordSet(env, settings, "index.word_list");
|
||||||
MatcherAssert.assertThat(wordList.size(), equalTo(6));
|
MatcherAssert.assertThat(wordList.size(), equalTo(6));
|
||||||
MatcherAssert.assertThat(wordList, hasItems(words));
|
MatcherAssert.assertThat(wordList, hasItems(words));
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue