Adding Wordnet synonym format
This commit is contained in:
parent
0f2b875df9
commit
f7747d7ff9
|
@ -241,4 +241,30 @@ public class Analysis {
|
|||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return null If no settings set for "settingsPrefix + _path" then return null.
|
||||
*
|
||||
* @throws ElasticSearchIllegalArgumentException
|
||||
* If the Reader can not be instantiated.
|
||||
*/
|
||||
public static Reader getFileReader(Environment env, Settings settings, String settingPrefix) {
|
||||
String filePath = settings.get(settingPrefix + "_path", null);
|
||||
|
||||
if (filePath == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
URL fileUrl = env.resolveConfig(filePath);
|
||||
|
||||
Reader reader = null;
|
||||
try {
|
||||
reader = new InputStreamReader(fileUrl.openStream(), Charsets.UTF_8);
|
||||
} catch (IOException ioe) {
|
||||
String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());
|
||||
throw new ElasticSearchIllegalArgumentException(message);
|
||||
}
|
||||
|
||||
return reader;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,8 +25,10 @@ import org.apache.lucene.analysis.ReusableAnalyzerBase;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
|
||||
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
|
@ -57,8 +59,8 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
@Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name, settings);
|
||||
|
||||
List<String> rules = Analysis.getWordList(env, settings, "synonyms");
|
||||
if (rules == null) {
|
||||
Reader rulesReader = Analysis.getFileReader(env, settings, "synonyms");
|
||||
if (rulesReader == null) {
|
||||
throw new ElasticSearchIllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured");
|
||||
}
|
||||
this.ignoreCase = settings.getAsBoolean("ignore_case", false);
|
||||
|
@ -71,7 +73,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
tokenizerFactoryFactory = indicesAnalysisService.tokenizerFactoryFactory(tokenizerName);
|
||||
}
|
||||
if (tokenizerFactoryFactory == null) {
|
||||
throw new ElasticSearchIllegalArgumentException("failed to fine tokenizer [" + tokenizerName + "] for synonym token filter");
|
||||
throw new ElasticSearchIllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter");
|
||||
}
|
||||
final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.create(tokenizerName, settings);
|
||||
|
||||
|
@ -84,13 +86,18 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
}
|
||||
};
|
||||
|
||||
CustomSynonymParser parser = new CustomSynonymParser(true, expand, analyzer);
|
||||
try {
|
||||
for (String rule : rules) {
|
||||
parser.addLine(rule);
|
||||
SynonymMap.Builder parser = null;
|
||||
|
||||
if (settings.get("format","wordnet").equalsIgnoreCase("wordnet")) {
|
||||
parser = new WordnetSynonymParser(true, expand, analyzer);
|
||||
((WordnetSynonymParser)parser).add(rulesReader);
|
||||
} else {
|
||||
parser = new SolrSynonymParser(true, expand, analyzer);
|
||||
((SolrSynonymParser)parser).add(rulesReader);
|
||||
}
|
||||
synonymMap = parser.build();
|
||||
} catch (IOException e) {
|
||||
} catch (Exception e) {
|
||||
throw new ElasticSearchIllegalArgumentException("failed to build synonyms", e);
|
||||
}
|
||||
}
|
||||
|
@ -99,160 +106,4 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
// fst is null means no synonyms
|
||||
return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, ignoreCase);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parser for the Solr synonyms format.
|
||||
* <ol>
|
||||
* <li> Blank lines and lines starting with '#' are comments.
|
||||
* <li> Explicit mappings match any token sequence on the LHS of "=>"
|
||||
* and replace with all alternatives on the RHS. These types of mappings
|
||||
* ignore the expand parameter in the constructor.
|
||||
* Example:
|
||||
* <blockquote>i-pod, i pod => ipod</blockquote>
|
||||
* <li> Equivalent synonyms may be separated with commas and give
|
||||
* no explicit mapping. In this case the mapping behavior will
|
||||
* be taken from the expand parameter in the constructor. This allows
|
||||
* the same synonym file to be used in different synonym handling strategies.
|
||||
* Example:
|
||||
* <blockquote>ipod, i-pod, i pod</blockquote>
|
||||
*
|
||||
* <li> Multiple synonym mapping entries are merged.
|
||||
* Example:
|
||||
* <blockquote>
|
||||
* foo => foo bar<br>
|
||||
* foo => baz<br><br>
|
||||
* is equivalent to<br><br>
|
||||
* foo => foo bar, baz
|
||||
* </blockquote>
|
||||
* </ol>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static class CustomSynonymParser extends SynonymMap.Builder {
|
||||
private final boolean expand;
|
||||
private final Analyzer analyzer;
|
||||
|
||||
public CustomSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
|
||||
super(dedup);
|
||||
this.expand = expand;
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
|
||||
public void add(Reader in) throws IOException, ParseException {
|
||||
LineNumberReader br = new LineNumberReader(in);
|
||||
try {
|
||||
addInternal(br);
|
||||
} catch (IllegalArgumentException e) {
|
||||
ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
|
||||
ex.initCause(e);
|
||||
throw ex;
|
||||
} finally {
|
||||
br.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void addLine(String line) throws IOException {
|
||||
if (line.length() == 0 || line.charAt(0) == '#') {
|
||||
return;
|
||||
}
|
||||
|
||||
CharsRef inputs[];
|
||||
CharsRef outputs[];
|
||||
|
||||
// TODO: we could process this more efficiently.
|
||||
String sides[] = split(line, "=>");
|
||||
if (sides.length > 1) { // explicit mapping
|
||||
if (sides.length != 2) {
|
||||
throw new IllegalArgumentException("more than one explicit mapping specified on the same line");
|
||||
}
|
||||
String inputStrings[] = split(sides[0], ",");
|
||||
inputs = new CharsRef[inputStrings.length];
|
||||
for (int i = 0; i < inputs.length; i++) {
|
||||
inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
|
||||
}
|
||||
|
||||
String outputStrings[] = split(sides[1], ",");
|
||||
outputs = new CharsRef[outputStrings.length];
|
||||
for (int i = 0; i < outputs.length; i++) {
|
||||
outputs[i] = analyze(analyzer, unescape(outputStrings[i]).trim(), new CharsRef());
|
||||
}
|
||||
} else {
|
||||
String inputStrings[] = split(line, ",");
|
||||
inputs = new CharsRef[inputStrings.length];
|
||||
for (int i = 0; i < inputs.length; i++) {
|
||||
inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
|
||||
}
|
||||
if (expand) {
|
||||
outputs = inputs;
|
||||
} else {
|
||||
outputs = new CharsRef[1];
|
||||
outputs[0] = inputs[0];
|
||||
}
|
||||
}
|
||||
|
||||
// currently we include the term itself in the map,
|
||||
// and use includeOrig = false always.
|
||||
// this is how the existing filter does it, but its actually a bug,
|
||||
// especially if combined with ignoreCase = true
|
||||
for (int i = 0; i < inputs.length; i++) {
|
||||
for (int j = 0; j < outputs.length; j++) {
|
||||
add(inputs[i], outputs[j], false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void addInternal(BufferedReader in) throws IOException {
|
||||
String line = null;
|
||||
while ((line = in.readLine()) != null) {
|
||||
addLine(line);
|
||||
}
|
||||
}
|
||||
|
||||
private static String[] split(String s, String separator) {
|
||||
ArrayList<String> list = new ArrayList<String>(2);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int pos = 0, end = s.length();
|
||||
while (pos < end) {
|
||||
if (s.startsWith(separator, pos)) {
|
||||
if (sb.length() > 0) {
|
||||
list.add(sb.toString());
|
||||
sb = new StringBuilder();
|
||||
}
|
||||
pos += separator.length();
|
||||
continue;
|
||||
}
|
||||
|
||||
char ch = s.charAt(pos++);
|
||||
if (ch == '\\') {
|
||||
sb.append(ch);
|
||||
if (pos >= end) break; // ERROR, or let it go?
|
||||
ch = s.charAt(pos++);
|
||||
}
|
||||
|
||||
sb.append(ch);
|
||||
}
|
||||
|
||||
if (sb.length() > 0) {
|
||||
list.add(sb.toString());
|
||||
}
|
||||
|
||||
return list.toArray(new String[list.size()]);
|
||||
}
|
||||
|
||||
private String unescape(String s) {
|
||||
if (s.indexOf("\\") >= 0) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
char ch = s.charAt(i);
|
||||
if (ch == '\\' && i < s.length() - 1) {
|
||||
sb.append(s.charAt(++i));
|
||||
} else {
|
||||
sb.append(ch);
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
return s;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue