Adding Wordnet synonym format

This commit is contained in:
Lukáš Vlček 2011-10-14 16:41:10 +02:00 committed by Shay Banon
parent 0f2b875df9
commit f7747d7ff9
2 changed files with 40 additions and 163 deletions

View File

@ -241,4 +241,30 @@ public class Analysis {
}
return result;
}
/**
* @return null If no settings set for "settingsPrefix + _path" then return null.
*
* @throws ElasticSearchIllegalArgumentException
* If the Reader can not be instantiated.
*/
public static Reader getFileReader(Environment env, Settings settings, String settingPrefix) {
String filePath = settings.get(settingPrefix + "_path", null);
if (filePath == null) {
return null;
}
URL fileUrl = env.resolveConfig(filePath);
Reader reader = null;
try {
reader = new InputStreamReader(fileUrl.openStream(), Charsets.UTF_8);
} catch (IOException ioe) {
String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());
throw new ElasticSearchIllegalArgumentException(message);
}
return reader;
}
}

View File

@ -25,8 +25,10 @@ import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
import org.apache.lucene.util.CharsRef;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.inject.Inject;
@ -57,8 +59,8 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
@Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
List<String> rules = Analysis.getWordList(env, settings, "synonyms");
if (rules == null) {
Reader rulesReader = Analysis.getFileReader(env, settings, "synonyms");
if (rulesReader == null) {
throw new ElasticSearchIllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured");
}
this.ignoreCase = settings.getAsBoolean("ignore_case", false);
@ -71,7 +73,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
tokenizerFactoryFactory = indicesAnalysisService.tokenizerFactoryFactory(tokenizerName);
}
if (tokenizerFactoryFactory == null) {
throw new ElasticSearchIllegalArgumentException("failed to fine tokenizer [" + tokenizerName + "] for synonym token filter");
throw new ElasticSearchIllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter");
}
final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.create(tokenizerName, settings);
@ -84,13 +86,18 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
}
};
CustomSynonymParser parser = new CustomSynonymParser(true, expand, analyzer);
try {
for (String rule : rules) {
parser.addLine(rule);
SynonymMap.Builder parser = null;
if (settings.get("format","wordnet").equalsIgnoreCase("wordnet")) {
parser = new WordnetSynonymParser(true, expand, analyzer);
((WordnetSynonymParser)parser).add(rulesReader);
} else {
parser = new SolrSynonymParser(true, expand, analyzer);
((SolrSynonymParser)parser).add(rulesReader);
}
synonymMap = parser.build();
} catch (IOException e) {
} catch (Exception e) {
throw new ElasticSearchIllegalArgumentException("failed to build synonyms", e);
}
}
@ -99,160 +106,4 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
// fst is null means no synonyms
return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, ignoreCase);
}
/**
* Parser for the Solr synonyms format.
* <ol>
* <li> Blank lines and lines starting with '#' are comments.
* <li> Explicit mappings match any token sequence on the LHS of "=>"
* and replace with all alternatives on the RHS. These types of mappings
* ignore the expand parameter in the constructor.
* Example:
* <blockquote>i-pod, i pod => ipod</blockquote>
* <li> Equivalent synonyms may be separated with commas and give
* no explicit mapping. In this case the mapping behavior will
* be taken from the expand parameter in the constructor. This allows
* the same synonym file to be used in different synonym handling strategies.
* Example:
* <blockquote>ipod, i-pod, i pod</blockquote>
*
* <li> Multiple synonym mapping entries are merged.
* Example:
* <blockquote>
* foo => foo bar<br>
* foo => baz<br><br>
* is equivalent to<br><br>
* foo => foo bar, baz
* </blockquote>
* </ol>
*
* @lucene.experimental
*/
public static class CustomSynonymParser extends SynonymMap.Builder {
private final boolean expand;
private final Analyzer analyzer;
public CustomSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
super(dedup);
this.expand = expand;
this.analyzer = analyzer;
}
public void add(Reader in) throws IOException, ParseException {
LineNumberReader br = new LineNumberReader(in);
try {
addInternal(br);
} catch (IllegalArgumentException e) {
ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
ex.initCause(e);
throw ex;
} finally {
br.close();
}
}
public void addLine(String line) throws IOException {
if (line.length() == 0 || line.charAt(0) == '#') {
return;
}
CharsRef inputs[];
CharsRef outputs[];
// TODO: we could process this more efficiently.
String sides[] = split(line, "=>");
if (sides.length > 1) { // explicit mapping
if (sides.length != 2) {
throw new IllegalArgumentException("more than one explicit mapping specified on the same line");
}
String inputStrings[] = split(sides[0], ",");
inputs = new CharsRef[inputStrings.length];
for (int i = 0; i < inputs.length; i++) {
inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
}
String outputStrings[] = split(sides[1], ",");
outputs = new CharsRef[outputStrings.length];
for (int i = 0; i < outputs.length; i++) {
outputs[i] = analyze(analyzer, unescape(outputStrings[i]).trim(), new CharsRef());
}
} else {
String inputStrings[] = split(line, ",");
inputs = new CharsRef[inputStrings.length];
for (int i = 0; i < inputs.length; i++) {
inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
}
if (expand) {
outputs = inputs;
} else {
outputs = new CharsRef[1];
outputs[0] = inputs[0];
}
}
// currently we include the term itself in the map,
// and use includeOrig = false always.
// this is how the existing filter does it, but its actually a bug,
// especially if combined with ignoreCase = true
for (int i = 0; i < inputs.length; i++) {
for (int j = 0; j < outputs.length; j++) {
add(inputs[i], outputs[j], false);
}
}
}
private void addInternal(BufferedReader in) throws IOException {
String line = null;
while ((line = in.readLine()) != null) {
addLine(line);
}
}
private static String[] split(String s, String separator) {
ArrayList<String> list = new ArrayList<String>(2);
StringBuilder sb = new StringBuilder();
int pos = 0, end = s.length();
while (pos < end) {
if (s.startsWith(separator, pos)) {
if (sb.length() > 0) {
list.add(sb.toString());
sb = new StringBuilder();
}
pos += separator.length();
continue;
}
char ch = s.charAt(pos++);
if (ch == '\\') {
sb.append(ch);
if (pos >= end) break; // ERROR, or let it go?
ch = s.charAt(pos++);
}
sb.append(ch);
}
if (sb.length() > 0) {
list.add(sb.toString());
}
return list.toArray(new String[list.size()]);
}
private String unescape(String s) {
if (s.indexOf("\\") >= 0) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < s.length(); i++) {
char ch = s.charAt(i);
if (ch == '\\' && i < s.length() - 1) {
sb.append(s.charAt(++i));
} else {
sb.append(ch);
}
}
return sb.toString();
}
return s;
}
}
}