From f7747d7ff9825cfead2701e973a9d0ee0b4ef92f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Vl=C4=8Dek?= Date: Fri, 14 Oct 2011 16:41:10 +0200 Subject: [PATCH] Adding Wordnet synonym format --- .../index/analysis/Analysis.java | 26 +++ .../analysis/SynonymTokenFilterFactory.java | 177 ++---------------- 2 files changed, 40 insertions(+), 163 deletions(-) diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/Analysis.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/Analysis.java index 12a21b0a91b..81ed2c1e8ff 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/Analysis.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/Analysis.java @@ -241,4 +241,30 @@ public class Analysis { } return result; } + + /** + * @return null If no settings set for "settingsPrefix + _path" then return null. + * + * @throws ElasticSearchIllegalArgumentException + * If the Reader can not be instantiated. + */ + public static Reader getFileReader(Environment env, Settings settings, String settingPrefix) { + String filePath = settings.get(settingPrefix + "_path", null); + + if (filePath == null) { + return null; + } + + URL fileUrl = env.resolveConfig(filePath); + + Reader reader = null; + try { + reader = new InputStreamReader(fileUrl.openStream(), Charsets.UTF_8); + } catch (IOException ioe) { + String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage()); + throw new ElasticSearchIllegalArgumentException(message); + } + + return reader; + } } diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java index f3dcce0ec4b..678b16e4109 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java @@ -25,8 +25,10 @@ import org.apache.lucene.analysis.ReusableAnalyzerBase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.synonym.SolrSynonymParser; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.analysis.synonym.WordnetSynonymParser; import org.apache.lucene.util.CharsRef; import org.elasticsearch.ElasticSearchIllegalArgumentException; import org.elasticsearch.common.inject.Inject; @@ -57,8 +59,8 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); - List rules = Analysis.getWordList(env, settings, "synonyms"); - if (rules == null) { + Reader rulesReader = Analysis.getFileReader(env, settings, "synonyms"); + if (rulesReader == null) { throw new ElasticSearchIllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured"); } this.ignoreCase = settings.getAsBoolean("ignore_case", false); @@ -71,7 +73,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { tokenizerFactoryFactory = indicesAnalysisService.tokenizerFactoryFactory(tokenizerName); } if (tokenizerFactoryFactory == null) { - throw new ElasticSearchIllegalArgumentException("failed to fine tokenizer [" + tokenizerName + "] for synonym token filter"); + throw new ElasticSearchIllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter"); } final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.create(tokenizerName, settings); @@ -84,13 +86,18 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { } }; - CustomSynonymParser parser = new CustomSynonymParser(true, expand, analyzer); try { - for (String rule : rules) { - parser.addLine(rule); + SynonymMap.Builder parser = null; + + if (settings.get("format","wordnet").equalsIgnoreCase("wordnet")) { + parser = new WordnetSynonymParser(true, expand, analyzer); + ((WordnetSynonymParser)parser).add(rulesReader); + } else { + parser = new SolrSynonymParser(true, expand, analyzer); + ((SolrSynonymParser)parser).add(rulesReader); } synonymMap = parser.build(); - } catch (IOException e) { + } catch (Exception e) { throw new ElasticSearchIllegalArgumentException("failed to build synonyms", e); } } @@ -99,160 +106,4 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { // fst is null means no synonyms return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, ignoreCase); } - - /** - * Parser for the Solr synonyms format. - *
    - *
  1. Blank lines and lines starting with '#' are comments. - *
  2. Explicit mappings match any token sequence on the LHS of "=>" - * and replace with all alternatives on the RHS. These types of mappings - * ignore the expand parameter in the constructor. - * Example: - *
    i-pod, i pod => ipod
    - *
  3. Equivalent synonyms may be separated with commas and give - * no explicit mapping. In this case the mapping behavior will - * be taken from the expand parameter in the constructor. This allows - * the same synonym file to be used in different synonym handling strategies. - * Example: - *
    ipod, i-pod, i pod
    - * - *
  4. Multiple synonym mapping entries are merged. - * Example: - *
    - * foo => foo bar
    - * foo => baz

    - * is equivalent to

    - * foo => foo bar, baz - *
    - *
- * - * @lucene.experimental - */ - public static class CustomSynonymParser extends SynonymMap.Builder { - private final boolean expand; - private final Analyzer analyzer; - - public CustomSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) { - super(dedup); - this.expand = expand; - this.analyzer = analyzer; - } - - public void add(Reader in) throws IOException, ParseException { - LineNumberReader br = new LineNumberReader(in); - try { - addInternal(br); - } catch (IllegalArgumentException e) { - ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0); - ex.initCause(e); - throw ex; - } finally { - br.close(); - } - } - - public void addLine(String line) throws IOException { - if (line.length() == 0 || line.charAt(0) == '#') { - return; - } - - CharsRef inputs[]; - CharsRef outputs[]; - - // TODO: we could process this more efficiently. - String sides[] = split(line, "=>"); - if (sides.length > 1) { // explicit mapping - if (sides.length != 2) { - throw new IllegalArgumentException("more than one explicit mapping specified on the same line"); - } - String inputStrings[] = split(sides[0], ","); - inputs = new CharsRef[inputStrings.length]; - for (int i = 0; i < inputs.length; i++) { - inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef()); - } - - String outputStrings[] = split(sides[1], ","); - outputs = new CharsRef[outputStrings.length]; - for (int i = 0; i < outputs.length; i++) { - outputs[i] = analyze(analyzer, unescape(outputStrings[i]).trim(), new CharsRef()); - } - } else { - String inputStrings[] = split(line, ","); - inputs = new CharsRef[inputStrings.length]; - for (int i = 0; i < inputs.length; i++) { - inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef()); - } - if (expand) { - outputs = inputs; - } else { - outputs = new CharsRef[1]; - outputs[0] = inputs[0]; - } - } - - // currently we include the term itself in the map, - // and use includeOrig = false always. - // this is how the existing filter does it, but its actually a bug, - // especially if combined with ignoreCase = true - for (int i = 0; i < inputs.length; i++) { - for (int j = 0; j < outputs.length; j++) { - add(inputs[i], outputs[j], false); - } - } - } - - private void addInternal(BufferedReader in) throws IOException { - String line = null; - while ((line = in.readLine()) != null) { - addLine(line); - } - } - - private static String[] split(String s, String separator) { - ArrayList list = new ArrayList(2); - StringBuilder sb = new StringBuilder(); - int pos = 0, end = s.length(); - while (pos < end) { - if (s.startsWith(separator, pos)) { - if (sb.length() > 0) { - list.add(sb.toString()); - sb = new StringBuilder(); - } - pos += separator.length(); - continue; - } - - char ch = s.charAt(pos++); - if (ch == '\\') { - sb.append(ch); - if (pos >= end) break; // ERROR, or let it go? - ch = s.charAt(pos++); - } - - sb.append(ch); - } - - if (sb.length() > 0) { - list.add(sb.toString()); - } - - return list.toArray(new String[list.size()]); - } - - private String unescape(String s) { - if (s.indexOf("\\") >= 0) { - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < s.length(); i++) { - char ch = s.charAt(i); - if (ch == '\\' && i < s.length() - 1) { - sb.append(s.charAt(++i)); - } else { - sb.append(ch); - } - } - return sb.toString(); - } - return s; - } - } } \ No newline at end of file