From cd527c5b92214505650c1573c172bc7b4cf79062 Mon Sep 17 00:00:00 2001 From: xuzha Date: Sat, 5 Sep 2015 11:07:59 -0700 Subject: [PATCH] Add support for customizing the rule file in ICU tokenizer Lucene allows to create a ICUTokenizer with a special config argument enabling the customization of the rule based iterator by providing custom rules files. This commit enable this feature. Users could provide a list of RBBI rule files to ICU tokenizer. closes #13146 --- docs/plugins/analysis-icu.asciidoc | 68 +++++++++ .../index/analysis/IcuTokenizerFactory.java | 88 +++++++++++- .../analysis/icu/AnalysisICUPlugin.java | 5 + .../analysis/IcuTokenizerFactoryTests.java | 107 ++++++++++++++ .../index/analysis/KeywordTokenizer.rbbi | 21 +++ .../analysis/Latin-dont-break-on-hyphens.rbbi | 135 ++++++++++++++++++ .../index/analysis/icu_analysis.json | 20 +++ 7 files changed, 443 insertions(+), 1 deletion(-) create mode 100644 plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/IcuTokenizerFactoryTests.java create mode 100644 plugins/analysis-icu/src/test/resources/org/elasticsearch/index/analysis/KeywordTokenizer.rbbi create mode 100644 plugins/analysis-icu/src/test/resources/org/elasticsearch/index/analysis/Latin-dont-break-on-hyphens.rbbi create mode 100644 plugins/analysis-icu/src/test/resources/org/elasticsearch/index/analysis/icu_analysis.json diff --git a/docs/plugins/analysis-icu.asciidoc b/docs/plugins/analysis-icu.asciidoc index 2a6efadba81..03d0dc0296c 100644 --- a/docs/plugins/analysis-icu.asciidoc +++ b/docs/plugins/analysis-icu.asciidoc @@ -115,6 +115,74 @@ PUT icu_sample -------------------------------------------------- // AUTOSENSE +===== Rules customization + +experimental[] + +You can customize the `icu-tokenizer` behavior by specifying per-script rule files, see the +http://userguide.icu-project.org/boundaryanalysis#TOC-RBBI-Rules[RBBI rules syntax reference] +for a more detailed explanation. + +To add icu tokenizer rules, set the `rule_files` settings, which should contain a comma-separated list of +`code:rulefile` pairs in the following format: +http://unicode.org/iso15924/iso15924-codes.html[four-letter ISO 15924 script code], +followed by a colon, then a rule file name. Rule files are placed `ES_HOME/config` directory. + +As a demonstration of how the rule files can be used, save the following user file to `$ES_HOME/config/KeywordTokenizer.rbbi`: + +[source,text] +----------------------- +.+ {200}; +----------------------- + +Then create an analyzer to use this rule file as follows: + +[source,json] +-------------------------------------------------- +PUT icu_sample +{ + "settings": { + "index":{ + "analysis":{ + "tokenizer" : { + "icu_user_file" : { + "type" : "icu_tokenizer", + "rule_files" : "Latn:KeywordTokenizer.rbbi" + } + }, + "analyzer" : { + "my_analyzer" : { + "type" : "custom", + "tokenizer" : "icu_user_file" + } + } + } + } + } +} + +POST icu_sample/_analyze?analyzer=my_analyzer&text=Elasticsearch. Wow! +-------------------------------------------------- +// AUTOSENSE + +The above `analyze` request returns the following: + +[source,json] +-------------------------------------------------- +# Result +{ + "tokens": [ + { + "token": "Elasticsearch. Wow!", + "start_offset": 0, + "end_offset": 19, + "type": "", + "position": 0 + } + ] +} +-------------------------------------------------- + [[analysis-icu-normalization]] ==== ICU Normalization Token Filter diff --git a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java index 0d2a6cdeb22..c49ceac483d 100644 --- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java +++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java @@ -19,23 +19,109 @@ package org.elasticsearch.index.analysis; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.RuleBasedBreakIterator; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig; import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; +import org.apache.lucene.analysis.icu.segmentation.ICUTokenizerConfig; +import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; + + /** */ public class IcuTokenizerFactory extends AbstractTokenizerFactory { + private final ICUTokenizerConfig config; + private static final String RULE_FILES = "rule_files"; + + public static final Setting> SETTING_RULE_FILES = + Setting.listSetting(RULE_FILES, Collections.emptyList(), Function.identity(), Setting.Property.IndexScope); + public IcuTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); + config = getIcuConfig(environment, settings); } @Override public Tokenizer create() { - return new ICUTokenizer(); + if (config == null) { + return new ICUTokenizer(); + }else{ + return new ICUTokenizer(config); + } } + private ICUTokenizerConfig getIcuConfig(Environment env, Settings settings) { + Map tailored = new HashMap<>(); + + try { + List ruleFiles = SETTING_RULE_FILES.get(settings); + + for (String scriptAndResourcePath : ruleFiles) { + int colonPos = scriptAndResourcePath.indexOf(":"); + if (colonPos == -1 || colonPos == scriptAndResourcePath.length() - 1) { + throw new IllegalArgumentException(RULE_FILES + " should contain comma-separated \"code:rulefile\" pairs"); + } + + String scriptCode = scriptAndResourcePath.substring(0, colonPos).trim(); + String resourcePath = scriptAndResourcePath.substring(colonPos + 1).trim(); + tailored.put(UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode), resourcePath); + } + + if (tailored.isEmpty()) { + return null; + } else { + final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT]; + for (Map.Entry entry : tailored.entrySet()) { + int code = entry.getKey(); + String resourcePath = entry.getValue(); + breakers[code] = parseRules(resourcePath, env); + } + // cjkAsWords is not configurable yet. + ICUTokenizerConfig config = new DefaultICUTokenizerConfig(true) { + @Override + public BreakIterator getBreakIterator(int script) { + if (breakers[script] != null) { + return (BreakIterator) breakers[script].clone(); + } else { + return super.getBreakIterator(script); + } + } + }; + return config; + } + } catch (Throwable t) { + throw new ElasticsearchException("failed to load ICU rule files", t); + } + } + + //parse a single RBBi rule file + private BreakIterator parseRules(String filename, Environment env) throws IOException { + + final Path path = env.configFile().resolve(filename); + String rules = Files.readAllLines(path) + .stream() + .filter((v) -> v.startsWith("#") == false) + .collect(Collectors.joining("\n")); + + return new RuleBasedBreakIterator(rules.toString()); + } } diff --git a/plugins/analysis-icu/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java index 46b8d530f5f..28d69b8523d 100644 --- a/plugins/analysis-icu/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java +++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java @@ -19,6 +19,7 @@ package org.elasticsearch.plugin.analysis.icu; +import org.elasticsearch.common.settings.SettingsModule; import org.elasticsearch.index.analysis.IcuCollationTokenFilterFactory; import org.elasticsearch.index.analysis.IcuFoldingTokenFilterFactory; import org.elasticsearch.index.analysis.IcuNormalizerCharFilterFactory; @@ -54,4 +55,8 @@ public class AnalysisICUPlugin extends Plugin { module.registerTokenFilter("icu_collation", IcuCollationTokenFilterFactory::new); module.registerTokenFilter("icu_transform", IcuTransformTokenFilterFactory::new); } + + public void onModule(SettingsModule settingsModule) { + settingsModule.registerSetting(IcuTokenizerFactory.SETTING_RULE_FILES); + } } diff --git a/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/IcuTokenizerFactoryTests.java b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/IcuTokenizerFactoryTests.java new file mode 100644 index 00000000000..1630d514ae3 --- /dev/null +++ b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/IcuTokenizerFactoryTests.java @@ -0,0 +1,107 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.Index; +import org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin; +import org.elasticsearch.test.ESTestCase; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.io.StringReader; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents; + +/** + */ +public class IcuTokenizerFactoryTests extends ESTestCase { + + public void testSimpleIcuTokenizer() throws IOException { + AnalysisService analysisService = createAnalysisService(); + + TokenizerFactory tokenizerFactory = analysisService.tokenizer("icu_tokenizer"); + ICUTokenizer tokenizer = (ICUTokenizer) tokenizerFactory.create(); + + Reader reader = new StringReader("向日葵, one-two"); + tokenizer.setReader(reader); + assertTokenStreamContents(tokenizer, new String[]{"向日葵", "one", "two"}); + } + + public void testIcuCustomizeRuleFile() throws IOException { + AnalysisService analysisService = createAnalysisService(); + + // test the tokenizer with single rule file + TokenizerFactory tokenizerFactory = analysisService.tokenizer("user_rule_tokenizer"); + ICUTokenizer tokenizer = (ICUTokenizer) tokenizerFactory.create(); + Reader reader = new StringReader + ("One-two punch. Brang-, not brung-it. This one--not that one--is the right one, -ish."); + + tokenizer.setReader(reader); + assertTokenStreamContents(tokenizer, + new String[]{"One-two", "punch", "Brang", "not", "brung-it", + "This", "one", "not", "that", "one", "is", "the", "right", "one", "ish"}); + } + + public void testMultipleIcuCustomizeRuleFiles() throws IOException { + AnalysisService analysisService = createAnalysisService(); + + // test the tokenizer with two rule files + TokenizerFactory tokenizerFactory = analysisService.tokenizer("multi_rule_tokenizer"); + ICUTokenizer tokenizer = (ICUTokenizer) tokenizerFactory.create(); + StringReader reader = new StringReader + ("Some English. Немного русский. ข้อความภาษาไทยเล็ก ๆ น้อย ๆ More English."); + + tokenizer.setReader(reader); + assertTokenStreamContents(tokenizer, new String[]{"Some", "English", + "Немного русский. ", + "ข้อความภาษาไทยเล็ก ๆ น้อย ๆ ", + "More", "English"}); + } + + + private static AnalysisService createAnalysisService() throws IOException { + InputStream keywords = IcuTokenizerFactoryTests.class.getResourceAsStream("KeywordTokenizer.rbbi"); + InputStream latin = IcuTokenizerFactoryTests.class.getResourceAsStream("Latin-dont-break-on-hyphens.rbbi"); + + Path home = createTempDir(); + Path config = home.resolve("config"); + Files.createDirectory(config); + Files.copy(keywords, config.resolve("KeywordTokenizer.rbbi")); + Files.copy(latin, config.resolve("Latin-dont-break-on-hyphens.rbbi")); + + String json = "/org/elasticsearch/index/analysis/icu_analysis.json"; + + Settings settings = Settings.builder() + .loadFromStream(json, IcuTokenizerFactoryTests.class.getResourceAsStream(json)) + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .build(); + Settings nodeSettings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), home).build(); + + return createAnalysisService(new Index("test", "_na_"), nodeSettings, settings, new AnalysisICUPlugin()::onModule); + } +} diff --git a/plugins/analysis-icu/src/test/resources/org/elasticsearch/index/analysis/KeywordTokenizer.rbbi b/plugins/analysis-icu/src/test/resources/org/elasticsearch/index/analysis/KeywordTokenizer.rbbi new file mode 100644 index 00000000000..8e6de8aa94a --- /dev/null +++ b/plugins/analysis-icu/src/test/resources/org/elasticsearch/index/analysis/KeywordTokenizer.rbbi @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# RBBI Keyword tokenizer: keep everything as a single token. + +# Apply rule status {200}=RBBI.WORD_LETTER, which is mapped +# to token type by DefaultICUTokenizerConfig. +.+ {200}; \ No newline at end of file diff --git a/plugins/analysis-icu/src/test/resources/org/elasticsearch/index/analysis/Latin-dont-break-on-hyphens.rbbi b/plugins/analysis-icu/src/test/resources/org/elasticsearch/index/analysis/Latin-dont-break-on-hyphens.rbbi new file mode 100644 index 00000000000..0a4f0686a4f --- /dev/null +++ b/plugins/analysis-icu/src/test/resources/org/elasticsearch/index/analysis/Latin-dont-break-on-hyphens.rbbi @@ -0,0 +1,135 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Based on Default.rbbi, the default RBBI rules, based on UAX#29. +# Added dashes to $MidLetter, so that words aren't broken on single dashes. +# + +!!chain; + +# +# Character Class Definitions. +# + +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$ALetter = [\p{Word_Break = ALetter}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +# Don't use [:Dash:] here - it contains lots of chars that should continue to trigger word breaks +$Dash = [\N{HYPHEN-MINUS} + \N{HYPHEN} + \N{EN DASH} + \N{MINUS SIGN} + \N{SMALL HYPHEN-MINUS} + \N{FULLWIDTH HYPHEN-MINUS}]; +$MidLetter = [\p{Word_Break = MidLetter}$Dash]; # Don't break on (single) hyphen +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; + + +# Dictionary character set, for triggering language-based break engines. Currently +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +# 5.0 or later as the definition of Complex_Context was corrected to include all +# characters requiring dictionary break. + +$dictionary = [:LineBreak = Complex_Context:]; +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not + # include the dictionary characters. + +# +# Rules 4 Ignore Format and Extend characters, +# except when they appear at the beginning of a region of text. +# +$KatakanaEx = $Katakana ($Extend | $Format)*; +$ALetterEx = $ALetterPlus ($Extend | $Format)*; +$MidNumLetEx = $MidNumLet ($Extend | $Format)*; +$MidLetterEx = $MidLetter ($Extend | $Format)*; +$MidNumEx = $MidNum ($Extend | $Format)*; +$NumericEx = $Numeric ($Extend | $Format)*; +$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; + +$Hiragana = [\p{script=Hiragana}]; +$Ideographic = [\p{Ideographic}]; +$HiraganaEx = $Hiragana ($Extend | $Format)*; +$IdeographicEx = $Ideographic ($Extend | $Format)*; + +## ------------------------------------------------- + +!!forward; + + +# Rule 3 - CR x LF +# +$CR $LF; + +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning +# of a region of Text. The rule here comes into play when the start of text +# begins with a group of Format chars, or with a "word" consisting of a single +# char that is not in any of the listed word break categories followed by +# format char(s). +[^$CR $LF $Newline]? ($Extend | $Format)+; + +$NumericEx {100}; +$ALetterEx {200}; +$KatakanaEx {300}; # note: these status values override those from rule 5 +$HiraganaEx {300}; # by virtual of being numerically larger. +$IdeographicEx {400}; # + +# +# rule 5 +# Do not break between most letters. +# +$ALetterEx $ALetterEx {200}; + +# rule 6 and 7 +$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; + +# rule 8 + +$NumericEx $NumericEx {100}; + +# rule 9 + +$ALetterEx $NumericEx {200}; + +# rule 10 + +$NumericEx $ALetterEx {200}; + +# rule 11 and 12 + +$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; + +# rule 13 + +$KatakanaEx $KatakanaEx {300}; + +# rule 13a/b + +$ALetterEx $ExtendNumLetEx {200}; # (13a) +$NumericEx $ExtendNumLetEx {100}; # (13a) +$KatakanaEx $ExtendNumLetEx {300}; # (13a) +$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) + +$ExtendNumLetEx $ALetterEx {200}; # (13b) +$ExtendNumLetEx $NumericEx {100}; # (13b) +$ExtendNumLetEx $KatakanaEx {300}; # (13b) diff --git a/plugins/analysis-icu/src/test/resources/org/elasticsearch/index/analysis/icu_analysis.json b/plugins/analysis-icu/src/test/resources/org/elasticsearch/index/analysis/icu_analysis.json new file mode 100644 index 00000000000..f6cb9e70c3f --- /dev/null +++ b/plugins/analysis-icu/src/test/resources/org/elasticsearch/index/analysis/icu_analysis.json @@ -0,0 +1,20 @@ +{ + "index":{ + "analysis":{ + "tokenizer" : { + "icu_tokenizer" : { + "type":"icu_tokenizer" + }, + "user_rule_tokenizer" : { + "type":"icu_tokenizer", + "rule_files":"Latn:Latin-dont-break-on-hyphens.rbbi" + }, + "multi_rule_tokenizer" : { + "type":"icu_tokenizer", + "rule_files":["Cyrl:KeywordTokenizer.rbbi", "thai:KeywordTokenizer.rbbi"] + + } + } + } + } +}