Add support for customizing the rule file in ICU tokenizer
Lucene allows to create a ICUTokenizer with a special config argument enabling the customization of the rule based iterator by providing custom rules files. This commit enable this feature. Users could provide a list of RBBI rule files to ICU tokenizer. closes #13146
This commit is contained in:
parent
858ca23b70
commit
cd527c5b92
|
@ -115,6 +115,74 @@ PUT icu_sample
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
// AUTOSENSE
|
// AUTOSENSE
|
||||||
|
|
||||||
|
===== Rules customization
|
||||||
|
|
||||||
|
experimental[]
|
||||||
|
|
||||||
|
You can customize the `icu-tokenizer` behavior by specifying per-script rule files, see the
|
||||||
|
http://userguide.icu-project.org/boundaryanalysis#TOC-RBBI-Rules[RBBI rules syntax reference]
|
||||||
|
for a more detailed explanation.
|
||||||
|
|
||||||
|
To add icu tokenizer rules, set the `rule_files` settings, which should contain a comma-separated list of
|
||||||
|
`code:rulefile` pairs in the following format:
|
||||||
|
http://unicode.org/iso15924/iso15924-codes.html[four-letter ISO 15924 script code],
|
||||||
|
followed by a colon, then a rule file name. Rule files are placed `ES_HOME/config` directory.
|
||||||
|
|
||||||
|
As a demonstration of how the rule files can be used, save the following user file to `$ES_HOME/config/KeywordTokenizer.rbbi`:
|
||||||
|
|
||||||
|
[source,text]
|
||||||
|
-----------------------
|
||||||
|
.+ {200};
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
Then create an analyzer to use this rule file as follows:
|
||||||
|
|
||||||
|
[source,json]
|
||||||
|
--------------------------------------------------
|
||||||
|
PUT icu_sample
|
||||||
|
{
|
||||||
|
"settings": {
|
||||||
|
"index":{
|
||||||
|
"analysis":{
|
||||||
|
"tokenizer" : {
|
||||||
|
"icu_user_file" : {
|
||||||
|
"type" : "icu_tokenizer",
|
||||||
|
"rule_files" : "Latn:KeywordTokenizer.rbbi"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"analyzer" : {
|
||||||
|
"my_analyzer" : {
|
||||||
|
"type" : "custom",
|
||||||
|
"tokenizer" : "icu_user_file"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
POST icu_sample/_analyze?analyzer=my_analyzer&text=Elasticsearch. Wow!
|
||||||
|
--------------------------------------------------
|
||||||
|
// AUTOSENSE
|
||||||
|
|
||||||
|
The above `analyze` request returns the following:
|
||||||
|
|
||||||
|
[source,json]
|
||||||
|
--------------------------------------------------
|
||||||
|
# Result
|
||||||
|
{
|
||||||
|
"tokens": [
|
||||||
|
{
|
||||||
|
"token": "Elasticsearch. Wow!",
|
||||||
|
"start_offset": 0,
|
||||||
|
"end_offset": 19,
|
||||||
|
"type": "<ALPHANUM>",
|
||||||
|
"position": 0
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
[[analysis-icu-normalization]]
|
[[analysis-icu-normalization]]
|
||||||
==== ICU Normalization Token Filter
|
==== ICU Normalization Token Filter
|
||||||
|
|
|
@ -19,23 +19,109 @@
|
||||||
|
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import com.ibm.icu.lang.UCharacter;
|
||||||
|
import com.ibm.icu.lang.UProperty;
|
||||||
|
import com.ibm.icu.lang.UScript;
|
||||||
|
import com.ibm.icu.text.BreakIterator;
|
||||||
|
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig;
|
||||||
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
|
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
|
||||||
|
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizerConfig;
|
||||||
|
import org.elasticsearch.ElasticsearchException;
|
||||||
|
import org.elasticsearch.common.settings.Setting;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.function.Function;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*/
|
*/
|
||||||
public class IcuTokenizerFactory extends AbstractTokenizerFactory {
|
public class IcuTokenizerFactory extends AbstractTokenizerFactory {
|
||||||
|
|
||||||
|
private final ICUTokenizerConfig config;
|
||||||
|
private static final String RULE_FILES = "rule_files";
|
||||||
|
|
||||||
|
public static final Setting<List<String>> SETTING_RULE_FILES =
|
||||||
|
Setting.listSetting(RULE_FILES, Collections.emptyList(), Function.identity(), Setting.Property.IndexScope);
|
||||||
|
|
||||||
public IcuTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
public IcuTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||||
super(indexSettings, name, settings);
|
super(indexSettings, name, settings);
|
||||||
|
config = getIcuConfig(environment, settings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Tokenizer create() {
|
public Tokenizer create() {
|
||||||
|
if (config == null) {
|
||||||
return new ICUTokenizer();
|
return new ICUTokenizer();
|
||||||
|
}else{
|
||||||
|
return new ICUTokenizer(config);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private ICUTokenizerConfig getIcuConfig(Environment env, Settings settings) {
|
||||||
|
Map<Integer, String> tailored = new HashMap<>();
|
||||||
|
|
||||||
|
try {
|
||||||
|
List<String> ruleFiles = SETTING_RULE_FILES.get(settings);
|
||||||
|
|
||||||
|
for (String scriptAndResourcePath : ruleFiles) {
|
||||||
|
int colonPos = scriptAndResourcePath.indexOf(":");
|
||||||
|
if (colonPos == -1 || colonPos == scriptAndResourcePath.length() - 1) {
|
||||||
|
throw new IllegalArgumentException(RULE_FILES + " should contain comma-separated \"code:rulefile\" pairs");
|
||||||
|
}
|
||||||
|
|
||||||
|
String scriptCode = scriptAndResourcePath.substring(0, colonPos).trim();
|
||||||
|
String resourcePath = scriptAndResourcePath.substring(colonPos + 1).trim();
|
||||||
|
tailored.put(UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode), resourcePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tailored.isEmpty()) {
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
|
final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT];
|
||||||
|
for (Map.Entry<Integer, String> entry : tailored.entrySet()) {
|
||||||
|
int code = entry.getKey();
|
||||||
|
String resourcePath = entry.getValue();
|
||||||
|
breakers[code] = parseRules(resourcePath, env);
|
||||||
|
}
|
||||||
|
// cjkAsWords is not configurable yet.
|
||||||
|
ICUTokenizerConfig config = new DefaultICUTokenizerConfig(true) {
|
||||||
|
@Override
|
||||||
|
public BreakIterator getBreakIterator(int script) {
|
||||||
|
if (breakers[script] != null) {
|
||||||
|
return (BreakIterator) breakers[script].clone();
|
||||||
|
} else {
|
||||||
|
return super.getBreakIterator(script);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
return config;
|
||||||
|
}
|
||||||
|
} catch (Throwable t) {
|
||||||
|
throw new ElasticsearchException("failed to load ICU rule files", t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//parse a single RBBi rule file
|
||||||
|
private BreakIterator parseRules(String filename, Environment env) throws IOException {
|
||||||
|
|
||||||
|
final Path path = env.configFile().resolve(filename);
|
||||||
|
String rules = Files.readAllLines(path)
|
||||||
|
.stream()
|
||||||
|
.filter((v) -> v.startsWith("#") == false)
|
||||||
|
.collect(Collectors.joining("\n"));
|
||||||
|
|
||||||
|
return new RuleBasedBreakIterator(rules.toString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
|
|
||||||
package org.elasticsearch.plugin.analysis.icu;
|
package org.elasticsearch.plugin.analysis.icu;
|
||||||
|
|
||||||
|
import org.elasticsearch.common.settings.SettingsModule;
|
||||||
import org.elasticsearch.index.analysis.IcuCollationTokenFilterFactory;
|
import org.elasticsearch.index.analysis.IcuCollationTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.IcuFoldingTokenFilterFactory;
|
import org.elasticsearch.index.analysis.IcuFoldingTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.IcuNormalizerCharFilterFactory;
|
import org.elasticsearch.index.analysis.IcuNormalizerCharFilterFactory;
|
||||||
|
@ -54,4 +55,8 @@ public class AnalysisICUPlugin extends Plugin {
|
||||||
module.registerTokenFilter("icu_collation", IcuCollationTokenFilterFactory::new);
|
module.registerTokenFilter("icu_collation", IcuCollationTokenFilterFactory::new);
|
||||||
module.registerTokenFilter("icu_transform", IcuTransformTokenFilterFactory::new);
|
module.registerTokenFilter("icu_transform", IcuTransformTokenFilterFactory::new);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void onModule(SettingsModule settingsModule) {
|
||||||
|
settingsModule.registerSetting(IcuTokenizerFactory.SETTING_RULE_FILES);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,107 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
|
||||||
|
import org.elasticsearch.Version;
|
||||||
|
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.env.Environment;
|
||||||
|
import org.elasticsearch.index.Index;
|
||||||
|
import org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin;
|
||||||
|
import org.elasticsearch.test.ESTestCase;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*/
|
||||||
|
public class IcuTokenizerFactoryTests extends ESTestCase {
|
||||||
|
|
||||||
|
public void testSimpleIcuTokenizer() throws IOException {
|
||||||
|
AnalysisService analysisService = createAnalysisService();
|
||||||
|
|
||||||
|
TokenizerFactory tokenizerFactory = analysisService.tokenizer("icu_tokenizer");
|
||||||
|
ICUTokenizer tokenizer = (ICUTokenizer) tokenizerFactory.create();
|
||||||
|
|
||||||
|
Reader reader = new StringReader("向日葵, one-two");
|
||||||
|
tokenizer.setReader(reader);
|
||||||
|
assertTokenStreamContents(tokenizer, new String[]{"向日葵", "one", "two"});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testIcuCustomizeRuleFile() throws IOException {
|
||||||
|
AnalysisService analysisService = createAnalysisService();
|
||||||
|
|
||||||
|
// test the tokenizer with single rule file
|
||||||
|
TokenizerFactory tokenizerFactory = analysisService.tokenizer("user_rule_tokenizer");
|
||||||
|
ICUTokenizer tokenizer = (ICUTokenizer) tokenizerFactory.create();
|
||||||
|
Reader reader = new StringReader
|
||||||
|
("One-two punch. Brang-, not brung-it. This one--not that one--is the right one, -ish.");
|
||||||
|
|
||||||
|
tokenizer.setReader(reader);
|
||||||
|
assertTokenStreamContents(tokenizer,
|
||||||
|
new String[]{"One-two", "punch", "Brang", "not", "brung-it",
|
||||||
|
"This", "one", "not", "that", "one", "is", "the", "right", "one", "ish"});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMultipleIcuCustomizeRuleFiles() throws IOException {
|
||||||
|
AnalysisService analysisService = createAnalysisService();
|
||||||
|
|
||||||
|
// test the tokenizer with two rule files
|
||||||
|
TokenizerFactory tokenizerFactory = analysisService.tokenizer("multi_rule_tokenizer");
|
||||||
|
ICUTokenizer tokenizer = (ICUTokenizer) tokenizerFactory.create();
|
||||||
|
StringReader reader = new StringReader
|
||||||
|
("Some English. Немного русский. ข้อความภาษาไทยเล็ก ๆ น้อย ๆ More English.");
|
||||||
|
|
||||||
|
tokenizer.setReader(reader);
|
||||||
|
assertTokenStreamContents(tokenizer, new String[]{"Some", "English",
|
||||||
|
"Немного русский. ",
|
||||||
|
"ข้อความภาษาไทยเล็ก ๆ น้อย ๆ ",
|
||||||
|
"More", "English"});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static AnalysisService createAnalysisService() throws IOException {
|
||||||
|
InputStream keywords = IcuTokenizerFactoryTests.class.getResourceAsStream("KeywordTokenizer.rbbi");
|
||||||
|
InputStream latin = IcuTokenizerFactoryTests.class.getResourceAsStream("Latin-dont-break-on-hyphens.rbbi");
|
||||||
|
|
||||||
|
Path home = createTempDir();
|
||||||
|
Path config = home.resolve("config");
|
||||||
|
Files.createDirectory(config);
|
||||||
|
Files.copy(keywords, config.resolve("KeywordTokenizer.rbbi"));
|
||||||
|
Files.copy(latin, config.resolve("Latin-dont-break-on-hyphens.rbbi"));
|
||||||
|
|
||||||
|
String json = "/org/elasticsearch/index/analysis/icu_analysis.json";
|
||||||
|
|
||||||
|
Settings settings = Settings.builder()
|
||||||
|
.loadFromStream(json, IcuTokenizerFactoryTests.class.getResourceAsStream(json))
|
||||||
|
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||||
|
.build();
|
||||||
|
Settings nodeSettings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), home).build();
|
||||||
|
|
||||||
|
return createAnalysisService(new Index("test", "_na_"), nodeSettings, settings, new AnalysisICUPlugin()::onModule);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,21 @@
|
||||||
|
#
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright ownership.
|
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
# RBBI Keyword tokenizer: keep everything as a single token.
|
||||||
|
|
||||||
|
# Apply rule status {200}=RBBI.WORD_LETTER, which is mapped
|
||||||
|
# to <ALPHANUM> token type by DefaultICUTokenizerConfig.
|
||||||
|
.+ {200};
|
|
@ -0,0 +1,135 @@
|
||||||
|
#
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright ownership.
|
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
# Based on Default.rbbi, the default RBBI rules, based on UAX#29.
|
||||||
|
# Added dashes to $MidLetter, so that words aren't broken on single dashes.
|
||||||
|
#
|
||||||
|
|
||||||
|
!!chain;
|
||||||
|
|
||||||
|
#
|
||||||
|
# Character Class Definitions.
|
||||||
|
#
|
||||||
|
|
||||||
|
$CR = [\p{Word_Break = CR}];
|
||||||
|
$LF = [\p{Word_Break = LF}];
|
||||||
|
$Newline = [\p{Word_Break = Newline}];
|
||||||
|
$Extend = [\p{Word_Break = Extend}];
|
||||||
|
$Format = [\p{Word_Break = Format}];
|
||||||
|
$Katakana = [\p{Word_Break = Katakana}];
|
||||||
|
$ALetter = [\p{Word_Break = ALetter}];
|
||||||
|
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||||
|
# Don't use [:Dash:] here - it contains lots of chars that should continue to trigger word breaks
|
||||||
|
$Dash = [\N{HYPHEN-MINUS}
|
||||||
|
\N{HYPHEN}
|
||||||
|
\N{EN DASH}
|
||||||
|
\N{MINUS SIGN}
|
||||||
|
\N{SMALL HYPHEN-MINUS}
|
||||||
|
\N{FULLWIDTH HYPHEN-MINUS}];
|
||||||
|
$MidLetter = [\p{Word_Break = MidLetter}$Dash]; # Don't break on (single) hyphen
|
||||||
|
$MidNum = [\p{Word_Break = MidNum}];
|
||||||
|
$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
|
||||||
|
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||||
|
|
||||||
|
|
||||||
|
# Dictionary character set, for triggering language-based break engines. Currently
|
||||||
|
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||||
|
# 5.0 or later as the definition of Complex_Context was corrected to include all
|
||||||
|
# characters requiring dictionary break.
|
||||||
|
|
||||||
|
$dictionary = [:LineBreak = Complex_Context:];
|
||||||
|
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||||
|
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not
|
||||||
|
# include the dictionary characters.
|
||||||
|
|
||||||
|
#
|
||||||
|
# Rules 4 Ignore Format and Extend characters,
|
||||||
|
# except when they appear at the beginning of a region of text.
|
||||||
|
#
|
||||||
|
$KatakanaEx = $Katakana ($Extend | $Format)*;
|
||||||
|
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
|
||||||
|
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||||
|
$MidLetterEx = $MidLetter ($Extend | $Format)*;
|
||||||
|
$MidNumEx = $MidNum ($Extend | $Format)*;
|
||||||
|
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||||
|
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||||
|
|
||||||
|
$Hiragana = [\p{script=Hiragana}];
|
||||||
|
$Ideographic = [\p{Ideographic}];
|
||||||
|
$HiraganaEx = $Hiragana ($Extend | $Format)*;
|
||||||
|
$IdeographicEx = $Ideographic ($Extend | $Format)*;
|
||||||
|
|
||||||
|
## -------------------------------------------------
|
||||||
|
|
||||||
|
!!forward;
|
||||||
|
|
||||||
|
|
||||||
|
# Rule 3 - CR x LF
|
||||||
|
#
|
||||||
|
$CR $LF;
|
||||||
|
|
||||||
|
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
|
||||||
|
# of a region of Text. The rule here comes into play when the start of text
|
||||||
|
# begins with a group of Format chars, or with a "word" consisting of a single
|
||||||
|
# char that is not in any of the listed word break categories followed by
|
||||||
|
# format char(s).
|
||||||
|
[^$CR $LF $Newline]? ($Extend | $Format)+;
|
||||||
|
|
||||||
|
$NumericEx {100};
|
||||||
|
$ALetterEx {200};
|
||||||
|
$KatakanaEx {300}; # note: these status values override those from rule 5
|
||||||
|
$HiraganaEx {300}; # by virtual of being numerically larger.
|
||||||
|
$IdeographicEx {400}; #
|
||||||
|
|
||||||
|
#
|
||||||
|
# rule 5
|
||||||
|
# Do not break between most letters.
|
||||||
|
#
|
||||||
|
$ALetterEx $ALetterEx {200};
|
||||||
|
|
||||||
|
# rule 6 and 7
|
||||||
|
$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
|
||||||
|
|
||||||
|
# rule 8
|
||||||
|
|
||||||
|
$NumericEx $NumericEx {100};
|
||||||
|
|
||||||
|
# rule 9
|
||||||
|
|
||||||
|
$ALetterEx $NumericEx {200};
|
||||||
|
|
||||||
|
# rule 10
|
||||||
|
|
||||||
|
$NumericEx $ALetterEx {200};
|
||||||
|
|
||||||
|
# rule 11 and 12
|
||||||
|
|
||||||
|
$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
|
||||||
|
|
||||||
|
# rule 13
|
||||||
|
|
||||||
|
$KatakanaEx $KatakanaEx {300};
|
||||||
|
|
||||||
|
# rule 13a/b
|
||||||
|
|
||||||
|
$ALetterEx $ExtendNumLetEx {200}; # (13a)
|
||||||
|
$NumericEx $ExtendNumLetEx {100}; # (13a)
|
||||||
|
$KatakanaEx $ExtendNumLetEx {300}; # (13a)
|
||||||
|
$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
|
||||||
|
|
||||||
|
$ExtendNumLetEx $ALetterEx {200}; # (13b)
|
||||||
|
$ExtendNumLetEx $NumericEx {100}; # (13b)
|
||||||
|
$ExtendNumLetEx $KatakanaEx {300}; # (13b)
|
|
@ -0,0 +1,20 @@
|
||||||
|
{
|
||||||
|
"index":{
|
||||||
|
"analysis":{
|
||||||
|
"tokenizer" : {
|
||||||
|
"icu_tokenizer" : {
|
||||||
|
"type":"icu_tokenizer"
|
||||||
|
},
|
||||||
|
"user_rule_tokenizer" : {
|
||||||
|
"type":"icu_tokenizer",
|
||||||
|
"rule_files":"Latn:Latin-dont-break-on-hyphens.rbbi"
|
||||||
|
},
|
||||||
|
"multi_rule_tokenizer" : {
|
||||||
|
"type":"icu_tokenizer",
|
||||||
|
"rule_files":["Cyrl:KeywordTokenizer.rbbi", "thai:KeywordTokenizer.rbbi"]
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue