mirror of https://github.com/apache/lucene.git
SOLR-4123: Add per-script customizability to ICUTokenizerFactory via rule files in the ICU RuleBasedBreakIterator format.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1416617 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0c24fa2204
commit
6024e1465e
|
@ -25,6 +25,7 @@
|
|||
<classpathentry kind="src" path="lucene/analysis/icu/src/java"/>
|
||||
<classpathentry kind="src" output="eclipse-build/analysis-icu" path="lucene/analysis/icu/src/resources"/>
|
||||
<classpathentry kind="src" path="lucene/analysis/icu/src/test"/>
|
||||
<classpathentry kind="src" path="lucene/analysis/icu/src/test-files"/>
|
||||
<classpathentry kind="src" path="lucene/analysis/kuromoji/src/java"/>
|
||||
<classpathentry kind="src" output="eclipse-build/analysis-kuromoji" path="lucene/analysis/kuromoji/src/resources"/>
|
||||
<classpathentry kind="src" path="lucene/analysis/kuromoji/src/test"/>
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
<sourceFolder url="file://$MODULE_DIR$/src/tools/java" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/resources" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test-files" isTestSource="true" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
|
|
|
@ -113,6 +113,10 @@ New Features
|
|||
be notified whenever a new searcher was opened. (selckin via Shai
|
||||
Erera, Mike McCandless)
|
||||
|
||||
* SOLR-4123: Add per-script customizability to ICUTokenizerFactory via
|
||||
rule files in the ICU RuleBasedBreakIterator format.
|
||||
(Shawn Heisey, Robert Muir, Steve Rowe)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries
|
||||
|
|
|
@ -35,6 +35,11 @@
|
|||
<path refid="base.classpath"/>
|
||||
</path>
|
||||
|
||||
<path id="test.classpath">
|
||||
<path refid="test.base.classpath" />
|
||||
<pathelement path="src/test-files" />
|
||||
</path>
|
||||
|
||||
<target name="compile-core" depends="jar-analyzers-common, common.compile-core" />
|
||||
|
||||
<property name="utr30.data.dir" location="src/data/utr30"/>
|
||||
|
|
|
@ -17,22 +17,135 @@ package org.apache.lucene.analysis.icu.segmentation;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** Factory for {@link ICUTokenizer} */
|
||||
public class ICUTokenizerFactory extends TokenizerFactory {
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
|
||||
/**
|
||||
* Factory for {@link ICUTokenizer}.
|
||||
* Words are broken across script boundaries, then segmented according to
|
||||
* the BreakIterator and typing provided by the {@link DefaultICUTokenizerConfig}.
|
||||
*
|
||||
* <p/>
|
||||
*
|
||||
* To use the default set of per-script rules:
|
||||
*
|
||||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_icu" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.ICUTokenizerFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
* <p/>
|
||||
*
|
||||
* You can customize this tokenizer's behavior by specifying per-script rule files,
|
||||
* which are compiled by the ICU RuleBasedBreakIterator. See the
|
||||
* <a href="http://userguide.icu-project.org/boundaryanalysis#TOC-RBBI-Rules"
|
||||
* >ICU RuleBasedBreakIterator syntax reference</a>.
|
||||
*
|
||||
* To add per-script rules, add a "rulefiles" argument, which should contain a
|
||||
* comma-separated list of <tt>code:rulefile</tt> pairs in the following format:
|
||||
* <a href="http://unicode.org/iso15924/iso15924-codes.html"
|
||||
* >four-letter ISO 15924 script code</a>, followed by a colon, then a resource
|
||||
* path. E.g. to specify rules for Latin (script code "Latn") and Cyrillic
|
||||
* (script code "Cyrl"):
|
||||
*
|
||||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_icu_custom" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.ICUTokenizerFactory"
|
||||
* rulefiles="Latn:my.Latin.rules.rbbi,Cyrl:my.Cyrillic.rules.rbbi"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
*/
|
||||
public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware {
|
||||
static final String RULEFILES = "rulefiles";
|
||||
private Map<Integer,String> tailored;
|
||||
private ICUTokenizerConfig config;
|
||||
|
||||
/** Sole constructor. See {@link AbstractAnalysisFactory} for initialization lifecycle. */
|
||||
public ICUTokenizerFactory() {}
|
||||
|
||||
// TODO: add support for custom configs
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
super.init(args);
|
||||
tailored = new HashMap<Integer,String>();
|
||||
String rulefilesArg = args.get(RULEFILES);
|
||||
if (rulefilesArg != null) {
|
||||
List<String> scriptAndResourcePaths = splitFileNames(rulefilesArg);
|
||||
for (String scriptAndResourcePath : scriptAndResourcePaths) {
|
||||
int colonPos = scriptAndResourcePath.indexOf(":");
|
||||
String scriptCode = scriptAndResourcePath.substring(0, colonPos).trim();
|
||||
String resourcePath = scriptAndResourcePath.substring(colonPos+1).trim();
|
||||
tailored.put(UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode), resourcePath);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void inform(ResourceLoader loader) throws IOException {
|
||||
assert tailored != null : "init must be called first!";
|
||||
if (tailored.isEmpty()) {
|
||||
config = new DefaultICUTokenizerConfig();
|
||||
} else {
|
||||
final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT];
|
||||
for (Map.Entry<Integer,String> entry : tailored.entrySet()) {
|
||||
int code = entry.getKey();
|
||||
String resourcePath = entry.getValue();
|
||||
breakers[code] = parseRules(resourcePath, loader);
|
||||
}
|
||||
config = new DefaultICUTokenizerConfig() {
|
||||
|
||||
@Override
|
||||
public BreakIterator getBreakIterator(int script) {
|
||||
if (breakers[script] != null) {
|
||||
return (BreakIterator) breakers[script].clone();
|
||||
} else {
|
||||
return super.getBreakIterator(script);
|
||||
}
|
||||
}
|
||||
// TODO: we could also allow codes->types mapping
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private BreakIterator parseRules(String filename, ResourceLoader loader) throws IOException {
|
||||
StringBuilder rules = new StringBuilder();
|
||||
InputStream rulesStream = loader.openResource(filename);
|
||||
BufferedReader reader = new BufferedReader
|
||||
(IOUtils.getDecodingReader(rulesStream, IOUtils.CHARSET_UTF_8));
|
||||
String line = null;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
if ( ! line.startsWith("#"))
|
||||
rules.append(line);
|
||||
rules.append('\n');
|
||||
}
|
||||
reader.close();
|
||||
return new RuleBasedBreakIterator(rules.toString());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tokenizer create(Reader input) {
|
||||
return new ICUTokenizer(input);
|
||||
assert config != null : "inform must be called first!";
|
||||
return new ICUTokenizer(input, config);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# RBBI Keyword tokenizer: keep everything as a single token.
|
||||
|
||||
# Apply rule status {200}=RBBI.WORD_LETTER, which is mapped
|
||||
# to <ALPHANUM> token type by DefaultICUTokenizerConfig.
|
||||
.+ {200};
|
|
@ -0,0 +1,40 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Break only on whitespace; assign token type from set { <ALPHANUM>, <NUM>, <OTHER> }
|
||||
#
|
||||
|
||||
!!forward;
|
||||
|
||||
$Whitespace = [\p{Whitespace}];
|
||||
$NonWhitespace = [\P{Whitespace}];
|
||||
$Letter = [\p{Letter}];
|
||||
$Number = [\p{Number}];
|
||||
|
||||
# Default rule status is {0}=RBBI.WORD_NONE => not tokenized by ICUTokenizer
|
||||
$Whitespace;
|
||||
|
||||
# Assign rule status {200}=RBBI.WORD_LETTER when the token contains a letter char
|
||||
# Mapped to <ALPHANUM> token type by DefaultICUTokenizerConfig
|
||||
$NonWhitespace* $Letter $NonWhitespace* {200};
|
||||
|
||||
# Assign rule status {100}=RBBI.WORD_NUM when the token contains a numeric char
|
||||
# Mapped to <NUM> token type by DefaultICUTokenizerConfig
|
||||
$NonWhitespace* $Number $NonWhitespace* {100};
|
||||
|
||||
# Assign rule status {1} (no RBBI equivalent) when the token contains neither a letter nor a numeric char
|
||||
# Mapped to <OTHER> token type by DefaultICUTokenizerConfig
|
||||
$NonWhitespace+ {1};
|
|
@ -0,0 +1,135 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Based on Default.rbbi, the default RBBI rules, based on UAX#29.
|
||||
# Added dashes to $MidLetter, so that words aren't broken on single dashes.
|
||||
#
|
||||
|
||||
!!chain;
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
#
|
||||
|
||||
$CR = [\p{Word_Break = CR}];
|
||||
$LF = [\p{Word_Break = LF}];
|
||||
$Newline = [\p{Word_Break = Newline}];
|
||||
$Extend = [\p{Word_Break = Extend}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$Katakana = [\p{Word_Break = Katakana}];
|
||||
$ALetter = [\p{Word_Break = ALetter}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
# Don't use [:Dash:] here - it contains lots of chars that should continue to trigger word breaks
|
||||
$Dash = [\N{HYPHEN-MINUS}
|
||||
\N{HYPHEN}
|
||||
\N{EN DASH}
|
||||
\N{MINUS SIGN}
|
||||
\N{SMALL HYPHEN-MINUS}
|
||||
\N{FULLWIDTH HYPHEN-MINUS}];
|
||||
$MidLetter = [\p{Word_Break = MidLetter}$Dash]; # Don't break on (single) hyphen
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||
# 5.0 or later as the definition of Complex_Context was corrected to include all
|
||||
# characters requiring dictionary break.
|
||||
|
||||
$dictionary = [:LineBreak = Complex_Context:];
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not
|
||||
# include the dictionary characters.
|
||||
|
||||
#
|
||||
# Rules 4 Ignore Format and Extend characters,
|
||||
# except when they appear at the beginning of a region of text.
|
||||
#
|
||||
$KatakanaEx = $Katakana ($Extend | $Format)*;
|
||||
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||
$MidLetterEx = $MidLetter ($Extend | $Format)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||
|
||||
$Hiragana = [\p{script=Hiragana}];
|
||||
$Ideographic = [\p{Ideographic}];
|
||||
$HiraganaEx = $Hiragana ($Extend | $Format)*;
|
||||
$IdeographicEx = $Ideographic ($Extend | $Format)*;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
|
||||
# Rule 3 - CR x LF
|
||||
#
|
||||
$CR $LF;
|
||||
|
||||
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
|
||||
# of a region of Text. The rule here comes into play when the start of text
|
||||
# begins with a group of Format chars, or with a "word" consisting of a single
|
||||
# char that is not in any of the listed word break categories followed by
|
||||
# format char(s).
|
||||
[^$CR $LF $Newline]? ($Extend | $Format)+;
|
||||
|
||||
$NumericEx {100};
|
||||
$ALetterEx {200};
|
||||
$KatakanaEx {300}; # note: these status values override those from rule 5
|
||||
$HiraganaEx {300}; # by virtual of being numerically larger.
|
||||
$IdeographicEx {400}; #
|
||||
|
||||
#
|
||||
# rule 5
|
||||
# Do not break between most letters.
|
||||
#
|
||||
$ALetterEx $ALetterEx {200};
|
||||
|
||||
# rule 6 and 7
|
||||
$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
|
||||
|
||||
# rule 8
|
||||
|
||||
$NumericEx $NumericEx {100};
|
||||
|
||||
# rule 9
|
||||
|
||||
$ALetterEx $NumericEx {200};
|
||||
|
||||
# rule 10
|
||||
|
||||
$NumericEx $ALetterEx {200};
|
||||
|
||||
# rule 11 and 12
|
||||
|
||||
$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
|
||||
|
||||
# rule 13
|
||||
|
||||
$KatakanaEx $KatakanaEx {300};
|
||||
|
||||
# rule 13a/b
|
||||
|
||||
$ALetterEx $ExtendNumLetEx {200}; # (13a)
|
||||
$NumericEx $ExtendNumLetEx {100}; # (13a)
|
||||
$KatakanaEx $ExtendNumLetEx {300}; # (13a)
|
||||
$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
|
||||
|
||||
$ExtendNumLetEx $ALetterEx {200}; # (13b)
|
||||
$ExtendNumLetEx $NumericEx {100}; # (13b)
|
||||
$ExtendNumLetEx $KatakanaEx {300}; # (13b)
|
|
@ -19,18 +19,73 @@ package org.apache.lucene.analysis.icu.segmentation;
|
|||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
|
||||
|
||||
/** basic tests for {@link ICUTokenizerFactory} **/
|
||||
public class TestICUTokenizerFactory extends BaseTokenStreamTestCase {
|
||||
public void testMixedText() throws Exception {
|
||||
Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี This is a test ກວ່າດອກ");
|
||||
ICUTokenizerFactory factory = new ICUTokenizerFactory();
|
||||
factory.init(new HashMap<String,String>());
|
||||
factory.inform(new ClasspathResourceLoader(getClass()));
|
||||
TokenStream stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี",
|
||||
"This", "is", "a", "test", "ກວ່າ", "ດອກ"});
|
||||
}
|
||||
|
||||
public void testTokenizeLatinOnWhitespaceOnly() throws Exception {
|
||||
// “ U+201C LEFT DOUBLE QUOTATION MARK; ” U+201D RIGHT DOUBLE QUOTATION MARK
|
||||
Reader reader = new StringReader
|
||||
(" Don't,break.at?/(punct)! \u201Cnice\u201D\r\n\r\n85_At:all; `really\" +2=3$5,&813 !@#%$^)(*@#$ ");
|
||||
ICUTokenizerFactory factory = new ICUTokenizerFactory();
|
||||
final Map<String,String> args = new HashMap<String,String>();
|
||||
args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-break-only-on-whitespace.rbbi");
|
||||
factory.init(args);
|
||||
factory.inform(new ClasspathResourceLoader(this.getClass()));
|
||||
TokenStream stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "Don't,break.at?/(punct)!", "\u201Cnice\u201D", "85_At:all;", "`really\"", "+2=3$5,&813", "!@#%$^)(*@#$" },
|
||||
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<OTHER>" });
|
||||
}
|
||||
|
||||
public void testTokenizeLatinDontBreakOnHyphens() throws Exception {
|
||||
Reader reader = new StringReader
|
||||
("One-two punch. Brang-, not brung-it. This one--not that one--is the right one, -ish.");
|
||||
ICUTokenizerFactory factory = new ICUTokenizerFactory();
|
||||
final Map<String,String> args = new HashMap<String,String>();
|
||||
args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-dont-break-on-hyphens.rbbi");
|
||||
factory.init(args);
|
||||
factory.inform(new ClasspathResourceLoader(getClass()));
|
||||
TokenStream stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "One-two", "punch",
|
||||
"Brang", "not", "brung-it",
|
||||
"This", "one", "not", "that", "one", "is", "the", "right", "one", "ish" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Specify more than one script/rule file pair.
|
||||
* Override default DefaultICUTokenizerConfig Thai script tokenization.
|
||||
* Use the same rule file for both scripts.
|
||||
*/
|
||||
public void testKeywordTokenizeCyrillicAndThai() throws Exception {
|
||||
Reader reader = new StringReader
|
||||
("Some English. Немного русский. ข้อความภาษาไทยเล็ก ๆ น้อย ๆ More English.");
|
||||
ICUTokenizerFactory factory = new ICUTokenizerFactory();
|
||||
final Map<String,String> args = new HashMap<String,String>();
|
||||
args.put(ICUTokenizerFactory.RULEFILES, "Cyrl:KeywordTokenizer.rbbi,Thai:KeywordTokenizer.rbbi");
|
||||
factory.init(args);
|
||||
factory.inform(new ClasspathResourceLoader(getClass()));
|
||||
TokenStream stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream, new String[] { "Some", "English",
|
||||
"Немного русский. ",
|
||||
"ข้อความภาษาไทยเล็ก ๆ น้อย ๆ ",
|
||||
"More", "English" });
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue