From 6024e1465e02302a0c1402127a6f407194365cd8 Mon Sep 17 00:00:00 2001 From: Steven Rowe Date: Mon, 3 Dec 2012 18:21:18 +0000 Subject: [PATCH] SOLR-4123: Add per-script customizability to ICUTokenizerFactory via rule files in the ICU RuleBasedBreakIterator format. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1416617 13f79535-47bb-0310-9956-ffa450edef68 --- dev-tools/eclipse/dot.classpath | 1 + dev-tools/idea/lucene/analysis/icu/icu.iml | 1 + lucene/CHANGES.txt | 4 + lucene/analysis/icu/build.xml | 5 + .../icu/segmentation/ICUTokenizerFactory.java | 123 +++++++++++++++- .../icu/segmentation/KeywordTokenizer.rbbi | 21 +++ .../Latin-break-only-on-whitespace.rbbi | 40 ++++++ .../Latin-dont-break-on-hyphens.rbbi | 135 ++++++++++++++++++ .../segmentation/TestICUTokenizerFactory.java | 55 +++++++ 9 files changed, 380 insertions(+), 5 deletions(-) create mode 100644 lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/KeywordTokenizer.rbbi create mode 100644 lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-break-only-on-whitespace.rbbi create mode 100644 lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-dont-break-on-hyphens.rbbi diff --git a/dev-tools/eclipse/dot.classpath b/dev-tools/eclipse/dot.classpath index 21a09761e44..9c2d3ae4ed5 100644 --- a/dev-tools/eclipse/dot.classpath +++ b/dev-tools/eclipse/dot.classpath @@ -25,6 +25,7 @@ + diff --git a/dev-tools/idea/lucene/analysis/icu/icu.iml b/dev-tools/idea/lucene/analysis/icu/icu.iml index a3f7d9fd216..fc3e6a09d95 100644 --- a/dev-tools/idea/lucene/analysis/icu/icu.iml +++ b/dev-tools/idea/lucene/analysis/icu/icu.iml @@ -9,6 +9,7 @@ + diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 21839ac8650..c70c4db850f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -113,6 +113,10 @@ New Features be notified whenever a new searcher was opened. (selckin via Shai Erera, Mike McCandless) +* SOLR-4123: Add per-script customizability to ICUTokenizerFactory via + rule files in the ICU RuleBasedBreakIterator format. + (Shawn Heisey, Robert Muir, Steve Rowe) + API Changes * LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries diff --git a/lucene/analysis/icu/build.xml b/lucene/analysis/icu/build.xml index 5e3578ccea0..61a5a36d9ee 100644 --- a/lucene/analysis/icu/build.xml +++ b/lucene/analysis/icu/build.xml @@ -35,6 +35,11 @@ + + + + + diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java index 7ed1ef7a343..a9345d5fe68 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java @@ -17,22 +17,135 @@ package org.apache.lucene.analysis.icu.segmentation; * limitations under the License. */ +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; import java.io.Reader; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs +import org.apache.lucene.analysis.util.ResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoaderAware; import org.apache.lucene.analysis.util.TokenizerFactory; +import org.apache.lucene.util.IOUtils; -/** Factory for {@link ICUTokenizer} */ -public class ICUTokenizerFactory extends TokenizerFactory { +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.RuleBasedBreakIterator; + +/** + * Factory for {@link ICUTokenizer}. + * Words are broken across script boundaries, then segmented according to + * the BreakIterator and typing provided by the {@link DefaultICUTokenizerConfig}. + * + *

+ * + * To use the default set of per-script rules: + * + *

+ * <fieldType name="text_icu" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.ICUTokenizerFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + *

+ * + * You can customize this tokenizer's behavior by specifying per-script rule files, + * which are compiled by the ICU RuleBasedBreakIterator. See the + * ICU RuleBasedBreakIterator syntax reference. + * + * To add per-script rules, add a "rulefiles" argument, which should contain a + * comma-separated list of code:rulefile pairs in the following format: + * four-letter ISO 15924 script code, followed by a colon, then a resource + * path. E.g. to specify rules for Latin (script code "Latn") and Cyrillic + * (script code "Cyrl"): + * + *

+ * <fieldType name="text_icu_custom" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.ICUTokenizerFactory"
+ *                rulefiles="Latn:my.Latin.rules.rbbi,Cyrl:my.Cyrillic.rules.rbbi"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware { + static final String RULEFILES = "rulefiles"; + private Map tailored; + private ICUTokenizerConfig config; /** Sole constructor. See {@link AbstractAnalysisFactory} for initialization lifecycle. */ public ICUTokenizerFactory() {} - // TODO: add support for custom configs + @Override + public void init(Map args) { + super.init(args); + tailored = new HashMap(); + String rulefilesArg = args.get(RULEFILES); + if (rulefilesArg != null) { + List scriptAndResourcePaths = splitFileNames(rulefilesArg); + for (String scriptAndResourcePath : scriptAndResourcePaths) { + int colonPos = scriptAndResourcePath.indexOf(":"); + String scriptCode = scriptAndResourcePath.substring(0, colonPos).trim(); + String resourcePath = scriptAndResourcePath.substring(colonPos+1).trim(); + tailored.put(UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode), resourcePath); + } + } + } + + @Override + public void inform(ResourceLoader loader) throws IOException { + assert tailored != null : "init must be called first!"; + if (tailored.isEmpty()) { + config = new DefaultICUTokenizerConfig(); + } else { + final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT]; + for (Map.Entry entry : tailored.entrySet()) { + int code = entry.getKey(); + String resourcePath = entry.getValue(); + breakers[code] = parseRules(resourcePath, loader); + } + config = new DefaultICUTokenizerConfig() { + + @Override + public BreakIterator getBreakIterator(int script) { + if (breakers[script] != null) { + return (BreakIterator) breakers[script].clone(); + } else { + return super.getBreakIterator(script); + } + } + // TODO: we could also allow codes->types mapping + }; + } + } + + private BreakIterator parseRules(String filename, ResourceLoader loader) throws IOException { + StringBuilder rules = new StringBuilder(); + InputStream rulesStream = loader.openResource(filename); + BufferedReader reader = new BufferedReader + (IOUtils.getDecodingReader(rulesStream, IOUtils.CHARSET_UTF_8)); + String line = null; + while ((line = reader.readLine()) != null) { + if ( ! line.startsWith("#")) + rules.append(line); + rules.append('\n'); + } + reader.close(); + return new RuleBasedBreakIterator(rules.toString()); + } + @Override public Tokenizer create(Reader input) { - return new ICUTokenizer(input); + assert config != null : "inform must be called first!"; + return new ICUTokenizer(input, config); } } diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/KeywordTokenizer.rbbi b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/KeywordTokenizer.rbbi new file mode 100644 index 00000000000..8e6de8aa94a --- /dev/null +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/KeywordTokenizer.rbbi @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# RBBI Keyword tokenizer: keep everything as a single token. + +# Apply rule status {200}=RBBI.WORD_LETTER, which is mapped +# to token type by DefaultICUTokenizerConfig. +.+ {200}; \ No newline at end of file diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-break-only-on-whitespace.rbbi b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-break-only-on-whitespace.rbbi new file mode 100644 index 00000000000..2d6d9bea021 --- /dev/null +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-break-only-on-whitespace.rbbi @@ -0,0 +1,40 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Break only on whitespace; assign token type from set { , , } +# + +!!forward; + +$Whitespace = [\p{Whitespace}]; +$NonWhitespace = [\P{Whitespace}]; +$Letter = [\p{Letter}]; +$Number = [\p{Number}]; + +# Default rule status is {0}=RBBI.WORD_NONE => not tokenized by ICUTokenizer +$Whitespace; + +# Assign rule status {200}=RBBI.WORD_LETTER when the token contains a letter char +# Mapped to token type by DefaultICUTokenizerConfig +$NonWhitespace* $Letter $NonWhitespace* {200}; + +# Assign rule status {100}=RBBI.WORD_NUM when the token contains a numeric char +# Mapped to token type by DefaultICUTokenizerConfig +$NonWhitespace* $Number $NonWhitespace* {100}; + +# Assign rule status {1} (no RBBI equivalent) when the token contains neither a letter nor a numeric char +# Mapped to token type by DefaultICUTokenizerConfig +$NonWhitespace+ {1}; diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-dont-break-on-hyphens.rbbi b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-dont-break-on-hyphens.rbbi new file mode 100644 index 00000000000..0a4f0686a4f --- /dev/null +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-dont-break-on-hyphens.rbbi @@ -0,0 +1,135 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Based on Default.rbbi, the default RBBI rules, based on UAX#29. +# Added dashes to $MidLetter, so that words aren't broken on single dashes. +# + +!!chain; + +# +# Character Class Definitions. +# + +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$ALetter = [\p{Word_Break = ALetter}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +# Don't use [:Dash:] here - it contains lots of chars that should continue to trigger word breaks +$Dash = [\N{HYPHEN-MINUS} + \N{HYPHEN} + \N{EN DASH} + \N{MINUS SIGN} + \N{SMALL HYPHEN-MINUS} + \N{FULLWIDTH HYPHEN-MINUS}]; +$MidLetter = [\p{Word_Break = MidLetter}$Dash]; # Don't break on (single) hyphen +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; + + +# Dictionary character set, for triggering language-based break engines. Currently +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +# 5.0 or later as the definition of Complex_Context was corrected to include all +# characters requiring dictionary break. + +$dictionary = [:LineBreak = Complex_Context:]; +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not + # include the dictionary characters. + +# +# Rules 4 Ignore Format and Extend characters, +# except when they appear at the beginning of a region of text. +# +$KatakanaEx = $Katakana ($Extend | $Format)*; +$ALetterEx = $ALetterPlus ($Extend | $Format)*; +$MidNumLetEx = $MidNumLet ($Extend | $Format)*; +$MidLetterEx = $MidLetter ($Extend | $Format)*; +$MidNumEx = $MidNum ($Extend | $Format)*; +$NumericEx = $Numeric ($Extend | $Format)*; +$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; + +$Hiragana = [\p{script=Hiragana}]; +$Ideographic = [\p{Ideographic}]; +$HiraganaEx = $Hiragana ($Extend | $Format)*; +$IdeographicEx = $Ideographic ($Extend | $Format)*; + +## ------------------------------------------------- + +!!forward; + + +# Rule 3 - CR x LF +# +$CR $LF; + +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning +# of a region of Text. The rule here comes into play when the start of text +# begins with a group of Format chars, or with a "word" consisting of a single +# char that is not in any of the listed word break categories followed by +# format char(s). +[^$CR $LF $Newline]? ($Extend | $Format)+; + +$NumericEx {100}; +$ALetterEx {200}; +$KatakanaEx {300}; # note: these status values override those from rule 5 +$HiraganaEx {300}; # by virtual of being numerically larger. +$IdeographicEx {400}; # + +# +# rule 5 +# Do not break between most letters. +# +$ALetterEx $ALetterEx {200}; + +# rule 6 and 7 +$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; + +# rule 8 + +$NumericEx $NumericEx {100}; + +# rule 9 + +$ALetterEx $NumericEx {200}; + +# rule 10 + +$NumericEx $ALetterEx {200}; + +# rule 11 and 12 + +$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; + +# rule 13 + +$KatakanaEx $KatakanaEx {300}; + +# rule 13a/b + +$ALetterEx $ExtendNumLetEx {200}; # (13a) +$NumericEx $ExtendNumLetEx {100}; # (13a) +$KatakanaEx $ExtendNumLetEx {300}; # (13a) +$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) + +$ExtendNumLetEx $ALetterEx {200}; # (13b) +$ExtendNumLetEx $NumericEx {100}; # (13b) +$ExtendNumLetEx $KatakanaEx {300}; # (13b) diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java index 1314db6fbc9..fc1da2247e3 100644 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java @@ -19,18 +19,73 @@ package org.apache.lucene.analysis.icu.segmentation; import java.io.Reader; import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.ClasspathResourceLoader; /** basic tests for {@link ICUTokenizerFactory} **/ public class TestICUTokenizerFactory extends BaseTokenStreamTestCase { public void testMixedText() throws Exception { Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี This is a test ກວ່າດອກ"); ICUTokenizerFactory factory = new ICUTokenizerFactory(); + factory.init(new HashMap()); + factory.inform(new ClasspathResourceLoader(getClass())); TokenStream stream = factory.create(reader); assertTokenStreamContents(stream, new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "This", "is", "a", "test", "ກວ່າ", "ດອກ"}); } + + public void testTokenizeLatinOnWhitespaceOnly() throws Exception { + // “ U+201C LEFT DOUBLE QUOTATION MARK; ” U+201D RIGHT DOUBLE QUOTATION MARK + Reader reader = new StringReader + (" Don't,break.at?/(punct)! \u201Cnice\u201D\r\n\r\n85_At:all; `really\" +2=3$5,&813 !@#%$^)(*@#$ "); + ICUTokenizerFactory factory = new ICUTokenizerFactory(); + final Map args = new HashMap(); + args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-break-only-on-whitespace.rbbi"); + factory.init(args); + factory.inform(new ClasspathResourceLoader(this.getClass())); + TokenStream stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] { "Don't,break.at?/(punct)!", "\u201Cnice\u201D", "85_At:all;", "`really\"", "+2=3$5,&813", "!@#%$^)(*@#$" }, + new String[] { "", "", "", "", "", "" }); + } + + public void testTokenizeLatinDontBreakOnHyphens() throws Exception { + Reader reader = new StringReader + ("One-two punch. Brang-, not brung-it. This one--not that one--is the right one, -ish."); + ICUTokenizerFactory factory = new ICUTokenizerFactory(); + final Map args = new HashMap(); + args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-dont-break-on-hyphens.rbbi"); + factory.init(args); + factory.inform(new ClasspathResourceLoader(getClass())); + TokenStream stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] { "One-two", "punch", + "Brang", "not", "brung-it", + "This", "one", "not", "that", "one", "is", "the", "right", "one", "ish" }); + } + + /** + * Specify more than one script/rule file pair. + * Override default DefaultICUTokenizerConfig Thai script tokenization. + * Use the same rule file for both scripts. + */ + public void testKeywordTokenizeCyrillicAndThai() throws Exception { + Reader reader = new StringReader + ("Some English. Немного русский. ข้อความภาษาไทยเล็ก ๆ น้อย ๆ More English."); + ICUTokenizerFactory factory = new ICUTokenizerFactory(); + final Map args = new HashMap(); + args.put(ICUTokenizerFactory.RULEFILES, "Cyrl:KeywordTokenizer.rbbi,Thai:KeywordTokenizer.rbbi"); + factory.init(args); + factory.inform(new ClasspathResourceLoader(getClass())); + TokenStream stream = factory.create(reader); + assertTokenStreamContents(stream, new String[] { "Some", "English", + "Немного русский. ", + "ข้อความภาษาไทยเล็ก ๆ น้อย ๆ ", + "More", "English" }); + } }