diff --git a/dev-tools/eclipse/dot.classpath b/dev-tools/eclipse/dot.classpath
index 21a09761e44..9c2d3ae4ed5 100644
--- a/dev-tools/eclipse/dot.classpath
+++ b/dev-tools/eclipse/dot.classpath
@@ -25,6 +25,7 @@
+
diff --git a/dev-tools/idea/lucene/analysis/icu/icu.iml b/dev-tools/idea/lucene/analysis/icu/icu.iml
index a3f7d9fd216..fc3e6a09d95 100644
--- a/dev-tools/idea/lucene/analysis/icu/icu.iml
+++ b/dev-tools/idea/lucene/analysis/icu/icu.iml
@@ -9,6 +9,7 @@
+
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 21839ac8650..c70c4db850f 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -113,6 +113,10 @@ New Features
be notified whenever a new searcher was opened. (selckin via Shai
Erera, Mike McCandless)
+* SOLR-4123: Add per-script customizability to ICUTokenizerFactory via
+ rule files in the ICU RuleBasedBreakIterator format.
+ (Shawn Heisey, Robert Muir, Steve Rowe)
+
API Changes
* LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries
diff --git a/lucene/analysis/icu/build.xml b/lucene/analysis/icu/build.xml
index 5e3578ccea0..61a5a36d9ee 100644
--- a/lucene/analysis/icu/build.xml
+++ b/lucene/analysis/icu/build.xml
@@ -35,6 +35,11 @@
+
+
+
+
+
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
index 7ed1ef7a343..a9345d5fe68 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
@@ -17,22 +17,135 @@ package org.apache.lucene.analysis.icu.segmentation;
* limitations under the License.
*/
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
import java.io.Reader;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.IOUtils;
-/** Factory for {@link ICUTokenizer} */
-public class ICUTokenizerFactory extends TokenizerFactory {
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UProperty;
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator;
+
+/**
+ * Factory for {@link ICUTokenizer}.
+ * Words are broken across script boundaries, then segmented according to
+ * the BreakIterator and typing provided by the {@link DefaultICUTokenizerConfig}.
+ *
+ *
+ *
+ * To use the default set of per-script rules:
+ *
+ *
+ * <fieldType name="text_icu" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.ICUTokenizerFactory"/>
+ * </analyzer>
+ * </fieldType>
+ *
+ *
+ *
+ * You can customize this tokenizer's behavior by specifying per-script rule files,
+ * which are compiled by the ICU RuleBasedBreakIterator. See the
+ * ICU RuleBasedBreakIterator syntax reference.
+ *
+ * To add per-script rules, add a "rulefiles" argument, which should contain a
+ * comma-separated list of code:rulefile pairs in the following format:
+ * four-letter ISO 15924 script code, followed by a colon, then a resource
+ * path. E.g. to specify rules for Latin (script code "Latn") and Cyrillic
+ * (script code "Cyrl"):
+ *
+ *
+ * <fieldType name="text_icu_custom" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.ICUTokenizerFactory"
+ * rulefiles="Latn:my.Latin.rules.rbbi,Cyrl:my.Cyrillic.rules.rbbi"/>
+ * </analyzer>
+ * </fieldType>
+ *
+ */
+public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware {
+ static final String RULEFILES = "rulefiles";
+ private Map tailored;
+ private ICUTokenizerConfig config;
/** Sole constructor. See {@link AbstractAnalysisFactory} for initialization lifecycle. */
public ICUTokenizerFactory() {}
- // TODO: add support for custom configs
+ @Override
+ public void init(Map args) {
+ super.init(args);
+ tailored = new HashMap();
+ String rulefilesArg = args.get(RULEFILES);
+ if (rulefilesArg != null) {
+ List scriptAndResourcePaths = splitFileNames(rulefilesArg);
+ for (String scriptAndResourcePath : scriptAndResourcePaths) {
+ int colonPos = scriptAndResourcePath.indexOf(":");
+ String scriptCode = scriptAndResourcePath.substring(0, colonPos).trim();
+ String resourcePath = scriptAndResourcePath.substring(colonPos+1).trim();
+ tailored.put(UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode), resourcePath);
+ }
+ }
+ }
+
+ @Override
+ public void inform(ResourceLoader loader) throws IOException {
+ assert tailored != null : "init must be called first!";
+ if (tailored.isEmpty()) {
+ config = new DefaultICUTokenizerConfig();
+ } else {
+ final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT];
+ for (Map.Entry entry : tailored.entrySet()) {
+ int code = entry.getKey();
+ String resourcePath = entry.getValue();
+ breakers[code] = parseRules(resourcePath, loader);
+ }
+ config = new DefaultICUTokenizerConfig() {
+
+ @Override
+ public BreakIterator getBreakIterator(int script) {
+ if (breakers[script] != null) {
+ return (BreakIterator) breakers[script].clone();
+ } else {
+ return super.getBreakIterator(script);
+ }
+ }
+ // TODO: we could also allow codes->types mapping
+ };
+ }
+ }
+
+ private BreakIterator parseRules(String filename, ResourceLoader loader) throws IOException {
+ StringBuilder rules = new StringBuilder();
+ InputStream rulesStream = loader.openResource(filename);
+ BufferedReader reader = new BufferedReader
+ (IOUtils.getDecodingReader(rulesStream, IOUtils.CHARSET_UTF_8));
+ String line = null;
+ while ((line = reader.readLine()) != null) {
+ if ( ! line.startsWith("#"))
+ rules.append(line);
+ rules.append('\n');
+ }
+ reader.close();
+ return new RuleBasedBreakIterator(rules.toString());
+ }
+
@Override
public Tokenizer create(Reader input) {
- return new ICUTokenizer(input);
+ assert config != null : "inform must be called first!";
+ return new ICUTokenizer(input, config);
}
}
diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/KeywordTokenizer.rbbi b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/KeywordTokenizer.rbbi
new file mode 100644
index 00000000000..8e6de8aa94a
--- /dev/null
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/KeywordTokenizer.rbbi
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# RBBI Keyword tokenizer: keep everything as a single token.
+
+# Apply rule status {200}=RBBI.WORD_LETTER, which is mapped
+# to token type by DefaultICUTokenizerConfig.
+.+ {200};
\ No newline at end of file
diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-break-only-on-whitespace.rbbi b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-break-only-on-whitespace.rbbi
new file mode 100644
index 00000000000..2d6d9bea021
--- /dev/null
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-break-only-on-whitespace.rbbi
@@ -0,0 +1,40 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Break only on whitespace; assign token type from set { , , }
+#
+
+!!forward;
+
+$Whitespace = [\p{Whitespace}];
+$NonWhitespace = [\P{Whitespace}];
+$Letter = [\p{Letter}];
+$Number = [\p{Number}];
+
+# Default rule status is {0}=RBBI.WORD_NONE => not tokenized by ICUTokenizer
+$Whitespace;
+
+# Assign rule status {200}=RBBI.WORD_LETTER when the token contains a letter char
+# Mapped to token type by DefaultICUTokenizerConfig
+$NonWhitespace* $Letter $NonWhitespace* {200};
+
+# Assign rule status {100}=RBBI.WORD_NUM when the token contains a numeric char
+# Mapped to token type by DefaultICUTokenizerConfig
+$NonWhitespace* $Number $NonWhitespace* {100};
+
+# Assign rule status {1} (no RBBI equivalent) when the token contains neither a letter nor a numeric char
+# Mapped to token type by DefaultICUTokenizerConfig
+$NonWhitespace+ {1};
diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-dont-break-on-hyphens.rbbi b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-dont-break-on-hyphens.rbbi
new file mode 100644
index 00000000000..0a4f0686a4f
--- /dev/null
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-dont-break-on-hyphens.rbbi
@@ -0,0 +1,135 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Based on Default.rbbi, the default RBBI rules, based on UAX#29.
+# Added dashes to $MidLetter, so that words aren't broken on single dashes.
+#
+
+!!chain;
+
+#
+# Character Class Definitions.
+#
+
+$CR = [\p{Word_Break = CR}];
+$LF = [\p{Word_Break = LF}];
+$Newline = [\p{Word_Break = Newline}];
+$Extend = [\p{Word_Break = Extend}];
+$Format = [\p{Word_Break = Format}];
+$Katakana = [\p{Word_Break = Katakana}];
+$ALetter = [\p{Word_Break = ALetter}];
+$MidNumLet = [\p{Word_Break = MidNumLet}];
+# Don't use [:Dash:] here - it contains lots of chars that should continue to trigger word breaks
+$Dash = [\N{HYPHEN-MINUS}
+ \N{HYPHEN}
+ \N{EN DASH}
+ \N{MINUS SIGN}
+ \N{SMALL HYPHEN-MINUS}
+ \N{FULLWIDTH HYPHEN-MINUS}];
+$MidLetter = [\p{Word_Break = MidLetter}$Dash]; # Don't break on (single) hyphen
+$MidNum = [\p{Word_Break = MidNum}];
+$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+
+
+# Dictionary character set, for triggering language-based break engines. Currently
+# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
+# 5.0 or later as the definition of Complex_Context was corrected to include all
+# characters requiring dictionary break.
+
+$dictionary = [:LineBreak = Complex_Context:];
+$Control = [\p{Grapheme_Cluster_Break = Control}];
+$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not
+ # include the dictionary characters.
+
+#
+# Rules 4 Ignore Format and Extend characters,
+# except when they appear at the beginning of a region of text.
+#
+$KatakanaEx = $Katakana ($Extend | $Format)*;
+$ALetterEx = $ALetterPlus ($Extend | $Format)*;
+$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
+$MidLetterEx = $MidLetter ($Extend | $Format)*;
+$MidNumEx = $MidNum ($Extend | $Format)*;
+$NumericEx = $Numeric ($Extend | $Format)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
+
+$Hiragana = [\p{script=Hiragana}];
+$Ideographic = [\p{Ideographic}];
+$HiraganaEx = $Hiragana ($Extend | $Format)*;
+$IdeographicEx = $Ideographic ($Extend | $Format)*;
+
+## -------------------------------------------------
+
+!!forward;
+
+
+# Rule 3 - CR x LF
+#
+$CR $LF;
+
+# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
+# of a region of Text. The rule here comes into play when the start of text
+# begins with a group of Format chars, or with a "word" consisting of a single
+# char that is not in any of the listed word break categories followed by
+# format char(s).
+[^$CR $LF $Newline]? ($Extend | $Format)+;
+
+$NumericEx {100};
+$ALetterEx {200};
+$KatakanaEx {300}; # note: these status values override those from rule 5
+$HiraganaEx {300}; # by virtual of being numerically larger.
+$IdeographicEx {400}; #
+
+#
+# rule 5
+# Do not break between most letters.
+#
+$ALetterEx $ALetterEx {200};
+
+# rule 6 and 7
+$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
+
+# rule 8
+
+$NumericEx $NumericEx {100};
+
+# rule 9
+
+$ALetterEx $NumericEx {200};
+
+# rule 10
+
+$NumericEx $ALetterEx {200};
+
+# rule 11 and 12
+
+$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
+
+# rule 13
+
+$KatakanaEx $KatakanaEx {300};
+
+# rule 13a/b
+
+$ALetterEx $ExtendNumLetEx {200}; # (13a)
+$NumericEx $ExtendNumLetEx {100}; # (13a)
+$KatakanaEx $ExtendNumLetEx {300}; # (13a)
+$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
+
+$ExtendNumLetEx $ALetterEx {200}; # (13b)
+$ExtendNumLetEx $NumericEx {100}; # (13b)
+$ExtendNumLetEx $KatakanaEx {300}; # (13b)
diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java
index 1314db6fbc9..fc1da2247e3 100644
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java
@@ -19,18 +19,73 @@ package org.apache.lucene.analysis.icu.segmentation;
import java.io.Reader;
import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;
/** basic tests for {@link ICUTokenizerFactory} **/
public class TestICUTokenizerFactory extends BaseTokenStreamTestCase {
public void testMixedText() throws Exception {
Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี This is a test ກວ່າດອກ");
ICUTokenizerFactory factory = new ICUTokenizerFactory();
+ factory.init(new HashMap());
+ factory.inform(new ClasspathResourceLoader(getClass()));
TokenStream stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี",
"This", "is", "a", "test", "ກວ່າ", "ດອກ"});
}
+
+ public void testTokenizeLatinOnWhitespaceOnly() throws Exception {
+ // “ U+201C LEFT DOUBLE QUOTATION MARK; ” U+201D RIGHT DOUBLE QUOTATION MARK
+ Reader reader = new StringReader
+ (" Don't,break.at?/(punct)! \u201Cnice\u201D\r\n\r\n85_At:all; `really\" +2=3$5,&813 !@#%$^)(*@#$ ");
+ ICUTokenizerFactory factory = new ICUTokenizerFactory();
+ final Map args = new HashMap();
+ args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-break-only-on-whitespace.rbbi");
+ factory.init(args);
+ factory.inform(new ClasspathResourceLoader(this.getClass()));
+ TokenStream stream = factory.create(reader);
+ assertTokenStreamContents(stream,
+ new String[] { "Don't,break.at?/(punct)!", "\u201Cnice\u201D", "85_At:all;", "`really\"", "+2=3$5,&813", "!@#%$^)(*@#$" },
+ new String[] { "", "", "", "", "", "" });
+ }
+
+ public void testTokenizeLatinDontBreakOnHyphens() throws Exception {
+ Reader reader = new StringReader
+ ("One-two punch. Brang-, not brung-it. This one--not that one--is the right one, -ish.");
+ ICUTokenizerFactory factory = new ICUTokenizerFactory();
+ final Map args = new HashMap();
+ args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-dont-break-on-hyphens.rbbi");
+ factory.init(args);
+ factory.inform(new ClasspathResourceLoader(getClass()));
+ TokenStream stream = factory.create(reader);
+ assertTokenStreamContents(stream,
+ new String[] { "One-two", "punch",
+ "Brang", "not", "brung-it",
+ "This", "one", "not", "that", "one", "is", "the", "right", "one", "ish" });
+ }
+
+ /**
+ * Specify more than one script/rule file pair.
+ * Override default DefaultICUTokenizerConfig Thai script tokenization.
+ * Use the same rule file for both scripts.
+ */
+ public void testKeywordTokenizeCyrillicAndThai() throws Exception {
+ Reader reader = new StringReader
+ ("Some English. Немного русский. ข้อความภาษาไทยเล็ก ๆ น้อย ๆ More English.");
+ ICUTokenizerFactory factory = new ICUTokenizerFactory();
+ final Map args = new HashMap();
+ args.put(ICUTokenizerFactory.RULEFILES, "Cyrl:KeywordTokenizer.rbbi,Thai:KeywordTokenizer.rbbi");
+ factory.init(args);
+ factory.inform(new ClasspathResourceLoader(getClass()));
+ TokenStream stream = factory.create(reader);
+ assertTokenStreamContents(stream, new String[] { "Some", "English",
+ "Немного русский. ",
+ "ข้อความภาษาไทยเล็ก ๆ น้อย ๆ ",
+ "More", "English" });
+ }
}