SOLR-4123: Add per-script customizability to ICUTokenizerFactory via rule files in the ICU RuleBasedBreakIterator format.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1416617 13f79535-47bb-0310-9956-ffa450edef68
2012-12-03 18:21:18 +00:00 · 2012-12-03 18:21:18 +00:00 · 6024e1465e
parent 0c24fa2204
commit 6024e1465e
9 changed files with 380 additions and 5 deletions
--- a/dev-tools/eclipse/dot.classpath
+++ b/dev-tools/eclipse/dot.classpath
@ -25,6 +25,7 @@
  <classpathentry kind="src" path="lucene/analysis/icu/src/java"/>
  <classpathentry kind="src" output="eclipse-build/analysis-icu" path="lucene/analysis/icu/src/resources"/>
  <classpathentry kind="src" path="lucene/analysis/icu/src/test"/>
+  <classpathentry kind="src" path="lucene/analysis/icu/src/test-files"/>
  <classpathentry kind="src" path="lucene/analysis/kuromoji/src/java"/>
  <classpathentry kind="src" output="eclipse-build/analysis-kuromoji" path="lucene/analysis/kuromoji/src/resources"/>
  <classpathentry kind="src" path="lucene/analysis/kuromoji/src/test"/>
--- a/dev-tools/idea/lucene/analysis/icu/icu.iml
+++ b/dev-tools/idea/lucene/analysis/icu/icu.iml
@ -9,6 +9,7 @@
      <sourceFolder url="file://$MODULE_DIR$/src/tools/java" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
      <sourceFolder url="file://$MODULE_DIR$/src/resources" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src/test-files" isTestSource="true" />
    </content>
    <orderEntry type="inheritedJdk" />
    <orderEntry type="sourceFolder" forTests="false" />
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -113,6 +113,10 @@ New Features
  be notified whenever a new searcher was opened. (selckin via Shai
  Erera, Mike McCandless)

+* SOLR-4123: Add per-script customizability to ICUTokenizerFactory via
+  rule files in the ICU RuleBasedBreakIterator format.
+  (Shawn Heisey, Robert Muir, Steve Rowe)
+
 API Changes

 * LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries
--- a/lucene/analysis/icu/build.xml
+++ b/lucene/analysis/icu/build.xml
@ -35,6 +35,11 @@
    <path refid="base.classpath"/>
  </path>

+  <path id="test.classpath">
+    <path refid="test.base.classpath" />
+    <pathelement path="src/test-files" />
+  </path>
+
  <target name="compile-core" depends="jar-analyzers-common, common.compile-core" />

  <property name="utr30.data.dir" location="src/data/utr30"/>
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
@ -17,22 +17,135 @@ package org.apache.lucene.analysis.icu.segmentation;
 * limitations under the License.
 */

+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
 import java.io.Reader;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;

 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
 import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
 import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.IOUtils;

-/** Factory for {@link ICUTokenizer} */
-public class ICUTokenizerFactory extends TokenizerFactory {
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UProperty;
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator;
+
+/**
+ * Factory for {@link ICUTokenizer}.
+ * Words are broken across script boundaries, then segmented according to
+ * the BreakIterator and typing provided by the {@link DefaultICUTokenizerConfig}.
+ *
+ * <p/>
+ *
+ * To use the default set of per-script rules:
+ *
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_icu" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.ICUTokenizerFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ *
+ * <p/>
+ *
+ * You can customize this tokenizer's behavior by specifying per-script rule files,
+ * which are compiled by the ICU RuleBasedBreakIterator.  See the
+ * <a href="http://userguide.icu-project.org/boundaryanalysis#TOC-RBBI-Rules"
+ * >ICU RuleBasedBreakIterator syntax reference</a>.
+ *
+ * To add per-script rules, add a "rulefiles" argument, which should contain a
+ * comma-separated list of <tt>code:rulefile</tt> pairs in the following format:
+ * <a href="http://unicode.org/iso15924/iso15924-codes.html"
+ * >four-letter ISO 15924 script code</a>, followed by a colon, then a resource
+ * path.  E.g. to specify rules for Latin (script code "Latn") and Cyrillic
+ * (script code "Cyrl"):
+ *
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_icu_custom" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.ICUTokenizerFactory"
+ *                rulefiles="Latn:my.Latin.rules.rbbi,Cyrl:my.Cyrillic.rules.rbbi"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ *
+ */
+public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware {
+  static final String RULEFILES = "rulefiles";
+  private Map<Integer,String> tailored;
+  private ICUTokenizerConfig config;
  
  /** Sole constructor. See {@link AbstractAnalysisFactory} for initialization lifecycle. */
  public ICUTokenizerFactory() {}

-  // TODO: add support for custom configs
+  @Override
+  public void init(Map<String,String> args) {
+    super.init(args);
+    tailored = new HashMap<Integer,String>();
+    String rulefilesArg = args.get(RULEFILES);
+    if (rulefilesArg != null) {
+      List<String> scriptAndResourcePaths = splitFileNames(rulefilesArg);
+      for (String scriptAndResourcePath : scriptAndResourcePaths) {
+        int colonPos = scriptAndResourcePath.indexOf(":");
+        String scriptCode = scriptAndResourcePath.substring(0, colonPos).trim();
+        String resourcePath = scriptAndResourcePath.substring(colonPos+1).trim();
+        tailored.put(UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode), resourcePath);
+      }
+    }
+  }
+
+  @Override
+  public void inform(ResourceLoader loader) throws IOException {
+    assert tailored != null : "init must be called first!";
+    if (tailored.isEmpty()) {
+      config = new DefaultICUTokenizerConfig();
+    } else {
+      final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT];
+      for (Map.Entry<Integer,String> entry : tailored.entrySet()) {
+        int code = entry.getKey();
+        String resourcePath = entry.getValue();
+        breakers[code] = parseRules(resourcePath, loader);
+      }
+      config = new DefaultICUTokenizerConfig() {
+        
+        @Override
+        public BreakIterator getBreakIterator(int script) {
+          if (breakers[script] != null) {
+            return (BreakIterator) breakers[script].clone();
+          } else {
+            return super.getBreakIterator(script);
+          }
+        }
+        // TODO: we could also allow codes->types mapping
+      };
+    }
+  }
+  
+  private BreakIterator parseRules(String filename, ResourceLoader loader) throws IOException {
+    StringBuilder rules = new StringBuilder();
+    InputStream rulesStream = loader.openResource(filename);
+    BufferedReader reader = new BufferedReader
+        (IOUtils.getDecodingReader(rulesStream, IOUtils.CHARSET_UTF_8));
+    String line = null;
+    while ((line = reader.readLine()) != null) {
+      if ( ! line.startsWith("#"))
+        rules.append(line);
+      rules.append('\n');
+    }
+    reader.close();
+    return new RuleBasedBreakIterator(rules.toString());
+  }
+
  @Override
  public Tokenizer create(Reader input) {
-    return new ICUTokenizer(input);
+    assert config != null : "inform must be called first!";
+    return new ICUTokenizer(input, config);
  }
 }
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/KeywordTokenizer.rbbi
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/KeywordTokenizer.rbbi
@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# RBBI Keyword tokenizer: keep everything as a single token.
+
+# Apply rule status {200}=RBBI.WORD_LETTER, which is mapped
+# to <ALPHANUM> token type by DefaultICUTokenizerConfig.
+.+ {200};
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-break-only-on-whitespace.rbbi
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-break-only-on-whitespace.rbbi
@ -0,0 +1,40 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Break only on whitespace; assign token type from set { <ALPHANUM>, <NUM>, <OTHER> }
+#
+
+!!forward;
+
+$Whitespace = [\p{Whitespace}];
+$NonWhitespace = [\P{Whitespace}];
+$Letter = [\p{Letter}];
+$Number = [\p{Number}];
+
+# Default rule status is {0}=RBBI.WORD_NONE => not tokenized by ICUTokenizer
+$Whitespace;
+
+# Assign rule status {200}=RBBI.WORD_LETTER when the token contains a letter char
+# Mapped to <ALPHANUM> token type by DefaultICUTokenizerConfig
+$NonWhitespace* $Letter $NonWhitespace*   {200};
+
+# Assign rule status {100}=RBBI.WORD_NUM when the token contains a numeric char
+# Mapped to <NUM> token type by DefaultICUTokenizerConfig
+$NonWhitespace* $Number $NonWhitespace*   {100};
+
+# Assign rule status {1} (no RBBI equivalent) when the token contains neither a letter nor a numeric char
+# Mapped to <OTHER> token type by DefaultICUTokenizerConfig
+$NonWhitespace+   {1};
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-dont-break-on-hyphens.rbbi
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-dont-break-on-hyphens.rbbi
@ -0,0 +1,135 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Based on Default.rbbi, the default RBBI rules, based on UAX#29.
+# Added dashes to $MidLetter, so that words aren't broken on single dashes.
+#
+
+!!chain;
+
+#
+#  Character Class Definitions.
+#
+
+$CR           = [\p{Word_Break = CR}];
+$LF           = [\p{Word_Break = LF}];
+$Newline      = [\p{Word_Break = Newline}];
+$Extend       = [\p{Word_Break = Extend}];
+$Format       = [\p{Word_Break = Format}];
+$Katakana     = [\p{Word_Break = Katakana}];
+$ALetter      = [\p{Word_Break = ALetter}];
+$MidNumLet    = [\p{Word_Break = MidNumLet}];
+# Don't use [:Dash:] here - it contains lots of chars that should continue to trigger word breaks
+$Dash         = [\N{HYPHEN-MINUS}
+                 \N{HYPHEN}
+                 \N{EN DASH}
+                 \N{MINUS SIGN}
+                 \N{SMALL HYPHEN-MINUS}
+                 \N{FULLWIDTH HYPHEN-MINUS}];
+$MidLetter    = [\p{Word_Break = MidLetter}$Dash]; # Don't break on (single) hyphen
+$MidNum       = [\p{Word_Break = MidNum}];
+$Numeric      = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+
+
+#   Dictionary character set, for triggering language-based break engines. Currently
+#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
+#   5.0 or later as the definition of Complex_Context was corrected to include all
+#   characters requiring dictionary break.
+
+$dictionary   = [:LineBreak = Complex_Context:];
+$Control        = [\p{Grapheme_Cluster_Break = Control}];
+$ALetterPlus  = [$ALetter [$dictionary-$Extend-$Control]];   # Note:  default ALetter does not
+                                                             #  include the dictionary characters.
+
+#
+#  Rules 4    Ignore Format and Extend characters,
+#             except when they appear at the beginning of a region of text.
+#
+$KatakanaEx     = $Katakana     ($Extend |  $Format)*;
+$ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;
+$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
+$MidLetterEx    = $MidLetter    ($Extend |  $Format)*;
+$MidNumEx       = $MidNum       ($Extend |  $Format)*;
+$NumericEx      = $Numeric      ($Extend |  $Format)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
+
+$Hiragana       = [\p{script=Hiragana}];
+$Ideographic    = [\p{Ideographic}];
+$HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
+$IdeographicEx  = $Ideographic  ($Extend |  $Format)*;
+
+## -------------------------------------------------
+
+!!forward;
+
+
+# Rule 3 - CR x LF
+#
+$CR $LF;
+
+# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
+#          of a region of Text.   The rule here comes into play when the start of text
+#          begins with a group of Format chars, or with a "word" consisting of a single
+#          char that is not in any of the listed word break categories followed by
+#          format char(s).
+[^$CR $LF $Newline]? ($Extend |  $Format)+;
+
+$NumericEx {100};
+$ALetterEx {200};
+$KatakanaEx {300};       # note:  these status values override those from rule 5
+$HiraganaEx {300};       #        by virtual of being numerically larger.
+$IdeographicEx {400};    #
+
+#
+# rule 5
+#    Do not break between most letters.
+#
+$ALetterEx $ALetterEx {200};
+
+# rule 6 and 7
+$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
+
+# rule 8
+
+$NumericEx $NumericEx {100};
+
+# rule 9
+
+$ALetterEx $NumericEx {200};
+
+# rule 10
+
+$NumericEx $ALetterEx {200};
+
+# rule 11 and 12
+
+$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
+
+# rule 13
+
+$KatakanaEx  $KatakanaEx {300};
+
+# rule 13a/b
+
+$ALetterEx      $ExtendNumLetEx {200};    #  (13a)
+$NumericEx      $ExtendNumLetEx {100};    #  (13a)
+$KatakanaEx     $ExtendNumLetEx {300};    #  (13a)
+$ExtendNumLetEx $ExtendNumLetEx {200};    #  (13a)
+
+$ExtendNumLetEx $ALetterEx  {200};    #  (13b)
+$ExtendNumLetEx $NumericEx  {100};    #  (13b)
+$ExtendNumLetEx $KatakanaEx {300};    #  (13b)
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java
@ -19,18 +19,73 @@ package org.apache.lucene.analysis.icu.segmentation;

 import java.io.Reader;
 import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;

 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;

 /** basic tests for {@link ICUTokenizerFactory} **/
 public class TestICUTokenizerFactory extends BaseTokenStreamTestCase {
  public void testMixedText() throws Exception {
    Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี  This is a test ກວ່າດອກ");
    ICUTokenizerFactory factory = new ICUTokenizerFactory();
+    factory.init(new HashMap<String,String>());
+    factory.inform(new ClasspathResourceLoader(getClass()));
    TokenStream stream = factory.create(reader);
    assertTokenStreamContents(stream,
        new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี",
        "This", "is", "a", "test", "ກວ່າ", "ດອກ"});
  }
+
+  public void testTokenizeLatinOnWhitespaceOnly() throws Exception {
+    // “ U+201C LEFT DOUBLE QUOTATION MARK; ” U+201D RIGHT DOUBLE QUOTATION MARK
+    Reader reader = new StringReader
+        ("  Don't,break.at?/(punct)!  \u201Cnice\u201D\r\n\r\n85_At:all; `really\" +2=3$5,&813 !@#%$^)(*@#$   ");
+    ICUTokenizerFactory factory = new ICUTokenizerFactory();
+    final Map<String,String> args = new HashMap<String,String>();
+    args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-break-only-on-whitespace.rbbi");
+    factory.init(args);
+    factory.inform(new ClasspathResourceLoader(this.getClass()));
+    TokenStream stream = factory.create(reader);
+    assertTokenStreamContents(stream,
+        new String[] { "Don't,break.at?/(punct)!", "\u201Cnice\u201D", "85_At:all;", "`really\"",  "+2=3$5,&813", "!@#%$^)(*@#$" },
+        new String[] { "<ALPHANUM>",               "<ALPHANUM>",       "<ALPHANUM>", "<ALPHANUM>", "<NUM>",       "<OTHER>" });
+  }
+
+  public void testTokenizeLatinDontBreakOnHyphens() throws Exception {
+    Reader reader = new StringReader
+        ("One-two punch.  Brang-, not brung-it.  This one--not that one--is the right one, -ish.");
+    ICUTokenizerFactory factory = new ICUTokenizerFactory();
+    final Map<String,String> args = new HashMap<String,String>();
+    args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-dont-break-on-hyphens.rbbi");
+    factory.init(args);
+    factory.inform(new ClasspathResourceLoader(getClass()));
+    TokenStream stream = factory.create(reader);
+    assertTokenStreamContents(stream,
+        new String[] { "One-two", "punch",
+            "Brang", "not", "brung-it",
+            "This", "one", "not", "that", "one", "is", "the", "right", "one", "ish" });
+  }
+
+  /**
+   * Specify more than one script/rule file pair.
+   * Override default DefaultICUTokenizerConfig Thai script tokenization.
+   * Use the same rule file for both scripts.
+   */
+  public void testKeywordTokenizeCyrillicAndThai() throws Exception {
+    Reader reader = new StringReader
+        ("Some English.  Немного русский.  ข้อความภาษาไทยเล็ก ๆ น้อย ๆ  More English.");
+    ICUTokenizerFactory factory = new ICUTokenizerFactory();
+    final Map<String,String> args = new HashMap<String,String>();
+    args.put(ICUTokenizerFactory.RULEFILES, "Cyrl:KeywordTokenizer.rbbi,Thai:KeywordTokenizer.rbbi");
+    factory.init(args);
+    factory.inform(new ClasspathResourceLoader(getClass()));
+    TokenStream stream = factory.create(reader);
+    assertTokenStreamContents(stream, new String[] { "Some", "English",
+        "Немного русский.  ",
+        "ข้อความภาษาไทยเล็ก ๆ น้อย ๆ  ",
+        "More", "English" });
+  }
 }