LUCENE-2653: ThaiAnalyzer assumes things about your jre

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@998684 13f79535-47bb-0310-9956-ffa450edef68
2010-09-19 15:40:06 +00:00 · 2010-09-19 15:40:06 +00:00 · c84bd2f1ec
parent 68776ee5d6
commit c84bd2f1ec
4 changed files with 31 additions and 2 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -122,6 +122,10 @@ Bug fixes
  you to customize its normalization/folding, by editing the source data files in src/data
  and regenerating a new .nrm with 'ant gennorm2'.  (David Bowen via Robert Muir)

+* LUCENE-2653: ThaiWordFilter depends on the JRE having a Thai dictionary, which is not
+  always the case. If the dictionary is unavailable, the filter will now throw 
+  UnsupportedOperationException in the constructor.  (Robert Muir)
+
   
 API Changes

--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
@ -38,10 +38,24 @@ import org.apache.lucene.util.Version;
 * {@link ThaiAnalyzer} will insert a {@link LowerCaseFilter} before this filter
 * so the behaviour of the Analyzer does not change. With version 3.1, the filter handles
 * position increments correctly.
+ * <p>WARNING: this filter may not be supported by all JREs.
+ *    It is known to work with Sun/Oracle and Harmony JREs.
+ *    If your application needs to be fully portable, consider using ICUTokenizer instead,
+ *    which uses an ICU Thai BreakIterator that will always be available.
 */
 public final class ThaiWordFilter extends TokenFilter {
-  
-  private final BreakIterator breaker = BreakIterator.getWordInstance(new Locale("th"));
+  /** 
+   * True if the JRE supports a working dictionary-based breakiterator for Thai.
+   * If this is false, this filter will not work at all!
+   */
+  public static final boolean DBBI_AVAILABLE;
+  private static final BreakIterator proto = BreakIterator.getWordInstance(new Locale("th"));
+  static {
+    // check that we have a working dictionary-based break iterator for thai
+    proto.setText("ภาษาไทย");
+    DBBI_AVAILABLE = proto.isBoundary(4);
+  }
+  private final BreakIterator breaker = (BreakIterator) proto.clone();
  private final Segment charIterator = new Segment();
  
  private final boolean handlePosIncr;
@ -67,6 +81,8 @@ public final class ThaiWordFilter extends TokenFilter {
  public ThaiWordFilter(Version matchVersion, TokenStream input) {
    super(matchVersion.onOrAfter(Version.LUCENE_31) ?
      input : new LowerCaseFilter(matchVersion, input));
+    if (!DBBI_AVAILABLE)
+      throw new UnsupportedOperationException("This JRE does not have support for Thai segmentation");
    handlePosIncr = matchVersion.onOrAfter(Version.LUCENE_31);
  }
  
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.th;
 */

 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.junit.Assume;

 /**
 * Test case for ThaiAnalyzer, modified from TestFrenchAnalyzer
@ -31,6 +32,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
 	 * testcase for offsets
 	 */
 	public void testOffsets() throws Exception {
+	  Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
 		assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี", 
 		    new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
 				new int[] { 0, 3, 6, 9, 13, 17, 20, 23 },
@ -49,6 +51,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
 	 * Instead, allow the definition of alphanum to include relevant categories like nonspacing marks!
 	 */
 	public void testBuggyTokenType() throws Exception {
+	  Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
 		assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี ๑๒๓", 
 		    new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
 				new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", 
@ -65,6 +68,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
 	*/

 	public void testAnalyzer() throws Exception {
+	  Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
 		ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
 	
 		assertAnalyzesTo(analyzer, "", new String[] {});
@ -90,6 +94,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
 	 * Test that position increments are adjusted correctly for stopwords.
 	 */
 	public void testPositionIncrements() throws Exception {
+	  Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
 	  ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);

    assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้อง the แสดงว่างานดี", 
@ -107,6 +112,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
 	}
 	
 	public void testReusableTokenStream() throws Exception {
+	  Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
 	  ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
 	  assertAnalyzesToReuse(analyzer, "", new String[] {});

--- a/solr/src/test/org/apache/solr/analysis/TestThaiWordFilterFactory.java
+++ b/solr/src/test/org/apache/solr/analysis/TestThaiWordFilterFactory.java
@ -23,6 +23,8 @@ import java.io.StringReader;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.th.ThaiWordFilter;
+import org.junit.Assume;

 /**
 * Simple tests to ensure the Thai word filter factory is working.
@ -32,6 +34,7 @@ public class TestThaiWordFilterFactory extends BaseTokenTestCase {
   * Ensure the filter actually decomposes text.
   */
  public void testWordBreak() throws Exception {
+    Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
    Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี");
    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
    ThaiWordFilterFactory factory = new ThaiWordFilterFactory();