diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index a2314f1065b..182e0df6762 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -122,6 +122,10 @@ Bug fixes you to customize its normalization/folding, by editing the source data files in src/data and regenerating a new .nrm with 'ant gennorm2'. (David Bowen via Robert Muir) +* LUCENE-2653: ThaiWordFilter depends on the JRE having a Thai dictionary, which is not + always the case. If the dictionary is unavailable, the filter will now throw + UnsupportedOperationException in the constructor. (Robert Muir) + API Changes diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java index 9751c1ac147..2f6caf95cd9 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java @@ -38,10 +38,24 @@ import org.apache.lucene.util.Version; * {@link ThaiAnalyzer} will insert a {@link LowerCaseFilter} before this filter * so the behaviour of the Analyzer does not change. With version 3.1, the filter handles * position increments correctly. + *
WARNING: this filter may not be supported by all JREs.
+ * It is known to work with Sun/Oracle and Harmony JREs.
+ * If your application needs to be fully portable, consider using ICUTokenizer instead,
+ * which uses an ICU Thai BreakIterator that will always be available.
*/
public final class ThaiWordFilter extends TokenFilter {
-
- private final BreakIterator breaker = BreakIterator.getWordInstance(new Locale("th"));
+ /**
+ * True if the JRE supports a working dictionary-based breakiterator for Thai.
+ * If this is false, this filter will not work at all!
+ */
+ public static final boolean DBBI_AVAILABLE;
+ private static final BreakIterator proto = BreakIterator.getWordInstance(new Locale("th"));
+ static {
+ // check that we have a working dictionary-based break iterator for thai
+ proto.setText("ภาษาไทย");
+ DBBI_AVAILABLE = proto.isBoundary(4);
+ }
+ private final BreakIterator breaker = (BreakIterator) proto.clone();
private final Segment charIterator = new Segment();
private final boolean handlePosIncr;
@@ -67,6 +81,8 @@ public final class ThaiWordFilter extends TokenFilter {
public ThaiWordFilter(Version matchVersion, TokenStream input) {
super(matchVersion.onOrAfter(Version.LUCENE_31) ?
input : new LowerCaseFilter(matchVersion, input));
+ if (!DBBI_AVAILABLE)
+ throw new UnsupportedOperationException("This JRE does not have support for Thai segmentation");
handlePosIncr = matchVersion.onOrAfter(Version.LUCENE_31);
}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
index 6ade715833b..9674a54d57d 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
@@ -18,6 +18,7 @@ package org.apache.lucene.analysis.th;
*/
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.junit.Assume;
/**
* Test case for ThaiAnalyzer, modified from TestFrenchAnalyzer
@@ -31,6 +32,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
* testcase for offsets
*/
public void testOffsets() throws Exception {
+ Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
new int[] { 0, 3, 6, 9, 13, 17, 20, 23 },
@@ -49,6 +51,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
* Instead, allow the definition of alphanum to include relevant categories like nonspacing marks!
*/
public void testBuggyTokenType() throws Exception {
+ Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
new String[] { "