diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index a2314f1065b..182e0df6762 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -122,6 +122,10 @@ Bug fixes you to customize its normalization/folding, by editing the source data files in src/data and regenerating a new .nrm with 'ant gennorm2'. (David Bowen via Robert Muir) +* LUCENE-2653: ThaiWordFilter depends on the JRE having a Thai dictionary, which is not + always the case. If the dictionary is unavailable, the filter will now throw + UnsupportedOperationException in the constructor. (Robert Muir) + API Changes diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java index 9751c1ac147..2f6caf95cd9 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java @@ -38,10 +38,24 @@ import org.apache.lucene.util.Version; * {@link ThaiAnalyzer} will insert a {@link LowerCaseFilter} before this filter * so the behaviour of the Analyzer does not change. With version 3.1, the filter handles * position increments correctly. + *

WARNING: this filter may not be supported by all JREs. + * It is known to work with Sun/Oracle and Harmony JREs. + * If your application needs to be fully portable, consider using ICUTokenizer instead, + * which uses an ICU Thai BreakIterator that will always be available. */ public final class ThaiWordFilter extends TokenFilter { - - private final BreakIterator breaker = BreakIterator.getWordInstance(new Locale("th")); + /** + * True if the JRE supports a working dictionary-based breakiterator for Thai. + * If this is false, this filter will not work at all! + */ + public static final boolean DBBI_AVAILABLE; + private static final BreakIterator proto = BreakIterator.getWordInstance(new Locale("th")); + static { + // check that we have a working dictionary-based break iterator for thai + proto.setText("ภาษาไทย"); + DBBI_AVAILABLE = proto.isBoundary(4); + } + private final BreakIterator breaker = (BreakIterator) proto.clone(); private final Segment charIterator = new Segment(); private final boolean handlePosIncr; @@ -67,6 +81,8 @@ public final class ThaiWordFilter extends TokenFilter { public ThaiWordFilter(Version matchVersion, TokenStream input) { super(matchVersion.onOrAfter(Version.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input)); + if (!DBBI_AVAILABLE) + throw new UnsupportedOperationException("This JRE does not have support for Thai segmentation"); handlePosIncr = matchVersion.onOrAfter(Version.LUCENE_31); } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java index 6ade715833b..9674a54d57d 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java @@ -18,6 +18,7 @@ package org.apache.lucene.analysis.th; */ import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.junit.Assume; /** * Test case for ThaiAnalyzer, modified from TestFrenchAnalyzer @@ -31,6 +32,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase { * testcase for offsets */ public void testOffsets() throws Exception { + Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE); assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี", new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 13, 17, 20, 23 }, @@ -49,6 +51,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase { * Instead, allow the definition of alphanum to include relevant categories like nonspacing marks! */ public void testBuggyTokenType() throws Exception { + Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE); assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี ๑๒๓", new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" }, new String[] { "", "", "", "", "", @@ -65,6 +68,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase { */ public void testAnalyzer() throws Exception { + Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE); ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT); assertAnalyzesTo(analyzer, "", new String[] {}); @@ -90,6 +94,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase { * Test that position increments are adjusted correctly for stopwords. */ public void testPositionIncrements() throws Exception { + Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE); ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT); assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้อง the แสดงว่างานดี", @@ -107,6 +112,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase { } public void testReusableTokenStream() throws Exception { + Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE); ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT); assertAnalyzesToReuse(analyzer, "", new String[] {}); diff --git a/solr/src/test/org/apache/solr/analysis/TestThaiWordFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestThaiWordFilterFactory.java index 3ae0e21bc03..7fb7ff7068c 100644 --- a/solr/src/test/org/apache/solr/analysis/TestThaiWordFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestThaiWordFilterFactory.java @@ -23,6 +23,8 @@ import java.io.StringReader; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.th.ThaiWordFilter; +import org.junit.Assume; /** * Simple tests to ensure the Thai word filter factory is working. @@ -32,6 +34,7 @@ public class TestThaiWordFilterFactory extends BaseTokenTestCase { * Ensure the filter actually decomposes text. */ public void testWordBreak() throws Exception { + Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE); Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี"); Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); ThaiWordFilterFactory factory = new ThaiWordFilterFactory();