mirror of https://github.com/apache/lucene.git
LUCENE-2653: ThaiAnalyzer assumes things about your jre
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@998684 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
68776ee5d6
commit
c84bd2f1ec
|
@ -122,6 +122,10 @@ Bug fixes
|
||||||
you to customize its normalization/folding, by editing the source data files in src/data
|
you to customize its normalization/folding, by editing the source data files in src/data
|
||||||
and regenerating a new .nrm with 'ant gennorm2'. (David Bowen via Robert Muir)
|
and regenerating a new .nrm with 'ant gennorm2'. (David Bowen via Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-2653: ThaiWordFilter depends on the JRE having a Thai dictionary, which is not
|
||||||
|
always the case. If the dictionary is unavailable, the filter will now throw
|
||||||
|
UnsupportedOperationException in the constructor. (Robert Muir)
|
||||||
|
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
|
|
|
@ -38,10 +38,24 @@ import org.apache.lucene.util.Version;
|
||||||
* {@link ThaiAnalyzer} will insert a {@link LowerCaseFilter} before this filter
|
* {@link ThaiAnalyzer} will insert a {@link LowerCaseFilter} before this filter
|
||||||
* so the behaviour of the Analyzer does not change. With version 3.1, the filter handles
|
* so the behaviour of the Analyzer does not change. With version 3.1, the filter handles
|
||||||
* position increments correctly.
|
* position increments correctly.
|
||||||
|
* <p>WARNING: this filter may not be supported by all JREs.
|
||||||
|
* It is known to work with Sun/Oracle and Harmony JREs.
|
||||||
|
* If your application needs to be fully portable, consider using ICUTokenizer instead,
|
||||||
|
* which uses an ICU Thai BreakIterator that will always be available.
|
||||||
*/
|
*/
|
||||||
public final class ThaiWordFilter extends TokenFilter {
|
public final class ThaiWordFilter extends TokenFilter {
|
||||||
|
/**
|
||||||
private final BreakIterator breaker = BreakIterator.getWordInstance(new Locale("th"));
|
* True if the JRE supports a working dictionary-based breakiterator for Thai.
|
||||||
|
* If this is false, this filter will not work at all!
|
||||||
|
*/
|
||||||
|
public static final boolean DBBI_AVAILABLE;
|
||||||
|
private static final BreakIterator proto = BreakIterator.getWordInstance(new Locale("th"));
|
||||||
|
static {
|
||||||
|
// check that we have a working dictionary-based break iterator for thai
|
||||||
|
proto.setText("ภาษาไทย");
|
||||||
|
DBBI_AVAILABLE = proto.isBoundary(4);
|
||||||
|
}
|
||||||
|
private final BreakIterator breaker = (BreakIterator) proto.clone();
|
||||||
private final Segment charIterator = new Segment();
|
private final Segment charIterator = new Segment();
|
||||||
|
|
||||||
private final boolean handlePosIncr;
|
private final boolean handlePosIncr;
|
||||||
|
@ -67,6 +81,8 @@ public final class ThaiWordFilter extends TokenFilter {
|
||||||
public ThaiWordFilter(Version matchVersion, TokenStream input) {
|
public ThaiWordFilter(Version matchVersion, TokenStream input) {
|
||||||
super(matchVersion.onOrAfter(Version.LUCENE_31) ?
|
super(matchVersion.onOrAfter(Version.LUCENE_31) ?
|
||||||
input : new LowerCaseFilter(matchVersion, input));
|
input : new LowerCaseFilter(matchVersion, input));
|
||||||
|
if (!DBBI_AVAILABLE)
|
||||||
|
throw new UnsupportedOperationException("This JRE does not have support for Thai segmentation");
|
||||||
handlePosIncr = matchVersion.onOrAfter(Version.LUCENE_31);
|
handlePosIncr = matchVersion.onOrAfter(Version.LUCENE_31);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.th;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.junit.Assume;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test case for ThaiAnalyzer, modified from TestFrenchAnalyzer
|
* Test case for ThaiAnalyzer, modified from TestFrenchAnalyzer
|
||||||
|
@ -31,6 +32,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* testcase for offsets
|
* testcase for offsets
|
||||||
*/
|
*/
|
||||||
public void testOffsets() throws Exception {
|
public void testOffsets() throws Exception {
|
||||||
|
Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
|
||||||
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี",
|
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี",
|
||||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
|
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
|
||||||
new int[] { 0, 3, 6, 9, 13, 17, 20, 23 },
|
new int[] { 0, 3, 6, 9, 13, 17, 20, 23 },
|
||||||
|
@ -49,6 +51,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* Instead, allow the definition of alphanum to include relevant categories like nonspacing marks!
|
* Instead, allow the definition of alphanum to include relevant categories like nonspacing marks!
|
||||||
*/
|
*/
|
||||||
public void testBuggyTokenType() throws Exception {
|
public void testBuggyTokenType() throws Exception {
|
||||||
|
Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
|
||||||
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
|
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
|
||||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
|
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
|
||||||
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
|
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
|
||||||
|
@ -65,6 +68,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public void testAnalyzer() throws Exception {
|
public void testAnalyzer() throws Exception {
|
||||||
|
Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
|
||||||
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
|
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
|
||||||
assertAnalyzesTo(analyzer, "", new String[] {});
|
assertAnalyzesTo(analyzer, "", new String[] {});
|
||||||
|
@ -90,6 +94,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* Test that position increments are adjusted correctly for stopwords.
|
* Test that position increments are adjusted correctly for stopwords.
|
||||||
*/
|
*/
|
||||||
public void testPositionIncrements() throws Exception {
|
public void testPositionIncrements() throws Exception {
|
||||||
|
Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
|
||||||
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
|
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
|
||||||
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้อง the แสดงว่างานดี",
|
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้อง the แสดงว่างานดี",
|
||||||
|
@ -107,6 +112,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
|
Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
|
||||||
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
|
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
|
||||||
assertAnalyzesToReuse(analyzer, "", new String[] {});
|
assertAnalyzesToReuse(analyzer, "", new String[] {});
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,8 @@ import java.io.StringReader;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.th.ThaiWordFilter;
|
||||||
|
import org.junit.Assume;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Simple tests to ensure the Thai word filter factory is working.
|
* Simple tests to ensure the Thai word filter factory is working.
|
||||||
|
@ -32,6 +34,7 @@ public class TestThaiWordFilterFactory extends BaseTokenTestCase {
|
||||||
* Ensure the filter actually decomposes text.
|
* Ensure the filter actually decomposes text.
|
||||||
*/
|
*/
|
||||||
public void testWordBreak() throws Exception {
|
public void testWordBreak() throws Exception {
|
||||||
|
Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
|
||||||
Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี");
|
Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี");
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
ThaiWordFilterFactory factory = new ThaiWordFilterFactory();
|
ThaiWordFilterFactory factory = new ThaiWordFilterFactory();
|
||||||
|
|
Loading…
Reference in New Issue