mirror of https://github.com/apache/lucene.git
LUCENE-1692: add tests for Thai & SmartChinese analyzers; fix wrong endOffset bug in ThaiWordFilter; use stop words by default with SmartChineseAnalyzer
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@786560 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
fc243f12ce
commit
2f2cd20828
|
@ -58,7 +58,7 @@ public class SmartChineseAnalyzer extends Analyzer {
|
||||||
private WordSegmenter wordSegment;
|
private WordSegmenter wordSegment;
|
||||||
|
|
||||||
public SmartChineseAnalyzer() {
|
public SmartChineseAnalyzer() {
|
||||||
this(false);
|
this(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -47,7 +47,7 @@ public class ThaiWordFilter extends TokenFilter {
|
||||||
if (end != BreakIterator.DONE) {
|
if (end != BreakIterator.DONE) {
|
||||||
reusableToken.reinit(thaiToken, thaiToken.termBuffer(), start, end - start);
|
reusableToken.reinit(thaiToken, thaiToken.termBuffer(), start, end - start);
|
||||||
reusableToken.setStartOffset(thaiToken.startOffset()+start);
|
reusableToken.setStartOffset(thaiToken.startOffset()+start);
|
||||||
reusableToken.setEndOffset(thaiToken.endOffset()+end);
|
reusableToken.setEndOffset(thaiToken.startOffset()+end);
|
||||||
return reusableToken;
|
return reusableToken;
|
||||||
}
|
}
|
||||||
thaiToken = null;
|
thaiToken = null;
|
||||||
|
|
|
@ -31,7 +31,27 @@ import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
public class TestSmartChineseAnalyzer extends TestCase {
|
public class TestSmartChineseAnalyzer extends TestCase {
|
||||||
|
|
||||||
|
public void testChineseStopWordsDefault() throws Exception {
|
||||||
|
Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */
|
||||||
|
String sentence = "我购买了道具和服装。";
|
||||||
|
String result[] = { "我", "购买", "了", "道具", "和", "服装" };
|
||||||
|
assertAnalyzesTo(ca, sentence, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Punctuation is handled in a strange way if you disable stopwords
|
||||||
|
* In this example the IDEOGRAPHIC FULL STOP is converted into a comma.
|
||||||
|
* if you don't supply (true) to the constructor, or use a different stopwords list,
|
||||||
|
* then punctuation is indexed.
|
||||||
|
*/
|
||||||
|
public void testChineseStopWordsOff() throws Exception {
|
||||||
|
Analyzer ca = new SmartChineseAnalyzer(false); /* doesnt load stopwords */
|
||||||
|
String sentence = "我购买了道具和服装。";
|
||||||
|
String result[] = { "我", "购买", "了", "道具", "和", "服装", "," };
|
||||||
|
assertAnalyzesTo(ca, sentence, result);
|
||||||
|
}
|
||||||
|
|
||||||
public void testChineseAnalyzer() throws IOException {
|
public void testChineseAnalyzer() throws IOException {
|
||||||
Token nt = new Token();
|
Token nt = new Token();
|
||||||
Analyzer ca = new SmartChineseAnalyzer(true);
|
Analyzer ca = new SmartChineseAnalyzer(true);
|
||||||
|
@ -47,6 +67,54 @@ public class TestSmartChineseAnalyzer extends TestCase {
|
||||||
}
|
}
|
||||||
ts.close();
|
ts.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* English words are lowercased and porter-stemmed.
|
||||||
|
*/
|
||||||
|
public void testMixedLatinChinese() throws Exception {
|
||||||
|
assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买 Tests 了道具和服装",
|
||||||
|
new String[] { "我", "购买", "test", "了", "道具", "和", "服装"});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOffsets() throws Exception {
|
||||||
|
assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买了道具和服装",
|
||||||
|
new String[] { "我", "购买", "了", "道具", "和", "服装" },
|
||||||
|
new int[] { 0, 1, 3, 4, 6, 7 },
|
||||||
|
new int[] { 1, 3, 4, 6, 7, 9 });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[])
|
||||||
|
throws Exception {
|
||||||
|
|
||||||
|
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||||
|
final Token reusableToken = new Token();
|
||||||
|
for (int i = 0; i < output.length; i++) {
|
||||||
|
Token nextToken = ts.next(reusableToken);
|
||||||
|
assertNotNull(nextToken);
|
||||||
|
assertEquals(nextToken.term(), output[i]);
|
||||||
|
if (startOffsets != null)
|
||||||
|
assertEquals(nextToken.startOffset(), startOffsets[i]);
|
||||||
|
if (endOffsets != null)
|
||||||
|
assertEquals(nextToken.endOffset(), endOffsets[i]);
|
||||||
|
if (types != null)
|
||||||
|
assertEquals(nextToken.type(), types[i]);
|
||||||
|
}
|
||||||
|
assertNull(ts.next(reusableToken));
|
||||||
|
ts.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
|
||||||
|
assertAnalyzesTo(a, input, output, null, null, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws Exception {
|
||||||
|
assertAnalyzesTo(a, input, output, null, null, types);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws Exception {
|
||||||
|
assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param args
|
* @param args
|
||||||
|
|
|
@ -30,8 +30,43 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class TestThaiAnalyzer extends TestCase {
|
public class TestThaiAnalyzer extends TestCase {
|
||||||
|
|
||||||
public void assertAnalyzesTo(Analyzer a, String input, String[] output)
|
/*
|
||||||
|
* testcase for offsets
|
||||||
|
*/
|
||||||
|
public void testOffsets() throws Exception {
|
||||||
|
assertAnalyzesTo(new ThaiAnalyzer(), "เดอะนิวยอร์กไทมส์",
|
||||||
|
new String[] { "เด", "อะนิว", "ยอ", "ร์ก", "ไทมส์"},
|
||||||
|
new int[] { 0, 2, 7, 9, 12 },
|
||||||
|
new int[] { 2, 7, 9, 12, 17});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Thai numeric tokens are typed as <ALPHANUM> instead of <NUM>.
|
||||||
|
* This is really a problem with the interaction w/ StandardTokenizer, which is used by ThaiAnalyzer.
|
||||||
|
*
|
||||||
|
* The issue is this: in StandardTokenizer the entire [:Thai:] block is specified in ALPHANUM (including punctuation, digits, etc)
|
||||||
|
* Fix is easy: refine this spec to exclude thai punctuation and digits.
|
||||||
|
*
|
||||||
|
* A better fix, that would also fix quite a few other languages would be to remove the thai hack.
|
||||||
|
* Instead, allow the definition of alphanum to include relevant categories like nonspacing marks!
|
||||||
|
*/
|
||||||
|
public void testBuggyTokenType() throws Exception {
|
||||||
|
assertAnalyzesTo(new ThaiAnalyzer(), "เดอะนิวยอร์กไทมส์ ๑๒๓",
|
||||||
|
new String[] { "เด", "อะนิว", "ยอ", "ร์ก", "ไทมส์", "๑๒๓" },
|
||||||
|
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/* correct testcase
|
||||||
|
public void testTokenType() throws Exception {
|
||||||
|
assertAnalyzesTo(new ThaiAnalyzer(), "เดอะนิวยอร์กไทมส์ ๑๒๓",
|
||||||
|
new String[] { "เด", "อะนิว", "ยอ", "ร์ก", "ไทมส์", "๑๒๓" },
|
||||||
|
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<NUM>" });
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[])
|
||||||
throws Exception {
|
throws Exception {
|
||||||
|
|
||||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||||
|
@ -40,10 +75,28 @@ public class TestThaiAnalyzer extends TestCase {
|
||||||
Token nextToken = ts.next(reusableToken);
|
Token nextToken = ts.next(reusableToken);
|
||||||
assertNotNull(nextToken);
|
assertNotNull(nextToken);
|
||||||
assertEquals(nextToken.term(), output[i]);
|
assertEquals(nextToken.term(), output[i]);
|
||||||
|
if (startOffsets != null)
|
||||||
|
assertEquals(nextToken.startOffset(), startOffsets[i]);
|
||||||
|
if (endOffsets != null)
|
||||||
|
assertEquals(nextToken.endOffset(), endOffsets[i]);
|
||||||
|
if (types != null)
|
||||||
|
assertEquals(nextToken.type(), types[i]);
|
||||||
}
|
}
|
||||||
assertNull(ts.next(reusableToken));
|
assertNull(ts.next(reusableToken));
|
||||||
ts.close();
|
ts.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
|
||||||
|
assertAnalyzesTo(a, input, output, null, null, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws Exception {
|
||||||
|
assertAnalyzesTo(a, input, output, null, null, types);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws Exception {
|
||||||
|
assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null);
|
||||||
|
}
|
||||||
|
|
||||||
public void testAnalyzer() throws Exception {
|
public void testAnalyzer() throws Exception {
|
||||||
ThaiAnalyzer analyzer = new ThaiAnalyzer();
|
ThaiAnalyzer analyzer = new ThaiAnalyzer();
|
||||||
|
|
Loading…
Reference in New Issue