mirror of https://github.com/apache/lucene.git
LUCENE-5601: ThaiTokenizer ignores sentenceStart
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1586614 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b72cd4b7d9
commit
d598f8b599
|
@ -99,7 +99,7 @@ public class ThaiTokenizer extends SegmentingTokenizerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAtt.copyBuffer(buffer, start, end - start);
|
termAtt.copyBuffer(buffer, sentenceStart + start, end - start);
|
||||||
offsetAtt.setOffset(correctOffset(offset + sentenceStart + start), correctOffset(offset + sentenceStart + end));
|
offsetAtt.setOffset(correctOffset(offset + sentenceStart + start), correctOffset(offset + sentenceStart + end));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -117,4 +117,11 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
ts.addAttribute(FlagsAttribute.class);
|
ts.addAttribute(FlagsAttribute.class);
|
||||||
assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" });
|
assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testTwoSentences() throws Exception {
|
||||||
|
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET), "This is a test. การที่ได้ต้องแสดงว่างานดี",
|
||||||
|
new String[] { "this", "is", "a", "test", "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
|
||||||
|
new int[] { 0, 5, 8, 10, 16, 19, 22, 25, 29, 33, 36, 39 },
|
||||||
|
new int[] { 4, 7, 9, 14, 19, 22, 25, 29, 33, 36, 39, 41 });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue