From d598f8b599093fd0fff2c17a515a267c4cbb48fa Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 11 Apr 2014 11:12:38 +0000 Subject: [PATCH] LUCENE-5601: ThaiTokenizer ignores sentenceStart git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1586614 13f79535-47bb-0310-9956-ffa450edef68 --- .../java/org/apache/lucene/analysis/th/ThaiTokenizer.java | 2 +- .../org/apache/lucene/analysis/th/TestThaiAnalyzer.java | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizer.java index e61ec06e228..e2b40842657 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizer.java @@ -99,7 +99,7 @@ public class ThaiTokenizer extends SegmentingTokenizerBase { } clearAttributes(); - termAtt.copyBuffer(buffer, start, end - start); + termAtt.copyBuffer(buffer, sentenceStart + start, end - start); offsetAtt.setOffset(correctOffset(offset + sentenceStart + start), correctOffset(offset + sentenceStart + end)); return true; } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java index ef5e2ad7aa7..a7273ca4385 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java @@ -117,4 +117,11 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase { ts.addAttribute(FlagsAttribute.class); assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" }); } + + public void testTwoSentences() throws Exception { + assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET), "This is a test. การที่ได้ต้องแสดงว่างานดี", + new String[] { "this", "is", "a", "test", "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, + new int[] { 0, 5, 8, 10, 16, 19, 22, 25, 29, 33, 36, 39 }, + new int[] { 4, 7, 9, 14, 19, 22, 25, 29, 33, 36, 39, 41 }); + } }