Applied patched for LUCENE-324, correcting token offsets returned by ChineseTokenizer

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@353930 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erik Hatcher 2005-12-04 23:07:42 +00:00
parent ebe44ace90
commit 7a3103fac0
3 changed files with 39 additions and 4 deletions

View File

@ -308,6 +308,9 @@ Bug fixes
18. Fixed inefficient memory usage when loading an index into RAMDirectory.
(Volodymyr Bychkoviak via Bernhard, LUCENE-475)
19. Corrected term offsets returned by ChineseTokenizer.
(Ray Tsang via Erik Hatcher, LUCENE-324)
Optimizations
1. Disk usage (peak requirements during indexing and optimization)

View File

@ -117,6 +117,7 @@ public final class ChineseTokenizer extends Tokenizer {
case Character.OTHER_LETTER:
if (length>0) {
bufferIndex--;
offset--;
return flush();
}
push(c);

View File

@ -0,0 +1,31 @@
package org.apache.lucene.analysis.cn;
import java.io.IOException;
import java.io.StringReader;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
/**
* @author rayt
*/
public class TestChineseTokenizer extends TestCase
{
public void testOtherLetterOffset() throws IOException
{
String s = "a天b";
ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s));
Token token;
int correctStartOffset = 0;
int correctEndOffset = 1;
while ((token = tokenizer.next()) != null)
{
assertEquals(correctStartOffset, token.startOffset());
assertEquals(correctEndOffset, token.endOffset());
correctStartOffset++;
correctEndOffset++;
}
}
}