mirror of https://github.com/apache/lucene.git
Applied patched for LUCENE-324, correcting token offsets returned by ChineseTokenizer
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@353930 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ebe44ace90
commit
7a3103fac0
|
@ -308,6 +308,9 @@ Bug fixes
|
|||
18. Fixed inefficient memory usage when loading an index into RAMDirectory.
|
||||
(Volodymyr Bychkoviak via Bernhard, LUCENE-475)
|
||||
|
||||
19. Corrected term offsets returned by ChineseTokenizer.
|
||||
(Ray Tsang via Erik Hatcher, LUCENE-324)
|
||||
|
||||
Optimizations
|
||||
|
||||
1. Disk usage (peak requirements during indexing and optimization)
|
||||
|
|
|
@ -117,6 +117,7 @@ public final class ChineseTokenizer extends Tokenizer {
|
|||
case Character.OTHER_LETTER:
|
||||
if (length>0) {
|
||||
bufferIndex--;
|
||||
offset--;
|
||||
return flush();
|
||||
}
|
||||
push(c);
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
package org.apache.lucene.analysis.cn;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
/**
|
||||
* @author rayt
|
||||
*/
|
||||
public class TestChineseTokenizer extends TestCase
|
||||
{
|
||||
public void testOtherLetterOffset() throws IOException
|
||||
{
|
||||
String s = "a天b";
|
||||
ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s));
|
||||
Token token;
|
||||
|
||||
int correctStartOffset = 0;
|
||||
int correctEndOffset = 1;
|
||||
while ((token = tokenizer.next()) != null)
|
||||
{
|
||||
assertEquals(correctStartOffset, token.startOffset());
|
||||
assertEquals(correctEndOffset, token.endOffset());
|
||||
correctStartOffset++;
|
||||
correctEndOffset++;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue