mirror of https://github.com/apache/lucene.git
Applied patched for LUCENE-324, correcting token offsets returned by ChineseTokenizer
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@353930 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ebe44ace90
commit
7a3103fac0
11
CHANGES.txt
11
CHANGES.txt
|
@ -187,7 +187,7 @@ New features
|
|||
It's very useful for searching across multiple fields.
|
||||
(Chuck Williams via Yonik Seeley, LUCENE-323)
|
||||
|
||||
28. New class ISOLatin1AccentFilter that replaces accented characters in the ISO
|
||||
28. New class ISOLatin1AccentFilter that replaces accented characters in the ISO
|
||||
Latin 1 character set by their unaccented equivalent.
|
||||
(Sven Duzont via Erik Hatcher)
|
||||
|
||||
|
@ -195,7 +195,7 @@ New features
|
|||
This is useful for data like zip codes, ids, and some product names.
|
||||
(Erik Hatcher)
|
||||
|
||||
30. Copied LengthFilter from contrib area to core. Removes words that are too
|
||||
30. Copied LengthFilter from contrib area to core. Removes words that are too
|
||||
long and too short from the stream.
|
||||
(David Spencer via Otis and Daniel)
|
||||
|
||||
|
@ -306,8 +306,11 @@ Bug fixes
|
|||
(Yonik Seeley, LUCENE-462)
|
||||
|
||||
18. Fixed inefficient memory usage when loading an index into RAMDirectory.
|
||||
(Volodymyr Bychkoviak via Bernhard, LUCENE-475)
|
||||
|
||||
(Volodymyr Bychkoviak via Bernhard, LUCENE-475)
|
||||
|
||||
19. Corrected term offsets returned by ChineseTokenizer.
|
||||
(Ray Tsang via Erik Hatcher, LUCENE-324)
|
||||
|
||||
Optimizations
|
||||
|
||||
1. Disk usage (peak requirements during indexing and optimization)
|
||||
|
|
|
@ -117,6 +117,7 @@ public final class ChineseTokenizer extends Tokenizer {
|
|||
case Character.OTHER_LETTER:
|
||||
if (length>0) {
|
||||
bufferIndex--;
|
||||
offset--;
|
||||
return flush();
|
||||
}
|
||||
push(c);
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
package org.apache.lucene.analysis.cn;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
/**
|
||||
* @author rayt
|
||||
*/
|
||||
public class TestChineseTokenizer extends TestCase
|
||||
{
|
||||
public void testOtherLetterOffset() throws IOException
|
||||
{
|
||||
String s = "a天b";
|
||||
ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s));
|
||||
Token token;
|
||||
|
||||
int correctStartOffset = 0;
|
||||
int correctEndOffset = 1;
|
||||
while ((token = tokenizer.next()) != null)
|
||||
{
|
||||
assertEquals(correctStartOffset, token.startOffset());
|
||||
assertEquals(correctEndOffset, token.endOffset());
|
||||
correctStartOffset++;
|
||||
correctEndOffset++;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue