mirror of
https://github.com/apache/lucene.git
synced 2025-02-20 17:07:09 +00:00
LUCENE-1943: Improve performance of ChineseFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@821322 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
891570478d
commit
1f9088b038
@ -18,9 +18,9 @@ package org.apache.lucene.analysis.cn;
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashMap;
|
import java.util.Arrays;
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
@ -56,33 +56,32 @@ public final class ChineseFilter extends TokenFilter {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
private Map stopTable;
|
private CharArraySet stopTable;
|
||||||
|
|
||||||
private TermAttribute termAtt;
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
public ChineseFilter(TokenStream in) {
|
public ChineseFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
|
|
||||||
stopTable = new HashMap(STOP_WORDS.length);
|
stopTable = new CharArraySet(Arrays.asList(STOP_WORDS), false);
|
||||||
for (int i = 0; i < STOP_WORDS.length; i++)
|
|
||||||
stopTable.put(STOP_WORDS[i], STOP_WORDS[i]);
|
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
termAtt = addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
|
|
||||||
while (input.incrementToken()) {
|
while (input.incrementToken()) {
|
||||||
String text = termAtt.term();
|
char text[] = termAtt.termBuffer();
|
||||||
|
int termLength = termAtt.termLength();
|
||||||
|
|
||||||
// why not key off token type here assuming ChineseTokenizer comes first?
|
// why not key off token type here assuming ChineseTokenizer comes first?
|
||||||
if (stopTable.get(text) == null) {
|
if (!stopTable.contains(text, 0, termLength)) {
|
||||||
switch (Character.getType(text.charAt(0))) {
|
switch (Character.getType(text[0])) {
|
||||||
|
|
||||||
case Character.LOWERCASE_LETTER:
|
case Character.LOWERCASE_LETTER:
|
||||||
case Character.UPPERCASE_LETTER:
|
case Character.UPPERCASE_LETTER:
|
||||||
|
|
||||||
// English word/token should larger than 1 character.
|
// English word/token should larger than 1 character.
|
||||||
if (text.length()>1) {
|
if (termLength>1) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user