LUCENE-1943: Improve performance of ChineseFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@821322 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2009-10-03 13:54:12 +00:00
parent 891570478d
commit 1f9088b038

View File

@ -18,9 +18,9 @@ package org.apache.lucene.analysis.cn;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashMap; import java.util.Arrays;
import java.util.Map;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@ -56,33 +56,32 @@ public final class ChineseFilter extends TokenFilter {
}; };
private Map stopTable; private CharArraySet stopTable;
private TermAttribute termAtt; private TermAttribute termAtt;
public ChineseFilter(TokenStream in) { public ChineseFilter(TokenStream in) {
super(in); super(in);
stopTable = new HashMap(STOP_WORDS.length); stopTable = new CharArraySet(Arrays.asList(STOP_WORDS), false);
for (int i = 0; i < STOP_WORDS.length; i++)
stopTable.put(STOP_WORDS[i], STOP_WORDS[i]);
termAtt = addAttribute(TermAttribute.class); termAtt = addAttribute(TermAttribute.class);
} }
public boolean incrementToken() throws IOException { public boolean incrementToken() throws IOException {
while (input.incrementToken()) { while (input.incrementToken()) {
String text = termAtt.term(); char text[] = termAtt.termBuffer();
int termLength = termAtt.termLength();
// why not key off token type here assuming ChineseTokenizer comes first? // why not key off token type here assuming ChineseTokenizer comes first?
if (stopTable.get(text) == null) { if (!stopTable.contains(text, 0, termLength)) {
switch (Character.getType(text.charAt(0))) { switch (Character.getType(text[0])) {
case Character.LOWERCASE_LETTER: case Character.LOWERCASE_LETTER:
case Character.UPPERCASE_LETTER: case Character.UPPERCASE_LETTER:
// English word/token should larger than 1 character. // English word/token should larger than 1 character.
if (text.length()>1) { if (termLength>1) {
return true; return true;
} }
break; break;