From 1f9088b038637f0188fb3c527d4acc940bbd7b40 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sat, 3 Oct 2009 13:54:12 +0000 Subject: [PATCH] LUCENE-1943: Improve performance of ChineseFilter git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@821322 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/cn/ChineseFilter.java | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java index 6666eaaf93b..97b2a9a2270 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java @@ -18,9 +18,9 @@ package org.apache.lucene.analysis.cn; */ import java.io.IOException; -import java.util.HashMap; -import java.util.Map; +import java.util.Arrays; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; @@ -56,33 +56,32 @@ public final class ChineseFilter extends TokenFilter { }; - private Map stopTable; + private CharArraySet stopTable; private TermAttribute termAtt; public ChineseFilter(TokenStream in) { super(in); - stopTable = new HashMap(STOP_WORDS.length); - for (int i = 0; i < STOP_WORDS.length; i++) - stopTable.put(STOP_WORDS[i], STOP_WORDS[i]); + stopTable = new CharArraySet(Arrays.asList(STOP_WORDS), false); termAtt = addAttribute(TermAttribute.class); } public boolean incrementToken() throws IOException { while (input.incrementToken()) { - String text = termAtt.term(); + char text[] = termAtt.termBuffer(); + int termLength = termAtt.termLength(); // why not key off token type here assuming ChineseTokenizer comes first? - if (stopTable.get(text) == null) { - switch (Character.getType(text.charAt(0))) { + if (!stopTable.contains(text, 0, termLength)) { + switch (Character.getType(text[0])) { case Character.LOWERCASE_LETTER: case Character.UPPERCASE_LETTER: // English word/token should larger than 1 character. - if (text.length()>1) { + if (termLength>1) { return true; } break;