From 1ffe616bbaf3359651b9e829e121a0ba3e0d47f9 Mon Sep 17 00:00:00 2001 From: Otis Gospodnetic Date: Tue, 23 Dec 2003 18:57:27 +0000 Subject: [PATCH] - Initial commit (http://nagoya.apache.org/bugzilla/show_bug.cgi?id=23786) git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150898 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/cn/ChineseAnalyzer.java | 34 +++++++ .../lucene/analysis/cn/ChineseFilter.java | 77 +++++++++++++++ .../lucene/analysis/cn/ChineseTokenizer.java | 98 +++++++++++++++++++ 3 files changed, 209 insertions(+) create mode 100644 sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java create mode 100644 sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java create mode 100644 sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java new file mode 100644 index 00000000000..e3786de1ac5 --- /dev/null +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java @@ -0,0 +1,34 @@ +package org.apache.lucene.analysis.cn; + +import java.io.Reader; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Title: ChineseAnalyzer + * Description: + * Subclass of org.apache.lucene.analysis.Analyzer + * build from a ChineseTokenizer, filtered with ChineseFilter. + * Copyright: Copyright (c) 2001 + * Company: + * @author Yiyi Sun + * @version 1.0 + * + */ + +public class ChineseAnalyzer extends Analyzer { + + public ChineseAnalyzer() { + } + + /** + * Creates a TokenStream which tokenizes all the text in the provided Reader. + * + * @return A TokenStream build from a ChineseTokenizer filtered with ChineseFilter. + */ + public final TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new ChineseTokenizer(reader); + result = new ChineseFilter(result); + return result; + } +} \ No newline at end of file diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java new file mode 100644 index 00000000000..db9a7a2ca4c --- /dev/null +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java @@ -0,0 +1,77 @@ +package org.apache.lucene.analysis.cn; + +import java.util.Hashtable; +import org.apache.lucene.analysis.*; + +/** + * Title: ChineseFilter + * Description: Filter with a stop word table + * Rule: No digital is allowed. + * English word/token should larger than 1 character. + * One Chinese character as one Chinese word. + * TO DO: + * 1. Add Chinese stop words, such as \ue400 + * 2. Dictionary based Chinese word extraction + * 3. Intelligent Chinese word extraction + * + * Copyright: Copyright (c) 2001 + * Company: + * @author Yiyi Sun + * @version 1.0 + * + */ + +public final class ChineseFilter extends TokenFilter { + + + // Only English now, Chinese to be added later. + public static final String[] STOP_WORDS = { + "and", "are", "as", "at", "be", "but", "by", + "for", "if", "in", "into", "is", "it", + "no", "not", "of", "on", "or", "such", + "that", "the", "their", "then", "there", "these", + "they", "this", "to", "was", "will", "with" + }; + + + private Hashtable stopTable; + + public ChineseFilter(TokenStream in) { + input = in; + + stopTable = new Hashtable(STOP_WORDS.length); + for (int i = 0; i < STOP_WORDS.length; i++) + stopTable.put(STOP_WORDS[i], STOP_WORDS[i]); + } + + public final Token next() throws java.io.IOException { + + for (Token token = input.next(); token != null; token = input.next()) { + String text = token.termText(); + + if (stopTable.get(text) == null) { + switch (Character.getType(text.charAt(0))) { + + case Character.LOWERCASE_LETTER: + case Character.UPPERCASE_LETTER: + + // English word/token should larger than 1 character. + if (text.length()>1) { + return token; + } + break; + case Character.OTHER_LETTER: + + // One Chinese character as one Chinese word. + // Chinese word extraction to be added later here. + + return token; + } + + } + + } + return null; + } + +} \ No newline at end of file diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java new file mode 100644 index 00000000000..6546862e32a --- /dev/null +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java @@ -0,0 +1,98 @@ +package org.apache.lucene.analysis.cn; + +import java.io.Reader; +import org.apache.lucene.analysis.*; + + +/** + * Title: ChineseTokenizer + * Description: Extract tokens from the Stream using Character.getType() + * Rule: A Chinese character as a single token + * Copyright: Copyright (c) 2001 + * Company: + * @author Yiyi Sun + * @version 1.0 + * + */ + +public final class ChineseTokenizer extends Tokenizer { + + + public ChineseTokenizer(Reader in) { + input = in; + } + + private int offset = 0, bufferIndex=0, dataLen=0; + private final static int MAX_WORD_LEN = 255; + private final static int IO_BUFFER_SIZE = 1024; + private final char[] buffer = new char[MAX_WORD_LEN]; + private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; + + + private int length; + private int start; + + + private final void push(char c) { + + if (length == 0) start = offset-1; // start of token + buffer[length++] = Character.toLowerCase(c); // buffer it + + } + + private final Token flush() { + + if (length>0) { + //System.out.println(new String(buffer, 0, length)); + return new Token(new String(buffer, 0, length), start, start+length); + } + else + return null; + } + + public final Token next() throws java.io.IOException { + + length = 0; + start = offset; + + + while (true) { + + final char c; + offset++; + + if (bufferIndex >= dataLen) { + dataLen = input.read(ioBuffer); + bufferIndex = 0; + }; + + if (dataLen == -1) return flush(); + else + c = (char) ioBuffer[bufferIndex++]; + + + switch(Character.getType(c)) { + + case Character.DECIMAL_DIGIT_NUMBER: + case Character.LOWERCASE_LETTER: + case Character.UPPERCASE_LETTER: + push(c); + if (length == MAX_WORD_LEN) return flush(); + break; + + case Character.OTHER_LETTER: + if (length>0) { + bufferIndex--; + return flush(); + } + push(c); + return flush(); + + default: + if (length>0) return flush(); + break; + } + } + + } +} \ No newline at end of file