- Initial commit (http://nagoya.apache.org/bugzilla/show_bug.cgi?id=23786)

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150898 13f79535-47bb-0310-9956-ffa450edef68
2003-12-23 18:57:27 +00:00 · 2003-12-23 18:57:27 +00:00 · 1ffe616bba
parent 950991783a
commit 1ffe616bba
3 changed files with 209 additions and 0 deletions
--- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java
+++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java
@ -0,0 +1,34 @@
+package org.apache.lucene.analysis.cn;
+
+import java.io.Reader;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Title: ChineseAnalyzer
+ * Description:
+ *   Subclass of org.apache.lucene.analysis.Analyzer
+ *   build from a ChineseTokenizer, filtered with ChineseFilter.
+ * Copyright:   Copyright (c) 2001
+ * Company:
+ * @author Yiyi Sun
+ * @version 1.0
+ *
+ */
+
+public class ChineseAnalyzer extends Analyzer {
+
+    public ChineseAnalyzer() {
+    }
+
+    /**
+    * Creates a TokenStream which tokenizes all the text in the provided Reader.
+    *
+    * @return  A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.
+    */
+    public final TokenStream tokenStream(String fieldName, Reader reader) {
+        TokenStream result = new ChineseTokenizer(reader);
+        result = new ChineseFilter(result);
+        return result;
+    }
+}
--- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java
+++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java
@ -0,0 +1,77 @@
+package org.apache.lucene.analysis.cn;
+
+import java.util.Hashtable;
+import org.apache.lucene.analysis.*;
+
+/**
+ * Title: ChineseFilter
+ * Description: Filter with a stop word table
+ *              Rule: No digital is allowed.
+ *                    English word/token should larger than 1 character.
+ *                    One Chinese character as one Chinese word.
+ * TO DO:
+ *   1. Add Chinese stop words, such as \ue400
+ *   2. Dictionary based Chinese word extraction
+ *   3. Intelligent Chinese word extraction
+ *
+ * Copyright:    Copyright (c) 2001
+ * Company:
+ * @author Yiyi Sun
+ * @version 1.0
+ *
+ */
+
+public final class ChineseFilter extends TokenFilter {
+
+
+    // Only English now, Chinese to be added later.
+    public static final String[] STOP_WORDS = {
+    "and", "are", "as", "at", "be", "but", "by",
+    "for", "if", "in", "into", "is", "it",
+    "no", "not", "of", "on", "or", "such",
+    "that", "the", "their", "then", "there", "these",
+    "they", "this", "to", "was", "will", "with"
+    };
+
+
+    private Hashtable stopTable;
+
+    public ChineseFilter(TokenStream in) {
+        input = in;
+
+        stopTable = new Hashtable(STOP_WORDS.length);
+        for (int i = 0; i < STOP_WORDS.length; i++)
+            stopTable.put(STOP_WORDS[i], STOP_WORDS[i]);
+    }
+
+    public final Token next() throws java.io.IOException {
+
+        for (Token token = input.next(); token != null; token = input.next()) {
+            String text = token.termText();
+
+            if (stopTable.get(text) == null) {
+                switch (Character.getType(text.charAt(0))) {
+
+                case Character.LOWERCASE_LETTER:
+                case Character.UPPERCASE_LETTER:
+
+                    // English word/token should larger than 1 character.
+                    if (text.length()>1) {
+                        return token;
+                    }
+                    break;
+                case Character.OTHER_LETTER:
+
+                    // One Chinese character as one Chinese word.
+                    // Chinese word extraction to be added later here.
+
+                    return token;
+                }
+
+            }
+
+        }
+        return null;
+    }
+
+}
--- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
+++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
@ -0,0 +1,98 @@
+package org.apache.lucene.analysis.cn;
+
+import java.io.Reader;
+import org.apache.lucene.analysis.*;
+
+
+/**
+ * Title: ChineseTokenizer
+ * Description: Extract tokens from the Stream using Character.getType()
+ *              Rule: A Chinese character as a single token
+ * Copyright:   Copyright (c) 2001
+ * Company:
+ * @author Yiyi Sun
+ * @version 1.0
+ *
+ */
+
+public final class ChineseTokenizer extends Tokenizer {
+
+
+    public ChineseTokenizer(Reader in) {
+        input = in;
+    }
+
+    private int offset = 0, bufferIndex=0, dataLen=0;
+    private final static int MAX_WORD_LEN = 255;
+    private final static int IO_BUFFER_SIZE = 1024;
+    private final char[] buffer = new char[MAX_WORD_LEN];
+    private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+
+    private int length;
+    private int start;
+
+
+    private final void push(char c) {
+
+        if (length == 0) start = offset-1;            // start of token
+        buffer[length++] = Character.toLowerCase(c);  // buffer it
+
+    }
+
+    private final Token flush() {
+
+        if (length>0) {
+            //System.out.println(new String(buffer, 0, length));
+            return new Token(new String(buffer, 0, length), start, start+length);
+        }
+        else
+            return null;
+    }
+
+    public final Token next() throws java.io.IOException {
+
+        length = 0;
+        start = offset;
+
+
+        while (true) {
+
+            final char c;
+            offset++;
+
+            if (bufferIndex >= dataLen) {
+                dataLen = input.read(ioBuffer);
+                bufferIndex = 0;
+            };
+
+            if (dataLen == -1) return flush();
+            else
+                c = (char) ioBuffer[bufferIndex++];
+
+
+            switch(Character.getType(c)) {
+
+            case Character.DECIMAL_DIGIT_NUMBER:
+            case Character.LOWERCASE_LETTER:
+            case Character.UPPERCASE_LETTER:
+                push(c);
+                if (length == MAX_WORD_LEN) return flush();
+                break;
+
+            case Character.OTHER_LETTER:
+                if (length>0) {
+                    bufferIndex--;
+                    return flush();
+                }
+                push(c);
+                return flush();
+
+            default:
+                if (length>0) return flush();
+                break;
+            }
+        }
+
+    }
+}