mirror of https://github.com/apache/lucene.git
- Initial commit (http://nagoya.apache.org/bugzilla/show_bug.cgi?id=23786)
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150898 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
950991783a
commit
1ffe616bba
|
@ -0,0 +1,34 @@
|
|||
package org.apache.lucene.analysis.cn;
|
||||
|
||||
import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Title: ChineseAnalyzer
|
||||
* Description:
|
||||
* Subclass of org.apache.lucene.analysis.Analyzer
|
||||
* build from a ChineseTokenizer, filtered with ChineseFilter.
|
||||
* Copyright: Copyright (c) 2001
|
||||
* Company:
|
||||
* @author Yiyi Sun
|
||||
* @version 1.0
|
||||
*
|
||||
*/
|
||||
|
||||
public class ChineseAnalyzer extends Analyzer {
|
||||
|
||||
public ChineseAnalyzer() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.
|
||||
*/
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new ChineseTokenizer(reader);
|
||||
result = new ChineseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,77 @@
|
|||
package org.apache.lucene.analysis.cn;
|
||||
|
||||
import java.util.Hashtable;
|
||||
import org.apache.lucene.analysis.*;
|
||||
|
||||
/**
|
||||
* Title: ChineseFilter
|
||||
* Description: Filter with a stop word table
|
||||
* Rule: No digital is allowed.
|
||||
* English word/token should larger than 1 character.
|
||||
* One Chinese character as one Chinese word.
|
||||
* TO DO:
|
||||
* 1. Add Chinese stop words, such as \ue400
|
||||
* 2. Dictionary based Chinese word extraction
|
||||
* 3. Intelligent Chinese word extraction
|
||||
*
|
||||
* Copyright: Copyright (c) 2001
|
||||
* Company:
|
||||
* @author Yiyi Sun
|
||||
* @version 1.0
|
||||
*
|
||||
*/
|
||||
|
||||
public final class ChineseFilter extends TokenFilter {
|
||||
|
||||
|
||||
// Only English now, Chinese to be added later.
|
||||
public static final String[] STOP_WORDS = {
|
||||
"and", "are", "as", "at", "be", "but", "by",
|
||||
"for", "if", "in", "into", "is", "it",
|
||||
"no", "not", "of", "on", "or", "such",
|
||||
"that", "the", "their", "then", "there", "these",
|
||||
"they", "this", "to", "was", "will", "with"
|
||||
};
|
||||
|
||||
|
||||
private Hashtable stopTable;
|
||||
|
||||
public ChineseFilter(TokenStream in) {
|
||||
input = in;
|
||||
|
||||
stopTable = new Hashtable(STOP_WORDS.length);
|
||||
for (int i = 0; i < STOP_WORDS.length; i++)
|
||||
stopTable.put(STOP_WORDS[i], STOP_WORDS[i]);
|
||||
}
|
||||
|
||||
public final Token next() throws java.io.IOException {
|
||||
|
||||
for (Token token = input.next(); token != null; token = input.next()) {
|
||||
String text = token.termText();
|
||||
|
||||
if (stopTable.get(text) == null) {
|
||||
switch (Character.getType(text.charAt(0))) {
|
||||
|
||||
case Character.LOWERCASE_LETTER:
|
||||
case Character.UPPERCASE_LETTER:
|
||||
|
||||
// English word/token should larger than 1 character.
|
||||
if (text.length()>1) {
|
||||
return token;
|
||||
}
|
||||
break;
|
||||
case Character.OTHER_LETTER:
|
||||
|
||||
// One Chinese character as one Chinese word.
|
||||
// Chinese word extraction to be added later here.
|
||||
|
||||
return token;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,98 @@
|
|||
package org.apache.lucene.analysis.cn;
|
||||
|
||||
import java.io.Reader;
|
||||
import org.apache.lucene.analysis.*;
|
||||
|
||||
|
||||
/**
|
||||
* Title: ChineseTokenizer
|
||||
* Description: Extract tokens from the Stream using Character.getType()
|
||||
* Rule: A Chinese character as a single token
|
||||
* Copyright: Copyright (c) 2001
|
||||
* Company:
|
||||
* @author Yiyi Sun
|
||||
* @version 1.0
|
||||
*
|
||||
*/
|
||||
|
||||
public final class ChineseTokenizer extends Tokenizer {
|
||||
|
||||
|
||||
public ChineseTokenizer(Reader in) {
|
||||
input = in;
|
||||
}
|
||||
|
||||
private int offset = 0, bufferIndex=0, dataLen=0;
|
||||
private final static int MAX_WORD_LEN = 255;
|
||||
private final static int IO_BUFFER_SIZE = 1024;
|
||||
private final char[] buffer = new char[MAX_WORD_LEN];
|
||||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
||||
|
||||
|
||||
private int length;
|
||||
private int start;
|
||||
|
||||
|
||||
private final void push(char c) {
|
||||
|
||||
if (length == 0) start = offset-1; // start of token
|
||||
buffer[length++] = Character.toLowerCase(c); // buffer it
|
||||
|
||||
}
|
||||
|
||||
private final Token flush() {
|
||||
|
||||
if (length>0) {
|
||||
//System.out.println(new String(buffer, 0, length));
|
||||
return new Token(new String(buffer, 0, length), start, start+length);
|
||||
}
|
||||
else
|
||||
return null;
|
||||
}
|
||||
|
||||
public final Token next() throws java.io.IOException {
|
||||
|
||||
length = 0;
|
||||
start = offset;
|
||||
|
||||
|
||||
while (true) {
|
||||
|
||||
final char c;
|
||||
offset++;
|
||||
|
||||
if (bufferIndex >= dataLen) {
|
||||
dataLen = input.read(ioBuffer);
|
||||
bufferIndex = 0;
|
||||
};
|
||||
|
||||
if (dataLen == -1) return flush();
|
||||
else
|
||||
c = (char) ioBuffer[bufferIndex++];
|
||||
|
||||
|
||||
switch(Character.getType(c)) {
|
||||
|
||||
case Character.DECIMAL_DIGIT_NUMBER:
|
||||
case Character.LOWERCASE_LETTER:
|
||||
case Character.UPPERCASE_LETTER:
|
||||
push(c);
|
||||
if (length == MAX_WORD_LEN) return flush();
|
||||
break;
|
||||
|
||||
case Character.OTHER_LETTER:
|
||||
if (length>0) {
|
||||
bufferIndex--;
|
||||
return flush();
|
||||
}
|
||||
push(c);
|
||||
return flush();
|
||||
|
||||
default:
|
||||
if (length>0) return flush();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue