LUCENE-1629: adding new contrib analyzer SmartChineseAnalyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@774718 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2009-05-14 10:09:22 +00:00
parent cc240447d4
commit e01aad89fe
29 changed files with 3659 additions and 4 deletions

View File

@ -3,14 +3,14 @@ Lucene Build Instructions
$Id$
Basic steps:
0) Install JDK 1.4 (or greater), Ant 1.6.2 (or greater)
0) Install JDK 1.4 (or greater), Ant 1.6.3 (or greater)
1) Download Lucene from Apache and unpack it
2) Connect to the top-level of your Lucene installation
3) Install JavaCC (optional)
4) Run ant
Step 0) Set up your development environment (JDK 1.4 or greater,
Ant 1.6.2 or greater)
Ant 1.6.3 or greater)
We'll assume that you know how to get and set up the JDK - if you
don't, then we suggest starting at http://java.sun.com and learning
@ -18,7 +18,7 @@ more about Java, before returning to this README. Lucene runs with
JDK 1.4 and later.
Like many Open Source java projects, Lucene uses Apache Ant for build
control. Specifically, you MUST use Ant version 1.6.2 or greater.
control. Specifically, you MUST use Ant version 1.6.3 or greater.
Ant is "kind of like make without make's wrinkles". Ant is
implemented in java and uses XML-based configuration files. You can

View File

@ -308,6 +308,12 @@ Bug fixes
cross-correlate Spans from different fields.
(Paul Cowan and Chris Hostetter)
25. LUCENE-1629: Add SmartChineseAnalyzer to contrib/analyzers. It
improves on CJKAnalyzer and ChineseAnalyzer by handling Chinese
sentences properly. SmartChineseAnalyzer uses a Hidden Markov
Model to tokenize Chinese words in a more intelligent way.
(Xiaoping Gao via Mike McCandless)
Optimizations
1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing

View File

@ -233,6 +233,12 @@
destdir="${build.dir}/classes/java">
<classpath refid="classpath"/>
</compile>
<!-- Copy the resources folder (if existent) -->
<copy todir="${build.dir}/classes/java" includeEmptyDirs="false">
<globmapper from="resources/*" to="*" handledirsep="yes"/>
<fileset dir="src" includes="resources/**"/>
</copy>
</target>
<target name="compile" depends="compile-core">

View File

@ -0,0 +1,129 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.PorterStemFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
import org.apache.lucene.analysis.cn.smart.WordSegmenter;
import org.apache.lucene.analysis.cn.smart.WordTokenizer;
/**
*
* SmartChineseAnalyzer 是一个智能中文分词模块 能够利用概率对汉语句子进行最优切分
* 并内嵌英文tokenizer能有效处理中英文混合的文本内容
*
* 它的原理基于自然语言处理领域的隐马尔科夫模型(HMM) 利用大量语料库的训练来统计汉语词汇的词频和跳转概率
* 从而根据这些统计结果对整个汉语句子计算最似然(likelihood)的切分
*
* 因为智能分词需要词典来保存词汇的统计值SmartChineseAnalyzer的运行需要指定词典位置如何指定词典位置请参考
* org.apache.lucene.analysis.cn.smart.AnalyzerProfile
*
* SmartChineseAnalyzer的算法和语料库词典来自于ictclas1.0项目(http://www.ictclas.org)
* 其中词典已获取www.ictclas.org的apache license v2(APLv2)的授权在遵循APLv2的条件下欢迎用户使用
* 在此感谢www.ictclas.org以及ictclas分词软件的工作人员的无私奉献
*
* @see org.apache.lucene.analysis.cn.smart.AnalyzerProfile
*
*/
public class SmartChineseAnalyzer extends Analyzer {
private Set stopWords = null;
private WordSegmenter wordSegment;
public SmartChineseAnalyzer() {
this(false);
}
/**
* SmartChineseAnalyzer内部带有默认停止词库主要是标点符号如果不希望结果中出现标点符号
* 可以将useDefaultStopWords设为true useDefaultStopWords为false时不使用任何停止词
*
* @param useDefaultStopWords
*/
public SmartChineseAnalyzer(boolean useDefaultStopWords) {
if (useDefaultStopWords) {
stopWords = loadStopWords(this.getClass().getResourceAsStream(
"stopwords.txt"));
}
wordSegment = new WordSegmenter();
}
/**
* 使用自定义的而不使用内置的停止词库停止词可以使用SmartChineseAnalyzer.loadStopWords(InputStream)加载
*
* @param stopWords
* @see SmartChineseAnalyzer.loadStopWords(InputStream)
*/
public SmartChineseAnalyzer(Set stopWords) {
this.stopWords = stopWords;
wordSegment = new WordSegmenter();
}
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new SentenceTokenizer(reader);
result = new WordTokenizer(result, wordSegment);
// result = new LowerCaseFilter(result);
// 不再需要LowerCaseFilter因为SegTokenFilter已经将所有英文字符转换成小写
// stem太严格了, This is not bug, this feature:)
result = new PorterStemFilter(result);
if (stopWords != null) {
result = new StopFilter(result, stopWords, false);
}
return result;
}
/**
* 从停用词文件中加载停用词 停用词文件是普通UTF-8编码的文本文件 每一行是一个停用词注释利用// 停用词中包括中文标点符号 中文空格
* 以及使用率太高而对索引意义不大的词
*
* @param input 停用词文件
* @return 停用词组成的HashSet
*/
public static Set loadStopWords(InputStream input) {
String line;
Set stopWords = new HashSet();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(input,
"UTF-8"));
while ((line = br.readLine()) != null) {
if (line.indexOf("//") != -1) {
line = line.substring(0, line.indexOf("//"));
}
line = line.trim();
if (line.length() != 0)
stopWords.add(line.toLowerCase());
}
br.close();
} catch (IOException e) {
System.err.println("WARNING: cannot open stop words list!");
}
return stopWords;
}
}

View File

@ -1,5 +1,51 @@
<html><head></head>
<html>
<head></head>
<body>
Analyzer for Chinese.
<h2>About SmartChineseAnalyzer</h2>
<p>SmartChineseAnalyzer 是一个智能中文分词模块, 与 ChineseAnalyzer (切分每个汉字)和
CJKAnalyzer (组合每两个汉字)不同, 它能够利用概率对汉语句子进行最优切分, 并内嵌英文tokenizer
能有效处理中英文混合的文本内容。目前SmartChineseAnalyzer的词典库只支持简体中文。</p>
<p>它的原理基于自然语言处理领域的隐马尔科夫模型(HMM) 利用大量语料库的训练来统计汉语词汇的词频和跳转概率,
从而根据这些统计结果对整个汉语句子计算最似然(likelihood)的切分。</p>
<p>三种分词模块的分词结果比较, 由此可以看出智能分词更符合句子的原本语义, 从而提高搜索的准确率。
<pre>语句: 我是中国人</pre>
<ol>
<li>SmartChineseAnalyzer: 我-是-中国-人</li>
<li>ChineseAnalyzer: 我-是-中-国-人</li>
<li>CJKAnalyzer: 我是-是中-中国-国人</li>
</ol>
</p>
<h3>分词词典的设置</h3>
<p>因为智能分词需要词典来保存词汇的统计值默认情况下SmartChineseAnalyzer使用内置的词典库当需要指定的词典库时需要指定词典位置如何指定词典位置请参考
org.apache.lucene.analysis.cn.smart.AnalyzerProfile。</p>
<p><b>词库的下载地址为:<a
href="http://code.google.com/p/imdict-chinese-analyzer/downloads/list">http://code.google.com/p/imdict-chinese-analyzer/downloads/list</a>
</b> 下载文件analysis-data.zip保存到本地解压即可使用。</p>
<p>最简单的指定词典库的办法就是运行时加上参数-Danalysis.data.dir
<pre>如: java -Danalysis.data.dir=/path/to/analysis-data com.example.YourApplication</pre>
</p>
<h3>版本要求</h3>
<p>SmartChineseAnalyzer的JVM要求java 1.4及以上版本Lucene
要求2.4.0及以上版本Lucene 2.3.X版应该也可以使用但未经测试有需要的用户可自行测试。</p>
<h3>源文件和文本编码</h3>
除特定的二进制码文件外SmartChineseAnalyzer的所有文本和Java源码都采用UTF-8编码
因此在读取文本和编译Java源码是请注意采用正确的方式以避免产生乱码错误。
<h3>SmartChineseAnalyzer的授权</h3>
<p>SmartChineseAnalyzer的算法和语料库词典来自于ictclas1.0项目(<a
href="http://www.ictclas.org">http://www.ictclas.org</a>)
其中词典已经著作权人www.ictclas.org允许以apache license
v2(APLv2)协议发布。在遵循APLv2的条件下欢迎用户使用。
在此感谢www.ictclas.org以及ictclas分词软件的工作人员的辛勤工作和无私奉献</p>
</body>
</html>

View File

@ -0,0 +1,112 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Properties;
/**
* 在默认情况下SmartChineseAnalyzer内置有词典库默认停止词库已经经过封装用户可以直接使用
*
* 特殊情况下用户需要使用指定的词典库和停止词库此时需要删除org.apache.lucene.analysis.cn.smart. hhmm下的
* coredict.mem bigramdict.mem 然后使用AnalyzerProfile来指定词典库目录
*
* AnalyzerProfile 用来寻找存放分词词库数据 和停用词数据的目录 该目录下应该有 bigramdict.dct, coredict.dct,
* stopwords_utf8.txt, 查找过程依次如下
*
* <ol>
* <li>读取系统运行时参数-Danalysis.data.dir=/path/to/analysis-data如果没有继续下一条</li>
* <li>执行命令的当前目录中是否存在analysis-data目录</li>
* <li>执行命令的lib/目录中是否存在analysis-data目录</li>
* <li>执行命令的当前目录中是否存在analysis.properties文件</li>
* <li>执行命令的lib/目录中是否存在analysis.properties文件</li>
* </ol>
*
* 其中analysis.properties文件analysis.data.dir指明analysis-data目录所在位置.
* analysis.properties文件的内容示例
*
* <pre>
* analysis.data.dir=D:/path/to/analysis-data/
* </pre>
*
* 当找不到analysis-data目录时ANALYSIS_DATA_DIR设置为""因此在使用前必须在程序里显式指定data目录例如
*
* <pre>
* AnalyzerProfile.ANALYSIS_DATA_DIR = &quot;/path/to/analysis-data&quot;;
* </pre>
*
*/
public class AnalyzerProfile {
public static String ANALYSIS_DATA_DIR = "";
static {
init();
}
private static void init() {
String dirName = "analysis-data";
String propName = "analysis.properties";
// 读取系统设置在运行时加入参数-Danalysis.data.dir=/path/to/analysis-data
ANALYSIS_DATA_DIR = System.getProperty("analysis.data.dir", "");
if (ANALYSIS_DATA_DIR.length() != 0)
return;
File[] cadidateFiles = new File[] { new File("./" + dirName),
new File("./lib/" + dirName), new File("./" + propName),
new File("./lib/" + propName) };
for (int i = 0; i < cadidateFiles.length; i++) {
File file = cadidateFiles[i];
if (file.exists()) {
if (file.isDirectory()) {
ANALYSIS_DATA_DIR = file.getAbsolutePath();
} else if (file.isFile() && getAnalysisDataDir(file).length() != 0) {
ANALYSIS_DATA_DIR = getAnalysisDataDir(file);
}
break;
}
}
if (ANALYSIS_DATA_DIR.length() == 0) {
// 提示用户未找到词典文件夹
System.err
.println("WARNING: Can not found lexical dictionary directory!");
System.err
.println("WARNING: This will cause unpredictable exceptions in your application!");
System.err
.println("WARNING: Please refer to the manual to download the dictionaries.");
}
}
private static String getAnalysisDataDir(File propFile) {
Properties prop = new Properties();
try {
FileInputStream input = new FileInputStream(propFile);
prop.load(input);
String dir = prop.getProperty("analysis.data.dir", "");
input.close();
return dir;
} catch (IOException e) {
}
return "";
}
}

View File

@ -0,0 +1,38 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart;
public class CharType {
public final static int DELIMITER = 0;
public final static int LETTER = 1;
public final static int DIGIT = 2;
public final static int HANZI = 3;
public final static int SPACE_LIKE = 4;
// (全角半角)标点符号半角字母数字汉字空格"\t\r\n"等空格或换行字符
public final static int FULLWIDTH_LETTER = 5;
public final static int FULLWIDTH_DIGIT = 6; // 全角字符字母数字
public final static int OTHER = 7;
}

View File

@ -0,0 +1,102 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
/**
*
* 包含一个完整句子的Token从文件中读出是下一步分词的对象
*
*/
public class SentenceTokenizer extends Tokenizer {
/**
* 用来切断句子的标点符号 ,!?;
*/
public final static String PUNCTION = "。,!?;,!?;";
private StringBuffer buffer = new StringBuffer();
private BufferedReader bufferInput;
private int tokenStart = 0, tokenEnd = 0;
private Token t = new Token();
public SentenceTokenizer(Reader reader) {
bufferInput = new BufferedReader(reader, 2048);
}
public Token next() throws IOException {
buffer.setLength(0);
int ci;
char ch, pch;
boolean atBegin = true;
tokenStart = tokenEnd;
ci = bufferInput.read();
ch = (char) ci;
while (true) {
if (ci == -1) {
break;
} else if (PUNCTION.indexOf(ch) != -1) {
// 找到了句子末尾
buffer.append(ch);
tokenEnd++;
break;
} else if (atBegin && Utility.SPACES.indexOf(ch) != -1) {
tokenStart++;
tokenEnd++;
ci = bufferInput.read();
ch = (char) ci;
} else {
buffer.append(ch);
atBegin = false;
tokenEnd++;
pch = ch;
ci = bufferInput.read();
ch = (char) ci;
// 如果碰上了两个连续的skip字符例如两个回车两个空格或者
// 一个回车一个空格等等将其视为句子结束以免句子太长而内存不足
if (Utility.SPACES.indexOf(ch) != -1
&& Utility.SPACES.indexOf(pch) != -1) {
// buffer.append(ch);
tokenEnd++;
break;
}
}
}
if (buffer.length() == 0)
return null;
else {
t.clear();
t.reinit(buffer.toString(), tokenStart, tokenEnd, "sentence");
return t;
}
}
public void close() throws IOException {
bufferInput.close();
}
}

View File

@ -0,0 +1,165 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart;
public class Utility {
public static final char[] STRING_CHAR_ARRAY = new String("未##串")
.toCharArray();
public static final char[] NUMBER_CHAR_ARRAY = new String("未##数")
.toCharArray();
public static final char[] START_CHAR_ARRAY = new String("始##始")
.toCharArray();
public static final char[] END_CHAR_ARRAY = new String("末##末").toCharArray();
public static final char[] COMMON_DELIMITER = new char[] { ',' };
/**
* 需要跳过的符号例如制表符回车换行等等
*/
public static final String SPACES = "  \t\r\n";
public static final int MAX_FREQUENCE = 2079997 + 80000;
/**
* 比较两个整数数组的大小, 分别从数组的一定位置开始逐个比较, 当依次相等且都到达末尾时, 返回相等, 否则未到达末尾的大于到达末尾的;
* 当未到达末尾时有一位不相等, 该位置数值大的数组大于小的
*
* @param larray
* @param lstartIndex larray的起始位置
* @param rarray
* @param rstartIndex rarray的起始位置
* @return 0表示相等1表示larray > rarray, -1表示larray < rarray
*/
public static int compareArray(char[] larray, int lstartIndex, char[] rarray,
int rstartIndex) {
if (larray == null) {
if (rarray == null || rstartIndex >= rarray.length)
return 0;
else
return -1;
} else {
// larray != null
if (rarray == null) {
if (lstartIndex >= larray.length)
return 0;
else
return 1;
}
}
int li = lstartIndex, ri = rstartIndex;
while (li < larray.length && ri < rarray.length && larray[li] == rarray[ri]) {
li++;
ri++;
}
if (li == larray.length) {
if (ri == rarray.length) {
// 两者一直相等到末尾因此返回相等也就是结果0
return 0;
} else {
// 此时不可能ri>rarray.length因此只有ri<rarray.length
// 表示larray已经结束rarray没有结束因此larray < rarray返回-1
return -1;
}
} else {
// 此时不可能li>larray.length因此只有li < larray.length表示li没有到达larray末尾
if (ri == rarray.length) {
// larray没有结束但是rarray已经结束因此larray > rarray
return 1;
} else {
// 此时不可能ri>rarray.length因此只有ri < rarray.length
// 表示larray和rarray都没有结束因此按下一个数的大小判断
if (larray[li] > rarray[ri])
return 1;
else
return -1;
}
}
}
/**
* 根据前缀来判断两个字符数组的大小当前者为后者的前缀时表示相等当不为前缀时按照普通字符串方式比较
*
* @param shortArray
* @param shortIndex
* @param longArray
* @param longIndex
* @return
*/
public static int compareArrayByPrefix(char[] shortArray, int shortIndex,
char[] longArray, int longIndex) {
// 空数组是所有数组的前缀不考虑index
if (shortArray == null)
return 0;
else if (longArray == null)
return (shortIndex < shortArray.length) ? 1 : 0;
int si = shortIndex, li = longIndex;
while (si < shortArray.length && li < longArray.length
&& shortArray[si] == longArray[li]) {
si++;
li++;
}
if (si == shortArray.length) {
// shortArray longArray的prefix
return 0;
} else {
// 此时不可能si>shortArray.length因此只有si <
// shortArray.length表示si没有到达shortArray末尾
// shortArray没有结束但是longArray已经结束因此shortArray > longArray
if (li == longArray.length)
return 1;
else
// 此时不可能li>longArray.length因此只有li < longArray.length
// 表示shortArray和longArray都没有结束因此按下一个数的大小判断
return (shortArray[si] > longArray[li]) ? 1 : -1;
}
}
public static int getCharType(char ch) {
// 最多的是汉字
if (ch >= 0x4E00 && ch <= 0x9FA5)
return CharType.HANZI;
if ((ch >= 0x0041 && ch <= 0x005A) || (ch >= 0x0061 && ch <= 0x007A))
return CharType.LETTER;
if (ch >= 0x0030 && ch <= 0x0039)
return CharType.DIGIT;
if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n' || ch == ' ')
return CharType.SPACE_LIKE;
// 最前面的其它的都是标点符号了
if ((ch >= 0x0021 && ch <= 0x00BB) || (ch >= 0x2010 && ch <= 0x2642)
|| (ch >= 0x3001 && ch <= 0x301E))
return CharType.DELIMITER;
// 全角字符区域
if ((ch >= 0xFF21 && ch <= 0xFF3A) || (ch >= 0xFF41 && ch <= 0xFF5A))
return CharType.FULLWIDTH_LETTER;
if (ch >= 0xFF10 && ch <= 0xFF19)
return CharType.FULLWIDTH_DIGIT;
if (ch >= 0xFE30 && ch <= 0xFF63)
return CharType.DELIMITER;
return CharType.OTHER;
}
}

View File

@ -0,0 +1,87 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter;
import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;
import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter;
public class WordSegmenter {
private HHMMSegmenter hhmmSegmenter = new HHMMSegmenter();
private SegTokenFilter tokenFilter = new SegTokenFilter();
/**
* 调用HHMMSegment程序将当前的sentence Token分词返回分词结果保存在Token List中
*
* @param sentenceToken 句子的Token
* @param shortPathCount HHMM算法分词所需要的优化前的最短路径个数一般越大分词结果越精确但是计算代价也较高
* @return 分词结果的Token List
*/
public List segmentSentence(Token sentenceToken, int shortPathCount) {
String sentence = sentenceToken.term();
List segTokenList = hhmmSegmenter.process(sentence);
List result = new ArrayList();
// i从1到rawTokens.length-2也就是说将####两个RawToken去掉
for (int i = 1; i < segTokenList.size() - 1; i++) {
result.add(convertSegToken((SegToken) segTokenList.get(i), sentence,
sentenceToken.startOffset(), "word"));
}
return result;
}
/**
*
* 将RawToken类型转换成索引需要的Token类型 因为索引需要RawToken在原句中的内容 因此转换时需要指定原句子
*
* @param rt
* @param sentence 转换需要的句子内容
* @param sentenceStartOffset sentence在文章中的初始位置
* @param type token类型默认应该是word
* @return
*/
public Token convertSegToken(SegToken st, String sentence,
int sentenceStartOffset, String type) {
Token result;
switch (st.wordType) {
case WordType.STRING:
case WordType.NUMBER:
case WordType.FULLWIDTH_NUMBER:
case WordType.FULLWIDTH_STRING:
st.charArray = sentence.substring(st.startOffset, st.endOffset)
.toCharArray();
break;
default:
break;
}
st = tokenFilter.filter(st);
result = new Token(st.charArray, 0, st.charArray.length, st.startOffset
+ sentenceStartOffset, st.endOffset + sentenceStartOffset);
return result;
}
}

View File

@ -0,0 +1,87 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
public class WordTokenizer extends Tokenizer {
/**
* 分词主程序WordTokenizer初始化时加载
*/
private WordSegmenter wordSegmenter;
private TokenStream in;
private Iterator tokenIter;
private List tokenBuffer;
private Token sentenceToken = new Token();
/**
* 设计上是SentenceTokenizer的下一处理层将SentenceTokenizer的句子读出
* 利用HHMMSegment主程序将句子分词然后将分词结果返回
*
* @param in 句子的Token
* @param smooth 平滑函数
* @param dataPath 装载核心字典与二叉字典的目录
* @see init()
*/
public WordTokenizer(TokenStream in, WordSegmenter wordSegmenter) {
this.in = in;
this.wordSegmenter = wordSegmenter;
}
public Token next() throws IOException {
if (tokenIter != null && tokenIter.hasNext())
return (Token) tokenIter.next();
else {
if (processNextSentence()) {
return (Token) tokenIter.next();
} else
return null;
}
}
/**
* 当当前的句子分词并索引完毕时需要读取下一个句子Token 本函数负责调用上一层的SentenceTokenizer去加载下一个句子 并将其分词
* 将分词结果保存成Token放在tokenBuffer中
*
* @return 读取并处理下一个句子成功与否如果没有成功说明文件处理完毕后面没有Token了
* @throws IOException
*/
private boolean processNextSentence() throws IOException {
sentenceToken = in.next(sentenceToken);
if (sentenceToken == null)
return false;
tokenBuffer = wordSegmenter.segmentSentence(sentenceToken, 1);
tokenIter = tokenBuffer.iterator();
return tokenBuffer != null && tokenIter.hasNext();
}
public void close() throws IOException {
in.close();
}
}

View File

@ -0,0 +1,37 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart;
public class WordType {
public final static int SENTENCE_BEGIN = 0;
public final static int SENTENCE_END = 1;// 句子的开头和结束
public final static int CHINESE_WORD = 2;// 中文词
public final static int STRING = 3;
public final static int NUMBER = 4; // ascii字符串和数字
public final static int DELIMITER = 5; // 所有标点符号
public final static int FULLWIDTH_STRING = 6;
public final static int FULLWIDTH_NUMBER = 7;// 含有全角字符的字符串含全角数字的数字
}

View File

@ -0,0 +1,195 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart.hhmm;
import java.io.UnsupportedEncodingException;
public abstract class AbstractDictionary {
/**
* 第一个汉字为他前面有15个区共15*94个字符
*/
public static final int GB2312_FIRST_CHAR = 1410;
/**
* GB2312字符集中01~87的字符集才可能有效共8178个
*/
public static final int GB2312_CHAR_NUM = 87 * 94;
/**
* 词库文件中收录了6768个汉字的词频统计
*/
public static final int CHAR_NUM_IN_FILE = 6768;
// =====================================================
// code +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +A +B +C +D +E +F
// B0A0
// B0B0
// B0C0
// B0D0
// B0E0
// B0F0
// =====================================================
//
// GB2312 字符集的区位分布表
// 区号 字数 字符类别
// 01 94 一般符号
// 02 72 顺序号码
// 03 94 拉丁字母
// 04 83 日文假名
// 05 86 Katakana
// 06 48 希腊字母
// 07 66 俄文字母
// 08 63 汉语拼音符号
// 09 76 图形符号
// 10-15 备用区
// 16-55 3755 一级汉字以拼音为序
// 56-87 3008 二级汉字以笔划为序
// 88-94 备用区
// ======================================================
/**
* GB2312 共收录有 7445 个字符其中简化汉字 6763 字母和符号 682
*
* GB2312 将所收录的字符分为 94 个区编号为 01 区至 94 每个区收录 94 个字符编号为 01 位至 94
* 01为起始与0xA194位处于0xFEGB2312 的每一个字符都由与其唯一对应的区号和位号所确定例如汉字编号为 16 01
*
*/
/**
* @param ccid
* @return
*/
public String getCCByGB2312Id(int ccid) {
if (ccid < 0 || ccid > WordDictionary.GB2312_CHAR_NUM)
return "";
int cc1 = ccid / 94 + 161;
int cc2 = ccid % 94 + 161;
byte[] buffer = new byte[2];
buffer[0] = (byte) cc1;
buffer[1] = (byte) cc2;
try {
String cchar = new String(buffer, "GB2312");
return cchar;
} catch (UnsupportedEncodingException e) {
return "";
}
}
/**
* 根据输入的Unicode字符获取它的GB2312编码或者ascii编码
*
* @param ch 输入的GB2312中文字符或者ASCII字符(128个)
* @return ch在GB2312中的位置-1表示该字符不认识
*/
public short getGB2312Id(char ch) {
try {
byte[] buffer = Character.toString(ch).getBytes("GB2312");
if (buffer.length != 2) {
// 正常情况下buffer应该是两个字节否则说明ch不属于GB2312编码故返回'?'此时说明不认识该字符
return -1;
}
int b0 = (int) (buffer[0] & 0x0FF) - 161; // 编码从A1开始因此减去0xA1=161
int b1 = (int) (buffer[1] & 0x0FF) - 161; // 第一个字符和最后一个字符没有汉字因此每个区只收16*6-2=94个汉字
return (short) (b0 * 94 + b1);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return -1;
}
/**
* 改进的32位FNV hash算法用作本程序中的第一hash函数.第一和第二hash函数用来联合计算hash表 使其均匀分布
* 并能避免因hash表过密而导致的长时间计算的问题
*
* @param c 待hash的Unicode字符
* @return c的哈希值
* @see Utility.hash2()
*/
public long hash1(char c) {
final long p = 1099511628211L;
long hash = 0xcbf29ce484222325L;
hash = (hash ^ (c & 0x00FF)) * p;
hash = (hash ^ (c >> 8)) * p;
hash += hash << 13;
hash ^= hash >> 7;
hash += hash << 3;
hash ^= hash >> 17;
hash += hash << 5;
return hash;
}
/**
* @see Utility.hash1(char[])
* @param carray
* @return
*/
public long hash1(char carray[]) {
final long p = 1099511628211L;
long hash = 0xcbf29ce484222325L;
for (int i = 0; i < carray.length; i++) {
char d = carray[i];
hash = (hash ^ (d & 0x00FF)) * p;
hash = (hash ^ (d >> 8)) * p;
}
// hash += hash << 13;
// hash ^= hash >> 7;
// hash += hash << 3;
// hash ^= hash >> 17;
// hash += hash << 5;
return hash;
}
/**
* djb2哈希算法用作本程序中的第二hash函数
*
* djb2 hash algorithmthis algorithm (k=33) was first reported by dan
* bernstein many years ago in comp.lang.c. another version of this algorithm
* (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i];
* the magic of number 33 (why it works better than many other constants,
* prime or not) has never been adequately explained.
*
* @param c
* @return
*/
public int hash2(char c) {
int hash = 5381;
/* hash 33 + c */
hash = ((hash << 5) + hash) + c & 0x00FF;
hash = ((hash << 5) + hash) + c >> 8;
return hash;
}
/**
* @see Utility.hash2(char[])
* @param carray
* @return
*/
public int hash2(char carray[]) {
int hash = 5381;
/* hash 33 + c */
for (int i = 0; i < carray.length; i++) {
char d = carray[i];
hash = ((hash << 5) + hash) + d & 0x00FF;
hash = ((hash << 5) + hash) + d >> 8;
}
return hash;
}
}

View File

@ -0,0 +1,237 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart.hhmm;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.cn.smart.Utility;
public class BiSegGraph {
private Map tokenPairListTable = new HashMap();
private List segTokenList;
private static BigramDictionary bigramDict = BigramDictionary.getInstance();
public BiSegGraph(SegGraph segGraph) {
segTokenList = segGraph.makeIndex();
generateBiSegGraph(segGraph);
}
/**
* 生成两两词之间的二叉图表将结果保存在一个MultiTokenPairMap中
*
* @param segGraph 所有的Token列表
* @param smooth 平滑系数
* @param biDict 二叉词典
* @return
*
* @see MultiTokenPairMap
*/
private void generateBiSegGraph(SegGraph segGraph) {
double smooth = 0.1;
int wordPairFreq = 0;
int maxStart = segGraph.getMaxStart();
double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE;
int next;
char[] idBuffer;
// 为segGraph中的每个元素赋以一个下标
segTokenList = segGraph.makeIndex();
// 因为startToken"始##始"的起始位置是-1因此key为-1时可以取出startToken
int key = -1;
List nextTokens = null;
while (key < maxStart) {
if (segGraph.isStartExist(key)) {
List tokenList = segGraph.getStartList(key);
// 为某一个key对应的所有Token都计算一次
for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
SegToken t1 = (SegToken) iter.next();
oneWordFreq = t1.weight;
next = t1.endOffset;
nextTokens = null;
// 找到下一个对应的Token例如阳光海岸当前Token是阳光 下一个Token可以是或者海岸
// 如果找不到下一个Token则说明到了末尾重新循环
while (next <= maxStart) {
// 因为endToken的起始位置是sentenceLen因此等于sentenceLen是可以找到endToken
if (segGraph.isStartExist(next)) {
nextTokens = segGraph.getStartList(next);
break;
}
next++;
}
if (nextTokens == null) {
break;
}
for (Iterator iter2 = nextTokens.iterator(); iter2.hasNext();) {
SegToken t2 = (SegToken) iter2.next();
idBuffer = new char[t1.charArray.length + t2.charArray.length + 1];
System.arraycopy(t1.charArray, 0, idBuffer, 0, t1.charArray.length);
idBuffer[t1.charArray.length] = BigramDictionary.WORD_SEGMENT_CHAR;
System.arraycopy(t2.charArray, 0, idBuffer,
t1.charArray.length + 1, t2.charArray.length);
// Two linked Words frequency
wordPairFreq = bigramDict.getFrequency(idBuffer);
// Smoothing
// -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
weight = -Math
.log(smooth
* (1.0 + oneWordFreq)
/ (Utility.MAX_FREQUENCE + 0.0)
+ (1.0 - smooth)
* ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble));
SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.index,
t2.index, weight);
this.addSegTokenPair(tokenPair);
}
}
}
key++;
}
}
/**
* 查看SegTokenPair的结束位置为to(SegTokenPair.to为to)是否存在SegTokenPair
* 如果没有则说明to处没有SegTokenPair或者还没有添加
*
* @param to SegTokenPair.to
* @return
*/
public boolean isToExist(int to) {
return tokenPairListTable.get(new Integer(to)) != null;
}
/**
* 取出SegTokenPair.to为to的所有SegTokenPair如果没有则返回null
*
* @param to
* @return 所有相同SegTokenPair.to的SegTokenPair的序列
*/
public List getToList(int to) {
return (List) tokenPairListTable.get(new Integer(to));
}
/**
* 向BiSegGraph中增加一个SegTokenPair这些SegTokenPair按照相同SegTokenPair.
* to放在同一个ArrayList中
*
* @param tokenPair
*/
public void addSegTokenPair(SegTokenPair tokenPair) {
int to = tokenPair.to;
if (!isToExist(to)) {
ArrayList newlist = new ArrayList();
newlist.add(tokenPair);
tokenPairListTable.put(new Integer(to), newlist);
} else {
List tokenPairList = (List) tokenPairListTable.get(new Integer(to));
tokenPairList.add(tokenPair);
}
}
/**
* @return TokenPair的列数也就是Map中不同列号的TokenPair种数
*/
public int getToCount() {
return tokenPairListTable.size();
}
/**
* 用veterbi算法计算从起点到终点的最短路径
*
* @return
*/
public List getShortPath() {
int current;
int nodeCount = getToCount();
List path = new ArrayList();
PathNode zeroPath = new PathNode();
zeroPath.weight = 0;
zeroPath.preNode = 0;
path.add(zeroPath);
for (current = 1; current <= nodeCount; current++) {
double weight;
List edges = getToList(current);
double minWeight = Double.MAX_VALUE;
SegTokenPair minEdge = null;
for (Iterator iter1 = edges.iterator(); iter1.hasNext();) {
SegTokenPair edge = (SegTokenPair) iter1.next();
weight = edge.weight;
PathNode preNode = (PathNode) path.get(edge.from);
if (preNode.weight + weight < minWeight) {
minWeight = preNode.weight + weight;
minEdge = edge;
}
}
PathNode newNode = new PathNode();
newNode.weight = minWeight;
newNode.preNode = minEdge.from;
path.add(newNode);
}
// 接下来从nodePaths中计算从起点到终点的真实路径
int preNode, lastNode;
lastNode = path.size() - 1;
current = lastNode;
List rpath = new ArrayList();
List resultPath = new ArrayList();
rpath.add(new Integer(current));
while (current != 0) {
PathNode currentPathNode = (PathNode) path.get(current);
preNode = currentPathNode.preNode;
rpath.add(new Integer(preNode));
current = preNode;
}
for (int j = rpath.size() - 1; j >= 0; j--) {
Integer idInteger = (Integer) rpath.get(j);
int id = idInteger.intValue();
SegToken t = (SegToken) segTokenList.get(id);
resultPath.add(t);
}
return resultPath;
}
public String toString() {
StringBuffer sb = new StringBuffer();
Collection values = tokenPairListTable.values();
for (Iterator iter1 = values.iterator(); iter1.hasNext();) {
List segList = (List) iter1.next();
for (Iterator iter2 = segList.iterator(); iter2.hasNext();) {
SegTokenPair pair = (SegTokenPair) iter2.next();
sb.append(pair + "\n");
}
}
return sb.toString();
}
}

View File

@ -0,0 +1,321 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart.hhmm;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
public class BigramDictionary extends AbstractDictionary {
private BigramDictionary() {
}
public static final char WORD_SEGMENT_CHAR = '@';
private static BigramDictionary singleInstance;
public static final int PRIME_BIGRAM_LENGTH = 402137;
/**
* bigramTable 来存储词与词之间的跳转频率 bigramHashTable frequencyTable
* 就是用来存储这些频率的数据结构 为了提高查询速度和节省内存 采用 hash 值来代替关联词作为查询依据 关联词就是
* (formWord+'@'+toWord) 利用 FNV1 hash 算法来计算关联词的hash值 并保存在 bigramHashTable
* 利用 hash 值来代替关联词有可能会产生很小概率的冲突 但是 long 类型
* (64bit)的hash值有效地将此概率降到极低bigramHashTable[i]与frequencyTable[i]一一对应
*/
private long[] bigramHashTable;
private int[] frequencyTable;
private int max = 0;
private int repeat = 0;
// static Logger log = Logger.getLogger(BigramDictionary.class);
public synchronized static BigramDictionary getInstance() {
if (singleInstance == null) {
singleInstance = new BigramDictionary();
try {
singleInstance.load();
} catch (IOException e) {
String dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
singleInstance.load(dictRoot);
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
return singleInstance;
}
private boolean loadFromObj(File serialObj) {
try {
loadFromInputStream(new FileInputStream(serialObj));
return true;
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
return false;
}
private void loadFromInputStream(InputStream serialObjectInputStream)
throws IOException, ClassNotFoundException {
ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
bigramHashTable = (long[]) input.readObject();
frequencyTable = (int[]) input.readObject();
// log.info("load bigram dict from serialization.");
input.close();
}
private void saveToObj(File serialObj) {
try {
ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
serialObj));
output.writeObject(bigramHashTable);
output.writeObject(frequencyTable);
output.close();
// log.info("serialize bigram dict.");
} catch (Exception e) {
// log.warn(e.getMessage());
}
}
private void load() throws IOException, ClassNotFoundException {
InputStream input = this.getClass().getResourceAsStream("bigramdict.mem");
loadFromInputStream(input);
}
private void load(String dictRoot) {
String bigramDictPath = dictRoot + "/bigramdict.dct";
File serialObj = new File(dictRoot + "/bigramdict.mem");
if (serialObj.exists() && loadFromObj(serialObj)) {
} else {
try {
bigramHashTable = new long[PRIME_BIGRAM_LENGTH];
frequencyTable = new int[PRIME_BIGRAM_LENGTH];
for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++) {
// 实际上将0作为初始值有一点问题因为某个字符串可能hash值为0但是概率非常小因此影响不大
bigramHashTable[i] = 0;
frequencyTable[i] = 0;
}
loadFromFile(bigramDictPath);
} catch (IOException e) {
throw new RuntimeException(e.getMessage());
}
saveToObj(serialObj);
}
}
/**
* 将词库文件加载到WordDictionary的相关数据结构中只是加载没有进行合并和修改操作
*
* @param dctFilePath
* @return
* @throws FileNotFoundException
* @throws IOException
* @throws UnsupportedEncodingException
*/
public void loadFromFile(String dctFilePath) throws FileNotFoundException,
IOException, UnsupportedEncodingException {
int i, cnt, length, total = 0;
// 文件中只统计了6763个汉字加5个空汉字符3756~3760其中第3756个用来存储符号信息
int[] buffer = new int[3];
byte[] intBuffer = new byte[4];
String tmpword;
RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
// 字典文件中第一个汉字出现的位置是0最后一个是6768
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
String currentStr = getCCByGB2312Id(i);
// if (i == 5231)
// System.out.println(i);
dctFile.read(intBuffer);// 原词库文件在c下开发所以写入的文件为little
// endian编码而java为big endian必须转换过来
cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
if (cnt <= 0) {
continue;
}
total += cnt;
int j = 0;
while (j < cnt) {
dctFile.read(intBuffer);
buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
.getInt();// frequency
dctFile.read(intBuffer);
buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
.getInt();// length
dctFile.read(intBuffer);
// buffer[2] = ByteBuffer.wrap(intBuffer).order(
// ByteOrder.LITTLE_ENDIAN).getInt();// handle
length = buffer[1];
if (length > 0) {
byte[] lchBuffer = new byte[length];
dctFile.read(lchBuffer);
tmpword = new String(lchBuffer, "GB2312");
if (i != 3755 + GB2312_FIRST_CHAR) {
tmpword = currentStr + tmpword;
}
char carray[] = tmpword.toCharArray();
long hashId = hash1(carray);
int index = getAvaliableIndex(hashId, carray);
if (index != -1) {
if (bigramHashTable[index] == 0) {
bigramHashTable[index] = hashId;
// bigramStringTable[index] = tmpword;
}
frequencyTable[index] += buffer[0];
}
}
j++;
}
}
dctFile.close();
// log.info("load dictionary done! " + dctFilePath + " total:" + total);
}
/*
* public void test(String dctFilePath) throws IOException { int i, cnt,
* length, total = 0; int corrupt = 0, notFound = 0; //
* 文件中只统计了6763个汉字加5个空汉字符3756~3760其中第3756个用来存储符号信息 int[] buffer = new int[3];
* byte[] intBuffer = new byte[4]; String tmpword; RandomAccessFile dctFile =
* new RandomAccessFile(dctFilePath, "r");
*
* // 字典文件中第一个汉字出现的位置是0最后一个是6768 for (i = GB2312_FIRST_CHAR; i <
* GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { String currentStr =
* getCCByGB2312Id(i); // if (i == 5231) // System.out.println(i);
*
* dctFile.read(intBuffer);// 原词库文件在c下开发所以写入的文件为little // endian编码而java为big
* endian必须转换过来 cnt =
* ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN) .getInt(); if
* (cnt <= 0) { continue; } total += cnt; int j = 0; while (j < cnt) {
* dctFile.read(intBuffer); buffer[0] = ByteBuffer.wrap(intBuffer).order(
* ByteOrder.LITTLE_ENDIAN).getInt();// frequency dctFile.read(intBuffer);
* buffer[1] = ByteBuffer.wrap(intBuffer).order(
* ByteOrder.LITTLE_ENDIAN).getInt();// length dctFile.read(intBuffer); //
* buffer[2] = ByteBuffer.wrap(intBuffer).order( //
* ByteOrder.LITTLE_ENDIAN).getInt();// handle
*
* length = buffer[1]; if (length > 0) { byte[] lchBuffer = new byte[length];
* dctFile.read(lchBuffer); tmpword = new String(lchBuffer, "GB2312"); if (i
* != 3755 + GB2312_FIRST_CHAR) { tmpword = currentStr + tmpword; } char
* carray[] = tmpword.toCharArray(); int index = getBigramItemIndex(carray);
* if (index != -1) { // if (!bigramStringTable[index].equals(tmpword)) { //
* System.out.println("corrupt: " + tmpword + "<->" // +
* bigramStringTable[index]); // corrupt++; // } } else {
* System.out.println("not found: " + tmpword); notFound++; } } j++; } }
* dctFile.close(); System.out.println("num not found:" + notFound);
* System.out.println("num corrupt:" + corrupt);
*
* log.info("test dictionary done! " + dctFilePath + " total:" + total); cnt =
* 0; for (int j = 0; j < PRIME_BIGRAM_LENGTH; j++) { if (bigramHashTable[j]
* != 0) { cnt++; } } System.out.println("total num in bigramTable: " + cnt);
* }
*/
private int getAvaliableIndex(long hashId, char carray[]) {
int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
if (hash1 < 0)
hash1 = PRIME_BIGRAM_LENGTH + hash1;
if (hash2 < 0)
hash2 = PRIME_BIGRAM_LENGTH + hash2;
int index = hash1;
int i = 1;
while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
&& i < PRIME_BIGRAM_LENGTH) {
index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
i++;
}
// System.out.println(i - 1);
if (i < PRIME_BIGRAM_LENGTH
&& (bigramHashTable[index] == 0 || bigramHashTable[index] == hashId)) {
return index;
} else
return -1;
}
/**
* @param c
* @return
*/
private int getBigramItemIndex(char carray[]) {
long hashId = hash1(carray);
int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
if (hash1 < 0)
hash1 = PRIME_BIGRAM_LENGTH + hash1;
if (hash2 < 0)
hash2 = PRIME_BIGRAM_LENGTH + hash2;
int index = hash1;
int i = 1;
repeat++;
while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
&& i < PRIME_BIGRAM_LENGTH) {
index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
i++;
repeat++;
if (i > max)
max = i;
}
// System.out.println(i - 1);
if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId) {
return index;
} else
return -1;
}
public int getFrequency(char[] carray) {
int index = getBigramItemIndex(carray);
if (index != -1)
return frequencyTable[index];
return 0;
}
public static void main(String[] args) throws FileNotFoundException,
UnsupportedEncodingException, IOException {
BigramDictionary dic = new BigramDictionary();
dic.load("D:/analysis-data");
// dic.test("D:/analysis-data/BigramDict.dct");
System.out.println("max:" + dic.max);
System.out.println("average repeat:" + (double) dic.repeat / 328856);
System.out.println("end");
}
}

View File

@ -0,0 +1,302 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart.hhmm;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
public class CopyOfBigramDictionary extends AbstractDictionary {
private CopyOfBigramDictionary() {
}
public static final char WORD_SEGMENT_CHAR = '@';
private static CopyOfBigramDictionary singleInstance;
public static final int PRIME_BIGRAM_LENGTH = 402137;
/**
* bigramTable 来存储词与词之间的跳转频率 bigramHashTable frequencyTable
* 就是用来存储这些频率的数据结构 为了提高查询速度和节省内存 采用 hash 值来代替关联词作为查询依据 关联词就是
* (formWord+'@'+toWord) 利用 FNV1 hash 算法来计算关联词的hash值 并保存在 bigramHashTable
* 利用 hash 值来代替关联词有可能会产生很小概率的冲突 但是 long 类型
* (64bit)的hash值有效地将此概率降到极低bigramHashTable[i]与frequencyTable[i]一一对应
*/
private long[] bigramHashTable;
private int[] frequencyTable;
private int max = 0;
private int repeat = 0;
// static Logger log = Logger.getLogger(BigramDictionary.class);
public synchronized static CopyOfBigramDictionary getInstance() {
if (singleInstance == null) {
String dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
singleInstance = new CopyOfBigramDictionary();
singleInstance.load(dictRoot);
}
return singleInstance;
}
private boolean loadFromObj(File serialObj) {
boolean loadFromObject = false;
try {
ObjectInputStream input = new ObjectInputStream(new FileInputStream(
serialObj));
bigramHashTable = (long[]) input.readObject();
frequencyTable = (int[]) input.readObject();
// log.info("load bigram dict from serialization.");
loadFromObject = true;
input.close();
} catch (Exception e) {
// log.warn(e.getMessage());
}
return loadFromObject;
}
private void saveToObj(File serialObj) {
try {
ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
serialObj));
output.writeObject(bigramHashTable);
output.writeObject(frequencyTable);
output.close();
// log.info("serialize bigram dict.");
} catch (Exception e) {
// log.warn(e.getMessage());
}
}
private void load(String dictRoot) {
String bigramDictPath = dictRoot + "/bigramdict.dct";
File serialObj = new File(dictRoot + "/bigramdict.mem");
if (serialObj.exists() && loadFromObj(serialObj)) {
} else {
try {
bigramHashTable = new long[PRIME_BIGRAM_LENGTH];
frequencyTable = new int[PRIME_BIGRAM_LENGTH];
for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++) {
// 实际上将0作为初始值有一点问题因为某个字符串可能hash值为0但是概率非常小因此影响不大
bigramHashTable[i] = 0;
frequencyTable[i] = 0;
}
loadFromFile(bigramDictPath);
} catch (IOException e) {
throw new RuntimeException(e.getMessage());
}
saveToObj(serialObj);
}
}
/**
* 将词库文件加载到WordDictionary的相关数据结构中只是加载没有进行合并和修改操作
*
* @param dctFilePath
* @return
* @throws FileNotFoundException
* @throws IOException
* @throws UnsupportedEncodingException
*/
public void loadFromFile(String dctFilePath) throws FileNotFoundException,
IOException, UnsupportedEncodingException {
int i, cnt, length, total = 0;
// 文件中只统计了6763个汉字加5个空汉字符3756~3760其中第3756个用来存储符号信息
int[] buffer = new int[3];
byte[] intBuffer = new byte[4];
String tmpword;
RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
// 字典文件中第一个汉字出现的位置是0最后一个是6768
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
String currentStr = getCCByGB2312Id(i);
// if (i == 5231)
// System.out.println(i);
dctFile.read(intBuffer);// 原词库文件在c下开发所以写入的文件为little
// endian编码而java为big endian必须转换过来
cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
if (cnt <= 0) {
continue;
}
total += cnt;
int j = 0;
while (j < cnt) {
dctFile.read(intBuffer);
buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
.getInt();// frequency
dctFile.read(intBuffer);
buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
.getInt();// length
dctFile.read(intBuffer);
// buffer[2] = ByteBuffer.wrap(intBuffer).order(
// ByteOrder.LITTLE_ENDIAN).getInt();// handle
length = buffer[1];
if (length > 0) {
byte[] lchBuffer = new byte[length];
dctFile.read(lchBuffer);
tmpword = new String(lchBuffer, "GB2312");
if (i != 3755 + GB2312_FIRST_CHAR) {
tmpword = currentStr + tmpword;
}
char carray[] = tmpword.toCharArray();
long hashId = hash1(carray);
int index = getAvaliableIndex(hashId, carray);
if (index != -1) {
if (bigramHashTable[index] == 0) {
bigramHashTable[index] = hashId;
// bigramStringTable[index] = tmpword;
}
frequencyTable[index] += buffer[0];
}
}
j++;
}
}
dctFile.close();
// log.info("load dictionary done! " + dctFilePath + " total:" + total);
}
/*
* public void test(String dctFilePath) throws IOException { int i, cnt,
* length, total = 0; int corrupt = 0, notFound = 0; //
* 文件中只统计了6763个汉字加5个空汉字符3756~3760其中第3756个用来存储符号信息 int[] buffer = new int[3];
* byte[] intBuffer = new byte[4]; String tmpword; RandomAccessFile dctFile =
* new RandomAccessFile(dctFilePath, "r");
*
* // 字典文件中第一个汉字出现的位置是0最后一个是6768 for (i = GB2312_FIRST_CHAR; i <
* GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { String currentStr =
* getCCByGB2312Id(i); // if (i == 5231) // System.out.println(i);
*
* dctFile.read(intBuffer);// 原词库文件在c下开发所以写入的文件为little // endian编码而java为big
* endian必须转换过来 cnt =
* ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN) .getInt(); if
* (cnt <= 0) { continue; } total += cnt; int j = 0; while (j < cnt) {
* dctFile.read(intBuffer); buffer[0] = ByteBuffer.wrap(intBuffer).order(
* ByteOrder.LITTLE_ENDIAN).getInt();// frequency dctFile.read(intBuffer);
* buffer[1] = ByteBuffer.wrap(intBuffer).order(
* ByteOrder.LITTLE_ENDIAN).getInt();// length dctFile.read(intBuffer); //
* buffer[2] = ByteBuffer.wrap(intBuffer).order( //
* ByteOrder.LITTLE_ENDIAN).getInt();// handle
*
* length = buffer[1]; if (length > 0) { byte[] lchBuffer = new byte[length];
* dctFile.read(lchBuffer); tmpword = new String(lchBuffer, "GB2312"); if (i
* != 3755 + GB2312_FIRST_CHAR) { tmpword = currentStr + tmpword; } char
* carray[] = tmpword.toCharArray(); int index = getBigramItemIndex(carray);
* if (index != -1) { // if (!bigramStringTable[index].equals(tmpword)) { //
* System.out.println("corrupt: " + tmpword + "<->" // +
* bigramStringTable[index]); // corrupt++; // } } else {
* System.out.println("not found: " + tmpword); notFound++; } } j++; } }
* dctFile.close(); System.out.println("num not found:" + notFound);
* System.out.println("num corrupt:" + corrupt);
*
* log.info("test dictionary done! " + dctFilePath + " total:" + total); cnt =
* 0; for (int j = 0; j < PRIME_BIGRAM_LENGTH; j++) { if (bigramHashTable[j]
* != 0) { cnt++; } } System.out.println("total num in bigramTable: " + cnt);
* }
*/
private int getAvaliableIndex(long hashId, char carray[]) {
int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
if (hash1 < 0)
hash1 = PRIME_BIGRAM_LENGTH + hash1;
if (hash2 < 0)
hash2 = PRIME_BIGRAM_LENGTH + hash2;
int index = hash1;
int i = 1;
while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
&& i < PRIME_BIGRAM_LENGTH) {
index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
i++;
}
// System.out.println(i - 1);
if (i < PRIME_BIGRAM_LENGTH
&& (bigramHashTable[index] == 0 || bigramHashTable[index] == hashId)) {
return index;
} else
return -1;
}
/**
* @param c
* @return
*/
private int getBigramItemIndex(char carray[]) {
long hashId = hash1(carray);
int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
if (hash1 < 0)
hash1 = PRIME_BIGRAM_LENGTH + hash1;
if (hash2 < 0)
hash2 = PRIME_BIGRAM_LENGTH + hash2;
int index = hash1;
int i = 1;
repeat++;
while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
&& i < PRIME_BIGRAM_LENGTH) {
index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
i++;
repeat++;
if (i > max)
max = i;
}
// System.out.println(i - 1);
if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId) {
return index;
} else
return -1;
}
public int getFrequency(char[] carray) {
int index = getBigramItemIndex(carray);
if (index != -1)
return frequencyTable[index];
return 0;
}
public static void main(String[] args) throws FileNotFoundException,
UnsupportedEncodingException, IOException {
CopyOfBigramDictionary dic = new CopyOfBigramDictionary();
dic.load("D:/analysis-data");
// dic.test("D:/analysis-data/BigramDict.dct");
System.out.println("max:" + dic.max);
System.out.println("average repeat:" + (double) dic.repeat / 328856);
System.out.println("end");
}
}

View File

@ -0,0 +1,541 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart.hhmm;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
import org.apache.lucene.analysis.cn.smart.Utility;
public class CopyOfWordDictionary extends AbstractDictionary {
private CopyOfWordDictionary() {
}
private static CopyOfWordDictionary singleInstance;
/**
* 一个较大的素数保证hash查找能够遍历所有位置
*/
public static final int PRIME_INDEX_LENGTH = 12071;
/**
* wordIndexTable保证将Unicode中的所有汉字编码hash到PRIME_INDEX_LENGTH长度的数组中
* 当然会有冲突但实际上本程序只处理GB2312字符部分6768个字符加上一些ASCII字符
* 因此对这些字符是有效的为了保证比较的准确性保留原来的字符在charIndexTable中以确定查找的准确性
*/
private short[] wordIndexTable;
private char[] charIndexTable;
/**
* 存储所有词库的真正数据结构为了避免占用空间太多用了两个单独的多维数组来存储词组和频率
* 每个词放在一个char[]每个char对应一个汉字或其他字符每个频率放在一个int中
* 这两个数组的前两个下表是一一对应的因此可以利用wordItem_charArrayTable[i][j]来查词
* 用wordItem_frequencyTable[i][j]来查询对应的频率
*/
private char[][][] wordItem_charArrayTable;
private int[][] wordItem_frequencyTable;
// static Logger log = Logger.getLogger(WordDictionary.class);
public synchronized static CopyOfWordDictionary getInstance() {
if (singleInstance == null) {
singleInstance = new CopyOfWordDictionary();
String wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
singleInstance.load(wordDictRoot);
}
return singleInstance;
}
/**
* 加在词典库文件
*
* @param dctFileName 词典库文件的路径
*/
public void load(String dctFileRoot) {
String dctFilePath = dctFileRoot + "/coredict.dct";
File serialObj = new File(dctFileRoot + "/coredict.mem");
if (serialObj.exists() && loadFromObj(serialObj)) {
} else {
try {
wordIndexTable = new short[PRIME_INDEX_LENGTH];
charIndexTable = new char[PRIME_INDEX_LENGTH];
for (int i = 0; i < PRIME_INDEX_LENGTH; i++) {
charIndexTable[i] = 0;
wordIndexTable[i] = -1;
}
wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][];
wordItem_frequencyTable = new int[GB2312_CHAR_NUM][];
// int total =
loadMainDataFromFile(dctFilePath);
expandDelimiterData();
mergeSameWords();
sortEachItems();
// log.info("load dictionary: " + dctFilePath + " total:" + total);
} catch (IOException e) {
throw new RuntimeException(e.getMessage());
}
saveToObj(serialObj);
}
}
private boolean loadFromObj(File serialObj) {
boolean loadFromObject = false;
try {
ObjectInputStream input = new ObjectInputStream(new FileInputStream(
serialObj));
wordIndexTable = (short[]) input.readObject();
charIndexTable = (char[]) input.readObject();
wordItem_charArrayTable = (char[][][]) input.readObject();
wordItem_frequencyTable = (int[][]) input.readObject();
// log.info("load core dict from serialization.");
input.close();
loadFromObject = true;
} catch (Exception e) {
// log.warn(e.getMessage());
}
return loadFromObject;
}
private void saveToObj(File serialObj) {
try {
ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
serialObj));
output.writeObject(wordIndexTable);
output.writeObject(charIndexTable);
output.writeObject(wordItem_charArrayTable);
output.writeObject(wordItem_frequencyTable);
output.close();
// log.info("serialize core dict.");
} catch (Exception e) {
// log.warn(e.getMessage());
}
}
/**
* 将词库文件加载到WordDictionary的相关数据结构中只是加载没有进行合并和修改操作
*
* @param dctFilePath
* @return
* @throws FileNotFoundException
* @throws IOException
* @throws UnsupportedEncodingException
*/
private int loadMainDataFromFile(String dctFilePath)
throws FileNotFoundException, IOException, UnsupportedEncodingException {
int i, cnt, length, total = 0;
// 文件中只统计了6763个汉字加5个空汉字符3756~3760其中第3756个用来存储符号信息
int[] buffer = new int[3];
byte[] intBuffer = new byte[4];
String tmpword;
RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
// 字典文件中第一个汉字出现的位置是0最后一个是6768
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
// if (i == 5231)
// System.out.println(i);
dctFile.read(intBuffer);// 原词库文件在c下开发所以写入的文件为little
// endian编码而java为big endian必须转换过来
cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
if (cnt <= 0) {
wordItem_charArrayTable[i] = null;
wordItem_frequencyTable[i] = null;
continue;
}
wordItem_charArrayTable[i] = new char[cnt][];
wordItem_frequencyTable[i] = new int[cnt];
total += cnt;
int j = 0;
while (j < cnt) {
// wordItemTable[i][j] = new WordItem();
dctFile.read(intBuffer);
buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
.getInt();// frequency
dctFile.read(intBuffer);
buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
.getInt();// length
dctFile.read(intBuffer);
buffer[2] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
.getInt();// handle
// wordItemTable[i][j].frequency = buffer[0];
wordItem_frequencyTable[i][j] = buffer[0];
length = buffer[1];
if (length > 0) {
byte[] lchBuffer = new byte[length];
dctFile.read(lchBuffer);
tmpword = new String(lchBuffer, "GB2312");
// indexTable[i].wordItems[j].word = tmpword;
// wordItemTable[i][j].charArray = tmpword.toCharArray();
wordItem_charArrayTable[i][j] = tmpword.toCharArray();
} else {
// wordItemTable[i][j].charArray = null;
wordItem_charArrayTable[i][j] = null;
}
// System.out.println(indexTable[i].wordItems[j]);
j++;
}
String str = getCCByGB2312Id(i);
setTableIndex(str.charAt(0), i);
}
dctFile.close();
return total;
}
/**
* 原词库将所有标点符号的信息合并到一个列表里(从1开始的3755处)这里将其展开分别放到各个符号对应的列表中
*/
private void expandDelimiterData() {
int i;
int cnt;
// 标点符号在从1开始的3755处将原始的标点符号对应的字典分配到对应的标点符号中
int delimiterIndex = 3755 + GB2312_FIRST_CHAR;
i = 0;
while (i < wordItem_charArrayTable[delimiterIndex].length) {
char c = wordItem_charArrayTable[delimiterIndex][i][0];
int j = getGB2312Id(c);// 该标点符号应该所在的index值
if (wordItem_charArrayTable[j] == null) {
int k = i;
// 从i开始计数后面以j开头的符号的worditem的个数
while (k < wordItem_charArrayTable[delimiterIndex].length
&& wordItem_charArrayTable[delimiterIndex][k][0] == c) {
k++;
}
// 此时k-i为id为j的标点符号对应的wordItem的个数
cnt = k - i;
if (cnt != 0) {
wordItem_charArrayTable[j] = new char[cnt][];
wordItem_frequencyTable[j] = new int[cnt];
}
// 为每一个wordItem赋值
for (k = 0; k < cnt; k++, i++) {
// wordItemTable[j][k] = new WordItem();
wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i];
wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].length - 1];
System.arraycopy(wordItem_charArrayTable[delimiterIndex][i], 1,
wordItem_charArrayTable[j][k], 0,
wordItem_charArrayTable[j][k].length);
}
setTableIndex(c, j);
}
}
// 将原符号对应的数组删除
wordItem_charArrayTable[delimiterIndex] = null;
wordItem_frequencyTable[delimiterIndex] = null;
}
/**
* 本程序不做词性标注因此将相同词不同词性的频率合并到同一个词下以减小存储空间加快搜索速度
*/
private void mergeSameWords() {
int i;
for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
if (wordItem_charArrayTable[i] == null)
continue;
int len = 1;
for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
wordItem_charArrayTable[i][j - 1], 0) != 0)
len++;
}
if (len < wordItem_charArrayTable[i].length) {
char[][] tempArray = new char[len][];
int[] tempFreq = new int[len];
int k = 0;
tempArray[0] = wordItem_charArrayTable[i][0];
tempFreq[0] = wordItem_frequencyTable[i][0];
for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
tempArray[k], 0) != 0) {
k++;
// temp[k] = wordItemTable[i][j];
tempArray[k] = wordItem_charArrayTable[i][j];
tempFreq[k] = wordItem_frequencyTable[i][j];
} else {
// temp[k].frequency += wordItemTable[i][j].frequency;
tempFreq[k] += wordItem_frequencyTable[i][j];
}
}
// wordItemTable[i] = temp;
wordItem_charArrayTable[i] = tempArray;
wordItem_frequencyTable[i] = tempFreq;
}
}
}
private void sortEachItems() {
char[] tmpArray;
int tmpFreq;
for (int i = 0; i < wordItem_charArrayTable.length; i++) {
if (wordItem_charArrayTable[i] != null
&& wordItem_charArrayTable[i].length > 1) {
for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) {
for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) {
if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
wordItem_charArrayTable[i][j2], 0) > 0) {
tmpArray = wordItem_charArrayTable[i][j];
tmpFreq = wordItem_frequencyTable[i][j];
wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2];
wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2];
wordItem_charArrayTable[i][j2] = tmpArray;
wordItem_frequencyTable[i][j2] = tmpFreq;
}
}
}
}
}
}
/**
* 计算字符c在哈希表中应该在的位置然后将地址列表中该位置的值初始化
*
* @param c
* @param j
* @return
*/
private boolean setTableIndex(char c, int j) {
int index = getAvaliableTableIndex(c);
if (index != -1) {
charIndexTable[index] = c;
wordIndexTable[index] = (short) j;
return true;
} else
return false;
}
private short getAvaliableTableIndex(char c) {
int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
if (hash1 < 0)
hash1 = PRIME_INDEX_LENGTH + hash1;
if (hash2 < 0)
hash2 = PRIME_INDEX_LENGTH + hash2;
int index = hash1;
int i = 1;
while (charIndexTable[index] != 0 && charIndexTable[index] != c
&& i < PRIME_INDEX_LENGTH) {
index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
i++;
}
// System.out.println(i - 1);
if (i < PRIME_INDEX_LENGTH
&& (charIndexTable[index] == 0 || charIndexTable[index] == c)) {
return (short) index;
} else
return -1;
}
/**
* @param c
* @return
*/
private short getWordItemTableIndex(char c) {
int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
if (hash1 < 0)
hash1 = PRIME_INDEX_LENGTH + hash1;
if (hash2 < 0)
hash2 = PRIME_INDEX_LENGTH + hash2;
int index = hash1;
int i = 1;
while (charIndexTable[index] != 0 && charIndexTable[index] != c
&& i < PRIME_INDEX_LENGTH) {
index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
i++;
}
if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) {
return (short) index;
} else
return -1;
}
/**
* 在字典库中查找单词对应的char数组为charArray的字符串返回该单词在单词序列中的位置
*
* @param charArray 查找单词对应的char数组
* @return 单词在单词数组中的位置如果没找到则返回-1
*/
private int findInTable(char[] charArray) {
if (charArray == null || charArray.length == 0)
return -1;
short index = getWordItemTableIndex(charArray[0]);
if (index == -1)
return -1;
return findInTable(index, charArray);
}
/**
* 在字典库中查找单词对应的char数组为charArray的字符串返回该单词在单词序列中的位置
*
* @param knownHashIndex 已知单词第一个字符charArray[0]在hash表中的位置如果未计算可以用函数int
* findInTable(char[] charArray) 代替
* @param charArray 查找单词对应的char数组
* @return 单词在单词数组中的位置如果没找到则返回-1
*/
private int findInTable(short knownHashIndex, char[] charArray) {
if (charArray == null || charArray.length == 0)
return -1;
char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]];
int start = 0, end = items.length - 1;
int mid = (start + end) / 2, cmpResult;
// Binary search for the index of idArray
while (start <= end) {
cmpResult = Utility.compareArray(items[mid], 0, charArray, 1);
if (cmpResult == 0)
return mid;// find it
else if (cmpResult < 0)
start = mid + 1;
else if (cmpResult > 0)
end = mid - 1;
mid = (start + end) / 2;
}
return -1;
}
/**
* charArray这个单词对应的词组在不在WordDictionary中出现
*
* @param charArray
* @return true表示存在false表示不存在
*/
public boolean isExist(char[] charArray) {
return findInTable(charArray) != -1;
}
/**
* @see{getPrefixMatch(char[] charArray, int knownStart)}
* @param charArray
* @return
*/
public int getPrefixMatch(char[] charArray) {
return getPrefixMatch(charArray, 0);
}
/**
* 从词典中查找以charArray对应的单词为前缀(prefix)的单词的位置, 并返回第一个满足条件的位置为了减小搜索代价,
* 可以根据已有知识设置起始搜索位置, 如果不知道起始位置默认是0
*
* @see{getPrefixMatch(char[] charArray)}
* @param charArray 前缀单词
* @param knownStart 已知的起始位置
* @return 满足前缀条件的第一个单词的位置
*/
public int getPrefixMatch(char[] charArray, int knownStart) {
short index = getWordItemTableIndex(charArray[0]);
if (index == -1)
return -1;
char[][] items = wordItem_charArrayTable[wordIndexTable[index]];
int start = knownStart, end = items.length - 1;
int mid = (start + end) / 2, cmpResult;
// Binary search for the index of idArray
while (start <= end) {
cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0);
if (cmpResult == 0) {
// Get the first item which match the current word
while (mid >= 0
&& Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0)
mid--;
mid++;
return mid;// 找到第一个以charArray为前缀的单词
} else if (cmpResult < 0)
end = mid - 1;
else
start = mid + 1;
mid = (start + end) / 2;
}
return -1;
}
/**
* 获取idArray对应的词的词频若pos为-1则获取所有词性的词频
*
* @param charArray 输入的单词对应的charArray
* @param pos 词性-1表示要求求出所有的词性的词频
* @return idArray对应的词频
*/
public int getFrequency(char[] charArray) {
short hashIndex = getWordItemTableIndex(charArray[0]);
if (hashIndex == -1)
return 0;
int itemIndex = findInTable(hashIndex, charArray);
if (itemIndex != -1)
return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex];
return 0;
}
/**
* 判断charArray对应的字符串是否跟词典中charArray[0]对应的wordIndex的charArray相等,
* 也就是说charArray的位置查找结果是不是就是wordIndex
*
* @param charArray 输入的charArray词组第一个数表示词典中的索引号
* @param itemIndex 位置编号
* @return 是否相等
*/
public boolean isEqual(char[] charArray, int itemIndex) {
short hashIndex = getWordItemTableIndex(charArray[0]);
return Utility.compareArray(charArray, 1,
wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0;
}
public static void main(String[] args) throws FileNotFoundException,
IOException {
CopyOfWordDictionary dic = new CopyOfWordDictionary();
dic.load("D:/analysis-data");
Utility.getCharType('。');
Utility.getCharType('汗');
Utility.getCharType(' ');// 0020
Utility.getCharType(' ');// 3000
Utility.getCharType('');// E095
Utility.getCharType(' ');// 3000
Utility.getCharType('\r');// 000D
Utility.getCharType('\n');// 000A
Utility.getCharType('\t');// 0009
}
}

View File

@ -0,0 +1,193 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart.hhmm;
import java.util.List;
import org.apache.lucene.analysis.cn.smart.CharType;
import org.apache.lucene.analysis.cn.smart.Utility;
import org.apache.lucene.analysis.cn.smart.WordType;
public class HHMMSegmenter {
private static WordDictionary wordDict = WordDictionary.getInstance();
/**
* 寻找sentence中所有可能的Token最后再添加两个特殊Token"始##始",
* "末##末""始##始"Token的起始位置是-1,"末##末"Token的起始位置是句子的长度
*
* @param sentence 输入的句子不包含"始##始","末##末"
* @param coreDict 核心字典
* @return 所有可能的Token
* @see MultiTokenMap
*/
private SegGraph createSegGraph(String sentence) {
int i = 0, j;
int length = sentence.length();
int foundIndex;
int[] charTypeArray = getCharTypes(sentence);
StringBuffer wordBuf = new StringBuffer();
SegToken token;
int frequency = 0; // word的出现次数
boolean hasFullWidth;
int wordType;
char[] charArray;
SegGraph segGraph = new SegGraph();
while (i < length) {
hasFullWidth = false;
switch (charTypeArray[i]) {
case CharType.SPACE_LIKE:
i++;
break;
case CharType.HANZI:
j = i + 1;
wordBuf.delete(0, wordBuf.length());
// 不管单个汉字能不能构成词都将单个汉字存到segGraph中去否则会造成分此图断字
wordBuf.append(sentence.charAt(i));
charArray = new char[] { sentence.charAt(i) };
frequency = wordDict.getFrequency(charArray);
token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
frequency);
segGraph.addToken(token);
foundIndex = wordDict.getPrefixMatch(charArray);
while (j <= length && foundIndex != -1) {
if (wordDict.isEqual(charArray, foundIndex) && charArray.length > 1) {
// 就是我们要找的词 也就是说找到了从i到j的一个成词SegToken并且不是单字词
frequency = wordDict.getFrequency(charArray);
token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
frequency);
segGraph.addToken(token);
}
while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
j++;
if (j < length && charTypeArray[j] == CharType.HANZI) {
wordBuf.append(sentence.charAt(j));
charArray = new char[wordBuf.length()];
wordBuf.getChars(0, charArray.length, charArray, 0);
// idArray作为前缀已经找到过(foundWordIndex!=-1),
// 因此加长过后的idArray只可能出现在foundWordIndex以后,
// 故从foundWordIndex之后开始查找
foundIndex = wordDict.getPrefixMatch(charArray, foundIndex);
j++;
} else {
break;
}
}
i++;
break;
case CharType.FULLWIDTH_LETTER:
hasFullWidth = true;
case CharType.LETTER:
j = i + 1;
while (j < length
&& (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) {
if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
hasFullWidth = true;
j++;
}
// 找到了从i到j的一个Token类型为LETTER的字符串
charArray = Utility.STRING_CHAR_ARRAY;
frequency = wordDict.getFrequency(charArray);
wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
token = new SegToken(charArray, i, j, wordType, frequency);
segGraph.addToken(token);
i = j;
break;
case CharType.FULLWIDTH_DIGIT:
hasFullWidth = true;
case CharType.DIGIT:
j = i + 1;
while (j < length
&& (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) {
if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
hasFullWidth = true;
j++;
}
// 找到了从i到j的一个Token类型为NUMBER的字符串
charArray = Utility.NUMBER_CHAR_ARRAY;
frequency = wordDict.getFrequency(charArray);
wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
token = new SegToken(charArray, i, j, wordType, frequency);
segGraph.addToken(token);
i = j;
break;
case CharType.DELIMITER:
j = i + 1;
// 标点符号的weight不用查了选个最大的频率即可
frequency = Utility.MAX_FREQUENCE;
charArray = new char[] { sentence.charAt(i) };
token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
segGraph.addToken(token);
i = j;
break;
default:
j = i + 1;
// 把不认识的字符当作未知串看待例如GB2312编码之外的字符每个字符当作一个
charArray = Utility.STRING_CHAR_ARRAY;
frequency = wordDict.getFrequency(charArray);
token = new SegToken(charArray, i, j, WordType.STRING, frequency);
segGraph.addToken(token);
i = j;
break;
}
}
// 为segGraph增加两个新Token "始##始","末##末"
charArray = Utility.START_CHAR_ARRAY;
frequency = wordDict.getFrequency(charArray);
token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
segGraph.addToken(token);
// "末##末"
charArray = Utility.END_CHAR_ARRAY;
frequency = wordDict.getFrequency(charArray);
token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
frequency);
segGraph.addToken(token);
return segGraph;
}
/**
* 为sentence中的每个字符确定唯一的字符类型
*
* @see Utility.charType(char)
* @param sentence 输入的完成句子
* @return 返回的字符类型数组如果输入为null返回也是null
*/
private static int[] getCharTypes(String sentence) {
int length = sentence.length();
int[] charTypeArray = new int[length];
// 生成对应单个汉字的字符类型数组
for (int i = 0; i < length; i++) {
charTypeArray[i] = Utility.getCharType(sentence.charAt(i));
}
return charTypeArray;
}
public List process(String sentence) {
SegGraph segGraph = createSegGraph(sentence);
BiSegGraph biSegGraph = new BiSegGraph(segGraph);
List shortPath = biSegGraph.getShortPath();
return shortPath;
}
}

View File

@ -0,0 +1,33 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart.hhmm;
public class PathNode implements Comparable {
public double weight;
public int preNode;
public int compareTo(Object p) {
PathNode pn = (PathNode) p;
if (weight < pn.weight)
return -1;
else if (weight == pn.weight)
return 0;
else
return 1;
}
}

View File

@ -0,0 +1,144 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart.hhmm;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
public class SegGraph {
/**
* 用一个ArrayList记录startOffset相同的Token这个startOffset就是Token的key
*/
private Map tokenListTable = new HashMap();
private int maxStart = -1;
/**
* 查看startOffset为s的Token是否存在如果没有则说明s处没有Token或者还没有添加
*
* @param s startOffset
* @return
*/
public boolean isStartExist(int s) {
return tokenListTable.get(new Integer(s)) != null;
}
/**
* 取出startOffset为s的所有Tokens如果没有则返回null
*
* @param s
* @return 所有相同startOffset的Token的序列
*/
public List getStartList(int s) {
return (List) tokenListTable.get(new Integer(s));
}
public int getMaxStart() {
return maxStart;
}
/**
* 为SegGraph中的所有Tokens生成一个统一的indexindex从0开始
* 按照startOffset递增的顺序排序相同startOffset的Tokens按照放置先后顺序排序
*/
public List makeIndex() {
List result = new ArrayList();
int s = -1, count = 0, size = tokenListTable.size();
List tokenList;
short index = 0;
while (count < size) {
if (isStartExist(s)) {
tokenList = (List) tokenListTable.get(new Integer(s));
for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
SegToken st = (SegToken) iter.next();
st.index = index;
result.add(st);
index++;
}
count++;
}
s++;
}
return result;
}
/**
* 向Map中增加一个Token这些Token按照相同startOffset放在同一个列表中
*
* @param token
*/
public void addToken(SegToken token) {
int s = token.startOffset;
if (!isStartExist(s)) {
ArrayList newlist = new ArrayList();
newlist.add(token);
tokenListTable.put((Object) (new Integer(s)), newlist);
} else {
List tokenList = (List) tokenListTable.get((Object) (new Integer(s)));
tokenList.add(token);
}
if (s > maxStart)
maxStart = s;
}
/**
* 获取SegGraph中不同起始Start位置Token类的个数每个开始位置可能有多个Token因此位置数与Token数并不一致
*
* @return
*/
public int getStartCount() {
return tokenListTable.size();
}
/**
* 将Map中存储的所有Token按照起始位置从小到大的方式组成一个列表
*
* @return
*/
public List toTokenList() {
List result = new ArrayList();
int s = -1, count = 0, size = tokenListTable.size();
List tokenList;
while (count < size) {
if (isStartExist(s)) {
tokenList = (List) tokenListTable.get(new Integer(s));
for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
SegToken st = (SegToken) iter.next();
result.add(st);
}
count++;
}
s++;
}
return result;
}
public String toString() {
List tokenList = this.toTokenList();
StringBuffer sb = new StringBuffer();
for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
SegToken t = (SegToken) iter.next();
sb.append(t + "\n");
}
return sb.toString();
}
}

View File

@ -0,0 +1,64 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart.hhmm;
public class SegToken {
public char[] charArray;
public int startOffset;
public int endOffset;
public int wordType;
public int weight;
public int index;
public SegToken(String word, int start, int end, int wordType, int weight) {
this.charArray = word.toCharArray();
this.startOffset = start;
this.endOffset = end;
this.wordType = wordType;
this.weight = weight;
}
public SegToken(char[] idArray, int start, int end, int wordType, int weight) {
this.charArray = idArray;
this.startOffset = start;
this.endOffset = end;
this.wordType = wordType;
this.weight = weight;
}
// public String toString() {
// return String.valueOf(charArray) + "/s(" + startOffset + ")e("
// + endOffset + ")/w(" + weight + ")t(" + wordType + ")";
// }
/**
* 判断两个Token相等的充要条件是他们的起始位置相等因为这样他们的原句中的内容一样
* 而pos与weight都可以从词典中查到多个可以用一对多的方法表示因此只需要一个Token
*
* @param t
* @return
*/
// public boolean equals(RawToken t) {
// return this.startOffset == t.startOffset
// && this.endOffset == t.endOffset;
// }
}

View File

@ -0,0 +1,50 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart.hhmm;
import org.apache.lucene.analysis.cn.smart.Utility;
import org.apache.lucene.analysis.cn.smart.WordType;
public class SegTokenFilter {
public SegToken filter(SegToken token) {
switch (token.wordType) {
case WordType.FULLWIDTH_NUMBER:
case WordType.FULLWIDTH_STRING:
for (int i = 0; i < token.charArray.length; i++) {
if (token.charArray[i] >= 0xFF10)
token.charArray[i] -= 0xFEE0;
if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A)
token.charArray[i] += 0x0020;
}
break;
case WordType.STRING:
for (int i = 0; i < token.charArray.length; i++) {
if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A)
token.charArray[i] += 0x0020;
}
break;
case WordType.DELIMITER:
token.charArray = Utility.COMMON_DELIMITER;
break;
default:
break;
}
return token;
}
}

View File

@ -0,0 +1,48 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart.hhmm;
public class SegTokenPair {
public char[] charArray;
/**
* from和to是Token对的index号表示本TokenPair的两个Token在segGragh中的位置
*/
public int from;
public int to;
public double weight;
public SegTokenPair(char[] idArray, int from, int to, double weight) {
this.charArray = idArray;
this.from = from;
this.to = to;
this.weight = weight;
}
// public String toString() {
// return String.valueOf(charArray) + ":f(" + from + ")t(" + to + "):"
// + weight;
// }
// public boolean equals(SegTokenPair tp) {
// return this.from == tp.from && this.to == tp.to;
// }
}

View File

@ -0,0 +1,568 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart.hhmm;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
import org.apache.lucene.analysis.cn.smart.Utility;
public class WordDictionary extends AbstractDictionary {
private WordDictionary() {
}
private static WordDictionary singleInstance;
/**
* 一个较大的素数保证hash查找能够遍历所有位置
*/
public static final int PRIME_INDEX_LENGTH = 12071;
/**
* wordIndexTable保证将Unicode中的所有汉字编码hash到PRIME_INDEX_LENGTH长度的数组中
* 当然会有冲突但实际上本程序只处理GB2312字符部分6768个字符加上一些ASCII字符
* 因此对这些字符是有效的为了保证比较的准确性保留原来的字符在charIndexTable中以确定查找的准确性
*/
private short[] wordIndexTable;
private char[] charIndexTable;
/**
* 存储所有词库的真正数据结构为了避免占用空间太多用了两个单独的多维数组来存储词组和频率
* 每个词放在一个char[]每个char对应一个汉字或其他字符每个频率放在一个int中
* 这两个数组的前两个下表是一一对应的因此可以利用wordItem_charArrayTable[i][j]来查词
* 用wordItem_frequencyTable[i][j]来查询对应的频率
*/
private char[][][] wordItem_charArrayTable;
private int[][] wordItem_frequencyTable;
// static Logger log = Logger.getLogger(WordDictionary.class);
public synchronized static WordDictionary getInstance() {
if (singleInstance == null) {
singleInstance = new WordDictionary();
try {
singleInstance.load();
} catch (IOException e) {
String wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
singleInstance.load(wordDictRoot);
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
return singleInstance;
}
/**
* 从外部文件夹dctFileRoot加载词典库文件首先测试是否有coredict.mem文件 如果有则直接作为序列化对象加载
* 如果没有则加载词典库源文件coredict.dct
*
* @param dctFileName 词典库文件的路径
*/
public void load(String dctFileRoot) {
String dctFilePath = dctFileRoot + "/coredict.dct";
File serialObj = new File(dctFileRoot + "/coredict.mem");
if (serialObj.exists() && loadFromObj(serialObj)) {
} else {
try {
wordIndexTable = new short[PRIME_INDEX_LENGTH];
charIndexTable = new char[PRIME_INDEX_LENGTH];
for (int i = 0; i < PRIME_INDEX_LENGTH; i++) {
charIndexTable[i] = 0;
wordIndexTable[i] = -1;
}
wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][];
wordItem_frequencyTable = new int[GB2312_CHAR_NUM][];
// int total =
loadMainDataFromFile(dctFilePath);
expandDelimiterData();
mergeSameWords();
sortEachItems();
// log.info("load dictionary: " + dctFilePath + " total:" + total);
} catch (IOException e) {
throw new RuntimeException(e.getMessage());
}
saveToObj(serialObj);
}
}
/**
* 从jar内部加载词典库文件要求保证WordDictionary类当前路径中有coredict.mem文件以将其作为序列化对象加载
*
* @param dctFileName 词典库文件的路径
* @throws ClassNotFoundException
* @throws IOException
*/
public void load() throws IOException, ClassNotFoundException {
InputStream input = this.getClass().getResourceAsStream("coredict.mem");
loadFromObjectInputStream(input);
}
private boolean loadFromObj(File serialObj) {
try {
loadFromObjectInputStream(new FileInputStream(serialObj));
return true;
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
return false;
}
private void loadFromObjectInputStream(InputStream serialObjectInputStream)
throws IOException, ClassNotFoundException {
ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
wordIndexTable = (short[]) input.readObject();
charIndexTable = (char[]) input.readObject();
wordItem_charArrayTable = (char[][][]) input.readObject();
wordItem_frequencyTable = (int[][]) input.readObject();
// log.info("load core dict from serialization.");
input.close();
}
private void saveToObj(File serialObj) {
try {
ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
serialObj));
output.writeObject(wordIndexTable);
output.writeObject(charIndexTable);
output.writeObject(wordItem_charArrayTable);
output.writeObject(wordItem_frequencyTable);
output.close();
// log.info("serialize core dict.");
} catch (Exception e) {
// log.warn(e.getMessage());
}
}
/**
* 将词库文件加载到WordDictionary的相关数据结构中只是加载没有进行合并和修改操作
*
* @param dctFilePath
* @return
* @throws FileNotFoundException
* @throws IOException
* @throws UnsupportedEncodingException
*/
private int loadMainDataFromFile(String dctFilePath)
throws FileNotFoundException, IOException, UnsupportedEncodingException {
int i, cnt, length, total = 0;
// 文件中只统计了6763个汉字加5个空汉字符3756~3760其中第3756个用来存储符号信息
int[] buffer = new int[3];
byte[] intBuffer = new byte[4];
String tmpword;
RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
// 字典文件中第一个汉字出现的位置是0最后一个是6768
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
// if (i == 5231)
// System.out.println(i);
dctFile.read(intBuffer);// 原词库文件在c下开发所以写入的文件为little
// endian编码而java为big endian必须转换过来
cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
if (cnt <= 0) {
wordItem_charArrayTable[i] = null;
wordItem_frequencyTable[i] = null;
continue;
}
wordItem_charArrayTable[i] = new char[cnt][];
wordItem_frequencyTable[i] = new int[cnt];
total += cnt;
int j = 0;
while (j < cnt) {
// wordItemTable[i][j] = new WordItem();
dctFile.read(intBuffer);
buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
.getInt();// frequency
dctFile.read(intBuffer);
buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
.getInt();// length
dctFile.read(intBuffer);
buffer[2] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
.getInt();// handle
// wordItemTable[i][j].frequency = buffer[0];
wordItem_frequencyTable[i][j] = buffer[0];
length = buffer[1];
if (length > 0) {
byte[] lchBuffer = new byte[length];
dctFile.read(lchBuffer);
tmpword = new String(lchBuffer, "GB2312");
// indexTable[i].wordItems[j].word = tmpword;
// wordItemTable[i][j].charArray = tmpword.toCharArray();
wordItem_charArrayTable[i][j] = tmpword.toCharArray();
} else {
// wordItemTable[i][j].charArray = null;
wordItem_charArrayTable[i][j] = null;
}
// System.out.println(indexTable[i].wordItems[j]);
j++;
}
String str = getCCByGB2312Id(i);
setTableIndex(str.charAt(0), i);
}
dctFile.close();
return total;
}
/**
* 原词库将所有标点符号的信息合并到一个列表里(从1开始的3755处)这里将其展开分别放到各个符号对应的列表中
*/
private void expandDelimiterData() {
int i;
int cnt;
// 标点符号在从1开始的3755处将原始的标点符号对应的字典分配到对应的标点符号中
int delimiterIndex = 3755 + GB2312_FIRST_CHAR;
i = 0;
while (i < wordItem_charArrayTable[delimiterIndex].length) {
char c = wordItem_charArrayTable[delimiterIndex][i][0];
int j = getGB2312Id(c);// 该标点符号应该所在的index值
if (wordItem_charArrayTable[j] == null) {
int k = i;
// 从i开始计数后面以j开头的符号的worditem的个数
while (k < wordItem_charArrayTable[delimiterIndex].length
&& wordItem_charArrayTable[delimiterIndex][k][0] == c) {
k++;
}
// 此时k-i为id为j的标点符号对应的wordItem的个数
cnt = k - i;
if (cnt != 0) {
wordItem_charArrayTable[j] = new char[cnt][];
wordItem_frequencyTable[j] = new int[cnt];
}
// 为每一个wordItem赋值
for (k = 0; k < cnt; k++, i++) {
// wordItemTable[j][k] = new WordItem();
wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i];
wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].length - 1];
System.arraycopy(wordItem_charArrayTable[delimiterIndex][i], 1,
wordItem_charArrayTable[j][k], 0,
wordItem_charArrayTable[j][k].length);
}
setTableIndex(c, j);
}
}
// 将原符号对应的数组删除
wordItem_charArrayTable[delimiterIndex] = null;
wordItem_frequencyTable[delimiterIndex] = null;
}
/**
* 本程序不做词性标注因此将相同词不同词性的频率合并到同一个词下以减小存储空间加快搜索速度
*/
private void mergeSameWords() {
int i;
for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
if (wordItem_charArrayTable[i] == null)
continue;
int len = 1;
for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
wordItem_charArrayTable[i][j - 1], 0) != 0)
len++;
}
if (len < wordItem_charArrayTable[i].length) {
char[][] tempArray = new char[len][];
int[] tempFreq = new int[len];
int k = 0;
tempArray[0] = wordItem_charArrayTable[i][0];
tempFreq[0] = wordItem_frequencyTable[i][0];
for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
tempArray[k], 0) != 0) {
k++;
// temp[k] = wordItemTable[i][j];
tempArray[k] = wordItem_charArrayTable[i][j];
tempFreq[k] = wordItem_frequencyTable[i][j];
} else {
// temp[k].frequency += wordItemTable[i][j].frequency;
tempFreq[k] += wordItem_frequencyTable[i][j];
}
}
// wordItemTable[i] = temp;
wordItem_charArrayTable[i] = tempArray;
wordItem_frequencyTable[i] = tempFreq;
}
}
}
private void sortEachItems() {
char[] tmpArray;
int tmpFreq;
for (int i = 0; i < wordItem_charArrayTable.length; i++) {
if (wordItem_charArrayTable[i] != null
&& wordItem_charArrayTable[i].length > 1) {
for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) {
for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) {
if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
wordItem_charArrayTable[i][j2], 0) > 0) {
tmpArray = wordItem_charArrayTable[i][j];
tmpFreq = wordItem_frequencyTable[i][j];
wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2];
wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2];
wordItem_charArrayTable[i][j2] = tmpArray;
wordItem_frequencyTable[i][j2] = tmpFreq;
}
}
}
}
}
}
/**
* 计算字符c在哈希表中应该在的位置然后将地址列表中该位置的值初始化
*
* @param c
* @param j
* @return
*/
private boolean setTableIndex(char c, int j) {
int index = getAvaliableTableIndex(c);
if (index != -1) {
charIndexTable[index] = c;
wordIndexTable[index] = (short) j;
return true;
} else
return false;
}
private short getAvaliableTableIndex(char c) {
int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
if (hash1 < 0)
hash1 = PRIME_INDEX_LENGTH + hash1;
if (hash2 < 0)
hash2 = PRIME_INDEX_LENGTH + hash2;
int index = hash1;
int i = 1;
while (charIndexTable[index] != 0 && charIndexTable[index] != c
&& i < PRIME_INDEX_LENGTH) {
index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
i++;
}
// System.out.println(i - 1);
if (i < PRIME_INDEX_LENGTH
&& (charIndexTable[index] == 0 || charIndexTable[index] == c)) {
return (short) index;
} else
return -1;
}
/**
* @param c
* @return
*/
private short getWordItemTableIndex(char c) {
int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
if (hash1 < 0)
hash1 = PRIME_INDEX_LENGTH + hash1;
if (hash2 < 0)
hash2 = PRIME_INDEX_LENGTH + hash2;
int index = hash1;
int i = 1;
while (charIndexTable[index] != 0 && charIndexTable[index] != c
&& i < PRIME_INDEX_LENGTH) {
index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
i++;
}
if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) {
return (short) index;
} else
return -1;
}
/**
* 在字典库中查找单词对应的char数组为charArray的字符串返回该单词在单词序列中的位置
*
* @param charArray 查找单词对应的char数组
* @return 单词在单词数组中的位置如果没找到则返回-1
*/
private int findInTable(char[] charArray) {
if (charArray == null || charArray.length == 0)
return -1;
short index = getWordItemTableIndex(charArray[0]);
if (index == -1)
return -1;
return findInTable(index, charArray);
}
/**
* 在字典库中查找单词对应的char数组为charArray的字符串返回该单词在单词序列中的位置
*
* @param knownHashIndex 已知单词第一个字符charArray[0]在hash表中的位置如果未计算可以用函数int
* findInTable(char[] charArray) 代替
* @param charArray 查找单词对应的char数组
* @return 单词在单词数组中的位置如果没找到则返回-1
*/
private int findInTable(short knownHashIndex, char[] charArray) {
if (charArray == null || charArray.length == 0)
return -1;
char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]];
int start = 0, end = items.length - 1;
int mid = (start + end) / 2, cmpResult;
// Binary search for the index of idArray
while (start <= end) {
cmpResult = Utility.compareArray(items[mid], 0, charArray, 1);
if (cmpResult == 0)
return mid;// find it
else if (cmpResult < 0)
start = mid + 1;
else if (cmpResult > 0)
end = mid - 1;
mid = (start + end) / 2;
}
return -1;
}
/**
* charArray这个单词对应的词组在不在WordDictionary中出现
*
* @param charArray
* @return true表示存在false表示不存在
*/
public boolean isExist(char[] charArray) {
return findInTable(charArray) != -1;
}
/**
* @see{getPrefixMatch(char[] charArray, int knownStart)}
* @param charArray
* @return
*/
public int getPrefixMatch(char[] charArray) {
return getPrefixMatch(charArray, 0);
}
/**
* 从词典中查找以charArray对应的单词为前缀(prefix)的单词的位置, 并返回第一个满足条件的位置为了减小搜索代价,
* 可以根据已有知识设置起始搜索位置, 如果不知道起始位置默认是0
*
* @see{getPrefixMatch(char[] charArray)}
* @param charArray 前缀单词
* @param knownStart 已知的起始位置
* @return 满足前缀条件的第一个单词的位置
*/
public int getPrefixMatch(char[] charArray, int knownStart) {
short index = getWordItemTableIndex(charArray[0]);
if (index == -1)
return -1;
char[][] items = wordItem_charArrayTable[wordIndexTable[index]];
int start = knownStart, end = items.length - 1;
int mid = (start + end) / 2, cmpResult;
// Binary search for the index of idArray
while (start <= end) {
cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0);
if (cmpResult == 0) {
// Get the first item which match the current word
while (mid >= 0
&& Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0)
mid--;
mid++;
return mid;// 找到第一个以charArray为前缀的单词
} else if (cmpResult < 0)
end = mid - 1;
else
start = mid + 1;
mid = (start + end) / 2;
}
return -1;
}
/**
* 获取idArray对应的词的词频若pos为-1则获取所有词性的词频
*
* @param charArray 输入的单词对应的charArray
* @param pos 词性-1表示要求求出所有的词性的词频
* @return idArray对应的词频
*/
public int getFrequency(char[] charArray) {
short hashIndex = getWordItemTableIndex(charArray[0]);
if (hashIndex == -1)
return 0;
int itemIndex = findInTable(hashIndex, charArray);
if (itemIndex != -1)
return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex];
return 0;
}
/**
* 判断charArray对应的字符串是否跟词典中charArray[0]对应的wordIndex的charArray相等,
* 也就是说charArray的位置查找结果是不是就是wordIndex
*
* @param charArray 输入的charArray词组第一个数表示词典中的索引号
* @param itemIndex 位置编号
* @return 是否相等
*/
public boolean isEqual(char[] charArray, int itemIndex) {
short hashIndex = getWordItemTableIndex(charArray[0]);
return Utility.compareArray(charArray, 1,
wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0;
}
public static void main(String[] args) throws FileNotFoundException,
IOException {
WordDictionary dic = new WordDictionary();
dic.load("D:/analysis-data");
Utility.getCharType('。');
Utility.getCharType('汗');
Utility.getCharType(' ');// 0020
Utility.getCharType(' ');// 3000
Utility.getCharType('');// E095
Utility.getCharType(' ');// 3000
Utility.getCharType('\r');// 000D
Utility.getCharType('\n');// 000A
Utility.getCharType('\t');// 0009
}
}

View File

@ -0,0 +1,58 @@
////////// 将标点符号全部去掉 ////////////////
,
.
`
-
_
=
?
'
|
"
(
)
{
}
[
]
<
>
*
#
&
^
$
@
!
~
:
;
+
/
\
·
 //中文空格字符
//////////////// 英文停用词 ////////////////
//////////////// 中文停用词 ////////////////

View File

@ -0,0 +1,86 @@
/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.util.Date;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
public class TestSmartChineseAnalyzer extends TestCase {
public void testChineseAnalyzer() throws IOException {
Token nt = new Token();
Analyzer ca = new SmartChineseAnalyzer(true);
Reader sentence = new StringReader("我购买了道具和服装。");
String[] result = { "", "购买", "", "道具", "", "服装" };
TokenStream ts = ca.tokenStream("sentence", sentence);
int i = 0;
nt = ts.next(nt);
while (nt != null) {
assertEquals(result[i], nt.term());
i++;
nt = ts.next(nt);
}
ts.close();
}
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
new TestSmartChineseAnalyzer().sampleMethod();
}
/**
* @throws UnsupportedEncodingException
* @throws FileNotFoundException
* @throws IOException
*/
private void sampleMethod() throws UnsupportedEncodingException,
FileNotFoundException, IOException {
Token nt = new Token();
Analyzer ca = new SmartChineseAnalyzer(true);
Reader sentence = new StringReader(
"我从小就不由自主地认为自己长大以后一定得成为一个象我父亲一样的画家, 可能是父母潜移默化的影响。其实我根本不知道作为画家意味着什么,我是否喜欢,最重要的是否适合我,我是否有这个才华。其实人到中年的我还是不确定我最喜欢什么,最想做的是什么?我相信很多人和我一样有同样的烦恼。毕竟不是每个人都能成为作文里的宇航员,科学家和大教授。知道自己适合做什么,喜欢做什么,能做好什么其实是个非常困难的问题。"
+ "幸运的是,我想我的孩子不会为这个太过烦恼。通过老大,我慢慢发现美国高中的一个重要功能就是帮助学生分析他们的专长和兴趣,从而帮助他们选择大学的专业和未来的职业。我觉得帮助一个未成形的孩子找到她未来成长的方向是个非常重要的过程。"
+ "美国高中都有专门的职业顾问,通过接触不同的课程,和各种心理,个性,兴趣很多方面的问答来帮助每个学生找到最感兴趣的专业。这样的教育一般是要到高年级才开始, 可老大因为今年上计算机的课程就是研究一个职业走向的软件项目所以她提前做了这些考试和面试。看来以后这样的教育会慢慢由电脑来测试了。老大带回家了一些试卷我挑出一些给大家看看。这门课她花了2个多月才做完这里只是很小的一部分。"
+ "在测试里有这样的一些问题:"
+ "你是个喜欢动手的人吗? 你喜欢修东西吗?你喜欢体育运动吗?你喜欢在室外工作吗?你是个喜欢思考的人吗?你喜欢数学和科学课吗?你喜欢一个人工作吗?你对自己的智力自信吗?你的创造能力很强吗?你喜欢艺术,音乐和戏剧吗? 你喜欢自由自在的工作环境吗?你喜欢尝试新的东西吗? 你喜欢帮助别人吗?你喜欢教别人吗?你喜欢和机器和工具打交道吗?你喜欢当领导吗?你喜欢组织活动吗?你什么和数字打交道吗?");
TokenStream ts = ca.tokenStream("sentence", sentence);
System.out.println("start: " + (new Date()));
long before = System.currentTimeMillis();
nt = ts.next(nt);
while (nt != null) {
System.out.println(nt.term());
nt = ts.next(nt);
}
ts.close();
long now = System.currentTimeMillis();
System.out.println("time: " + (now - before) / 1000.0 + " s");
}
}