mirror of https://github.com/apache/lucene.git
LUCENE-1629: adding new contrib analyzer SmartChineseAnalyzer
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@774718 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cc240447d4
commit
e01aad89fe
|
@ -3,14 +3,14 @@ Lucene Build Instructions
|
|||
$Id$
|
||||
|
||||
Basic steps:
|
||||
0) Install JDK 1.4 (or greater), Ant 1.6.2 (or greater)
|
||||
0) Install JDK 1.4 (or greater), Ant 1.6.3 (or greater)
|
||||
1) Download Lucene from Apache and unpack it
|
||||
2) Connect to the top-level of your Lucene installation
|
||||
3) Install JavaCC (optional)
|
||||
4) Run ant
|
||||
|
||||
Step 0) Set up your development environment (JDK 1.4 or greater,
|
||||
Ant 1.6.2 or greater)
|
||||
Ant 1.6.3 or greater)
|
||||
|
||||
We'll assume that you know how to get and set up the JDK - if you
|
||||
don't, then we suggest starting at http://java.sun.com and learning
|
||||
|
@ -18,7 +18,7 @@ more about Java, before returning to this README. Lucene runs with
|
|||
JDK 1.4 and later.
|
||||
|
||||
Like many Open Source java projects, Lucene uses Apache Ant for build
|
||||
control. Specifically, you MUST use Ant version 1.6.2 or greater.
|
||||
control. Specifically, you MUST use Ant version 1.6.3 or greater.
|
||||
|
||||
Ant is "kind of like make without make's wrinkles". Ant is
|
||||
implemented in java and uses XML-based configuration files. You can
|
||||
|
|
|
@ -308,6 +308,12 @@ Bug fixes
|
|||
cross-correlate Spans from different fields.
|
||||
(Paul Cowan and Chris Hostetter)
|
||||
|
||||
25. LUCENE-1629: Add SmartChineseAnalyzer to contrib/analyzers. It
|
||||
improves on CJKAnalyzer and ChineseAnalyzer by handling Chinese
|
||||
sentences properly. SmartChineseAnalyzer uses a Hidden Markov
|
||||
Model to tokenize Chinese words in a more intelligent way.
|
||||
(Xiaoping Gao via Mike McCandless)
|
||||
|
||||
Optimizations
|
||||
|
||||
1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing
|
||||
|
|
|
@ -233,6 +233,12 @@
|
|||
destdir="${build.dir}/classes/java">
|
||||
<classpath refid="classpath"/>
|
||||
</compile>
|
||||
|
||||
<!-- Copy the resources folder (if existent) -->
|
||||
<copy todir="${build.dir}/classes/java" includeEmptyDirs="false">
|
||||
<globmapper from="resources/*" to="*" handledirsep="yes"/>
|
||||
<fileset dir="src" includes="resources/**"/>
|
||||
</copy>
|
||||
</target>
|
||||
|
||||
<target name="compile" depends="compile-core">
|
||||
|
|
|
@ -0,0 +1,129 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.PorterStemFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
|
||||
import org.apache.lucene.analysis.cn.smart.WordSegmenter;
|
||||
import org.apache.lucene.analysis.cn.smart.WordTokenizer;
|
||||
|
||||
/**
|
||||
*
|
||||
* SmartChineseAnalyzer 是一个智能中文分词模块, 能够利用概率对汉语句子进行最优切分,
|
||||
* 并内嵌英文tokenizer,能有效处理中英文混合的文本内容。
|
||||
*
|
||||
* 它的原理基于自然语言处理领域的隐马尔科夫模型(HMM), 利用大量语料库的训练来统计汉语词汇的词频和跳转概率,
|
||||
* 从而根据这些统计结果对整个汉语句子计算最似然(likelihood)的切分。
|
||||
*
|
||||
* 因为智能分词需要词典来保存词汇的统计值,SmartChineseAnalyzer的运行需要指定词典位置,如何指定词典位置请参考
|
||||
* org.apache.lucene.analysis.cn.smart.AnalyzerProfile
|
||||
*
|
||||
* SmartChineseAnalyzer的算法和语料库词典来自于ictclas1.0项目(http://www.ictclas.org),
|
||||
* 其中词典已获取www.ictclas.org的apache license v2(APLv2)的授权。在遵循APLv2的条件下,欢迎用户使用。
|
||||
* 在此感谢www.ictclas.org以及ictclas分词软件的工作人员的无私奉献!
|
||||
*
|
||||
* @see org.apache.lucene.analysis.cn.smart.AnalyzerProfile
|
||||
*
|
||||
*/
|
||||
public class SmartChineseAnalyzer extends Analyzer {
|
||||
|
||||
private Set stopWords = null;
|
||||
|
||||
private WordSegmenter wordSegment;
|
||||
|
||||
public SmartChineseAnalyzer() {
|
||||
this(false);
|
||||
}
|
||||
|
||||
/**
|
||||
* SmartChineseAnalyzer内部带有默认停止词库,主要是标点符号。如果不希望结果中出现标点符号,
|
||||
* 可以将useDefaultStopWords设为true, useDefaultStopWords为false时不使用任何停止词
|
||||
*
|
||||
* @param useDefaultStopWords
|
||||
*/
|
||||
public SmartChineseAnalyzer(boolean useDefaultStopWords) {
|
||||
if (useDefaultStopWords) {
|
||||
stopWords = loadStopWords(this.getClass().getResourceAsStream(
|
||||
"stopwords.txt"));
|
||||
}
|
||||
wordSegment = new WordSegmenter();
|
||||
}
|
||||
|
||||
/**
|
||||
* 使用自定义的而不使用内置的停止词库,停止词可以使用SmartChineseAnalyzer.loadStopWords(InputStream)加载
|
||||
*
|
||||
* @param stopWords
|
||||
* @see SmartChineseAnalyzer.loadStopWords(InputStream)
|
||||
*/
|
||||
public SmartChineseAnalyzer(Set stopWords) {
|
||||
this.stopWords = stopWords;
|
||||
wordSegment = new WordSegmenter();
|
||||
}
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new SentenceTokenizer(reader);
|
||||
result = new WordTokenizer(result, wordSegment);
|
||||
// result = new LowerCaseFilter(result);
|
||||
// 不再需要LowerCaseFilter,因为SegTokenFilter已经将所有英文字符转换成小写
|
||||
// stem太严格了, This is not bug, this feature:)
|
||||
result = new PorterStemFilter(result);
|
||||
if (stopWords != null) {
|
||||
result = new StopFilter(result, stopWords, false);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 从停用词文件中加载停用词, 停用词文件是普通UTF-8编码的文本文件, 每一行是一个停用词,注释利用“//”, 停用词中包括中文标点符号, 中文空格,
|
||||
* 以及使用率太高而对索引意义不大的词。
|
||||
*
|
||||
* @param input 停用词文件
|
||||
* @return 停用词组成的HashSet
|
||||
*/
|
||||
public static Set loadStopWords(InputStream input) {
|
||||
String line;
|
||||
Set stopWords = new HashSet();
|
||||
try {
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(input,
|
||||
"UTF-8"));
|
||||
while ((line = br.readLine()) != null) {
|
||||
if (line.indexOf("//") != -1) {
|
||||
line = line.substring(0, line.indexOf("//"));
|
||||
}
|
||||
line = line.trim();
|
||||
if (line.length() != 0)
|
||||
stopWords.add(line.toLowerCase());
|
||||
}
|
||||
br.close();
|
||||
} catch (IOException e) {
|
||||
System.err.println("WARNING: cannot open stop words list!");
|
||||
}
|
||||
return stopWords;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,5 +1,51 @@
|
|||
<html><head></head>
|
||||
<html>
|
||||
<head></head>
|
||||
<body>
|
||||
Analyzer for Chinese.
|
||||
|
||||
|
||||
<h2>About SmartChineseAnalyzer</h2>
|
||||
<p>SmartChineseAnalyzer 是一个智能中文分词模块, 与 ChineseAnalyzer (切分每个汉字)和
|
||||
CJKAnalyzer (组合每两个汉字)不同, 它能够利用概率对汉语句子进行最优切分, 并内嵌英文tokenizer,
|
||||
能有效处理中英文混合的文本内容。目前SmartChineseAnalyzer的词典库只支持简体中文。</p>
|
||||
|
||||
<p>它的原理基于自然语言处理领域的隐马尔科夫模型(HMM), 利用大量语料库的训练来统计汉语词汇的词频和跳转概率,
|
||||
从而根据这些统计结果对整个汉语句子计算最似然(likelihood)的切分。</p>
|
||||
|
||||
<p>三种分词模块的分词结果比较, 由此可以看出智能分词更符合句子的原本语义, 从而提高搜索的准确率。
|
||||
<pre>语句: 我是中国人</pre>
|
||||
<ol>
|
||||
<li>SmartChineseAnalyzer: 我-是-中国-人</li>
|
||||
<li>ChineseAnalyzer: 我-是-中-国-人</li>
|
||||
<li>CJKAnalyzer: 我是-是中-中国-国人</li>
|
||||
</ol>
|
||||
</p>
|
||||
|
||||
<h3>分词词典的设置</h3>
|
||||
<p>因为智能分词需要词典来保存词汇的统计值,默认情况下,SmartChineseAnalyzer使用内置的词典库,当需要指定的词典库时,需要指定词典位置,如何指定词典位置请参考
|
||||
org.apache.lucene.analysis.cn.smart.AnalyzerProfile。</p>
|
||||
|
||||
<p><b>词库的下载地址为:<a
|
||||
href="http://code.google.com/p/imdict-chinese-analyzer/downloads/list">http://code.google.com/p/imdict-chinese-analyzer/downloads/list</a>
|
||||
</b> 下载文件analysis-data.zip保存到本地,解压即可使用。</p>
|
||||
|
||||
<p>最简单的指定词典库的办法就是运行时加上参数-Danalysis.data.dir
|
||||
<pre>如: java -Danalysis.data.dir=/path/to/analysis-data com.example.YourApplication</pre>
|
||||
</p>
|
||||
|
||||
<h3>版本要求</h3>
|
||||
<p>SmartChineseAnalyzer的JVM要求java 1.4及以上版本;Lucene
|
||||
要求2.4.0及以上版本,Lucene 2.3.X版应该也可以使用,但未经测试,有需要的用户可自行测试。</p>
|
||||
|
||||
<h3>源文件和文本编码</h3>
|
||||
除特定的二进制码文件外,SmartChineseAnalyzer的所有文本和Java源码都采用UTF-8编码,
|
||||
因此在读取文本和编译Java源码是请注意采用正确的方式,以避免产生乱码错误。
|
||||
|
||||
<h3>SmartChineseAnalyzer的授权</h3>
|
||||
<p>SmartChineseAnalyzer的算法和语料库词典来自于ictclas1.0项目(<a
|
||||
href="http://www.ictclas.org">http://www.ictclas.org</a>),
|
||||
其中词典已经著作权人www.ictclas.org允许,以apache license
|
||||
v2(APLv2)协议发布。在遵循APLv2的条件下,欢迎用户使用。
|
||||
在此感谢www.ictclas.org以及ictclas分词软件的工作人员的辛勤工作和无私奉献!</p>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -0,0 +1,112 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
* 在默认情况下,SmartChineseAnalyzer内置有词典库、默认停止词库,已经经过封装,用户可以直接使用。
|
||||
*
|
||||
* 特殊情况下,用户需要使用指定的词典库和停止词库,此时需要删除org.apache.lucene.analysis.cn.smart. hhmm下的
|
||||
* coredict.mem 和 bigramdict.mem, 然后使用AnalyzerProfile来指定词典库目录。
|
||||
*
|
||||
* AnalyzerProfile 用来寻找存放分词词库数据 和停用词数据的目录, 该目录下应该有 bigramdict.dct, coredict.dct,
|
||||
* stopwords_utf8.txt, 查找过程依次如下:
|
||||
*
|
||||
* <ol>
|
||||
* <li>读取系统运行时参数:-Danalysis.data.dir=/path/to/analysis-data,如果没有,继续下一条</li>
|
||||
* <li>执行命令的当前目录中是否存在analysis-data目录</li>
|
||||
* <li>执行命令的lib/目录中是否存在analysis-data目录</li>
|
||||
* <li>执行命令的当前目录中是否存在analysis.properties文件</li>
|
||||
* <li>执行命令的lib/目录中是否存在analysis.properties文件</li>
|
||||
* </ol>
|
||||
*
|
||||
* 其中analysis.properties文件analysis.data.dir指明analysis-data目录所在位置.
|
||||
* analysis.properties文件的内容示例:
|
||||
*
|
||||
* <pre>
|
||||
* analysis.data.dir=D:/path/to/analysis-data/
|
||||
* </pre>
|
||||
*
|
||||
* 当找不到analysis-data目录时,ANALYSIS_DATA_DIR设置为"",因此在使用前,必须在程序里显式指定data目录,例如:
|
||||
*
|
||||
* <pre>
|
||||
* AnalyzerProfile.ANALYSIS_DATA_DIR = "/path/to/analysis-data";
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
public class AnalyzerProfile {
|
||||
|
||||
public static String ANALYSIS_DATA_DIR = "";
|
||||
|
||||
static {
|
||||
init();
|
||||
}
|
||||
|
||||
private static void init() {
|
||||
String dirName = "analysis-data";
|
||||
String propName = "analysis.properties";
|
||||
|
||||
// 读取系统设置,在运行时加入参数:-Danalysis.data.dir=/path/to/analysis-data
|
||||
ANALYSIS_DATA_DIR = System.getProperty("analysis.data.dir", "");
|
||||
if (ANALYSIS_DATA_DIR.length() != 0)
|
||||
return;
|
||||
|
||||
File[] cadidateFiles = new File[] { new File("./" + dirName),
|
||||
new File("./lib/" + dirName), new File("./" + propName),
|
||||
new File("./lib/" + propName) };
|
||||
for (int i = 0; i < cadidateFiles.length; i++) {
|
||||
File file = cadidateFiles[i];
|
||||
if (file.exists()) {
|
||||
if (file.isDirectory()) {
|
||||
ANALYSIS_DATA_DIR = file.getAbsolutePath();
|
||||
} else if (file.isFile() && getAnalysisDataDir(file).length() != 0) {
|
||||
ANALYSIS_DATA_DIR = getAnalysisDataDir(file);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ANALYSIS_DATA_DIR.length() == 0) {
|
||||
// 提示用户未找到词典文件夹
|
||||
System.err
|
||||
.println("WARNING: Can not found lexical dictionary directory!");
|
||||
System.err
|
||||
.println("WARNING: This will cause unpredictable exceptions in your application!");
|
||||
System.err
|
||||
.println("WARNING: Please refer to the manual to download the dictionaries.");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static String getAnalysisDataDir(File propFile) {
|
||||
Properties prop = new Properties();
|
||||
try {
|
||||
FileInputStream input = new FileInputStream(propFile);
|
||||
prop.load(input);
|
||||
String dir = prop.getProperty("analysis.data.dir", "");
|
||||
input.close();
|
||||
return dir;
|
||||
} catch (IOException e) {
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart;
|
||||
|
||||
public class CharType {
|
||||
|
||||
public final static int DELIMITER = 0;
|
||||
|
||||
public final static int LETTER = 1;
|
||||
|
||||
public final static int DIGIT = 2;
|
||||
|
||||
public final static int HANZI = 3;
|
||||
|
||||
public final static int SPACE_LIKE = 4;
|
||||
|
||||
// (全角半角)标点符号,半角(字母,数字),汉字,空格,"\t\r\n"等空格或换行字符
|
||||
public final static int FULLWIDTH_LETTER = 5;
|
||||
|
||||
public final static int FULLWIDTH_DIGIT = 6; // 全角字符,字母,数字
|
||||
|
||||
public final static int OTHER = 7;
|
||||
|
||||
}
|
|
@ -0,0 +1,102 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/**
|
||||
*
|
||||
* 包含一个完整句子的Token,从文件中读出,是下一步分词的对象
|
||||
*
|
||||
*/
|
||||
public class SentenceTokenizer extends Tokenizer {
|
||||
|
||||
/**
|
||||
* 用来切断句子的标点符号 。,!?;,!?;
|
||||
*/
|
||||
public final static String PUNCTION = "。,!?;,!?;";
|
||||
|
||||
private StringBuffer buffer = new StringBuffer();
|
||||
|
||||
private BufferedReader bufferInput;
|
||||
|
||||
private int tokenStart = 0, tokenEnd = 0;
|
||||
|
||||
private Token t = new Token();
|
||||
|
||||
public SentenceTokenizer(Reader reader) {
|
||||
bufferInput = new BufferedReader(reader, 2048);
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
buffer.setLength(0);
|
||||
int ci;
|
||||
char ch, pch;
|
||||
boolean atBegin = true;
|
||||
tokenStart = tokenEnd;
|
||||
ci = bufferInput.read();
|
||||
ch = (char) ci;
|
||||
|
||||
while (true) {
|
||||
if (ci == -1) {
|
||||
break;
|
||||
} else if (PUNCTION.indexOf(ch) != -1) {
|
||||
// 找到了句子末尾
|
||||
buffer.append(ch);
|
||||
tokenEnd++;
|
||||
break;
|
||||
} else if (atBegin && Utility.SPACES.indexOf(ch) != -1) {
|
||||
tokenStart++;
|
||||
tokenEnd++;
|
||||
ci = bufferInput.read();
|
||||
ch = (char) ci;
|
||||
} else {
|
||||
buffer.append(ch);
|
||||
atBegin = false;
|
||||
tokenEnd++;
|
||||
pch = ch;
|
||||
ci = bufferInput.read();
|
||||
ch = (char) ci;
|
||||
// 如果碰上了两个连续的skip字符,例如两个回车,两个空格或者,
|
||||
// 一个回车,一个空格等等,将其视为句子结束,以免句子太长而内存不足
|
||||
if (Utility.SPACES.indexOf(ch) != -1
|
||||
&& Utility.SPACES.indexOf(pch) != -1) {
|
||||
// buffer.append(ch);
|
||||
tokenEnd++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (buffer.length() == 0)
|
||||
return null;
|
||||
else {
|
||||
t.clear();
|
||||
t.reinit(buffer.toString(), tokenStart, tokenEnd, "sentence");
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
bufferInput.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,165 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart;
|
||||
|
||||
public class Utility {
|
||||
|
||||
public static final char[] STRING_CHAR_ARRAY = new String("未##串")
|
||||
.toCharArray();
|
||||
|
||||
public static final char[] NUMBER_CHAR_ARRAY = new String("未##数")
|
||||
.toCharArray();
|
||||
|
||||
public static final char[] START_CHAR_ARRAY = new String("始##始")
|
||||
.toCharArray();
|
||||
|
||||
public static final char[] END_CHAR_ARRAY = new String("末##末").toCharArray();
|
||||
|
||||
public static final char[] COMMON_DELIMITER = new char[] { ',' };
|
||||
|
||||
/**
|
||||
* 需要跳过的符号,例如制表符,回车,换行等等。
|
||||
*/
|
||||
public static final String SPACES = " \t\r\n";
|
||||
|
||||
public static final int MAX_FREQUENCE = 2079997 + 80000;
|
||||
|
||||
/**
|
||||
* 比较两个整数数组的大小, 分别从数组的一定位置开始逐个比较, 当依次相等且都到达末尾时, 返回相等, 否则未到达末尾的大于到达末尾的;
|
||||
* 当未到达末尾时有一位不相等, 该位置数值大的数组大于小的
|
||||
*
|
||||
* @param larray
|
||||
* @param lstartIndex larray的起始位置
|
||||
* @param rarray
|
||||
* @param rstartIndex rarray的起始位置
|
||||
* @return 0表示相等,1表示larray > rarray, -1表示larray < rarray
|
||||
*/
|
||||
public static int compareArray(char[] larray, int lstartIndex, char[] rarray,
|
||||
int rstartIndex) {
|
||||
|
||||
if (larray == null) {
|
||||
if (rarray == null || rstartIndex >= rarray.length)
|
||||
return 0;
|
||||
else
|
||||
return -1;
|
||||
} else {
|
||||
// larray != null
|
||||
if (rarray == null) {
|
||||
if (lstartIndex >= larray.length)
|
||||
return 0;
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
int li = lstartIndex, ri = rstartIndex;
|
||||
while (li < larray.length && ri < rarray.length && larray[li] == rarray[ri]) {
|
||||
li++;
|
||||
ri++;
|
||||
}
|
||||
if (li == larray.length) {
|
||||
if (ri == rarray.length) {
|
||||
// 两者一直相等到末尾,因此返回相等,也就是结果0
|
||||
return 0;
|
||||
} else {
|
||||
// 此时不可能ri>rarray.length因此只有ri<rarray.length
|
||||
// 表示larray已经结束,rarray没有结束,因此larray < rarray,返回-1
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
// 此时不可能li>larray.length因此只有li < larray.length,表示li没有到达larray末尾
|
||||
if (ri == rarray.length) {
|
||||
// larray没有结束,但是rarray已经结束,因此larray > rarray
|
||||
return 1;
|
||||
} else {
|
||||
// 此时不可能ri>rarray.length因此只有ri < rarray.length
|
||||
// 表示larray和rarray都没有结束,因此按下一个数的大小判断
|
||||
if (larray[li] > rarray[ri])
|
||||
return 1;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据前缀来判断两个字符数组的大小,当前者为后者的前缀时,表示相等,当不为前缀时,按照普通字符串方式比较
|
||||
*
|
||||
* @param shortArray
|
||||
* @param shortIndex
|
||||
* @param longArray
|
||||
* @param longIndex
|
||||
* @return
|
||||
*/
|
||||
public static int compareArrayByPrefix(char[] shortArray, int shortIndex,
|
||||
char[] longArray, int longIndex) {
|
||||
|
||||
// 空数组是所有数组的前缀,不考虑index
|
||||
if (shortArray == null)
|
||||
return 0;
|
||||
else if (longArray == null)
|
||||
return (shortIndex < shortArray.length) ? 1 : 0;
|
||||
|
||||
int si = shortIndex, li = longIndex;
|
||||
while (si < shortArray.length && li < longArray.length
|
||||
&& shortArray[si] == longArray[li]) {
|
||||
si++;
|
||||
li++;
|
||||
}
|
||||
if (si == shortArray.length) {
|
||||
// shortArray 是 longArray的prefix
|
||||
return 0;
|
||||
} else {
|
||||
// 此时不可能si>shortArray.length因此只有si <
|
||||
// shortArray.length,表示si没有到达shortArray末尾
|
||||
|
||||
// shortArray没有结束,但是longArray已经结束,因此shortArray > longArray
|
||||
if (li == longArray.length)
|
||||
return 1;
|
||||
else
|
||||
// 此时不可能li>longArray.length因此只有li < longArray.length
|
||||
// 表示shortArray和longArray都没有结束,因此按下一个数的大小判断
|
||||
return (shortArray[si] > longArray[li]) ? 1 : -1;
|
||||
}
|
||||
}
|
||||
|
||||
public static int getCharType(char ch) {
|
||||
// 最多的是汉字
|
||||
if (ch >= 0x4E00 && ch <= 0x9FA5)
|
||||
return CharType.HANZI;
|
||||
if ((ch >= 0x0041 && ch <= 0x005A) || (ch >= 0x0061 && ch <= 0x007A))
|
||||
return CharType.LETTER;
|
||||
if (ch >= 0x0030 && ch <= 0x0039)
|
||||
return CharType.DIGIT;
|
||||
if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n' || ch == ' ')
|
||||
return CharType.SPACE_LIKE;
|
||||
// 最前面的其它的都是标点符号了
|
||||
if ((ch >= 0x0021 && ch <= 0x00BB) || (ch >= 0x2010 && ch <= 0x2642)
|
||||
|| (ch >= 0x3001 && ch <= 0x301E))
|
||||
return CharType.DELIMITER;
|
||||
|
||||
// 全角字符区域
|
||||
if ((ch >= 0xFF21 && ch <= 0xFF3A) || (ch >= 0xFF41 && ch <= 0xFF5A))
|
||||
return CharType.FULLWIDTH_LETTER;
|
||||
if (ch >= 0xFF10 && ch <= 0xFF19)
|
||||
return CharType.FULLWIDTH_DIGIT;
|
||||
if (ch >= 0xFE30 && ch <= 0xFF63)
|
||||
return CharType.DELIMITER;
|
||||
return CharType.OTHER;
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter;
|
||||
import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;
|
||||
import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter;
|
||||
|
||||
public class WordSegmenter {
|
||||
|
||||
private HHMMSegmenter hhmmSegmenter = new HHMMSegmenter();
|
||||
|
||||
private SegTokenFilter tokenFilter = new SegTokenFilter();
|
||||
|
||||
/**
|
||||
* 调用HHMMSegment程序将当前的sentence Token分词,返回分词结果,保存在Token List中
|
||||
*
|
||||
* @param sentenceToken 句子的Token
|
||||
* @param shortPathCount HHMM算法分词所需要的优化前的最短路径个数。一般越大分词结果越精确,但是计算代价也较高。
|
||||
* @return 分词结果的Token List
|
||||
*/
|
||||
public List segmentSentence(Token sentenceToken, int shortPathCount) {
|
||||
String sentence = sentenceToken.term();
|
||||
|
||||
List segTokenList = hhmmSegmenter.process(sentence);
|
||||
|
||||
List result = new ArrayList();
|
||||
|
||||
// i从1到rawTokens.length-2,也就是说将“始##始”,“末##末”两个RawToken去掉
|
||||
for (int i = 1; i < segTokenList.size() - 1; i++) {
|
||||
result.add(convertSegToken((SegToken) segTokenList.get(i), sentence,
|
||||
sentenceToken.startOffset(), "word"));
|
||||
}
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* 将RawToken类型转换成索引需要的Token类型, 因为索引需要RawToken在原句中的内容, 因此转换时需要指定原句子。
|
||||
*
|
||||
* @param rt
|
||||
* @param sentence 转换需要的句子内容
|
||||
* @param sentenceStartOffset sentence在文章中的初始位置
|
||||
* @param type token类型,默认应该是word
|
||||
* @return
|
||||
*/
|
||||
public Token convertSegToken(SegToken st, String sentence,
|
||||
int sentenceStartOffset, String type) {
|
||||
Token result;
|
||||
switch (st.wordType) {
|
||||
case WordType.STRING:
|
||||
case WordType.NUMBER:
|
||||
case WordType.FULLWIDTH_NUMBER:
|
||||
case WordType.FULLWIDTH_STRING:
|
||||
st.charArray = sentence.substring(st.startOffset, st.endOffset)
|
||||
.toCharArray();
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
st = tokenFilter.filter(st);
|
||||
|
||||
result = new Token(st.charArray, 0, st.charArray.length, st.startOffset
|
||||
+ sentenceStartOffset, st.endOffset + sentenceStartOffset);
|
||||
return result;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
public class WordTokenizer extends Tokenizer {
|
||||
|
||||
/**
|
||||
* 分词主程序,WordTokenizer初始化时加载。
|
||||
*/
|
||||
private WordSegmenter wordSegmenter;
|
||||
|
||||
private TokenStream in;
|
||||
|
||||
private Iterator tokenIter;
|
||||
|
||||
private List tokenBuffer;
|
||||
|
||||
private Token sentenceToken = new Token();
|
||||
|
||||
/**
|
||||
* 设计上是SentenceTokenizer的下一处理层。将SentenceTokenizer的句子读出,
|
||||
* 利用HHMMSegment主程序将句子分词,然后将分词结果返回。
|
||||
*
|
||||
* @param in 句子的Token
|
||||
* @param smooth 平滑函数
|
||||
* @param dataPath 装载核心字典与二叉字典的目录
|
||||
* @see init()
|
||||
*/
|
||||
public WordTokenizer(TokenStream in, WordSegmenter wordSegmenter) {
|
||||
this.in = in;
|
||||
this.wordSegmenter = wordSegmenter;
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
if (tokenIter != null && tokenIter.hasNext())
|
||||
return (Token) tokenIter.next();
|
||||
else {
|
||||
if (processNextSentence()) {
|
||||
return (Token) tokenIter.next();
|
||||
} else
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 当当前的句子分词并索引完毕时,需要读取下一个句子Token, 本函数负责调用上一层的SentenceTokenizer去加载下一个句子, 并将其分词,
|
||||
* 将分词结果保存成Token放在tokenBuffer中
|
||||
*
|
||||
* @return 读取并处理下一个句子成功与否,如果没有成功,说明文件处理完毕,后面没有Token了
|
||||
* @throws IOException
|
||||
*/
|
||||
private boolean processNextSentence() throws IOException {
|
||||
sentenceToken = in.next(sentenceToken);
|
||||
if (sentenceToken == null)
|
||||
return false;
|
||||
tokenBuffer = wordSegmenter.segmentSentence(sentenceToken, 1);
|
||||
tokenIter = tokenBuffer.iterator();
|
||||
return tokenBuffer != null && tokenIter.hasNext();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
in.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart;
|
||||
|
||||
public class WordType {
|
||||
|
||||
public final static int SENTENCE_BEGIN = 0;
|
||||
|
||||
public final static int SENTENCE_END = 1;// 句子的开头和结束
|
||||
|
||||
public final static int CHINESE_WORD = 2;// 中文词
|
||||
|
||||
public final static int STRING = 3;
|
||||
|
||||
public final static int NUMBER = 4; // ascii字符串和数字
|
||||
|
||||
public final static int DELIMITER = 5; // 所有标点符号
|
||||
|
||||
public final static int FULLWIDTH_STRING = 6;
|
||||
|
||||
public final static int FULLWIDTH_NUMBER = 7;// 含有全角字符的字符串,含全角数字的数字
|
||||
|
||||
}
|
|
@ -0,0 +1,195 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart.hhmm;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
|
||||
public abstract class AbstractDictionary {
|
||||
/**
|
||||
* 第一个汉字为“啊”,他前面有15个区,共15*94个字符
|
||||
*/
|
||||
public static final int GB2312_FIRST_CHAR = 1410;
|
||||
|
||||
/**
|
||||
* GB2312字符集中01~87的字符集才可能有效,共8178个
|
||||
*/
|
||||
public static final int GB2312_CHAR_NUM = 87 * 94;
|
||||
|
||||
/**
|
||||
* 词库文件中收录了6768个汉字的词频统计
|
||||
*/
|
||||
public static final int CHAR_NUM_IN_FILE = 6768;
|
||||
|
||||
// =====================================================
|
||||
// code +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +A +B +C +D +E +F
|
||||
// B0A0 啊 阿 埃 挨 哎 唉 哀 皑 癌 蔼 矮 艾 碍 爱 隘
|
||||
// B0B0 鞍 氨 安 俺 按 暗 岸 胺 案 肮 昂 盎 凹 敖 熬 翱
|
||||
// B0C0 袄 傲 奥 懊 澳 芭 捌 扒 叭 吧 笆 八 疤 巴 拔 跋
|
||||
// B0D0 靶 把 耙 坝 霸 罢 爸 白 柏 百 摆 佰 败 拜 稗 斑
|
||||
// B0E0 班 搬 扳 般 颁 板 版 扮 拌 伴 瓣 半 办 绊 邦 帮
|
||||
// B0F0 梆 榜 膀 绑 棒 磅 蚌 镑 傍 谤 苞 胞 包 褒 剥
|
||||
// =====================================================
|
||||
//
|
||||
// GB2312 字符集的区位分布表:
|
||||
// 区号 字数 字符类别
|
||||
// 01 94 一般符号
|
||||
// 02 72 顺序号码
|
||||
// 03 94 拉丁字母
|
||||
// 04 83 日文假名
|
||||
// 05 86 Katakana
|
||||
// 06 48 希腊字母
|
||||
// 07 66 俄文字母
|
||||
// 08 63 汉语拼音符号
|
||||
// 09 76 图形符号
|
||||
// 10-15 备用区
|
||||
// 16-55 3755 一级汉字,以拼音为序
|
||||
// 56-87 3008 二级汉字,以笔划为序
|
||||
// 88-94 备用区
|
||||
// ======================================================
|
||||
|
||||
/**
|
||||
* GB2312 共收录有 7445 个字符,其中简化汉字 6763 个,字母和符号 682 个。
|
||||
*
|
||||
* GB2312 将所收录的字符分为 94 个区,编号为 01 区至 94 区;每个区收录 94 个字符,编号为 01 位至 94
|
||||
* 位,01为起始与0xA1,94位处于0xFE。GB2312 的每一个字符都由与其唯一对应的区号和位号所确定。例如:汉字“啊”,编号为 16 区 01
|
||||
* 位。
|
||||
*/
|
||||
/**
|
||||
* @param ccid
|
||||
* @return
|
||||
*/
|
||||
public String getCCByGB2312Id(int ccid) {
|
||||
if (ccid < 0 || ccid > WordDictionary.GB2312_CHAR_NUM)
|
||||
return "";
|
||||
int cc1 = ccid / 94 + 161;
|
||||
int cc2 = ccid % 94 + 161;
|
||||
byte[] buffer = new byte[2];
|
||||
buffer[0] = (byte) cc1;
|
||||
buffer[1] = (byte) cc2;
|
||||
try {
|
||||
String cchar = new String(buffer, "GB2312");
|
||||
return cchar;
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据输入的Unicode字符,获取它的GB2312编码或者ascii编码,
|
||||
*
|
||||
* @param ch 输入的GB2312中文字符或者ASCII字符(128个)
|
||||
* @return ch在GB2312中的位置,-1表示该字符不认识
|
||||
*/
|
||||
public short getGB2312Id(char ch) {
|
||||
try {
|
||||
byte[] buffer = Character.toString(ch).getBytes("GB2312");
|
||||
if (buffer.length != 2) {
|
||||
// 正常情况下buffer应该是两个字节,否则说明ch不属于GB2312编码,故返回'?',此时说明不认识该字符
|
||||
return -1;
|
||||
}
|
||||
int b0 = (int) (buffer[0] & 0x0FF) - 161; // 编码从A1开始,因此减去0xA1=161
|
||||
int b1 = (int) (buffer[1] & 0x0FF) - 161; // 第一个字符和最后一个字符没有汉字,因此每个区只收16*6-2=94个汉字
|
||||
return (short) (b0 * 94 + b1);
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* 改进的32位FNV hash算法,用作本程序中的第一hash函数.第一和第二hash函数用来联合计算hash表, 使其均匀分布,
|
||||
* 并能避免因hash表过密而导致的长时间计算的问题
|
||||
*
|
||||
* @param c 待hash的Unicode字符
|
||||
* @return c的哈希值
|
||||
* @see Utility.hash2()
|
||||
*/
|
||||
public long hash1(char c) {
|
||||
final long p = 1099511628211L;
|
||||
long hash = 0xcbf29ce484222325L;
|
||||
hash = (hash ^ (c & 0x00FF)) * p;
|
||||
hash = (hash ^ (c >> 8)) * p;
|
||||
hash += hash << 13;
|
||||
hash ^= hash >> 7;
|
||||
hash += hash << 3;
|
||||
hash ^= hash >> 17;
|
||||
hash += hash << 5;
|
||||
return hash;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see Utility.hash1(char[])
|
||||
* @param carray
|
||||
* @return
|
||||
*/
|
||||
public long hash1(char carray[]) {
|
||||
final long p = 1099511628211L;
|
||||
long hash = 0xcbf29ce484222325L;
|
||||
for (int i = 0; i < carray.length; i++) {
|
||||
char d = carray[i];
|
||||
hash = (hash ^ (d & 0x00FF)) * p;
|
||||
hash = (hash ^ (d >> 8)) * p;
|
||||
}
|
||||
|
||||
// hash += hash << 13;
|
||||
// hash ^= hash >> 7;
|
||||
// hash += hash << 3;
|
||||
// hash ^= hash >> 17;
|
||||
// hash += hash << 5;
|
||||
return hash;
|
||||
}
|
||||
|
||||
/**
|
||||
* djb2哈希算法,用作本程序中的第二hash函数
|
||||
*
|
||||
* djb2 hash algorithm,this algorithm (k=33) was first reported by dan
|
||||
* bernstein many years ago in comp.lang.c. another version of this algorithm
|
||||
* (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i];
|
||||
* the magic of number 33 (why it works better than many other constants,
|
||||
* prime or not) has never been adequately explained.
|
||||
*
|
||||
* @param c
|
||||
* @return
|
||||
*/
|
||||
public int hash2(char c) {
|
||||
int hash = 5381;
|
||||
|
||||
/* hash 33 + c */
|
||||
hash = ((hash << 5) + hash) + c & 0x00FF;
|
||||
hash = ((hash << 5) + hash) + c >> 8;
|
||||
|
||||
return hash;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see Utility.hash2(char[])
|
||||
* @param carray
|
||||
* @return
|
||||
*/
|
||||
public int hash2(char carray[]) {
|
||||
int hash = 5381;
|
||||
|
||||
/* hash 33 + c */
|
||||
for (int i = 0; i < carray.length; i++) {
|
||||
char d = carray[i];
|
||||
hash = ((hash << 5) + hash) + d & 0x00FF;
|
||||
hash = ((hash << 5) + hash) + d >> 8;
|
||||
}
|
||||
|
||||
return hash;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,237 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart.hhmm;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.cn.smart.Utility;
|
||||
|
||||
public class BiSegGraph {
|
||||
|
||||
private Map tokenPairListTable = new HashMap();
|
||||
|
||||
private List segTokenList;
|
||||
|
||||
private static BigramDictionary bigramDict = BigramDictionary.getInstance();
|
||||
|
||||
public BiSegGraph(SegGraph segGraph) {
|
||||
segTokenList = segGraph.makeIndex();
|
||||
generateBiSegGraph(segGraph);
|
||||
}
|
||||
|
||||
/**
|
||||
* 生成两两词之间的二叉图表,将结果保存在一个MultiTokenPairMap中
|
||||
*
|
||||
* @param segGraph 所有的Token列表
|
||||
* @param smooth 平滑系数
|
||||
* @param biDict 二叉词典
|
||||
* @return
|
||||
*
|
||||
* @see MultiTokenPairMap
|
||||
*/
|
||||
private void generateBiSegGraph(SegGraph segGraph) {
|
||||
double smooth = 0.1;
|
||||
int wordPairFreq = 0;
|
||||
int maxStart = segGraph.getMaxStart();
|
||||
double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE;
|
||||
|
||||
int next;
|
||||
char[] idBuffer;
|
||||
// 为segGraph中的每个元素赋以一个下标
|
||||
segTokenList = segGraph.makeIndex();
|
||||
// 因为startToken("始##始")的起始位置是-1因此key为-1时可以取出startToken
|
||||
int key = -1;
|
||||
List nextTokens = null;
|
||||
while (key < maxStart) {
|
||||
if (segGraph.isStartExist(key)) {
|
||||
|
||||
List tokenList = segGraph.getStartList(key);
|
||||
|
||||
// 为某一个key对应的所有Token都计算一次
|
||||
for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
|
||||
SegToken t1 = (SegToken) iter.next();
|
||||
oneWordFreq = t1.weight;
|
||||
next = t1.endOffset;
|
||||
nextTokens = null;
|
||||
// 找到下一个对应的Token,例如“阳光海岸”,当前Token是“阳光”, 下一个Token可以是“海”或者“海岸”
|
||||
// 如果找不到下一个Token,则说明到了末尾,重新循环。
|
||||
while (next <= maxStart) {
|
||||
// 因为endToken的起始位置是sentenceLen,因此等于sentenceLen是可以找到endToken
|
||||
if (segGraph.isStartExist(next)) {
|
||||
nextTokens = segGraph.getStartList(next);
|
||||
break;
|
||||
}
|
||||
next++;
|
||||
}
|
||||
if (nextTokens == null) {
|
||||
break;
|
||||
}
|
||||
for (Iterator iter2 = nextTokens.iterator(); iter2.hasNext();) {
|
||||
SegToken t2 = (SegToken) iter2.next();
|
||||
idBuffer = new char[t1.charArray.length + t2.charArray.length + 1];
|
||||
System.arraycopy(t1.charArray, 0, idBuffer, 0, t1.charArray.length);
|
||||
idBuffer[t1.charArray.length] = BigramDictionary.WORD_SEGMENT_CHAR;
|
||||
System.arraycopy(t2.charArray, 0, idBuffer,
|
||||
t1.charArray.length + 1, t2.charArray.length);
|
||||
|
||||
// Two linked Words frequency
|
||||
wordPairFreq = bigramDict.getFrequency(idBuffer);
|
||||
|
||||
// Smoothing
|
||||
|
||||
// -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
|
||||
weight = -Math
|
||||
.log(smooth
|
||||
* (1.0 + oneWordFreq)
|
||||
/ (Utility.MAX_FREQUENCE + 0.0)
|
||||
+ (1.0 - smooth)
|
||||
* ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble));
|
||||
|
||||
SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.index,
|
||||
t2.index, weight);
|
||||
this.addSegTokenPair(tokenPair);
|
||||
}
|
||||
}
|
||||
}
|
||||
key++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 查看SegTokenPair的结束位置为to(SegTokenPair.to为to)是否存在SegTokenPair,
|
||||
* 如果没有则说明to处没有SegTokenPair或者还没有添加
|
||||
*
|
||||
* @param to SegTokenPair.to
|
||||
* @return
|
||||
*/
|
||||
public boolean isToExist(int to) {
|
||||
return tokenPairListTable.get(new Integer(to)) != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 取出SegTokenPair.to为to的所有SegTokenPair,如果没有则返回null
|
||||
*
|
||||
* @param to
|
||||
* @return 所有相同SegTokenPair.to的SegTokenPair的序列
|
||||
*/
|
||||
public List getToList(int to) {
|
||||
return (List) tokenPairListTable.get(new Integer(to));
|
||||
}
|
||||
|
||||
/**
|
||||
* 向BiSegGraph中增加一个SegTokenPair,这些SegTokenPair按照相同SegTokenPair.
|
||||
* to放在同一个ArrayList中
|
||||
*
|
||||
* @param tokenPair
|
||||
*/
|
||||
public void addSegTokenPair(SegTokenPair tokenPair) {
|
||||
int to = tokenPair.to;
|
||||
if (!isToExist(to)) {
|
||||
ArrayList newlist = new ArrayList();
|
||||
newlist.add(tokenPair);
|
||||
tokenPairListTable.put(new Integer(to), newlist);
|
||||
} else {
|
||||
List tokenPairList = (List) tokenPairListTable.get(new Integer(to));
|
||||
tokenPairList.add(tokenPair);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return TokenPair的列数,也就是Map中不同列号的TokenPair种数。
|
||||
*/
|
||||
public int getToCount() {
|
||||
return tokenPairListTable.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* 用veterbi算法计算从起点到终点的最短路径
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public List getShortPath() {
|
||||
int current;
|
||||
int nodeCount = getToCount();
|
||||
List path = new ArrayList();
|
||||
PathNode zeroPath = new PathNode();
|
||||
zeroPath.weight = 0;
|
||||
zeroPath.preNode = 0;
|
||||
path.add(zeroPath);
|
||||
for (current = 1; current <= nodeCount; current++) {
|
||||
double weight;
|
||||
List edges = getToList(current);
|
||||
|
||||
double minWeight = Double.MAX_VALUE;
|
||||
SegTokenPair minEdge = null;
|
||||
for (Iterator iter1 = edges.iterator(); iter1.hasNext();) {
|
||||
SegTokenPair edge = (SegTokenPair) iter1.next();
|
||||
weight = edge.weight;
|
||||
PathNode preNode = (PathNode) path.get(edge.from);
|
||||
if (preNode.weight + weight < minWeight) {
|
||||
minWeight = preNode.weight + weight;
|
||||
minEdge = edge;
|
||||
}
|
||||
}
|
||||
PathNode newNode = new PathNode();
|
||||
newNode.weight = minWeight;
|
||||
newNode.preNode = minEdge.from;
|
||||
path.add(newNode);
|
||||
}
|
||||
|
||||
// 接下来从nodePaths中计算从起点到终点的真实路径
|
||||
int preNode, lastNode;
|
||||
lastNode = path.size() - 1;
|
||||
current = lastNode;
|
||||
List rpath = new ArrayList();
|
||||
List resultPath = new ArrayList();
|
||||
|
||||
rpath.add(new Integer(current));
|
||||
while (current != 0) {
|
||||
PathNode currentPathNode = (PathNode) path.get(current);
|
||||
preNode = currentPathNode.preNode;
|
||||
rpath.add(new Integer(preNode));
|
||||
current = preNode;
|
||||
}
|
||||
for (int j = rpath.size() - 1; j >= 0; j--) {
|
||||
Integer idInteger = (Integer) rpath.get(j);
|
||||
int id = idInteger.intValue();
|
||||
SegToken t = (SegToken) segTokenList.get(id);
|
||||
resultPath.add(t);
|
||||
}
|
||||
return resultPath;
|
||||
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
Collection values = tokenPairListTable.values();
|
||||
for (Iterator iter1 = values.iterator(); iter1.hasNext();) {
|
||||
List segList = (List) iter1.next();
|
||||
for (Iterator iter2 = segList.iterator(); iter2.hasNext();) {
|
||||
SegTokenPair pair = (SegTokenPair) iter2.next();
|
||||
sb.append(pair + "\n");
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,321 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart.hhmm;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.ObjectInputStream;
|
||||
import java.io.ObjectOutputStream;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
|
||||
|
||||
public class BigramDictionary extends AbstractDictionary {
|
||||
|
||||
private BigramDictionary() {
|
||||
}
|
||||
|
||||
public static final char WORD_SEGMENT_CHAR = '@';
|
||||
|
||||
private static BigramDictionary singleInstance;
|
||||
|
||||
public static final int PRIME_BIGRAM_LENGTH = 402137;
|
||||
|
||||
/**
|
||||
* bigramTable 来存储词与词之间的跳转频率, bigramHashTable 和 frequencyTable
|
||||
* 就是用来存储这些频率的数据结构。 为了提高查询速度和节省内存, 采用 hash 值来代替关联词作为查询依据, 关联词就是
|
||||
* (formWord+'@'+toWord) , 利用 FNV1 hash 算法来计算关联词的hash值 ,并保存在 bigramHashTable
|
||||
* 中,利用 hash 值来代替关联词有可能会产生很小概率的冲突, 但是 long 类型
|
||||
* (64bit)的hash值有效地将此概率降到极低。bigramHashTable[i]与frequencyTable[i]一一对应
|
||||
*/
|
||||
private long[] bigramHashTable;
|
||||
|
||||
private int[] frequencyTable;
|
||||
|
||||
private int max = 0;
|
||||
|
||||
private int repeat = 0;
|
||||
|
||||
// static Logger log = Logger.getLogger(BigramDictionary.class);
|
||||
|
||||
public synchronized static BigramDictionary getInstance() {
|
||||
if (singleInstance == null) {
|
||||
singleInstance = new BigramDictionary();
|
||||
try {
|
||||
singleInstance.load();
|
||||
} catch (IOException e) {
|
||||
String dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
|
||||
singleInstance.load(dictRoot);
|
||||
} catch (ClassNotFoundException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
return singleInstance;
|
||||
}
|
||||
|
||||
private boolean loadFromObj(File serialObj) {
|
||||
try {
|
||||
loadFromInputStream(new FileInputStream(serialObj));
|
||||
return true;
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
} catch (ClassNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private void loadFromInputStream(InputStream serialObjectInputStream)
|
||||
throws IOException, ClassNotFoundException {
|
||||
ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
|
||||
bigramHashTable = (long[]) input.readObject();
|
||||
frequencyTable = (int[]) input.readObject();
|
||||
// log.info("load bigram dict from serialization.");
|
||||
input.close();
|
||||
}
|
||||
|
||||
private void saveToObj(File serialObj) {
|
||||
try {
|
||||
ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
|
||||
serialObj));
|
||||
output.writeObject(bigramHashTable);
|
||||
output.writeObject(frequencyTable);
|
||||
output.close();
|
||||
// log.info("serialize bigram dict.");
|
||||
} catch (Exception e) {
|
||||
// log.warn(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private void load() throws IOException, ClassNotFoundException {
|
||||
InputStream input = this.getClass().getResourceAsStream("bigramdict.mem");
|
||||
loadFromInputStream(input);
|
||||
}
|
||||
|
||||
private void load(String dictRoot) {
|
||||
String bigramDictPath = dictRoot + "/bigramdict.dct";
|
||||
|
||||
File serialObj = new File(dictRoot + "/bigramdict.mem");
|
||||
|
||||
if (serialObj.exists() && loadFromObj(serialObj)) {
|
||||
|
||||
} else {
|
||||
try {
|
||||
bigramHashTable = new long[PRIME_BIGRAM_LENGTH];
|
||||
frequencyTable = new int[PRIME_BIGRAM_LENGTH];
|
||||
for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++) {
|
||||
// 实际上将0作为初始值有一点问题,因为某个字符串可能hash值为0,但是概率非常小,因此影响不大
|
||||
bigramHashTable[i] = 0;
|
||||
frequencyTable[i] = 0;
|
||||
}
|
||||
loadFromFile(bigramDictPath);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e.getMessage());
|
||||
}
|
||||
saveToObj(serialObj);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 将词库文件加载到WordDictionary的相关数据结构中,只是加载,没有进行合并和修改操作
|
||||
*
|
||||
* @param dctFilePath
|
||||
* @return
|
||||
* @throws FileNotFoundException
|
||||
* @throws IOException
|
||||
* @throws UnsupportedEncodingException
|
||||
*/
|
||||
public void loadFromFile(String dctFilePath) throws FileNotFoundException,
|
||||
IOException, UnsupportedEncodingException {
|
||||
|
||||
int i, cnt, length, total = 0;
|
||||
// 文件中只统计了6763个汉字加5个空汉字符3756~3760,其中第3756个用来存储符号信息。
|
||||
int[] buffer = new int[3];
|
||||
byte[] intBuffer = new byte[4];
|
||||
String tmpword;
|
||||
RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
|
||||
|
||||
// 字典文件中第一个汉字出现的位置是0,最后一个是6768
|
||||
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
|
||||
String currentStr = getCCByGB2312Id(i);
|
||||
// if (i == 5231)
|
||||
// System.out.println(i);
|
||||
|
||||
dctFile.read(intBuffer);// 原词库文件在c下开发,所以写入的文件为little
|
||||
// endian编码,而java为big endian,必须转换过来
|
||||
cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
|
||||
if (cnt <= 0) {
|
||||
continue;
|
||||
}
|
||||
total += cnt;
|
||||
int j = 0;
|
||||
while (j < cnt) {
|
||||
dctFile.read(intBuffer);
|
||||
buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
|
||||
.getInt();// frequency
|
||||
dctFile.read(intBuffer);
|
||||
buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
|
||||
.getInt();// length
|
||||
dctFile.read(intBuffer);
|
||||
// buffer[2] = ByteBuffer.wrap(intBuffer).order(
|
||||
// ByteOrder.LITTLE_ENDIAN).getInt();// handle
|
||||
|
||||
length = buffer[1];
|
||||
if (length > 0) {
|
||||
byte[] lchBuffer = new byte[length];
|
||||
dctFile.read(lchBuffer);
|
||||
tmpword = new String(lchBuffer, "GB2312");
|
||||
if (i != 3755 + GB2312_FIRST_CHAR) {
|
||||
tmpword = currentStr + tmpword;
|
||||
}
|
||||
char carray[] = tmpword.toCharArray();
|
||||
long hashId = hash1(carray);
|
||||
int index = getAvaliableIndex(hashId, carray);
|
||||
if (index != -1) {
|
||||
if (bigramHashTable[index] == 0) {
|
||||
bigramHashTable[index] = hashId;
|
||||
// bigramStringTable[index] = tmpword;
|
||||
}
|
||||
frequencyTable[index] += buffer[0];
|
||||
}
|
||||
}
|
||||
j++;
|
||||
}
|
||||
}
|
||||
dctFile.close();
|
||||
// log.info("load dictionary done! " + dctFilePath + " total:" + total);
|
||||
}
|
||||
|
||||
/*
|
||||
* public void test(String dctFilePath) throws IOException { int i, cnt,
|
||||
* length, total = 0; int corrupt = 0, notFound = 0; //
|
||||
* 文件中只统计了6763个汉字加5个空汉字符3756~3760,其中第3756个用来存储符号信息。 int[] buffer = new int[3];
|
||||
* byte[] intBuffer = new byte[4]; String tmpword; RandomAccessFile dctFile =
|
||||
* new RandomAccessFile(dctFilePath, "r");
|
||||
*
|
||||
* // 字典文件中第一个汉字出现的位置是0,最后一个是6768 for (i = GB2312_FIRST_CHAR; i <
|
||||
* GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { String currentStr =
|
||||
* getCCByGB2312Id(i); // if (i == 5231) // System.out.println(i);
|
||||
*
|
||||
* dctFile.read(intBuffer);// 原词库文件在c下开发,所以写入的文件为little // endian编码,而java为big
|
||||
* endian,必须转换过来 cnt =
|
||||
* ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN) .getInt(); if
|
||||
* (cnt <= 0) { continue; } total += cnt; int j = 0; while (j < cnt) {
|
||||
* dctFile.read(intBuffer); buffer[0] = ByteBuffer.wrap(intBuffer).order(
|
||||
* ByteOrder.LITTLE_ENDIAN).getInt();// frequency dctFile.read(intBuffer);
|
||||
* buffer[1] = ByteBuffer.wrap(intBuffer).order(
|
||||
* ByteOrder.LITTLE_ENDIAN).getInt();// length dctFile.read(intBuffer); //
|
||||
* buffer[2] = ByteBuffer.wrap(intBuffer).order( //
|
||||
* ByteOrder.LITTLE_ENDIAN).getInt();// handle
|
||||
*
|
||||
* length = buffer[1]; if (length > 0) { byte[] lchBuffer = new byte[length];
|
||||
* dctFile.read(lchBuffer); tmpword = new String(lchBuffer, "GB2312"); if (i
|
||||
* != 3755 + GB2312_FIRST_CHAR) { tmpword = currentStr + tmpword; } char
|
||||
* carray[] = tmpword.toCharArray(); int index = getBigramItemIndex(carray);
|
||||
* if (index != -1) { // if (!bigramStringTable[index].equals(tmpword)) { //
|
||||
* System.out.println("corrupt: " + tmpword + "<->" // +
|
||||
* bigramStringTable[index]); // corrupt++; // } } else {
|
||||
* System.out.println("not found: " + tmpword); notFound++; } } j++; } }
|
||||
* dctFile.close(); System.out.println("num not found:" + notFound);
|
||||
* System.out.println("num corrupt:" + corrupt);
|
||||
*
|
||||
* log.info("test dictionary done! " + dctFilePath + " total:" + total); cnt =
|
||||
* 0; for (int j = 0; j < PRIME_BIGRAM_LENGTH; j++) { if (bigramHashTable[j]
|
||||
* != 0) { cnt++; } } System.out.println("total num in bigramTable: " + cnt);
|
||||
* }
|
||||
*/
|
||||
|
||||
private int getAvaliableIndex(long hashId, char carray[]) {
|
||||
int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
|
||||
int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
|
||||
if (hash1 < 0)
|
||||
hash1 = PRIME_BIGRAM_LENGTH + hash1;
|
||||
if (hash2 < 0)
|
||||
hash2 = PRIME_BIGRAM_LENGTH + hash2;
|
||||
int index = hash1;
|
||||
int i = 1;
|
||||
while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
|
||||
&& i < PRIME_BIGRAM_LENGTH) {
|
||||
index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
|
||||
i++;
|
||||
}
|
||||
// System.out.println(i - 1);
|
||||
|
||||
if (i < PRIME_BIGRAM_LENGTH
|
||||
&& (bigramHashTable[index] == 0 || bigramHashTable[index] == hashId)) {
|
||||
return index;
|
||||
} else
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param c
|
||||
* @return
|
||||
*/
|
||||
private int getBigramItemIndex(char carray[]) {
|
||||
long hashId = hash1(carray);
|
||||
int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
|
||||
int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
|
||||
if (hash1 < 0)
|
||||
hash1 = PRIME_BIGRAM_LENGTH + hash1;
|
||||
if (hash2 < 0)
|
||||
hash2 = PRIME_BIGRAM_LENGTH + hash2;
|
||||
int index = hash1;
|
||||
int i = 1;
|
||||
repeat++;
|
||||
while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
|
||||
&& i < PRIME_BIGRAM_LENGTH) {
|
||||
index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
|
||||
i++;
|
||||
repeat++;
|
||||
if (i > max)
|
||||
max = i;
|
||||
}
|
||||
// System.out.println(i - 1);
|
||||
|
||||
if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId) {
|
||||
return index;
|
||||
} else
|
||||
return -1;
|
||||
}
|
||||
|
||||
public int getFrequency(char[] carray) {
|
||||
int index = getBigramItemIndex(carray);
|
||||
if (index != -1)
|
||||
return frequencyTable[index];
|
||||
return 0;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws FileNotFoundException,
|
||||
UnsupportedEncodingException, IOException {
|
||||
BigramDictionary dic = new BigramDictionary();
|
||||
dic.load("D:/analysis-data");
|
||||
// dic.test("D:/analysis-data/BigramDict.dct");
|
||||
System.out.println("max:" + dic.max);
|
||||
System.out.println("average repeat:" + (double) dic.repeat / 328856);
|
||||
System.out.println("end");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,302 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart.hhmm;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.ObjectInputStream;
|
||||
import java.io.ObjectOutputStream;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
|
||||
|
||||
public class CopyOfBigramDictionary extends AbstractDictionary {
|
||||
|
||||
private CopyOfBigramDictionary() {
|
||||
}
|
||||
|
||||
public static final char WORD_SEGMENT_CHAR = '@';
|
||||
|
||||
private static CopyOfBigramDictionary singleInstance;
|
||||
|
||||
public static final int PRIME_BIGRAM_LENGTH = 402137;
|
||||
|
||||
/**
|
||||
* bigramTable 来存储词与词之间的跳转频率, bigramHashTable 和 frequencyTable
|
||||
* 就是用来存储这些频率的数据结构。 为了提高查询速度和节省内存, 采用 hash 值来代替关联词作为查询依据, 关联词就是
|
||||
* (formWord+'@'+toWord) , 利用 FNV1 hash 算法来计算关联词的hash值 ,并保存在 bigramHashTable
|
||||
* 中,利用 hash 值来代替关联词有可能会产生很小概率的冲突, 但是 long 类型
|
||||
* (64bit)的hash值有效地将此概率降到极低。bigramHashTable[i]与frequencyTable[i]一一对应
|
||||
*/
|
||||
private long[] bigramHashTable;
|
||||
|
||||
private int[] frequencyTable;
|
||||
|
||||
private int max = 0;
|
||||
|
||||
private int repeat = 0;
|
||||
|
||||
// static Logger log = Logger.getLogger(BigramDictionary.class);
|
||||
|
||||
public synchronized static CopyOfBigramDictionary getInstance() {
|
||||
if (singleInstance == null) {
|
||||
String dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
|
||||
singleInstance = new CopyOfBigramDictionary();
|
||||
singleInstance.load(dictRoot);
|
||||
}
|
||||
return singleInstance;
|
||||
}
|
||||
|
||||
private boolean loadFromObj(File serialObj) {
|
||||
boolean loadFromObject = false;
|
||||
try {
|
||||
ObjectInputStream input = new ObjectInputStream(new FileInputStream(
|
||||
serialObj));
|
||||
bigramHashTable = (long[]) input.readObject();
|
||||
frequencyTable = (int[]) input.readObject();
|
||||
// log.info("load bigram dict from serialization.");
|
||||
loadFromObject = true;
|
||||
input.close();
|
||||
} catch (Exception e) {
|
||||
// log.warn(e.getMessage());
|
||||
}
|
||||
return loadFromObject;
|
||||
}
|
||||
|
||||
private void saveToObj(File serialObj) {
|
||||
try {
|
||||
ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
|
||||
serialObj));
|
||||
output.writeObject(bigramHashTable);
|
||||
output.writeObject(frequencyTable);
|
||||
output.close();
|
||||
// log.info("serialize bigram dict.");
|
||||
} catch (Exception e) {
|
||||
// log.warn(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private void load(String dictRoot) {
|
||||
String bigramDictPath = dictRoot + "/bigramdict.dct";
|
||||
|
||||
File serialObj = new File(dictRoot + "/bigramdict.mem");
|
||||
|
||||
if (serialObj.exists() && loadFromObj(serialObj)) {
|
||||
|
||||
} else {
|
||||
try {
|
||||
bigramHashTable = new long[PRIME_BIGRAM_LENGTH];
|
||||
frequencyTable = new int[PRIME_BIGRAM_LENGTH];
|
||||
for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++) {
|
||||
// 实际上将0作为初始值有一点问题,因为某个字符串可能hash值为0,但是概率非常小,因此影响不大
|
||||
bigramHashTable[i] = 0;
|
||||
frequencyTable[i] = 0;
|
||||
}
|
||||
loadFromFile(bigramDictPath);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e.getMessage());
|
||||
}
|
||||
saveToObj(serialObj);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 将词库文件加载到WordDictionary的相关数据结构中,只是加载,没有进行合并和修改操作
|
||||
*
|
||||
* @param dctFilePath
|
||||
* @return
|
||||
* @throws FileNotFoundException
|
||||
* @throws IOException
|
||||
* @throws UnsupportedEncodingException
|
||||
*/
|
||||
public void loadFromFile(String dctFilePath) throws FileNotFoundException,
|
||||
IOException, UnsupportedEncodingException {
|
||||
|
||||
int i, cnt, length, total = 0;
|
||||
// 文件中只统计了6763个汉字加5个空汉字符3756~3760,其中第3756个用来存储符号信息。
|
||||
int[] buffer = new int[3];
|
||||
byte[] intBuffer = new byte[4];
|
||||
String tmpword;
|
||||
RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
|
||||
|
||||
// 字典文件中第一个汉字出现的位置是0,最后一个是6768
|
||||
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
|
||||
String currentStr = getCCByGB2312Id(i);
|
||||
// if (i == 5231)
|
||||
// System.out.println(i);
|
||||
|
||||
dctFile.read(intBuffer);// 原词库文件在c下开发,所以写入的文件为little
|
||||
// endian编码,而java为big endian,必须转换过来
|
||||
cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
|
||||
if (cnt <= 0) {
|
||||
continue;
|
||||
}
|
||||
total += cnt;
|
||||
int j = 0;
|
||||
while (j < cnt) {
|
||||
dctFile.read(intBuffer);
|
||||
buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
|
||||
.getInt();// frequency
|
||||
dctFile.read(intBuffer);
|
||||
buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
|
||||
.getInt();// length
|
||||
dctFile.read(intBuffer);
|
||||
// buffer[2] = ByteBuffer.wrap(intBuffer).order(
|
||||
// ByteOrder.LITTLE_ENDIAN).getInt();// handle
|
||||
|
||||
length = buffer[1];
|
||||
if (length > 0) {
|
||||
byte[] lchBuffer = new byte[length];
|
||||
dctFile.read(lchBuffer);
|
||||
tmpword = new String(lchBuffer, "GB2312");
|
||||
if (i != 3755 + GB2312_FIRST_CHAR) {
|
||||
tmpword = currentStr + tmpword;
|
||||
}
|
||||
char carray[] = tmpword.toCharArray();
|
||||
long hashId = hash1(carray);
|
||||
int index = getAvaliableIndex(hashId, carray);
|
||||
if (index != -1) {
|
||||
if (bigramHashTable[index] == 0) {
|
||||
bigramHashTable[index] = hashId;
|
||||
// bigramStringTable[index] = tmpword;
|
||||
}
|
||||
frequencyTable[index] += buffer[0];
|
||||
}
|
||||
}
|
||||
j++;
|
||||
}
|
||||
}
|
||||
dctFile.close();
|
||||
// log.info("load dictionary done! " + dctFilePath + " total:" + total);
|
||||
}
|
||||
|
||||
/*
|
||||
* public void test(String dctFilePath) throws IOException { int i, cnt,
|
||||
* length, total = 0; int corrupt = 0, notFound = 0; //
|
||||
* 文件中只统计了6763个汉字加5个空汉字符3756~3760,其中第3756个用来存储符号信息。 int[] buffer = new int[3];
|
||||
* byte[] intBuffer = new byte[4]; String tmpword; RandomAccessFile dctFile =
|
||||
* new RandomAccessFile(dctFilePath, "r");
|
||||
*
|
||||
* // 字典文件中第一个汉字出现的位置是0,最后一个是6768 for (i = GB2312_FIRST_CHAR; i <
|
||||
* GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { String currentStr =
|
||||
* getCCByGB2312Id(i); // if (i == 5231) // System.out.println(i);
|
||||
*
|
||||
* dctFile.read(intBuffer);// 原词库文件在c下开发,所以写入的文件为little // endian编码,而java为big
|
||||
* endian,必须转换过来 cnt =
|
||||
* ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN) .getInt(); if
|
||||
* (cnt <= 0) { continue; } total += cnt; int j = 0; while (j < cnt) {
|
||||
* dctFile.read(intBuffer); buffer[0] = ByteBuffer.wrap(intBuffer).order(
|
||||
* ByteOrder.LITTLE_ENDIAN).getInt();// frequency dctFile.read(intBuffer);
|
||||
* buffer[1] = ByteBuffer.wrap(intBuffer).order(
|
||||
* ByteOrder.LITTLE_ENDIAN).getInt();// length dctFile.read(intBuffer); //
|
||||
* buffer[2] = ByteBuffer.wrap(intBuffer).order( //
|
||||
* ByteOrder.LITTLE_ENDIAN).getInt();// handle
|
||||
*
|
||||
* length = buffer[1]; if (length > 0) { byte[] lchBuffer = new byte[length];
|
||||
* dctFile.read(lchBuffer); tmpword = new String(lchBuffer, "GB2312"); if (i
|
||||
* != 3755 + GB2312_FIRST_CHAR) { tmpword = currentStr + tmpword; } char
|
||||
* carray[] = tmpword.toCharArray(); int index = getBigramItemIndex(carray);
|
||||
* if (index != -1) { // if (!bigramStringTable[index].equals(tmpword)) { //
|
||||
* System.out.println("corrupt: " + tmpword + "<->" // +
|
||||
* bigramStringTable[index]); // corrupt++; // } } else {
|
||||
* System.out.println("not found: " + tmpword); notFound++; } } j++; } }
|
||||
* dctFile.close(); System.out.println("num not found:" + notFound);
|
||||
* System.out.println("num corrupt:" + corrupt);
|
||||
*
|
||||
* log.info("test dictionary done! " + dctFilePath + " total:" + total); cnt =
|
||||
* 0; for (int j = 0; j < PRIME_BIGRAM_LENGTH; j++) { if (bigramHashTable[j]
|
||||
* != 0) { cnt++; } } System.out.println("total num in bigramTable: " + cnt);
|
||||
* }
|
||||
*/
|
||||
|
||||
private int getAvaliableIndex(long hashId, char carray[]) {
|
||||
int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
|
||||
int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
|
||||
if (hash1 < 0)
|
||||
hash1 = PRIME_BIGRAM_LENGTH + hash1;
|
||||
if (hash2 < 0)
|
||||
hash2 = PRIME_BIGRAM_LENGTH + hash2;
|
||||
int index = hash1;
|
||||
int i = 1;
|
||||
while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
|
||||
&& i < PRIME_BIGRAM_LENGTH) {
|
||||
index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
|
||||
i++;
|
||||
}
|
||||
// System.out.println(i - 1);
|
||||
|
||||
if (i < PRIME_BIGRAM_LENGTH
|
||||
&& (bigramHashTable[index] == 0 || bigramHashTable[index] == hashId)) {
|
||||
return index;
|
||||
} else
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param c
|
||||
* @return
|
||||
*/
|
||||
private int getBigramItemIndex(char carray[]) {
|
||||
long hashId = hash1(carray);
|
||||
int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
|
||||
int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
|
||||
if (hash1 < 0)
|
||||
hash1 = PRIME_BIGRAM_LENGTH + hash1;
|
||||
if (hash2 < 0)
|
||||
hash2 = PRIME_BIGRAM_LENGTH + hash2;
|
||||
int index = hash1;
|
||||
int i = 1;
|
||||
repeat++;
|
||||
while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
|
||||
&& i < PRIME_BIGRAM_LENGTH) {
|
||||
index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
|
||||
i++;
|
||||
repeat++;
|
||||
if (i > max)
|
||||
max = i;
|
||||
}
|
||||
// System.out.println(i - 1);
|
||||
|
||||
if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId) {
|
||||
return index;
|
||||
} else
|
||||
return -1;
|
||||
}
|
||||
|
||||
public int getFrequency(char[] carray) {
|
||||
int index = getBigramItemIndex(carray);
|
||||
if (index != -1)
|
||||
return frequencyTable[index];
|
||||
return 0;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws FileNotFoundException,
|
||||
UnsupportedEncodingException, IOException {
|
||||
CopyOfBigramDictionary dic = new CopyOfBigramDictionary();
|
||||
dic.load("D:/analysis-data");
|
||||
// dic.test("D:/analysis-data/BigramDict.dct");
|
||||
System.out.println("max:" + dic.max);
|
||||
System.out.println("average repeat:" + (double) dic.repeat / 328856);
|
||||
System.out.println("end");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,541 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart.hhmm;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.ObjectInputStream;
|
||||
import java.io.ObjectOutputStream;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
|
||||
import org.apache.lucene.analysis.cn.smart.Utility;
|
||||
|
||||
public class CopyOfWordDictionary extends AbstractDictionary {
|
||||
|
||||
private CopyOfWordDictionary() {
|
||||
}
|
||||
|
||||
private static CopyOfWordDictionary singleInstance;
|
||||
|
||||
/**
|
||||
* 一个较大的素数,保证hash查找能够遍历所有位置
|
||||
*/
|
||||
public static final int PRIME_INDEX_LENGTH = 12071;
|
||||
|
||||
/**
|
||||
* wordIndexTable保证将Unicode中的所有汉字编码hash到PRIME_INDEX_LENGTH长度的数组中,
|
||||
* 当然会有冲突,但实际上本程序只处理GB2312字符部分,6768个字符加上一些ASCII字符,
|
||||
* 因此对这些字符是有效的,为了保证比较的准确性,保留原来的字符在charIndexTable中以确定查找的准确性
|
||||
*/
|
||||
private short[] wordIndexTable;
|
||||
|
||||
private char[] charIndexTable;
|
||||
|
||||
/**
|
||||
* 存储所有词库的真正数据结构,为了避免占用空间太多,用了两个单独的多维数组来存储词组和频率。
|
||||
* 每个词放在一个char[]中,每个char对应一个汉字或其他字符,每个频率放在一个int中,
|
||||
* 这两个数组的前两个下表是一一对应的。因此可以利用wordItem_charArrayTable[i][j]来查词,
|
||||
* 用wordItem_frequencyTable[i][j]来查询对应的频率
|
||||
*/
|
||||
private char[][][] wordItem_charArrayTable;
|
||||
|
||||
private int[][] wordItem_frequencyTable;
|
||||
|
||||
// static Logger log = Logger.getLogger(WordDictionary.class);
|
||||
|
||||
public synchronized static CopyOfWordDictionary getInstance() {
|
||||
if (singleInstance == null) {
|
||||
singleInstance = new CopyOfWordDictionary();
|
||||
String wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
|
||||
singleInstance.load(wordDictRoot);
|
||||
}
|
||||
return singleInstance;
|
||||
}
|
||||
|
||||
/**
|
||||
* 加在词典库文件,
|
||||
*
|
||||
* @param dctFileName 词典库文件的路径
|
||||
*/
|
||||
public void load(String dctFileRoot) {
|
||||
String dctFilePath = dctFileRoot + "/coredict.dct";
|
||||
File serialObj = new File(dctFileRoot + "/coredict.mem");
|
||||
|
||||
if (serialObj.exists() && loadFromObj(serialObj)) {
|
||||
|
||||
} else {
|
||||
try {
|
||||
wordIndexTable = new short[PRIME_INDEX_LENGTH];
|
||||
charIndexTable = new char[PRIME_INDEX_LENGTH];
|
||||
for (int i = 0; i < PRIME_INDEX_LENGTH; i++) {
|
||||
charIndexTable[i] = 0;
|
||||
wordIndexTable[i] = -1;
|
||||
}
|
||||
wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][];
|
||||
wordItem_frequencyTable = new int[GB2312_CHAR_NUM][];
|
||||
// int total =
|
||||
loadMainDataFromFile(dctFilePath);
|
||||
expandDelimiterData();
|
||||
mergeSameWords();
|
||||
sortEachItems();
|
||||
// log.info("load dictionary: " + dctFilePath + " total:" + total);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e.getMessage());
|
||||
}
|
||||
|
||||
saveToObj(serialObj);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private boolean loadFromObj(File serialObj) {
|
||||
boolean loadFromObject = false;
|
||||
try {
|
||||
ObjectInputStream input = new ObjectInputStream(new FileInputStream(
|
||||
serialObj));
|
||||
wordIndexTable = (short[]) input.readObject();
|
||||
charIndexTable = (char[]) input.readObject();
|
||||
wordItem_charArrayTable = (char[][][]) input.readObject();
|
||||
wordItem_frequencyTable = (int[][]) input.readObject();
|
||||
// log.info("load core dict from serialization.");
|
||||
input.close();
|
||||
loadFromObject = true;
|
||||
} catch (Exception e) {
|
||||
// log.warn(e.getMessage());
|
||||
}
|
||||
return loadFromObject;
|
||||
}
|
||||
|
||||
private void saveToObj(File serialObj) {
|
||||
try {
|
||||
ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
|
||||
serialObj));
|
||||
output.writeObject(wordIndexTable);
|
||||
output.writeObject(charIndexTable);
|
||||
output.writeObject(wordItem_charArrayTable);
|
||||
output.writeObject(wordItem_frequencyTable);
|
||||
output.close();
|
||||
// log.info("serialize core dict.");
|
||||
} catch (Exception e) {
|
||||
// log.warn(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 将词库文件加载到WordDictionary的相关数据结构中,只是加载,没有进行合并和修改操作
|
||||
*
|
||||
* @param dctFilePath
|
||||
* @return
|
||||
* @throws FileNotFoundException
|
||||
* @throws IOException
|
||||
* @throws UnsupportedEncodingException
|
||||
*/
|
||||
private int loadMainDataFromFile(String dctFilePath)
|
||||
throws FileNotFoundException, IOException, UnsupportedEncodingException {
|
||||
int i, cnt, length, total = 0;
|
||||
// 文件中只统计了6763个汉字加5个空汉字符3756~3760,其中第3756个用来存储符号信息。
|
||||
int[] buffer = new int[3];
|
||||
byte[] intBuffer = new byte[4];
|
||||
String tmpword;
|
||||
RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
|
||||
|
||||
// 字典文件中第一个汉字出现的位置是0,最后一个是6768
|
||||
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
|
||||
// if (i == 5231)
|
||||
// System.out.println(i);
|
||||
|
||||
dctFile.read(intBuffer);// 原词库文件在c下开发,所以写入的文件为little
|
||||
// endian编码,而java为big endian,必须转换过来
|
||||
cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
|
||||
if (cnt <= 0) {
|
||||
wordItem_charArrayTable[i] = null;
|
||||
wordItem_frequencyTable[i] = null;
|
||||
continue;
|
||||
}
|
||||
wordItem_charArrayTable[i] = new char[cnt][];
|
||||
wordItem_frequencyTable[i] = new int[cnt];
|
||||
total += cnt;
|
||||
int j = 0;
|
||||
while (j < cnt) {
|
||||
// wordItemTable[i][j] = new WordItem();
|
||||
dctFile.read(intBuffer);
|
||||
buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
|
||||
.getInt();// frequency
|
||||
dctFile.read(intBuffer);
|
||||
buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
|
||||
.getInt();// length
|
||||
dctFile.read(intBuffer);
|
||||
buffer[2] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
|
||||
.getInt();// handle
|
||||
|
||||
// wordItemTable[i][j].frequency = buffer[0];
|
||||
wordItem_frequencyTable[i][j] = buffer[0];
|
||||
|
||||
length = buffer[1];
|
||||
if (length > 0) {
|
||||
byte[] lchBuffer = new byte[length];
|
||||
dctFile.read(lchBuffer);
|
||||
tmpword = new String(lchBuffer, "GB2312");
|
||||
// indexTable[i].wordItems[j].word = tmpword;
|
||||
// wordItemTable[i][j].charArray = tmpword.toCharArray();
|
||||
wordItem_charArrayTable[i][j] = tmpword.toCharArray();
|
||||
} else {
|
||||
// wordItemTable[i][j].charArray = null;
|
||||
wordItem_charArrayTable[i][j] = null;
|
||||
}
|
||||
// System.out.println(indexTable[i].wordItems[j]);
|
||||
j++;
|
||||
}
|
||||
|
||||
String str = getCCByGB2312Id(i);
|
||||
setTableIndex(str.charAt(0), i);
|
||||
}
|
||||
dctFile.close();
|
||||
return total;
|
||||
}
|
||||
|
||||
/**
|
||||
* 原词库将所有标点符号的信息合并到一个列表里(从1开始的3755处)。这里将其展开,分别放到各个符号对应的列表中
|
||||
*/
|
||||
private void expandDelimiterData() {
|
||||
int i;
|
||||
int cnt;
|
||||
// 标点符号在从1开始的3755处,将原始的标点符号对应的字典分配到对应的标点符号中
|
||||
int delimiterIndex = 3755 + GB2312_FIRST_CHAR;
|
||||
i = 0;
|
||||
while (i < wordItem_charArrayTable[delimiterIndex].length) {
|
||||
char c = wordItem_charArrayTable[delimiterIndex][i][0];
|
||||
int j = getGB2312Id(c);// 该标点符号应该所在的index值
|
||||
if (wordItem_charArrayTable[j] == null) {
|
||||
|
||||
int k = i;
|
||||
// 从i开始计数后面以j开头的符号的worditem的个数
|
||||
while (k < wordItem_charArrayTable[delimiterIndex].length
|
||||
&& wordItem_charArrayTable[delimiterIndex][k][0] == c) {
|
||||
k++;
|
||||
}
|
||||
// 此时k-i为id为j的标点符号对应的wordItem的个数
|
||||
cnt = k - i;
|
||||
if (cnt != 0) {
|
||||
wordItem_charArrayTable[j] = new char[cnt][];
|
||||
wordItem_frequencyTable[j] = new int[cnt];
|
||||
}
|
||||
|
||||
// 为每一个wordItem赋值
|
||||
for (k = 0; k < cnt; k++, i++) {
|
||||
// wordItemTable[j][k] = new WordItem();
|
||||
wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i];
|
||||
wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].length - 1];
|
||||
System.arraycopy(wordItem_charArrayTable[delimiterIndex][i], 1,
|
||||
wordItem_charArrayTable[j][k], 0,
|
||||
wordItem_charArrayTable[j][k].length);
|
||||
}
|
||||
setTableIndex(c, j);
|
||||
}
|
||||
}
|
||||
// 将原符号对应的数组删除
|
||||
wordItem_charArrayTable[delimiterIndex] = null;
|
||||
wordItem_frequencyTable[delimiterIndex] = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 本程序不做词性标注,因此将相同词不同词性的频率合并到同一个词下,以减小存储空间,加快搜索速度
|
||||
*/
|
||||
private void mergeSameWords() {
|
||||
int i;
|
||||
for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
|
||||
if (wordItem_charArrayTable[i] == null)
|
||||
continue;
|
||||
int len = 1;
|
||||
for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
|
||||
if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
|
||||
wordItem_charArrayTable[i][j - 1], 0) != 0)
|
||||
len++;
|
||||
|
||||
}
|
||||
if (len < wordItem_charArrayTable[i].length) {
|
||||
char[][] tempArray = new char[len][];
|
||||
int[] tempFreq = new int[len];
|
||||
int k = 0;
|
||||
tempArray[0] = wordItem_charArrayTable[i][0];
|
||||
tempFreq[0] = wordItem_frequencyTable[i][0];
|
||||
for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
|
||||
if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
|
||||
tempArray[k], 0) != 0) {
|
||||
k++;
|
||||
// temp[k] = wordItemTable[i][j];
|
||||
tempArray[k] = wordItem_charArrayTable[i][j];
|
||||
tempFreq[k] = wordItem_frequencyTable[i][j];
|
||||
} else {
|
||||
// temp[k].frequency += wordItemTable[i][j].frequency;
|
||||
tempFreq[k] += wordItem_frequencyTable[i][j];
|
||||
}
|
||||
}
|
||||
// wordItemTable[i] = temp;
|
||||
wordItem_charArrayTable[i] = tempArray;
|
||||
wordItem_frequencyTable[i] = tempFreq;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void sortEachItems() {
|
||||
char[] tmpArray;
|
||||
int tmpFreq;
|
||||
for (int i = 0; i < wordItem_charArrayTable.length; i++) {
|
||||
if (wordItem_charArrayTable[i] != null
|
||||
&& wordItem_charArrayTable[i].length > 1) {
|
||||
for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) {
|
||||
for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) {
|
||||
if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
|
||||
wordItem_charArrayTable[i][j2], 0) > 0) {
|
||||
tmpArray = wordItem_charArrayTable[i][j];
|
||||
tmpFreq = wordItem_frequencyTable[i][j];
|
||||
wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2];
|
||||
wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2];
|
||||
wordItem_charArrayTable[i][j2] = tmpArray;
|
||||
wordItem_frequencyTable[i][j2] = tmpFreq;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 计算字符c在哈希表中应该在的位置,然后将地址列表中该位置的值初始化
|
||||
*
|
||||
* @param c
|
||||
* @param j
|
||||
* @return
|
||||
*/
|
||||
private boolean setTableIndex(char c, int j) {
|
||||
int index = getAvaliableTableIndex(c);
|
||||
if (index != -1) {
|
||||
charIndexTable[index] = c;
|
||||
wordIndexTable[index] = (short) j;
|
||||
return true;
|
||||
} else
|
||||
return false;
|
||||
}
|
||||
|
||||
private short getAvaliableTableIndex(char c) {
|
||||
int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
|
||||
int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
|
||||
if (hash1 < 0)
|
||||
hash1 = PRIME_INDEX_LENGTH + hash1;
|
||||
if (hash2 < 0)
|
||||
hash2 = PRIME_INDEX_LENGTH + hash2;
|
||||
int index = hash1;
|
||||
int i = 1;
|
||||
while (charIndexTable[index] != 0 && charIndexTable[index] != c
|
||||
&& i < PRIME_INDEX_LENGTH) {
|
||||
index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
|
||||
i++;
|
||||
}
|
||||
// System.out.println(i - 1);
|
||||
|
||||
if (i < PRIME_INDEX_LENGTH
|
||||
&& (charIndexTable[index] == 0 || charIndexTable[index] == c)) {
|
||||
return (short) index;
|
||||
} else
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param c
|
||||
* @return
|
||||
*/
|
||||
private short getWordItemTableIndex(char c) {
|
||||
int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
|
||||
int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
|
||||
if (hash1 < 0)
|
||||
hash1 = PRIME_INDEX_LENGTH + hash1;
|
||||
if (hash2 < 0)
|
||||
hash2 = PRIME_INDEX_LENGTH + hash2;
|
||||
int index = hash1;
|
||||
int i = 1;
|
||||
while (charIndexTable[index] != 0 && charIndexTable[index] != c
|
||||
&& i < PRIME_INDEX_LENGTH) {
|
||||
index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
|
||||
i++;
|
||||
}
|
||||
|
||||
if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) {
|
||||
return (short) index;
|
||||
} else
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* 在字典库中查找单词对应的char数组为charArray的字符串。返回该单词在单词序列中的位置
|
||||
*
|
||||
* @param charArray 查找单词对应的char数组
|
||||
* @return 单词在单词数组中的位置,如果没找到则返回-1
|
||||
*/
|
||||
private int findInTable(char[] charArray) {
|
||||
if (charArray == null || charArray.length == 0)
|
||||
return -1;
|
||||
short index = getWordItemTableIndex(charArray[0]);
|
||||
if (index == -1)
|
||||
return -1;
|
||||
|
||||
return findInTable(index, charArray);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 在字典库中查找单词对应的char数组为charArray的字符串。返回该单词在单词序列中的位置
|
||||
*
|
||||
* @param knownHashIndex 已知单词第一个字符charArray[0]在hash表中的位置,如果未计算,可以用函数int
|
||||
* findInTable(char[] charArray) 代替
|
||||
* @param charArray 查找单词对应的char数组
|
||||
* @return 单词在单词数组中的位置,如果没找到则返回-1
|
||||
*/
|
||||
private int findInTable(short knownHashIndex, char[] charArray) {
|
||||
if (charArray == null || charArray.length == 0)
|
||||
return -1;
|
||||
|
||||
char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]];
|
||||
int start = 0, end = items.length - 1;
|
||||
int mid = (start + end) / 2, cmpResult;
|
||||
|
||||
// Binary search for the index of idArray
|
||||
while (start <= end) {
|
||||
cmpResult = Utility.compareArray(items[mid], 0, charArray, 1);
|
||||
|
||||
if (cmpResult == 0)
|
||||
return mid;// find it
|
||||
else if (cmpResult < 0)
|
||||
start = mid + 1;
|
||||
else if (cmpResult > 0)
|
||||
end = mid - 1;
|
||||
|
||||
mid = (start + end) / 2;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* charArray这个单词对应的词组在不在WordDictionary中出现
|
||||
*
|
||||
* @param charArray
|
||||
* @return true表示存在,false表示不存在
|
||||
*/
|
||||
public boolean isExist(char[] charArray) {
|
||||
return findInTable(charArray) != -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see{getPrefixMatch(char[] charArray, int knownStart)}
|
||||
* @param charArray
|
||||
* @return
|
||||
*/
|
||||
public int getPrefixMatch(char[] charArray) {
|
||||
return getPrefixMatch(charArray, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* 从词典中查找以charArray对应的单词为前缀(prefix)的单词的位置, 并返回第一个满足条件的位置。为了减小搜索代价,
|
||||
* 可以根据已有知识设置起始搜索位置, 如果不知道起始位置,默认是0
|
||||
*
|
||||
* @see{getPrefixMatch(char[] charArray)}
|
||||
* @param charArray 前缀单词
|
||||
* @param knownStart 已知的起始位置
|
||||
* @return 满足前缀条件的第一个单词的位置
|
||||
*/
|
||||
public int getPrefixMatch(char[] charArray, int knownStart) {
|
||||
short index = getWordItemTableIndex(charArray[0]);
|
||||
if (index == -1)
|
||||
return -1;
|
||||
char[][] items = wordItem_charArrayTable[wordIndexTable[index]];
|
||||
int start = knownStart, end = items.length - 1;
|
||||
|
||||
int mid = (start + end) / 2, cmpResult;
|
||||
|
||||
// Binary search for the index of idArray
|
||||
while (start <= end) {
|
||||
cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0);
|
||||
if (cmpResult == 0) {
|
||||
// Get the first item which match the current word
|
||||
while (mid >= 0
|
||||
&& Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0)
|
||||
mid--;
|
||||
mid++;
|
||||
return mid;// 找到第一个以charArray为前缀的单词
|
||||
} else if (cmpResult < 0)
|
||||
end = mid - 1;
|
||||
else
|
||||
start = mid + 1;
|
||||
mid = (start + end) / 2;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取idArray对应的词的词频,若pos为-1则获取所有词性的词频
|
||||
*
|
||||
* @param charArray 输入的单词对应的charArray
|
||||
* @param pos 词性,-1表示要求求出所有的词性的词频
|
||||
* @return idArray对应的词频
|
||||
*/
|
||||
public int getFrequency(char[] charArray) {
|
||||
short hashIndex = getWordItemTableIndex(charArray[0]);
|
||||
if (hashIndex == -1)
|
||||
return 0;
|
||||
int itemIndex = findInTable(hashIndex, charArray);
|
||||
if (itemIndex != -1)
|
||||
return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex];
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断charArray对应的字符串是否跟词典中charArray[0]对应的wordIndex的charArray相等,
|
||||
* 也就是说charArray的位置查找结果是不是就是wordIndex
|
||||
*
|
||||
* @param charArray 输入的charArray词组,第一个数表示词典中的索引号
|
||||
* @param itemIndex 位置编号
|
||||
* @return 是否相等
|
||||
*/
|
||||
public boolean isEqual(char[] charArray, int itemIndex) {
|
||||
short hashIndex = getWordItemTableIndex(charArray[0]);
|
||||
return Utility.compareArray(charArray, 1,
|
||||
wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws FileNotFoundException,
|
||||
IOException {
|
||||
CopyOfWordDictionary dic = new CopyOfWordDictionary();
|
||||
dic.load("D:/analysis-data");
|
||||
Utility.getCharType('。');
|
||||
Utility.getCharType('汗');
|
||||
Utility.getCharType(' ');// 0020
|
||||
Utility.getCharType(' ');// 3000
|
||||
Utility.getCharType('');// E095
|
||||
Utility.getCharType(' ');// 3000
|
||||
Utility.getCharType('\r');// 000D
|
||||
Utility.getCharType('\n');// 000A
|
||||
Utility.getCharType('\t');// 0009
|
||||
}
|
||||
}
|
|
@ -0,0 +1,193 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart.hhmm;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.cn.smart.CharType;
|
||||
import org.apache.lucene.analysis.cn.smart.Utility;
|
||||
import org.apache.lucene.analysis.cn.smart.WordType;
|
||||
|
||||
public class HHMMSegmenter {
|
||||
|
||||
private static WordDictionary wordDict = WordDictionary.getInstance();
|
||||
|
||||
/**
|
||||
* 寻找sentence中所有可能的Token,最后再添加两个特殊Token,"始##始",
|
||||
* "末##末","始##始"Token的起始位置是-1,"末##末"Token的起始位置是句子的长度
|
||||
*
|
||||
* @param sentence 输入的句子,不包含"始##始","末##末"等
|
||||
* @param coreDict 核心字典
|
||||
* @return 所有可能的Token
|
||||
* @see MultiTokenMap
|
||||
*/
|
||||
private SegGraph createSegGraph(String sentence) {
|
||||
int i = 0, j;
|
||||
int length = sentence.length();
|
||||
int foundIndex;
|
||||
int[] charTypeArray = getCharTypes(sentence);
|
||||
StringBuffer wordBuf = new StringBuffer();
|
||||
SegToken token;
|
||||
int frequency = 0; // word的出现次数
|
||||
boolean hasFullWidth;
|
||||
int wordType;
|
||||
char[] charArray;
|
||||
|
||||
SegGraph segGraph = new SegGraph();
|
||||
while (i < length) {
|
||||
hasFullWidth = false;
|
||||
switch (charTypeArray[i]) {
|
||||
case CharType.SPACE_LIKE:
|
||||
i++;
|
||||
break;
|
||||
case CharType.HANZI:
|
||||
j = i + 1;
|
||||
wordBuf.delete(0, wordBuf.length());
|
||||
// 不管单个汉字能不能构成词,都将单个汉字存到segGraph中去,否则会造成分此图断字
|
||||
wordBuf.append(sentence.charAt(i));
|
||||
charArray = new char[] { sentence.charAt(i) };
|
||||
frequency = wordDict.getFrequency(charArray);
|
||||
token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
|
||||
frequency);
|
||||
segGraph.addToken(token);
|
||||
|
||||
foundIndex = wordDict.getPrefixMatch(charArray);
|
||||
while (j <= length && foundIndex != -1) {
|
||||
if (wordDict.isEqual(charArray, foundIndex) && charArray.length > 1) {
|
||||
// 就是我们要找的词, 也就是说找到了从i到j的一个成词SegToken,并且不是单字词
|
||||
frequency = wordDict.getFrequency(charArray);
|
||||
token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
|
||||
frequency);
|
||||
segGraph.addToken(token);
|
||||
}
|
||||
|
||||
while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
|
||||
j++;
|
||||
|
||||
if (j < length && charTypeArray[j] == CharType.HANZI) {
|
||||
wordBuf.append(sentence.charAt(j));
|
||||
charArray = new char[wordBuf.length()];
|
||||
wordBuf.getChars(0, charArray.length, charArray, 0);
|
||||
// idArray作为前缀已经找到过(foundWordIndex!=-1),
|
||||
// 因此加长过后的idArray只可能出现在foundWordIndex以后,
|
||||
// 故从foundWordIndex之后开始查找
|
||||
foundIndex = wordDict.getPrefixMatch(charArray, foundIndex);
|
||||
j++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
i++;
|
||||
break;
|
||||
case CharType.FULLWIDTH_LETTER:
|
||||
hasFullWidth = true;
|
||||
case CharType.LETTER:
|
||||
j = i + 1;
|
||||
while (j < length
|
||||
&& (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) {
|
||||
if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
|
||||
hasFullWidth = true;
|
||||
j++;
|
||||
}
|
||||
// 找到了从i到j的一个Token,类型为LETTER的字符串
|
||||
charArray = Utility.STRING_CHAR_ARRAY;
|
||||
frequency = wordDict.getFrequency(charArray);
|
||||
wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
|
||||
token = new SegToken(charArray, i, j, wordType, frequency);
|
||||
segGraph.addToken(token);
|
||||
i = j;
|
||||
break;
|
||||
case CharType.FULLWIDTH_DIGIT:
|
||||
hasFullWidth = true;
|
||||
case CharType.DIGIT:
|
||||
j = i + 1;
|
||||
while (j < length
|
||||
&& (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) {
|
||||
if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
|
||||
hasFullWidth = true;
|
||||
j++;
|
||||
}
|
||||
// 找到了从i到j的一个Token,类型为NUMBER的字符串
|
||||
charArray = Utility.NUMBER_CHAR_ARRAY;
|
||||
frequency = wordDict.getFrequency(charArray);
|
||||
wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
|
||||
token = new SegToken(charArray, i, j, wordType, frequency);
|
||||
segGraph.addToken(token);
|
||||
i = j;
|
||||
break;
|
||||
case CharType.DELIMITER:
|
||||
j = i + 1;
|
||||
// 标点符号的weight不用查了,选个最大的频率即可
|
||||
frequency = Utility.MAX_FREQUENCE;
|
||||
charArray = new char[] { sentence.charAt(i) };
|
||||
token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
|
||||
segGraph.addToken(token);
|
||||
i = j;
|
||||
break;
|
||||
default:
|
||||
j = i + 1;
|
||||
// 把不认识的字符当作未知串看待,例如GB2312编码之外的字符,每个字符当作一个
|
||||
charArray = Utility.STRING_CHAR_ARRAY;
|
||||
frequency = wordDict.getFrequency(charArray);
|
||||
token = new SegToken(charArray, i, j, WordType.STRING, frequency);
|
||||
segGraph.addToken(token);
|
||||
i = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// 为segGraph增加两个新Token: "始##始","末##末"
|
||||
charArray = Utility.START_CHAR_ARRAY;
|
||||
frequency = wordDict.getFrequency(charArray);
|
||||
token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
|
||||
segGraph.addToken(token);
|
||||
|
||||
// "末##末"
|
||||
charArray = Utility.END_CHAR_ARRAY;
|
||||
frequency = wordDict.getFrequency(charArray);
|
||||
token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
|
||||
frequency);
|
||||
segGraph.addToken(token);
|
||||
|
||||
return segGraph;
|
||||
}
|
||||
|
||||
/**
|
||||
* 为sentence中的每个字符确定唯一的字符类型
|
||||
*
|
||||
* @see Utility.charType(char)
|
||||
* @param sentence 输入的完成句子
|
||||
* @return 返回的字符类型数组,如果输入为null,返回也是null
|
||||
*/
|
||||
private static int[] getCharTypes(String sentence) {
|
||||
int length = sentence.length();
|
||||
int[] charTypeArray = new int[length];
|
||||
// 生成对应单个汉字的字符类型数组
|
||||
for (int i = 0; i < length; i++) {
|
||||
charTypeArray[i] = Utility.getCharType(sentence.charAt(i));
|
||||
}
|
||||
|
||||
return charTypeArray;
|
||||
}
|
||||
|
||||
public List process(String sentence) {
|
||||
SegGraph segGraph = createSegGraph(sentence);
|
||||
BiSegGraph biSegGraph = new BiSegGraph(segGraph);
|
||||
List shortPath = biSegGraph.getShortPath();
|
||||
return shortPath;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart.hhmm;
|
||||
|
||||
public class PathNode implements Comparable {
|
||||
public double weight;
|
||||
|
||||
public int preNode;
|
||||
|
||||
public int compareTo(Object p) {
|
||||
PathNode pn = (PathNode) p;
|
||||
if (weight < pn.weight)
|
||||
return -1;
|
||||
else if (weight == pn.weight)
|
||||
return 0;
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,144 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart.hhmm;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class SegGraph {
|
||||
|
||||
/**
|
||||
* 用一个ArrayList记录startOffset相同的Token,这个startOffset就是Token的key
|
||||
*/
|
||||
private Map tokenListTable = new HashMap();
|
||||
|
||||
private int maxStart = -1;
|
||||
|
||||
/**
|
||||
* 查看startOffset为s的Token是否存在,如果没有则说明s处没有Token或者还没有添加
|
||||
*
|
||||
* @param s startOffset
|
||||
* @return
|
||||
*/
|
||||
public boolean isStartExist(int s) {
|
||||
return tokenListTable.get(new Integer(s)) != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 取出startOffset为s的所有Tokens,如果没有则返回null
|
||||
*
|
||||
* @param s
|
||||
* @return 所有相同startOffset的Token的序列
|
||||
*/
|
||||
public List getStartList(int s) {
|
||||
return (List) tokenListTable.get(new Integer(s));
|
||||
}
|
||||
|
||||
public int getMaxStart() {
|
||||
return maxStart;
|
||||
}
|
||||
|
||||
/**
|
||||
* 为SegGraph中的所有Tokens生成一个统一的index,index从0开始,
|
||||
* 按照startOffset递增的顺序排序,相同startOffset的Tokens按照放置先后顺序排序
|
||||
*/
|
||||
public List makeIndex() {
|
||||
List result = new ArrayList();
|
||||
int s = -1, count = 0, size = tokenListTable.size();
|
||||
List tokenList;
|
||||
short index = 0;
|
||||
while (count < size) {
|
||||
if (isStartExist(s)) {
|
||||
tokenList = (List) tokenListTable.get(new Integer(s));
|
||||
for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
|
||||
SegToken st = (SegToken) iter.next();
|
||||
st.index = index;
|
||||
result.add(st);
|
||||
index++;
|
||||
}
|
||||
count++;
|
||||
}
|
||||
s++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 向Map中增加一个Token,这些Token按照相同startOffset放在同一个列表中,
|
||||
*
|
||||
* @param token
|
||||
*/
|
||||
public void addToken(SegToken token) {
|
||||
int s = token.startOffset;
|
||||
if (!isStartExist(s)) {
|
||||
ArrayList newlist = new ArrayList();
|
||||
newlist.add(token);
|
||||
tokenListTable.put((Object) (new Integer(s)), newlist);
|
||||
} else {
|
||||
List tokenList = (List) tokenListTable.get((Object) (new Integer(s)));
|
||||
tokenList.add(token);
|
||||
}
|
||||
if (s > maxStart)
|
||||
maxStart = s;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取SegGraph中不同起始(Start)位置Token类的个数,每个开始位置可能有多个Token,因此位置数与Token数并不一致
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public int getStartCount() {
|
||||
return tokenListTable.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* 将Map中存储的所有Token按照起始位置从小到大的方式组成一个列表
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public List toTokenList() {
|
||||
List result = new ArrayList();
|
||||
int s = -1, count = 0, size = tokenListTable.size();
|
||||
List tokenList;
|
||||
|
||||
while (count < size) {
|
||||
if (isStartExist(s)) {
|
||||
tokenList = (List) tokenListTable.get(new Integer(s));
|
||||
for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
|
||||
SegToken st = (SegToken) iter.next();
|
||||
result.add(st);
|
||||
}
|
||||
count++;
|
||||
}
|
||||
s++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
List tokenList = this.toTokenList();
|
||||
StringBuffer sb = new StringBuffer();
|
||||
for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
|
||||
SegToken t = (SegToken) iter.next();
|
||||
sb.append(t + "\n");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart.hhmm;
|
||||
|
||||
public class SegToken {
|
||||
public char[] charArray;
|
||||
|
||||
public int startOffset;
|
||||
|
||||
public int endOffset;
|
||||
|
||||
public int wordType;
|
||||
|
||||
public int weight;
|
||||
|
||||
public int index;
|
||||
|
||||
public SegToken(String word, int start, int end, int wordType, int weight) {
|
||||
this.charArray = word.toCharArray();
|
||||
this.startOffset = start;
|
||||
this.endOffset = end;
|
||||
this.wordType = wordType;
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
public SegToken(char[] idArray, int start, int end, int wordType, int weight) {
|
||||
this.charArray = idArray;
|
||||
this.startOffset = start;
|
||||
this.endOffset = end;
|
||||
this.wordType = wordType;
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
// public String toString() {
|
||||
// return String.valueOf(charArray) + "/s(" + startOffset + ")e("
|
||||
// + endOffset + ")/w(" + weight + ")t(" + wordType + ")";
|
||||
// }
|
||||
|
||||
/**
|
||||
* 判断两个Token相等的充要条件是他们的起始位置相等,因为这样他们的原句中的内容一样,
|
||||
* 而pos与weight都可以从词典中查到多个,可以用一对多的方法表示,因此只需要一个Token
|
||||
*
|
||||
* @param t
|
||||
* @return
|
||||
*/
|
||||
// public boolean equals(RawToken t) {
|
||||
// return this.startOffset == t.startOffset
|
||||
// && this.endOffset == t.endOffset;
|
||||
// }
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart.hhmm;
|
||||
|
||||
import org.apache.lucene.analysis.cn.smart.Utility;
|
||||
import org.apache.lucene.analysis.cn.smart.WordType;
|
||||
|
||||
public class SegTokenFilter {
|
||||
|
||||
public SegToken filter(SegToken token) {
|
||||
switch (token.wordType) {
|
||||
case WordType.FULLWIDTH_NUMBER:
|
||||
case WordType.FULLWIDTH_STRING:
|
||||
for (int i = 0; i < token.charArray.length; i++) {
|
||||
if (token.charArray[i] >= 0xFF10)
|
||||
token.charArray[i] -= 0xFEE0;
|
||||
|
||||
if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A)
|
||||
token.charArray[i] += 0x0020;
|
||||
}
|
||||
break;
|
||||
case WordType.STRING:
|
||||
for (int i = 0; i < token.charArray.length; i++) {
|
||||
if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A)
|
||||
token.charArray[i] += 0x0020;
|
||||
}
|
||||
break;
|
||||
case WordType.DELIMITER:
|
||||
token.charArray = Utility.COMMON_DELIMITER;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return token;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart.hhmm;
|
||||
|
||||
public class SegTokenPair {
|
||||
|
||||
public char[] charArray;
|
||||
|
||||
/**
|
||||
* from和to是Token对的index号,表示本TokenPair的两个Token在segGragh中的位置。
|
||||
*/
|
||||
public int from;
|
||||
|
||||
public int to;
|
||||
|
||||
public double weight;
|
||||
|
||||
public SegTokenPair(char[] idArray, int from, int to, double weight) {
|
||||
this.charArray = idArray;
|
||||
this.from = from;
|
||||
this.to = to;
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
// public String toString() {
|
||||
// return String.valueOf(charArray) + ":f(" + from + ")t(" + to + "):"
|
||||
// + weight;
|
||||
// }
|
||||
|
||||
// public boolean equals(SegTokenPair tp) {
|
||||
// return this.from == tp.from && this.to == tp.to;
|
||||
// }
|
||||
|
||||
}
|
|
@ -0,0 +1,568 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart.hhmm;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.ObjectInputStream;
|
||||
import java.io.ObjectOutputStream;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
|
||||
import org.apache.lucene.analysis.cn.smart.Utility;
|
||||
|
||||
public class WordDictionary extends AbstractDictionary {
|
||||
|
||||
private WordDictionary() {
|
||||
}
|
||||
|
||||
private static WordDictionary singleInstance;
|
||||
|
||||
/**
|
||||
* 一个较大的素数,保证hash查找能够遍历所有位置
|
||||
*/
|
||||
public static final int PRIME_INDEX_LENGTH = 12071;
|
||||
|
||||
/**
|
||||
* wordIndexTable保证将Unicode中的所有汉字编码hash到PRIME_INDEX_LENGTH长度的数组中,
|
||||
* 当然会有冲突,但实际上本程序只处理GB2312字符部分,6768个字符加上一些ASCII字符,
|
||||
* 因此对这些字符是有效的,为了保证比较的准确性,保留原来的字符在charIndexTable中以确定查找的准确性
|
||||
*/
|
||||
private short[] wordIndexTable;
|
||||
|
||||
private char[] charIndexTable;
|
||||
|
||||
/**
|
||||
* 存储所有词库的真正数据结构,为了避免占用空间太多,用了两个单独的多维数组来存储词组和频率。
|
||||
* 每个词放在一个char[]中,每个char对应一个汉字或其他字符,每个频率放在一个int中,
|
||||
* 这两个数组的前两个下表是一一对应的。因此可以利用wordItem_charArrayTable[i][j]来查词,
|
||||
* 用wordItem_frequencyTable[i][j]来查询对应的频率
|
||||
*/
|
||||
private char[][][] wordItem_charArrayTable;
|
||||
|
||||
private int[][] wordItem_frequencyTable;
|
||||
|
||||
// static Logger log = Logger.getLogger(WordDictionary.class);
|
||||
|
||||
public synchronized static WordDictionary getInstance() {
|
||||
if (singleInstance == null) {
|
||||
singleInstance = new WordDictionary();
|
||||
try {
|
||||
singleInstance.load();
|
||||
} catch (IOException e) {
|
||||
String wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
|
||||
singleInstance.load(wordDictRoot);
|
||||
} catch (ClassNotFoundException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
return singleInstance;
|
||||
}
|
||||
|
||||
/**
|
||||
* 从外部文件夹dctFileRoot加载词典库文件,首先测试是否有coredict.mem文件, 如果有则直接作为序列化对象加载,
|
||||
* 如果没有则加载词典库源文件coredict.dct
|
||||
*
|
||||
* @param dctFileName 词典库文件的路径
|
||||
*/
|
||||
public void load(String dctFileRoot) {
|
||||
String dctFilePath = dctFileRoot + "/coredict.dct";
|
||||
File serialObj = new File(dctFileRoot + "/coredict.mem");
|
||||
|
||||
if (serialObj.exists() && loadFromObj(serialObj)) {
|
||||
|
||||
} else {
|
||||
try {
|
||||
wordIndexTable = new short[PRIME_INDEX_LENGTH];
|
||||
charIndexTable = new char[PRIME_INDEX_LENGTH];
|
||||
for (int i = 0; i < PRIME_INDEX_LENGTH; i++) {
|
||||
charIndexTable[i] = 0;
|
||||
wordIndexTable[i] = -1;
|
||||
}
|
||||
wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][];
|
||||
wordItem_frequencyTable = new int[GB2312_CHAR_NUM][];
|
||||
// int total =
|
||||
loadMainDataFromFile(dctFilePath);
|
||||
expandDelimiterData();
|
||||
mergeSameWords();
|
||||
sortEachItems();
|
||||
// log.info("load dictionary: " + dctFilePath + " total:" + total);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e.getMessage());
|
||||
}
|
||||
|
||||
saveToObj(serialObj);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 从jar内部加载词典库文件,要求保证WordDictionary类当前路径中有coredict.mem文件,以将其作为序列化对象加载
|
||||
*
|
||||
* @param dctFileName 词典库文件的路径
|
||||
* @throws ClassNotFoundException
|
||||
* @throws IOException
|
||||
*/
|
||||
public void load() throws IOException, ClassNotFoundException {
|
||||
InputStream input = this.getClass().getResourceAsStream("coredict.mem");
|
||||
loadFromObjectInputStream(input);
|
||||
}
|
||||
|
||||
private boolean loadFromObj(File serialObj) {
|
||||
try {
|
||||
loadFromObjectInputStream(new FileInputStream(serialObj));
|
||||
return true;
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
} catch (ClassNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private void loadFromObjectInputStream(InputStream serialObjectInputStream)
|
||||
throws IOException, ClassNotFoundException {
|
||||
ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
|
||||
wordIndexTable = (short[]) input.readObject();
|
||||
charIndexTable = (char[]) input.readObject();
|
||||
wordItem_charArrayTable = (char[][][]) input.readObject();
|
||||
wordItem_frequencyTable = (int[][]) input.readObject();
|
||||
// log.info("load core dict from serialization.");
|
||||
input.close();
|
||||
}
|
||||
|
||||
private void saveToObj(File serialObj) {
|
||||
try {
|
||||
ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
|
||||
serialObj));
|
||||
output.writeObject(wordIndexTable);
|
||||
output.writeObject(charIndexTable);
|
||||
output.writeObject(wordItem_charArrayTable);
|
||||
output.writeObject(wordItem_frequencyTable);
|
||||
output.close();
|
||||
// log.info("serialize core dict.");
|
||||
} catch (Exception e) {
|
||||
// log.warn(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 将词库文件加载到WordDictionary的相关数据结构中,只是加载,没有进行合并和修改操作
|
||||
*
|
||||
* @param dctFilePath
|
||||
* @return
|
||||
* @throws FileNotFoundException
|
||||
* @throws IOException
|
||||
* @throws UnsupportedEncodingException
|
||||
*/
|
||||
private int loadMainDataFromFile(String dctFilePath)
|
||||
throws FileNotFoundException, IOException, UnsupportedEncodingException {
|
||||
int i, cnt, length, total = 0;
|
||||
// 文件中只统计了6763个汉字加5个空汉字符3756~3760,其中第3756个用来存储符号信息。
|
||||
int[] buffer = new int[3];
|
||||
byte[] intBuffer = new byte[4];
|
||||
String tmpword;
|
||||
RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
|
||||
|
||||
// 字典文件中第一个汉字出现的位置是0,最后一个是6768
|
||||
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
|
||||
// if (i == 5231)
|
||||
// System.out.println(i);
|
||||
|
||||
dctFile.read(intBuffer);// 原词库文件在c下开发,所以写入的文件为little
|
||||
// endian编码,而java为big endian,必须转换过来
|
||||
cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
|
||||
if (cnt <= 0) {
|
||||
wordItem_charArrayTable[i] = null;
|
||||
wordItem_frequencyTable[i] = null;
|
||||
continue;
|
||||
}
|
||||
wordItem_charArrayTable[i] = new char[cnt][];
|
||||
wordItem_frequencyTable[i] = new int[cnt];
|
||||
total += cnt;
|
||||
int j = 0;
|
||||
while (j < cnt) {
|
||||
// wordItemTable[i][j] = new WordItem();
|
||||
dctFile.read(intBuffer);
|
||||
buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
|
||||
.getInt();// frequency
|
||||
dctFile.read(intBuffer);
|
||||
buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
|
||||
.getInt();// length
|
||||
dctFile.read(intBuffer);
|
||||
buffer[2] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
|
||||
.getInt();// handle
|
||||
|
||||
// wordItemTable[i][j].frequency = buffer[0];
|
||||
wordItem_frequencyTable[i][j] = buffer[0];
|
||||
|
||||
length = buffer[1];
|
||||
if (length > 0) {
|
||||
byte[] lchBuffer = new byte[length];
|
||||
dctFile.read(lchBuffer);
|
||||
tmpword = new String(lchBuffer, "GB2312");
|
||||
// indexTable[i].wordItems[j].word = tmpword;
|
||||
// wordItemTable[i][j].charArray = tmpword.toCharArray();
|
||||
wordItem_charArrayTable[i][j] = tmpword.toCharArray();
|
||||
} else {
|
||||
// wordItemTable[i][j].charArray = null;
|
||||
wordItem_charArrayTable[i][j] = null;
|
||||
}
|
||||
// System.out.println(indexTable[i].wordItems[j]);
|
||||
j++;
|
||||
}
|
||||
|
||||
String str = getCCByGB2312Id(i);
|
||||
setTableIndex(str.charAt(0), i);
|
||||
}
|
||||
dctFile.close();
|
||||
return total;
|
||||
}
|
||||
|
||||
/**
|
||||
* 原词库将所有标点符号的信息合并到一个列表里(从1开始的3755处)。这里将其展开,分别放到各个符号对应的列表中
|
||||
*/
|
||||
private void expandDelimiterData() {
|
||||
int i;
|
||||
int cnt;
|
||||
// 标点符号在从1开始的3755处,将原始的标点符号对应的字典分配到对应的标点符号中
|
||||
int delimiterIndex = 3755 + GB2312_FIRST_CHAR;
|
||||
i = 0;
|
||||
while (i < wordItem_charArrayTable[delimiterIndex].length) {
|
||||
char c = wordItem_charArrayTable[delimiterIndex][i][0];
|
||||
int j = getGB2312Id(c);// 该标点符号应该所在的index值
|
||||
if (wordItem_charArrayTable[j] == null) {
|
||||
|
||||
int k = i;
|
||||
// 从i开始计数后面以j开头的符号的worditem的个数
|
||||
while (k < wordItem_charArrayTable[delimiterIndex].length
|
||||
&& wordItem_charArrayTable[delimiterIndex][k][0] == c) {
|
||||
k++;
|
||||
}
|
||||
// 此时k-i为id为j的标点符号对应的wordItem的个数
|
||||
cnt = k - i;
|
||||
if (cnt != 0) {
|
||||
wordItem_charArrayTable[j] = new char[cnt][];
|
||||
wordItem_frequencyTable[j] = new int[cnt];
|
||||
}
|
||||
|
||||
// 为每一个wordItem赋值
|
||||
for (k = 0; k < cnt; k++, i++) {
|
||||
// wordItemTable[j][k] = new WordItem();
|
||||
wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i];
|
||||
wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].length - 1];
|
||||
System.arraycopy(wordItem_charArrayTable[delimiterIndex][i], 1,
|
||||
wordItem_charArrayTable[j][k], 0,
|
||||
wordItem_charArrayTable[j][k].length);
|
||||
}
|
||||
setTableIndex(c, j);
|
||||
}
|
||||
}
|
||||
// 将原符号对应的数组删除
|
||||
wordItem_charArrayTable[delimiterIndex] = null;
|
||||
wordItem_frequencyTable[delimiterIndex] = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 本程序不做词性标注,因此将相同词不同词性的频率合并到同一个词下,以减小存储空间,加快搜索速度
|
||||
*/
|
||||
private void mergeSameWords() {
|
||||
int i;
|
||||
for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
|
||||
if (wordItem_charArrayTable[i] == null)
|
||||
continue;
|
||||
int len = 1;
|
||||
for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
|
||||
if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
|
||||
wordItem_charArrayTable[i][j - 1], 0) != 0)
|
||||
len++;
|
||||
|
||||
}
|
||||
if (len < wordItem_charArrayTable[i].length) {
|
||||
char[][] tempArray = new char[len][];
|
||||
int[] tempFreq = new int[len];
|
||||
int k = 0;
|
||||
tempArray[0] = wordItem_charArrayTable[i][0];
|
||||
tempFreq[0] = wordItem_frequencyTable[i][0];
|
||||
for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
|
||||
if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
|
||||
tempArray[k], 0) != 0) {
|
||||
k++;
|
||||
// temp[k] = wordItemTable[i][j];
|
||||
tempArray[k] = wordItem_charArrayTable[i][j];
|
||||
tempFreq[k] = wordItem_frequencyTable[i][j];
|
||||
} else {
|
||||
// temp[k].frequency += wordItemTable[i][j].frequency;
|
||||
tempFreq[k] += wordItem_frequencyTable[i][j];
|
||||
}
|
||||
}
|
||||
// wordItemTable[i] = temp;
|
||||
wordItem_charArrayTable[i] = tempArray;
|
||||
wordItem_frequencyTable[i] = tempFreq;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void sortEachItems() {
|
||||
char[] tmpArray;
|
||||
int tmpFreq;
|
||||
for (int i = 0; i < wordItem_charArrayTable.length; i++) {
|
||||
if (wordItem_charArrayTable[i] != null
|
||||
&& wordItem_charArrayTable[i].length > 1) {
|
||||
for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) {
|
||||
for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) {
|
||||
if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
|
||||
wordItem_charArrayTable[i][j2], 0) > 0) {
|
||||
tmpArray = wordItem_charArrayTable[i][j];
|
||||
tmpFreq = wordItem_frequencyTable[i][j];
|
||||
wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2];
|
||||
wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2];
|
||||
wordItem_charArrayTable[i][j2] = tmpArray;
|
||||
wordItem_frequencyTable[i][j2] = tmpFreq;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 计算字符c在哈希表中应该在的位置,然后将地址列表中该位置的值初始化
|
||||
*
|
||||
* @param c
|
||||
* @param j
|
||||
* @return
|
||||
*/
|
||||
private boolean setTableIndex(char c, int j) {
|
||||
int index = getAvaliableTableIndex(c);
|
||||
if (index != -1) {
|
||||
charIndexTable[index] = c;
|
||||
wordIndexTable[index] = (short) j;
|
||||
return true;
|
||||
} else
|
||||
return false;
|
||||
}
|
||||
|
||||
private short getAvaliableTableIndex(char c) {
|
||||
int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
|
||||
int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
|
||||
if (hash1 < 0)
|
||||
hash1 = PRIME_INDEX_LENGTH + hash1;
|
||||
if (hash2 < 0)
|
||||
hash2 = PRIME_INDEX_LENGTH + hash2;
|
||||
int index = hash1;
|
||||
int i = 1;
|
||||
while (charIndexTable[index] != 0 && charIndexTable[index] != c
|
||||
&& i < PRIME_INDEX_LENGTH) {
|
||||
index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
|
||||
i++;
|
||||
}
|
||||
// System.out.println(i - 1);
|
||||
|
||||
if (i < PRIME_INDEX_LENGTH
|
||||
&& (charIndexTable[index] == 0 || charIndexTable[index] == c)) {
|
||||
return (short) index;
|
||||
} else
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param c
|
||||
* @return
|
||||
*/
|
||||
private short getWordItemTableIndex(char c) {
|
||||
int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
|
||||
int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
|
||||
if (hash1 < 0)
|
||||
hash1 = PRIME_INDEX_LENGTH + hash1;
|
||||
if (hash2 < 0)
|
||||
hash2 = PRIME_INDEX_LENGTH + hash2;
|
||||
int index = hash1;
|
||||
int i = 1;
|
||||
while (charIndexTable[index] != 0 && charIndexTable[index] != c
|
||||
&& i < PRIME_INDEX_LENGTH) {
|
||||
index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
|
||||
i++;
|
||||
}
|
||||
|
||||
if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) {
|
||||
return (short) index;
|
||||
} else
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* 在字典库中查找单词对应的char数组为charArray的字符串。返回该单词在单词序列中的位置
|
||||
*
|
||||
* @param charArray 查找单词对应的char数组
|
||||
* @return 单词在单词数组中的位置,如果没找到则返回-1
|
||||
*/
|
||||
private int findInTable(char[] charArray) {
|
||||
if (charArray == null || charArray.length == 0)
|
||||
return -1;
|
||||
short index = getWordItemTableIndex(charArray[0]);
|
||||
if (index == -1)
|
||||
return -1;
|
||||
|
||||
return findInTable(index, charArray);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 在字典库中查找单词对应的char数组为charArray的字符串。返回该单词在单词序列中的位置
|
||||
*
|
||||
* @param knownHashIndex 已知单词第一个字符charArray[0]在hash表中的位置,如果未计算,可以用函数int
|
||||
* findInTable(char[] charArray) 代替
|
||||
* @param charArray 查找单词对应的char数组
|
||||
* @return 单词在单词数组中的位置,如果没找到则返回-1
|
||||
*/
|
||||
private int findInTable(short knownHashIndex, char[] charArray) {
|
||||
if (charArray == null || charArray.length == 0)
|
||||
return -1;
|
||||
|
||||
char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]];
|
||||
int start = 0, end = items.length - 1;
|
||||
int mid = (start + end) / 2, cmpResult;
|
||||
|
||||
// Binary search for the index of idArray
|
||||
while (start <= end) {
|
||||
cmpResult = Utility.compareArray(items[mid], 0, charArray, 1);
|
||||
|
||||
if (cmpResult == 0)
|
||||
return mid;// find it
|
||||
else if (cmpResult < 0)
|
||||
start = mid + 1;
|
||||
else if (cmpResult > 0)
|
||||
end = mid - 1;
|
||||
|
||||
mid = (start + end) / 2;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* charArray这个单词对应的词组在不在WordDictionary中出现
|
||||
*
|
||||
* @param charArray
|
||||
* @return true表示存在,false表示不存在
|
||||
*/
|
||||
public boolean isExist(char[] charArray) {
|
||||
return findInTable(charArray) != -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see{getPrefixMatch(char[] charArray, int knownStart)}
|
||||
* @param charArray
|
||||
* @return
|
||||
*/
|
||||
public int getPrefixMatch(char[] charArray) {
|
||||
return getPrefixMatch(charArray, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* 从词典中查找以charArray对应的单词为前缀(prefix)的单词的位置, 并返回第一个满足条件的位置。为了减小搜索代价,
|
||||
* 可以根据已有知识设置起始搜索位置, 如果不知道起始位置,默认是0
|
||||
*
|
||||
* @see{getPrefixMatch(char[] charArray)}
|
||||
* @param charArray 前缀单词
|
||||
* @param knownStart 已知的起始位置
|
||||
* @return 满足前缀条件的第一个单词的位置
|
||||
*/
|
||||
public int getPrefixMatch(char[] charArray, int knownStart) {
|
||||
short index = getWordItemTableIndex(charArray[0]);
|
||||
if (index == -1)
|
||||
return -1;
|
||||
char[][] items = wordItem_charArrayTable[wordIndexTable[index]];
|
||||
int start = knownStart, end = items.length - 1;
|
||||
|
||||
int mid = (start + end) / 2, cmpResult;
|
||||
|
||||
// Binary search for the index of idArray
|
||||
while (start <= end) {
|
||||
cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0);
|
||||
if (cmpResult == 0) {
|
||||
// Get the first item which match the current word
|
||||
while (mid >= 0
|
||||
&& Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0)
|
||||
mid--;
|
||||
mid++;
|
||||
return mid;// 找到第一个以charArray为前缀的单词
|
||||
} else if (cmpResult < 0)
|
||||
end = mid - 1;
|
||||
else
|
||||
start = mid + 1;
|
||||
mid = (start + end) / 2;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取idArray对应的词的词频,若pos为-1则获取所有词性的词频
|
||||
*
|
||||
* @param charArray 输入的单词对应的charArray
|
||||
* @param pos 词性,-1表示要求求出所有的词性的词频
|
||||
* @return idArray对应的词频
|
||||
*/
|
||||
public int getFrequency(char[] charArray) {
|
||||
short hashIndex = getWordItemTableIndex(charArray[0]);
|
||||
if (hashIndex == -1)
|
||||
return 0;
|
||||
int itemIndex = findInTable(hashIndex, charArray);
|
||||
if (itemIndex != -1)
|
||||
return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex];
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断charArray对应的字符串是否跟词典中charArray[0]对应的wordIndex的charArray相等,
|
||||
* 也就是说charArray的位置查找结果是不是就是wordIndex
|
||||
*
|
||||
* @param charArray 输入的charArray词组,第一个数表示词典中的索引号
|
||||
* @param itemIndex 位置编号
|
||||
* @return 是否相等
|
||||
*/
|
||||
public boolean isEqual(char[] charArray, int itemIndex) {
|
||||
short hashIndex = getWordItemTableIndex(charArray[0]);
|
||||
return Utility.compareArray(charArray, 1,
|
||||
wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws FileNotFoundException,
|
||||
IOException {
|
||||
WordDictionary dic = new WordDictionary();
|
||||
dic.load("D:/analysis-data");
|
||||
Utility.getCharType('。');
|
||||
Utility.getCharType('汗');
|
||||
Utility.getCharType(' ');// 0020
|
||||
Utility.getCharType(' ');// 3000
|
||||
Utility.getCharType('');// E095
|
||||
Utility.getCharType(' ');// 3000
|
||||
Utility.getCharType('\r');// 000D
|
||||
Utility.getCharType('\n');// 000A
|
||||
Utility.getCharType('\t');// 0009
|
||||
}
|
||||
}
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,58 @@
|
|||
////////// 将标点符号全部去掉 ////////////////
|
||||
,
|
||||
.
|
||||
`
|
||||
-
|
||||
_
|
||||
=
|
||||
?
|
||||
'
|
||||
|
|
||||
"
|
||||
(
|
||||
)
|
||||
{
|
||||
}
|
||||
[
|
||||
]
|
||||
<
|
||||
>
|
||||
*
|
||||
#
|
||||
&
|
||||
^
|
||||
$
|
||||
@
|
||||
!
|
||||
~
|
||||
:
|
||||
;
|
||||
+
|
||||
/
|
||||
\
|
||||
《
|
||||
》
|
||||
—
|
||||
-
|
||||
,
|
||||
。
|
||||
、
|
||||
:
|
||||
;
|
||||
!
|
||||
·
|
||||
?
|
||||
“
|
||||
”
|
||||
)
|
||||
(
|
||||
【
|
||||
】
|
||||
[
|
||||
]
|
||||
●
|
||||
//中文空格字符
|
||||
|
||||
//////////////// 英文停用词 ////////////////
|
||||
|
||||
//////////////// 中文停用词 ////////////////
|
|
@ -0,0 +1,86 @@
|
|||
/**
|
||||
* Copyright 2009 www.imdict.net
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.cn;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.Date;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
public class TestSmartChineseAnalyzer extends TestCase {
|
||||
|
||||
public void testChineseAnalyzer() throws IOException {
|
||||
Token nt = new Token();
|
||||
Analyzer ca = new SmartChineseAnalyzer(true);
|
||||
Reader sentence = new StringReader("我购买了道具和服装。");
|
||||
String[] result = { "我", "购买", "了", "道具", "和", "服装" };
|
||||
TokenStream ts = ca.tokenStream("sentence", sentence);
|
||||
int i = 0;
|
||||
nt = ts.next(nt);
|
||||
while (nt != null) {
|
||||
assertEquals(result[i], nt.term());
|
||||
i++;
|
||||
nt = ts.next(nt);
|
||||
}
|
||||
ts.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param args
|
||||
* @throws IOException
|
||||
*/
|
||||
public static void main(String[] args) throws IOException {
|
||||
new TestSmartChineseAnalyzer().sampleMethod();
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws UnsupportedEncodingException
|
||||
* @throws FileNotFoundException
|
||||
* @throws IOException
|
||||
*/
|
||||
private void sampleMethod() throws UnsupportedEncodingException,
|
||||
FileNotFoundException, IOException {
|
||||
Token nt = new Token();
|
||||
Analyzer ca = new SmartChineseAnalyzer(true);
|
||||
Reader sentence = new StringReader(
|
||||
"我从小就不由自主地认为自己长大以后一定得成为一个象我父亲一样的画家, 可能是父母潜移默化的影响。其实我根本不知道作为画家意味着什么,我是否喜欢,最重要的是否适合我,我是否有这个才华。其实人到中年的我还是不确定我最喜欢什么,最想做的是什么?我相信很多人和我一样有同样的烦恼。毕竟不是每个人都能成为作文里的宇航员,科学家和大教授。知道自己适合做什么,喜欢做什么,能做好什么其实是个非常困难的问题。"
|
||||
+ "幸运的是,我想我的孩子不会为这个太过烦恼。通过老大,我慢慢发现美国高中的一个重要功能就是帮助学生分析他们的专长和兴趣,从而帮助他们选择大学的专业和未来的职业。我觉得帮助一个未成形的孩子找到她未来成长的方向是个非常重要的过程。"
|
||||
+ "美国高中都有专门的职业顾问,通过接触不同的课程,和各种心理,个性,兴趣很多方面的问答来帮助每个学生找到最感兴趣的专业。这样的教育一般是要到高年级才开始, 可老大因为今年上计算机的课程就是研究一个职业走向的软件项目,所以她提前做了这些考试和面试。看来以后这样的教育会慢慢由电脑来测试了。老大带回家了一些试卷,我挑出一些给大家看看。这门课她花了2个多月才做完,这里只是很小的一部分。"
|
||||
+ "在测试里有这样的一些问题:"
|
||||
+ "你是个喜欢动手的人吗? 你喜欢修东西吗?你喜欢体育运动吗?你喜欢在室外工作吗?你是个喜欢思考的人吗?你喜欢数学和科学课吗?你喜欢一个人工作吗?你对自己的智力自信吗?你的创造能力很强吗?你喜欢艺术,音乐和戏剧吗? 你喜欢自由自在的工作环境吗?你喜欢尝试新的东西吗? 你喜欢帮助别人吗?你喜欢教别人吗?你喜欢和机器和工具打交道吗?你喜欢当领导吗?你喜欢组织活动吗?你什么和数字打交道吗?");
|
||||
TokenStream ts = ca.tokenStream("sentence", sentence);
|
||||
|
||||
System.out.println("start: " + (new Date()));
|
||||
long before = System.currentTimeMillis();
|
||||
nt = ts.next(nt);
|
||||
while (nt != null) {
|
||||
System.out.println(nt.term());
|
||||
nt = ts.next(nt);
|
||||
}
|
||||
ts.close();
|
||||
long now = System.currentTimeMillis();
|
||||
System.out.println("time: " + (now - before) / 1000.0 + " s");
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue