Initial commit

This commit is contained in:
magese 2018-05-10 13:52:58 +08:00
commit 5a497d99d0
34 changed files with 557678 additions and 0 deletions

10
.gitignore vendored Normal file
View File

@ -0,0 +1,10 @@
*.bak
*.class
*.log
.classpath
.project
.settings
target
.idea
*.iml

0
README.md Normal file
View File

64
pom.xml Normal file
View File

@ -0,0 +1,64 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.wltea.ik-analyzer</groupId>
<artifactId>ik-analyzer-solr</artifactId>
<version>7.x</version>
<packaging>jar</packaging>
<name>ik-analyzer-solr</name>
<url>http://code.google.com/p/ik-analyzer/</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<lucene.version>7.3.0</lucene.version>
<javac.src.version>1.8</javac.src.version>
<javac.target.version>1.8</javac.target.version>
<maven.compiler.plugin.version>3.3</maven.compiler.plugin.version>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>${lucene.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<target>${javac.src.version}</target>
<source>${javac.target.version}</source>
</configuration>
<version>${maven.compiler.plugin.version}</version>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,61 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.cfg;
import java.util.List;
/**
* 配置管理类接口
*/
public interface Configuration {
/**
* 返回useSmart标志位
* useSmart =true 分词器使用智能切分策略 =false则使用细粒度切分
*
* @return useSmart
*/
boolean useSmart();
/**
* 设置useSmart标志位
*
* @param useSmart =true 分词器使用智能切分策略 =false则使用细粒度切分
*/
void setUseSmart(boolean useSmart);
/**
* 获取主词典路径
*
* @return String 主词典路径
*/
String getMainDictionary();
/**
* 获取量词词典路径
*
* @return String 量词词典路径
*/
String getQuantifierDicionary();
/**
* 获取扩展字典配置路径
*
* @return List<String> 相对类加载器的路径
*/
List<String> getExtDictionarys();
/**
* 获取扩展停止词典配置路径
*
* @return List<String> 相对类加载器的路径
*/
List<String> getExtStopWordDictionarys();
}

View File

@ -0,0 +1,145 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.cfg;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
/**
* Configuration 默认实现
*/
public class DefaultConfig implements Configuration {
/*
* 分词器默认字典路径
*/
private static final String PATH_DIC_MAIN = "org/wltea/analyzer/dic/main2012.dic";
private static final String PATH_DIC_QUANTIFIER = "org/wltea/analyzer/dic/quantifier.dic";
/*
* 分词器配置文件路径
*/
private static final String FILE_NAME = "IKAnalyzer.cfg.xml";
//配置属性扩展字典
private static final String EXT_DICT = "ext_dict";
//配置属性扩展停止词典
private static final String EXT_STOP = "ext_stopwords";
private Properties props;
/*
* 是否使用smart方式分词
*/
private boolean useSmart;
/**
* 返回单例
*
* @return Configuration单例
*/
public static Configuration getInstance() {
return new DefaultConfig();
}
/*
* 初始化配置文件
*/
private DefaultConfig() {
props = new Properties();
InputStream input = this.getClass().getClassLoader().getResourceAsStream(FILE_NAME);
if (input != null) {
try {
props.loadFromXML(input);
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 返回useSmart标志位
* useSmart =true 分词器使用智能切分策略 =false则使用细粒度切分
*
* @return useSmart
*/
public boolean useSmart() {
return useSmart;
}
/**
* 设置useSmart标志位
*
* @param useSmart =true 分词器使用智能切分策略 =false则使用细粒度切分
*/
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
/**
* 获取主词典路径
*
* @return String 主词典路径
*/
public String getMainDictionary() {
return PATH_DIC_MAIN;
}
/**
* 获取量词词典路径
*
* @return String 量词词典路径
*/
public String getQuantifierDicionary() {
return PATH_DIC_QUANTIFIER;
}
/**
* 获取扩展字典配置路径
*
* @return List<String> 相对类加载器的路径
*/
public List<String> getExtDictionarys() {
List<String> extDictFiles = new ArrayList<>(2);
String extDictCfg = props.getProperty(EXT_DICT);
if (extDictCfg != null) {
//使用;分割多个扩展字典配置
String[] filePaths = extDictCfg.split(";");
for (String filePath : filePaths) {
if (filePath != null && !"".equals(filePath.trim())) {
extDictFiles.add(filePath.trim());
}
}
}
return extDictFiles;
}
/**
* 获取扩展停止词典配置路径
*
* @return List<String> 相对类加载器的路径
*/
public List<String> getExtStopWordDictionarys() {
List<String> extStopWordDictFiles = new ArrayList<>(2);
String extStopWordDictCfg = props.getProperty(EXT_STOP);
if (extStopWordDictCfg != null) {
//使用;分割多个扩展字典配置
String[] filePaths = extStopWordDictCfg.split(";");
for (String filePath : filePaths) {
if (filePath != null && !"".equals(filePath.trim())) {
extStopWordDictFiles.add(filePath.trim());
}
}
}
return extStopWordDictFiles;
}
}

View File

@ -0,0 +1,364 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.core;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.dic.Dictionary;
/**
* 分词器上下文状态
*/
class AnalyzeContext {
//默认缓冲区大小
private static final int BUFF_SIZE = 4096;
//缓冲区耗尽的临界值
private static final int BUFF_EXHAUST_CRITICAL = 100;
//字符窜读取缓冲
private char[] segmentBuff;
//字符类型数组
private int[] charTypes;
//记录Reader内已分析的字串总长度
//在分多段分析词元时该变量累计当前的segmentBuff相对于reader起始位置的位移
private int buffOffset;
//当前缓冲区位置指针
private int cursor;
//最近一次读入的,可处理的字串长度
private int available;
//子分词器锁
//该集合非空说明有子分词器在占用segmentBuff
private Set<String> buffLocker;
//原始分词结果集合未经歧义处理
private QuickSortSet orgLexemes;
//LexemePath位置索引表
private Map<Integer, LexemePath> pathMap;
//最终分词结果集
private LinkedList<Lexeme> results;
//分词器配置项
private Configuration cfg;
AnalyzeContext(Configuration cfg) {
this.cfg = cfg;
this.segmentBuff = new char[BUFF_SIZE];
this.charTypes = new int[BUFF_SIZE];
this.buffLocker = new HashSet<>();
this.orgLexemes = new QuickSortSet();
this.pathMap = new HashMap<>();
this.results = new LinkedList<>();
}
int getCursor() {
return this.cursor;
}
char[] getSegmentBuff() {
return this.segmentBuff;
}
char getCurrentChar() {
return this.segmentBuff[this.cursor];
}
int getCurrentCharType() {
return this.charTypes[this.cursor];
}
int getBufferOffset() {
return this.buffOffset;
}
/**
* 根据context的上下文情况填充segmentBuff
*
* @param reader 读取流
* @return 返回待分析的有效的字串长度
*/
int fillBuffer(Reader reader) throws IOException {
int readCount = 0;
if (this.buffOffset == 0) {
//首次读取reader
readCount = reader.read(segmentBuff);
} else {
int offset = this.available - this.cursor;
if (offset > 0) {
//最近一次读取的>最近一次处理的将未处理的字串拷贝到segmentBuff头部
System.arraycopy(this.segmentBuff, this.cursor, this.segmentBuff, 0, offset);
readCount = offset;
}
//继续读取reader 以onceReadIn - onceAnalyzed为起始位置继续填充segmentBuff剩余的部分
readCount += reader.read(this.segmentBuff, offset, BUFF_SIZE - offset);
}
//记录最后一次从Reader中读入的可用字符长度
this.available = readCount;
//重置当前指针
this.cursor = 0;
return readCount;
}
/**
* 初始化buff指针处理第一个字符
*/
void initCursor() {
this.cursor = 0;
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
}
/**
* 指针+1
* 成功返回 true 指针已经到了buff尾部不能前进返回false
* 并处理当前字符
*/
boolean moveCursor() {
if (this.cursor < this.available - 1) {
this.cursor++;
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
return true;
} else {
return false;
}
}
/**
* 设置当前segmentBuff为锁定状态
* 加入占用segmentBuff的子分词器名称表示占用segmentBuff
*/
void lockBuffer(String segmenterName) {
this.buffLocker.add(segmenterName);
}
/**
* 移除指定的子分词器名释放对segmentBuff的占用
*/
void unlockBuffer(String segmenterName) {
this.buffLocker.remove(segmenterName);
}
/**
* 只要buffLocker中存在segmenterName
* 则buffer被锁定
*
* @return boolean 缓冲去是否被锁定
*/
private boolean isBufferLocked() {
return this.buffLocker.size() > 0;
}
/**
* 判断当前segmentBuff是否已经用完
* 当前执针cursor移至segmentBuff末端this.available - 1
*/
boolean isBufferConsumed() {
return this.cursor == this.available - 1;
}
/**
* 判断segmentBuff是否需要读取新数据
* <p>
* 满足一下条件时
* 1.available == BUFF_SIZE 表示buffer满载
* 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内
* 3.!context.isBufferLocked()表示没有segmenter在占用buffer
* 要中断当前循环buffer要进行移位并再读取数据的操作
*/
boolean needRefillBuffer() {
return this.available == BUFF_SIZE
&& this.cursor < this.available - 1
&& this.cursor > this.available - BUFF_EXHAUST_CRITICAL
&& !this.isBufferLocked();
}
/**
* 累计当前的segmentBuff相对于reader起始位置的位移
*/
void markBufferOffset() {
this.buffOffset += this.cursor;
}
/**
* 向分词结果集添加词元
*
* @param lexeme 词元
*/
void addLexeme(Lexeme lexeme) {
this.orgLexemes.addLexeme(lexeme);
}
/**
* 添加分词结果路径
* 路径起始位置 ---> 路径 映射表
*
* @param path 分词结果路径
*/
void addLexemePath(LexemePath path) {
if (path != null) {
this.pathMap.put(path.getPathBegin(), path);
}
}
/**
* 返回原始分词结果
*/
QuickSortSet getOrgLexemes() {
return this.orgLexemes;
}
/**
* 推送分词结果到结果集合
* 1.从buff头部遍历到this.cursor已处理位置
* 2.将map中存在的分词结果推入results
* 3.将map中不存在的CJDK字符以单字方式推入results
*/
void outputToResult() {
int index = 0;
for (; index <= this.cursor; ) {
//跳过非CJK字符
if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) {
index++;
continue;
}
//从pathMap找出对应index位置的LexemePath
LexemePath path = this.pathMap.get(index);
if (path != null) {
//输出LexemePath中的lexeme到results集合
Lexeme l = path.pollFirst();
while (l != null) {
this.results.add(l);
//将index移至lexeme后
index = l.getBegin() + l.getLength();
l = path.pollFirst();
if (l != null) {
//输出path内部词元间遗漏的单字
for (; index < l.getBegin(); index++) {
this.outputSingleCJK(index);
}
}
}
} else {//pathMap中找不到index对应的LexemePath
//单字输出
this.outputSingleCJK(index);
index++;
}
}
//清空当前的Map
this.pathMap.clear();
}
/**
* 对CJK字符进行单字输出
*
* @param index 字符索引
*/
private void outputSingleCJK(int index) {
if (CharacterUtil.CHAR_CHINESE == this.charTypes[index]) {
Lexeme singleCharLexeme = new Lexeme(this.buffOffset, index, 1, Lexeme.TYPE_CNCHAR);
this.results.add(singleCharLexeme);
} else if (CharacterUtil.CHAR_OTHER_CJK == this.charTypes[index]) {
Lexeme singleCharLexeme = new Lexeme(this.buffOffset, index, 1, Lexeme.TYPE_OTHER_CJK);
this.results.add(singleCharLexeme);
}
}
/**
* 返回lexeme
* <p>
* 同时处理合并
*/
Lexeme getNextLexeme() {
//从结果集取出并移除第一个Lexme
Lexeme result = this.results.pollFirst();
while (result != null) {
//数量词合并
this.compound(result);
if (Dictionary.getSingleton().isStopWord(this.segmentBuff, result.getBegin(), result.getLength())) {
//是停止词继续取列表的下一个
result = this.results.pollFirst();
} else {
//不是停止词, 生成lexeme的词元文本,输出
result.setLexemeText(String.valueOf(segmentBuff, result.getBegin(), result.getLength()));
break;
}
}
return result;
}
/**
* 重置分词上下文状态
*/
void reset() {
this.buffLocker.clear();
this.orgLexemes = new QuickSortSet();
this.available = 0;
this.buffOffset = 0;
this.charTypes = new int[BUFF_SIZE];
this.cursor = 0;
this.results.clear();
this.segmentBuff = new char[BUFF_SIZE];
this.pathMap.clear();
}
/**
* 组合词元
*/
private void compound(Lexeme result) {
if (!this.cfg.useSmart()) {
return;
}
//数量词合并处理
if (!this.results.isEmpty()) {
if (Lexeme.TYPE_ARABIC == result.getLexemeType()) {
Lexeme nextLexeme = this.results.peekFirst();
boolean appendOk = false;
if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) {
//合并英文数词+中文数词
appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
} else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
//合并英文数词+中文量词
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
}
if (appendOk) {
//弹出
this.results.pollFirst();
}
}
//可能存在第二轮合并
if (Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()) {
Lexeme nextLexeme = this.results.peekFirst();
boolean appendOk = false;
if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
//合并中文数词+中文量词
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
}
if (appendOk) {
//弹出
this.results.pollFirst();
}
}
}
}
}

View File

@ -0,0 +1,106 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.core;
import java.util.LinkedList;
import java.util.List;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
/**
* 中文-日韩文子分词器
*/
class CJKSegmenter implements ISegmenter {
//子分词器标签
private static final String SEGMENTER_NAME = "CJK_SEGMENTER";
//待处理的分词hit队列
private List<Hit> tmpHits;
CJKSegmenter(){
this.tmpHits = new LinkedList<>();
}
/* (non-Javadoc)
* @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
*/
public void analyze(AnalyzeContext context) {
if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){
//优先处理tmpHits中的hit
if(!this.tmpHits.isEmpty()){
//处理词段队列
Hit[] tmpArray = this.tmpHits.toArray(new Hit[0]);
for(Hit hit : tmpArray){
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if(hit.isMatch()){
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
context.addLexeme(newLexeme);
if(!hit.isPrefix()){//不是词前缀hit不需要继续匹配移除
this.tmpHits.remove(hit);
}
}else if(hit.isUnmatch()){
//hit不是词移除
this.tmpHits.remove(hit);
}
}
}
//*********************************
//再对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成词
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);
context.addLexeme(newLexeme);
//同时也是词前缀
if(singleCharHit.isPrefix()){
//前缀匹配则放入hit列表
this.tmpHits.add(singleCharHit);
}
}else if(singleCharHit.isPrefix()){//首字为词前缀
//前缀匹配则放入hit列表
this.tmpHits.add(singleCharHit);
}
}else{
//遇到CHAR_USELESS字符
//清空队列
this.tmpHits.clear();
}
//判断缓冲区是否已经读完
if(context.isBufferConsumed()){
//清空队列
this.tmpHits.clear();
}
//判断是否锁定缓冲区
if(this.tmpHits.size() == 0){
context.unlockBuffer(SEGMENTER_NAME);
}else{
context.lockBuffer(SEGMENTER_NAME);
}
}
/* (non-Javadoc)
* @see org.wltea.analyzer.core.ISegmenter#reset()
*/
public void reset() {
//清空队列
this.tmpHits.clear();
}
}

View File

@ -0,0 +1,219 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.core;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
/**
*
* 中文数量词子分词器
*/
class CN_QuantifierSegmenter implements ISegmenter{
//子分词器标签
private static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
private static Set<Character> ChnNumberChars = new HashSet<>();
static{
//中文数词
//Cnum
String chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";
char[] ca = chn_Num.toCharArray();
for(char nChar : ca){
ChnNumberChars.add(nChar);
}
}
/*
* 词元的开始位置
* 同时作为子分词器状态标识
* 当start > -1 标识当前的分词器正在处理字符
*/
private int nStart;
/*
* 记录词元结束位置
* end记录的是在词元中最后一个出现的合理的数词结束
*/
private int nEnd;
//待处理的量词hit队列
private List<Hit> countHits;
CN_QuantifierSegmenter(){
nStart = -1;
nEnd = -1;
this.countHits = new LinkedList<>();
}
/**
* 分词
*/
public void analyze(AnalyzeContext context) {
//处理中文数词
this.processCNumber(context);
//处理中文量词
this.processCount(context);
//判断是否锁定缓冲区
if(this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()){
//对缓冲区解锁
context.unlockBuffer(SEGMENTER_NAME);
}else{
context.lockBuffer(SEGMENTER_NAME);
}
}
/**
* 重置子分词器状态
*/
public void reset() {
nStart = -1;
nEnd = -1;
countHits.clear();
}
/**
* 处理数词
*/
private void processCNumber(AnalyzeContext context){
if(nStart == -1 && nEnd == -1){//初始状态
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
&& ChnNumberChars.contains(context.getCurrentChar())){
//记录数词的起始结束位置
nStart = context.getCursor();
nEnd = context.getCursor();
}
}else{//正在处理状态
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
&& ChnNumberChars.contains(context.getCurrentChar())){
//记录数词的结束位置
nEnd = context.getCursor();
}else{
//输出数词
this.outputNumLexeme(context);
//重置头尾指针
nStart = -1;
nEnd = -1;
}
}
//缓冲区已经用完还有尚未输出的数词
if(context.isBufferConsumed()){
if(nStart != -1 && nEnd != -1){
//输出数词
outputNumLexeme(context);
//重置头尾指针
nStart = -1;
nEnd = -1;
}
}
}
/**
* 处理中文量词
* @param context 需要处理的内容
*/
private void processCount(AnalyzeContext context){
// 判断是否需要启动量词扫描
if(!this.needCountScan(context)){
return;
}
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){
//优先处理countHits中的hit
if(!this.countHits.isEmpty()){
//处理词段队列
Hit[] tmpArray = this.countHits.toArray(new Hit[0]);
for(Hit hit : tmpArray){
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if(hit.isMatch()){
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
context.addLexeme(newLexeme);
if(!hit.isPrefix()){//不是词前缀hit不需要继续匹配移除
this.countHits.remove(hit);
}
}else if(hit.isUnmatch()){
//hit不是词移除
this.countHits.remove(hit);
}
}
}
//*********************************
//对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成量词词
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
context.addLexeme(newLexeme);
//同时也是词前缀
if(singleCharHit.isPrefix()){
//前缀匹配则放入hit列表
this.countHits.add(singleCharHit);
}
}else if(singleCharHit.isPrefix()){//首字为量词前缀
//前缀匹配则放入hit列表
this.countHits.add(singleCharHit);
}
}else{
//输入的不是中文字符
//清空未成形的量词
this.countHits.clear();
}
//缓冲区数据已经读完还有尚未输出的量词
if(context.isBufferConsumed()){
//清空未成形的量词
this.countHits.clear();
}
}
/**
* 判断是否需要扫描量词
*/
private boolean needCountScan(AnalyzeContext context){
if((nStart != -1 && nEnd != -1 ) || !countHits.isEmpty()){
//正在处理中文数词,或者正在处理量词
return true;
}else{
//找到一个相邻的数词
if(!context.getOrgLexemes().isEmpty()){
Lexeme l = context.getOrgLexemes().peekLast();
if(Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()){
return l.getBegin() + l.getLength() == context.getCursor();
}
}
}
return false;
}
/**
* 添加数词词元到结果集
* @param context 需要添加的词元
*/
private void outputNumLexeme(AnalyzeContext context){
if(nStart > -1 && nEnd > -1){
//输出数词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , nStart , nEnd - nStart + 1 , Lexeme.TYPE_CNUM);
context.addLexeme(newLexeme);
}
}
}

View File

@ -0,0 +1,82 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.core;
/**
*
* 字符集识别工具类
*/
class CharacterUtil {
static final int CHAR_USELESS = 0;
static final int CHAR_ARABIC = 0X00000001;
static final int CHAR_ENGLISH = 0X00000002;
static final int CHAR_CHINESE = 0X00000004;
static final int CHAR_OTHER_CJK = 0X00000008;
/**
* 识别字符类型
* @param input 需要识别的字符
* @return int CharacterUtil定义的字符类型常量
*/
static int identifyCharType(char input){
if(input >= '0' && input <= '9'){
return CHAR_ARABIC;
}else if((input >= 'a' && input <= 'z')
|| (input >= 'A' && input <= 'Z')){
return CHAR_ENGLISH;
}else {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){
//目前已知的中文字符UTF-8集合
return CHAR_CHINESE;
}else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符
//韩文字符集
|| ub == Character.UnicodeBlock.HANGUL_SYLLABLES
|| ub == Character.UnicodeBlock.HANGUL_JAMO
|| ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
//日文字符集
|| ub == Character.UnicodeBlock.HIRAGANA //平假名
|| ub == Character.UnicodeBlock.KATAKANA //片假名
|| ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){
return CHAR_OTHER_CJK;
}
}
//其他的不做处理的字符
return CHAR_USELESS;
}
/**
* 进行字符规格化全角转半角大写转小写处理
* @param input 需要转换的字符
* @return char
*/
static char regularize(char input){
if (input == 12288) {
input = (char) 32;
}else if (input > 65280 && input < 65375) {
input = (char) (input - 65248);
}else if (input >= 'A' && input <= 'Z') {
input += 32;
}
return input;
}
}

View File

@ -0,0 +1,129 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.core;
import java.util.Stack;
import java.util.TreeSet;
/**
* IK分词歧义裁决器
*/
class IKArbitrator {
IKArbitrator() {
}
/**
* 分词歧义处理
*
* @param context 内容
* @param useSmart 是否细粒度分词
*/
void process(AnalyzeContext context, boolean useSmart) {
QuickSortSet orgLexemes = context.getOrgLexemes();
Lexeme orgLexeme = orgLexemes.pollFirst();
LexemePath crossPath = new LexemePath();
while (orgLexeme != null) {
if (!crossPath.addCrossLexeme(orgLexeme)) {
//找到与crossPath不相交的下一个crossPath
if (crossPath.size() == 1 || !useSmart) {
//crossPath没有歧义 或者 不做歧义处理
//直接输出当前crossPath
context.addLexemePath(crossPath);
} else {
//对当前的crossPath进行歧义处理
QuickSortSet.Cell headCell = crossPath.getHead();
LexemePath judgeResult = this.judge(headCell);
//输出歧义处理结果judgeResult
context.addLexemePath(judgeResult);
}
//把orgLexeme加入新的crossPath中
crossPath = new LexemePath();
crossPath.addCrossLexeme(orgLexeme);
}
orgLexeme = orgLexemes.pollFirst();
}
//处理最后的path
if (crossPath.size() == 1 || !useSmart) {
//crossPath没有歧义 或者 不做歧义处理
//直接输出当前crossPath
context.addLexemePath(crossPath);
} else {
//对当前的crossPath进行歧义处理
QuickSortSet.Cell headCell = crossPath.getHead();
LexemePath judgeResult = this.judge(headCell);
//输出歧义处理结果judgeResult
context.addLexemePath(judgeResult);
}
}
/**
* 歧义识别
*
* @param lexemeCell 歧义路径链表头
*/
private LexemePath judge(QuickSortSet.Cell lexemeCell) {
//候选路径集合
TreeSet<LexemePath> pathOptions = new TreeSet<>();
//候选结果路径
LexemePath option = new LexemePath();
//对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
Stack<QuickSortSet.Cell> lexemeStack = this.forwardPath(lexemeCell, option);
//当前词元链并非最理想的加入候选路径集合
pathOptions.add(option.copy());
//存在歧义词处理
QuickSortSet.Cell c;
while (!lexemeStack.isEmpty()) {
c = lexemeStack.pop();
//回滚词元链
this.backPath(c.getLexeme(), option);
//从歧义词位置开始递归生成可选方案
this.forwardPath(c, option);
pathOptions.add(option.copy());
}
//返回集合中的最优方案
return pathOptions.first();
}
/**
* 向前遍历添加词元构造一个无歧义词元组合
*/
private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell, LexemePath option) {
//发生冲突的Lexeme栈
Stack<QuickSortSet.Cell> conflictStack = new Stack<>();
QuickSortSet.Cell c = lexemeCell;
//迭代遍历Lexeme链表
while (c != null && c.getLexeme() != null) {
if (!option.addNotCrossLexeme(c.getLexeme())) {
//词元交叉添加失败则加入lexemeStack栈
conflictStack.push(c);
}
c = c.getNext();
}
return conflictStack;
}
/**
* 回滚词元链直到它能够接受指定的词元
*/
private void backPath(Lexeme l, LexemePath option) {
while (option.checkCross(l)) {
option.removeTail();
}
}
}

View File

@ -0,0 +1,153 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.core;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.cfg.DefaultConfig;
import org.wltea.analyzer.dic.Dictionary;
/**
* IK分词器主类
*/
public final class IKSegmenter {
//字符窜reader
private Reader input;
//分词器配置项
private Configuration cfg;
//分词器上下文
private AnalyzeContext context;
//分词处理器列表
private List<ISegmenter> segmenters;
//分词歧义裁决器
private IKArbitrator arbitrator;
/**
* IK分词器构造函数
*
* @param input 读取流
* @param useSmart 为true使用智能分词策略
* <p>
* 非智能分词细粒度输出所有可能的切分结果
* 智能分词 合并数词和量词对分词结果进行歧义判断
*/
public IKSegmenter(Reader input, boolean useSmart) {
this.input = input;
this.cfg = DefaultConfig.getInstance();
this.cfg.setUseSmart(useSmart);
this.init();
}
/**
* IK分词器构造函数
*
* @param input 读取流
* @param cfg 使用自定义的Configuration构造分词器
*/
@SuppressWarnings("unused")
public IKSegmenter(Reader input, Configuration cfg) {
this.input = input;
this.cfg = cfg;
this.init();
}
/**
* 初始化
*/
private void init() {
//初始化词典单例
Dictionary.initial(this.cfg);
//初始化分词上下文
this.context = new AnalyzeContext(this.cfg);
//加载子分词器
this.segmenters = this.loadSegmenters();
//加载歧义裁决器
this.arbitrator = new IKArbitrator();
}
/**
* 初始化词典加载子分词器实现
*
* @return List<ISegmenter>
*/
private List<ISegmenter> loadSegmenters() {
List<ISegmenter> segmenters = new ArrayList<>(4);
//处理字母的子分词器
segmenters.add(new LetterSegmenter());
//处理中文数量词的子分词器
segmenters.add(new CN_QuantifierSegmenter());
//处理中文词的子分词器
segmenters.add(new CJKSegmenter());
return segmenters;
}
/**
* 分词获取下一个词元
*
* @return Lexeme 词元对象
*/
public synchronized Lexeme next() throws IOException {
Lexeme l;
while ((l = context.getNextLexeme()) == null) {
/*
* 从reader中读取数据填充buffer
* 如果reader是分次读入buffer的那么buffer要 进行移位处理
* 移位处理上次读入的但未处理的数据
*/
int available = context.fillBuffer(this.input);
if (available <= 0) {
//reader已经读完
context.reset();
return null;
} else {
//初始化指针
context.initCursor();
do {
//遍历子分词器
for (ISegmenter segmenter : segmenters) {
segmenter.analyze(context);
}
//字符缓冲区接近读完需要读入新的字符
if (context.needRefillBuffer()) {
break;
}
//向前移动指针
} while (context.moveCursor());
//重置子分词器为下轮循环进行初始化
for (ISegmenter segmenter : segmenters) {
segmenter.reset();
}
}
//对分词进行歧义处理
this.arbitrator.process(context, this.cfg.useSmart());
//将分词结果输出到结果集并处理未切分的单个CJK字符
context.outputToResult();
//记录本次分词的缓冲区位移
context.markBufferOffset();
}
return l;
}
/**
* 重置分词器到初始状态
*
* @param input 读取流
*/
public synchronized void reset(Reader input) {
this.input = input;
context.reset();
for (ISegmenter segmenter : segmenters) {
segmenter.reset();
}
}
}

View File

@ -0,0 +1,27 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.core;
/**
*
* 子分词器接口
*/
interface ISegmenter {
/**
* 从分析器读取下一个可能分解的词元对象
* @param context 分词算法上下文
*/
void analyze(AnalyzeContext context);
/**
* 重置子分析器状态
*/
void reset();
}

View File

@ -0,0 +1,261 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.core;
import java.util.Arrays;
/**
* 英文字符及阿拉伯数字子分词器
*/
class LetterSegmenter implements ISegmenter {
//子分词器标签
private static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
//链接符号
private static final char[] Letter_Connector = new char[]{'#', '&', '+', '-', '.', '@', '_'};
//数字符号
private static final char[] Num_Connector = new char[]{',', '.'};
/*
* 词元的开始位置
* 同时作为子分词器状态标识
* 当start > -1 标识当前的分词器正在处理字符
*/
private int start;
/*
* 记录词元结束位置
* end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置
*/
private int end;
/*
* 字母起始位置
*/
private int englishStart;
/*
* 字母结束位置
*/
private int englishEnd;
/*
* 阿拉伯数字起始位置
*/
private int arabicStart;
/*
* 阿拉伯数字结束位置
*/
private int arabicEnd;
LetterSegmenter() {
Arrays.sort(Letter_Connector);
Arrays.sort(Num_Connector);
this.start = -1;
this.end = -1;
this.englishStart = -1;
this.englishEnd = -1;
this.arabicStart = -1;
this.arabicEnd = -1;
}
/* (non-Javadoc)
* @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
*/
public void analyze(AnalyzeContext context) {
boolean bufferLockFlag;
//处理英文字母
bufferLockFlag = this.processEnglishLetter(context);
//处理阿拉伯字母
bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag;
//处理混合字母(这个要放最后处理可以通过QuickSortSet排除重复)
bufferLockFlag = this.processMixLetter(context) || bufferLockFlag;
//判断是否锁定缓冲区
if (bufferLockFlag) {
context.lockBuffer(SEGMENTER_NAME);
} else {
//对缓冲区解锁
context.unlockBuffer(SEGMENTER_NAME);
}
}
/* (non-Javadoc)
* @see org.wltea.analyzer.core.ISegmenter#reset()
*/
public void reset() {
this.start = -1;
this.end = -1;
this.englishStart = -1;
this.englishEnd = -1;
this.arabicStart = -1;
this.arabicEnd = -1;
}
/**
* 处理数字字母混合输出
* windos2000 | linliangyi2005@gmail.com
*
* @param context 内容
*/
private boolean processMixLetter(AnalyzeContext context) {
boolean needLock;
if (this.start == -1) {//当前的分词器尚未开始处理字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
//记录起始指针的位置,标明分词器进入处理状态
this.start = context.getCursor();
this.end = start;
}
} else {//当前的分词器正在处理字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
//记录下可能的结束位置
this.end = context.getCursor();
} else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
&& this.isLetterConnector(context.getCurrentChar())) {
//记录下可能的结束位置
this.end = context.getCursor();
} else {
//遇到非Letter字符输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start + 1, Lexeme.TYPE_LETTER);
context.addLexeme(newLexeme);
this.start = -1;
this.end = -1;
}
}
//判断缓冲区是否已经读完
if (context.isBufferConsumed()) {
if (this.start != -1 && this.end != -1) {
//缓冲以读完输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start + 1, Lexeme.TYPE_LETTER);
context.addLexeme(newLexeme);
this.start = -1;
this.end = -1;
}
}
//判断是否锁定缓冲区
needLock = this.start != -1 || this.end != -1;
return needLock;
}
/**
* 处理纯英文字母输出
*
* @param context 内容
*/
private boolean processEnglishLetter(AnalyzeContext context) {
boolean needLock;
if (this.englishStart == -1) {//当前的分词器尚未开始处理英文字符
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
//记录起始指针的位置,标明分词器进入处理状态
this.englishStart = context.getCursor();
this.englishEnd = this.englishStart;
}
} else {//当前的分词器正在处理英文字符
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
//记录当前指针位置为结束位置
this.englishEnd = context.getCursor();
} else {
//遇到非English字符,输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd - this.englishStart + 1, Lexeme.TYPE_ENGLISH);
context.addLexeme(newLexeme);
this.englishStart = -1;
this.englishEnd = -1;
}
}
//判断缓冲区是否已经读完
if (context.isBufferConsumed()) {
if (this.englishStart != -1 && this.englishEnd != -1) {
//缓冲以读完输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd - this.englishStart + 1, Lexeme.TYPE_ENGLISH);
context.addLexeme(newLexeme);
this.englishStart = -1;
this.englishEnd = -1;
}
}
//判断是否锁定缓冲区
needLock = this.englishStart != -1 || this.englishEnd != -1;
return needLock;
}
/**
* 处理阿拉伯数字输出
*
* @param context 内容
*/
private boolean processArabicLetter(AnalyzeContext context) {
boolean needLock;
if (this.arabicStart == -1) {//当前的分词器尚未开始处理数字字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
//记录起始指针的位置,标明分词器进入处理状态
this.arabicStart = context.getCursor();
this.arabicEnd = this.arabicStart;
}
} else {//当前的分词器正在处理数字字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
//记录当前指针位置为结束位置
this.arabicEnd = context.getCursor();
}/* else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
&& this.isNumConnector(context.getCurrentChar())) {
//不输出数字但不标记结束
}*/ else {
////遇到非Arabic字符,输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd - this.arabicStart + 1, Lexeme.TYPE_ARABIC);
context.addLexeme(newLexeme);
this.arabicStart = -1;
this.arabicEnd = -1;
}
}
//判断缓冲区是否已经读完
if (context.isBufferConsumed()) {
if (this.arabicStart != -1 && this.arabicEnd != -1) {
//生成已切分的词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd - this.arabicStart + 1, Lexeme.TYPE_ARABIC);
context.addLexeme(newLexeme);
this.arabicStart = -1;
this.arabicEnd = -1;
}
}
//判断是否锁定缓冲区
needLock = this.arabicStart != -1 || this.arabicEnd != -1;
return needLock;
}
/**
* 判断是否是字母连接符号
*
* @param input 内容
*/
private boolean isLetterConnector(char input) {
int index = Arrays.binarySearch(Letter_Connector, input);
return index >= 0;
}
/**
* 判断是否是数字连接符号
*
* @param input 内容
*/
@SuppressWarnings("unused")
private boolean isNumConnector(char input) {
int index = Arrays.binarySearch(Num_Connector, input);
return index >= 0;
}
}

View File

@ -0,0 +1,250 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.core;
/**
* IK词元对象
*/
@SuppressWarnings("unused")
public class Lexeme implements Comparable<Lexeme>{
//英文
static final int TYPE_ENGLISH = 1;
//数字
static final int TYPE_ARABIC = 2;
//英文数字混合
static final int TYPE_LETTER = 3;
//中文词元
static final int TYPE_CNWORD = 4;
//中文单字
static final int TYPE_CNCHAR = 64;
//日韩文字
static final int TYPE_OTHER_CJK = 8;
//中文数词
static final int TYPE_CNUM = 16;
//中文量词
static final int TYPE_COUNT = 32;
//中文数量词
static final int TYPE_CQUAN = 48;
//词元的起始位移
private int offset;
//词元的相对起始位置
private int begin;
//词元的长度
private int length;
//词元文本
private String lexemeText;
//词元类型
private int lexemeType;
public Lexeme(int offset , int begin , int length , int lexemeType){
this.offset = offset;
this.begin = begin;
if(length < 0){
throw new IllegalArgumentException("length < 0");
}
this.length = length;
this.lexemeType = lexemeType;
}
/*
* 判断词元相等算法
* 起始位置偏移起始位置终止位置相同
* @see java.lang.Object#equals(Object o)
*/
public boolean equals(Object o){
if(o == null){
return false;
}
if(this == o){
return true;
}
if(o instanceof Lexeme){
Lexeme other = (Lexeme)o;
return this.offset == other.getOffset()
&& this.begin == other.getBegin()
&& this.length == other.getLength();
}else{
return false;
}
}
/*
* 词元哈希编码算法
* @see java.lang.Object#hashCode()
*/
public int hashCode(){
int absBegin = getBeginPosition();
int absEnd = getEndPosition();
return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
}
/*
* 词元在排序集合中的比较算法
* @see java.lang.Comparable#compareTo(java.lang.Object)
*/
public int compareTo(Lexeme other) {
//起始位置优先
if(this.begin < other.getBegin()){
return -1;
}else if(this.begin == other.getBegin()){
//词元长度优先
//this.length < other.getLength()
return Integer.compare(other.getLength(), this.length);
}else{//this.begin > other.getBegin()
return 1;
}
}
private int getOffset() {
return offset;
}
public void setOffset(int offset) {
this.offset = offset;
}
int getBegin() {
return begin;
}
/**
* 获取词元在文本中的起始位置
* @return int
*/
public int getBeginPosition(){
return offset + begin;
}
public void setBegin(int begin) {
this.begin = begin;
}
/**
* 获取词元在文本中的结束位置
* @return int
*/
public int getEndPosition(){
return offset + begin + length;
}
/**
* 获取词元的字符长度
* @return int
*/
public int getLength(){
return this.length;
}
public void setLength(int length) {
if(this.length < 0){
throw new IllegalArgumentException("length < 0");
}
this.length = length;
}
/**
* 获取词元的文本内容
* @return String
*/
public String getLexemeText() {
if(lexemeText == null){
return "";
}
return lexemeText;
}
void setLexemeText(String lexemeText) {
if(lexemeText == null){
this.lexemeText = "";
this.length = 0;
}else{
this.lexemeText = lexemeText;
this.length = lexemeText.length();
}
}
/**
* 获取词元类型
* @return int
*/
int getLexemeType() {
return lexemeType;
}
/**
* 获取词元类型标示字符串
* @return String
*/
public String getLexemeTypeString(){
switch(lexemeType) {
case TYPE_ENGLISH :
return "ENGLISH";
case TYPE_ARABIC :
return "ARABIC";
case TYPE_LETTER :
return "LETTER";
case TYPE_CNWORD :
return "CN_WORD";
case TYPE_CNCHAR :
return "CN_CHAR";
case TYPE_OTHER_CJK :
return "OTHER_CJK";
case TYPE_COUNT :
return "COUNT";
case TYPE_CNUM :
return "TYPE_CNUM";
case TYPE_CQUAN:
return "TYPE_CQUAN";
default :
return "UNKONW";
}
}
public void setLexemeType(int lexemeType) {
this.lexemeType = lexemeType;
}
/**
* 合并两个相邻的词元
* @return boolean 词元是否成功合并
*/
boolean append(Lexeme l, int lexemeType){
if(l != null && this.getEndPosition() == l.getBeginPosition()){
this.length += l.getLength();
this.lexemeType = lexemeType;
return true;
}else {
return false;
}
}
/**
*
*/
public String toString(){
return String.valueOf(this.getBeginPosition()) + "-" + this.getEndPosition() +
" : " + this.lexemeText + " : \t" +
this.getLexemeTypeString();
}
}

View File

@ -0,0 +1,230 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.core;
/**
* Lexeme链路径
*/
@SuppressWarnings("unused")
class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
//起始位置
private int pathBegin;
//结束
private int pathEnd;
//词元链的有效字符长度
private int payloadLength;
LexemePath() {
this.pathBegin = -1;
this.pathEnd = -1;
this.payloadLength = 0;
}
/**
* 向LexemePath追加相交的Lexeme
*/
boolean addCrossLexeme(Lexeme lexeme) {
if (this.isEmpty()) {
this.addLexeme(lexeme);
this.pathBegin = lexeme.getBegin();
this.pathEnd = lexeme.getBegin() + lexeme.getLength();
this.payloadLength += lexeme.getLength();
return true;
} else if (this.checkCross(lexeme)) {
this.addLexeme(lexeme);
if (lexeme.getBegin() + lexeme.getLength() > this.pathEnd) {
this.pathEnd = lexeme.getBegin() + lexeme.getLength();
}
this.payloadLength = this.pathEnd - this.pathBegin;
return true;
} else {
return false;
}
}
/**
* 向LexemePath追加不相交的Lexeme
*/
boolean addNotCrossLexeme(Lexeme lexeme) {
if (this.isEmpty()) {
this.addLexeme(lexeme);
this.pathBegin = lexeme.getBegin();
this.pathEnd = lexeme.getBegin() + lexeme.getLength();
this.payloadLength += lexeme.getLength();
return true;
} else if (this.checkCross(lexeme)) {
return false;
} else {
this.addLexeme(lexeme);
this.payloadLength += lexeme.getLength();
Lexeme head = this.peekFirst();
this.pathBegin = head.getBegin();
Lexeme tail = this.peekLast();
this.pathEnd = tail.getBegin() + tail.getLength();
return true;
}
}
/**
* 移除尾部的Lexeme
*
*/
void removeTail() {
Lexeme tail = this.pollLast();
if (this.isEmpty()) {
this.pathBegin = -1;
this.pathEnd = -1;
this.payloadLength = 0;
} else {
this.payloadLength -= tail.getLength();
Lexeme newTail = this.peekLast();
this.pathEnd = newTail.getBegin() + newTail.getLength();
}
}
/**
* 检测词元位置交叉有歧义的切分
*
*/
boolean checkCross(Lexeme lexeme) {
return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd)
|| (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin() + lexeme.getLength());
}
int getPathBegin() {
return pathBegin;
}
int getPathEnd() {
return pathEnd;
}
/**
* 获取Path的有效词长
*/
int getPayloadLength() {
return this.payloadLength;
}
/**
* 获取LexemePath的路径长度
*
*/
private int getPathLength() {
return this.pathEnd - this.pathBegin;
}
/**
* X权重词元长度积
*
*/
private int getXWeight() {
int product = 1;
Cell c = this.getHead();
while (c != null && c.getLexeme() != null) {
product *= c.getLexeme().getLength();
c = c.getNext();
}
return product;
}
/**
* 词元位置权重
*/
private int getPWeight() {
int pWeight = 0;
int p = 0;
Cell c = this.getHead();
while (c != null && c.getLexeme() != null) {
p++;
pWeight += p * c.getLexeme().getLength();
c = c.getNext();
}
return pWeight;
}
LexemePath copy() {
LexemePath theCopy = new LexemePath();
theCopy.pathBegin = this.pathBegin;
theCopy.pathEnd = this.pathEnd;
theCopy.payloadLength = this.payloadLength;
Cell c = this.getHead();
while (c != null && c.getLexeme() != null) {
theCopy.addLexeme(c.getLexeme());
c = c.getNext();
}
return theCopy;
}
public int compareTo(LexemePath o) {
//比较有效文本长度
if (this.payloadLength > o.payloadLength) {
return -1;
} else if (this.payloadLength < o.payloadLength) {
return 1;
} else {
//比较词元个数越少越好
if (this.size() < o.size()) {
return -1;
} else if (this.size() > o.size()) {
return 1;
} else {
//路径跨度越大越好
if (this.getPathLength() > o.getPathLength()) {
return -1;
} else if (this.getPathLength() < o.getPathLength()) {
return 1;
} else {
//根据统计学结论逆向切分概率高于正向切分因此位置越靠后的优先
if (this.pathEnd > o.pathEnd) {
return -1;
} else if (pathEnd < o.pathEnd) {
return 1;
} else {
//词长越平均越好
if (this.getXWeight() > o.getXWeight()) {
return -1;
} else if (this.getXWeight() < o.getXWeight()) {
return 1;
} else {
//词元位置权重比较
if (this.getPWeight() > o.getPWeight()) {
return -1;
} else if (this.getPWeight() < o.getPWeight()) {
return 1;
}
}
}
}
}
}
return 0;
}
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("pathBegin : ").append(pathBegin).append("\r\n");
sb.append("pathEnd : ").append(pathEnd).append("\r\n");
sb.append("payloadLength : ").append(payloadLength).append("\r\n");
Cell head = this.getHead();
while (head != null) {
sb.append("lexeme : ").append(head.getLexeme()).append("\r\n");
head = head.getNext();
}
return sb.toString();
}
}

View File

@ -0,0 +1,186 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.core;
/**
* IK分词器专用的Lexem快速排序集合
*/
class QuickSortSet {
//链表头
private Cell head;
//链表尾
private Cell tail;
//链表的实际大小
private int size;
QuickSortSet(){
this.size = 0;
}
/**
* 向链表集合添加词元
*/
void addLexeme(Lexeme lexeme){
Cell newCell = new Cell(lexeme);
if(this.size == 0){
this.head = newCell;
this.tail = newCell;
this.size++;
}else{
/*if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同不放入集合
}else */if(this.tail.compareTo(newCell) < 0){//词元接入链表尾部
this.tail.next = newCell;
newCell.prev = this.tail;
this.tail = newCell;
this.size++;
}else if(this.head.compareTo(newCell) > 0){//词元接入链表头部
this.head.prev = newCell;
newCell.next = this.head;
this.head = newCell;
this.size++;
}else{
//从尾部上逆
Cell index = this.tail;
while(index != null && index.compareTo(newCell) > 0){
index = index.prev;
}
/*if(index.compareTo(newCell) == 0){//词元与集合中的词元重复不放入集合
}else */if((index != null ? index.compareTo(newCell) : 1) < 0){//词元插入链表中的某个位置
newCell.prev = index;
newCell.next = index.next;
index.next.prev = newCell;
index.next = newCell;
this.size++;
}
}
}
}
/**
* 返回链表头部元素
*/
Lexeme peekFirst(){
if(this.head != null){
return this.head.lexeme;
}
return null;
}
/**
* 取出链表集合的第一个元素
* @return Lexeme
*/
Lexeme pollFirst(){
if(this.size == 1){
Lexeme first = this.head.lexeme;
this.head = null;
this.tail = null;
this.size--;
return first;
}else if(this.size > 1){
Lexeme first = this.head.lexeme;
this.head = this.head.next;
this.size --;
return first;
}else{
return null;
}
}
/**
* 返回链表尾部元素
*/
Lexeme peekLast(){
if(this.tail != null){
return this.tail.lexeme;
}
return null;
}
/**
* 取出链表集合的最后一个元素
* @return Lexeme
*/
Lexeme pollLast(){
if(this.size == 1){
Lexeme last = this.head.lexeme;
this.head = null;
this.tail = null;
this.size--;
return last;
}else if(this.size > 1){
Lexeme last = this.tail.lexeme;
this.tail = this.tail.prev;
this.size--;
return last;
}else{
return null;
}
}
/**
* 返回集合大小
*/
int size(){
return this.size;
}
/**
* 判断集合是否为空
*/
boolean isEmpty(){
return this.size == 0;
}
/**
* 返回lexeme链的头部
*/
Cell getHead(){
return this.head;
}
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
@SuppressWarnings("unused")
class Cell implements Comparable<Cell>{
private Cell prev;
private Cell next;
private Lexeme lexeme;
Cell(Lexeme lexeme){
if(lexeme == null){
throw new IllegalArgumentException("lexeme must not be null");
}
this.lexeme = lexeme;
}
public int compareTo(Cell o) {
return this.lexeme.compareTo(o.lexeme);
}
public Cell getPrev(){
return this.prev;
}
Cell getNext(){
return this.next;
}
public Lexeme getLexeme(){
return this.lexeme;
}
}
}

View File

@ -0,0 +1,298 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.dic;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
/**
* 词典树分段表示词典树的一个分枝
*/
@SuppressWarnings("unused")
class DictSegment implements Comparable<DictSegment> {
//公用字典表存储汉字
private static final Map<Character, Character> charMap = new HashMap<>(16, 0.95f);
//数组大小上限
private static final int ARRAY_LENGTH_LIMIT = 3;
//Map存储结构
private Map<Character, DictSegment> childrenMap;
//数组方式存储结构
private DictSegment[] childrenArray;
//当前节点上存储的字符
private Character nodeChar;
//当前节点存储的Segment数目
//storeSize <=ARRAY_LENGTH_LIMIT 使用数组存储 storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
private int storeSize = 0;
//当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
private int nodeState = 0;
DictSegment(Character nodeChar) {
if (nodeChar == null) {
throw new IllegalArgumentException("参数为空异常,字符不能为空");
}
this.nodeChar = nodeChar;
}
Character getNodeChar() {
return nodeChar;
}
/*
* 判断是否有下一个节点
*/
private boolean hasNextNode() {
return this.storeSize > 0;
}
/**
* 匹配词段
*
* @return Hit
*/
Hit match(char[] charArray) {
return this.match(charArray, 0, charArray.length, null);
}
/**
* 匹配词段
*
* @return Hit
*/
Hit match(char[] charArray, int begin, int length) {
return this.match(charArray, begin, length, null);
}
/**
* 匹配词段
*
* @return Hit
*/
Hit match(char[] charArray, int begin, int length, Hit searchHit) {
if (searchHit == null) {
//如果hit为空新建
searchHit = new Hit();
//设置hit的其实文本位置
searchHit.setBegin(begin);
} else {
//否则要将HIT状态重置
searchHit.setUnmatch();
}
//设置hit的当前处理位置
searchHit.setEnd(begin);
Character keyChar = charArray[begin];
DictSegment ds = null;
//引用实例变量为本地变量避免查询时遇到更新的同步问题
DictSegment[] segmentArray = this.childrenArray;
Map<Character, DictSegment> segmentMap = this.childrenMap;
//STEP1 在节点中查找keyChar对应的DictSegment
if (segmentArray != null) {
//在数组中查找
DictSegment keySegment = new DictSegment(keyChar);
int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
if (position >= 0) {
ds = segmentArray[position];
}
} else if (segmentMap != null) {
//在map中查找
ds = segmentMap.get(keyChar);
}
//STEP2 找到DictSegment判断词的匹配状态是否继续递归还是返回结果
if (ds != null) {
if (length > 1) {
//词未匹配完继续往下搜索
return ds.match(charArray, begin + 1, length - 1, searchHit);
} else if (length == 1) {
//搜索最后一个char
if (ds.nodeState == 1) {
//添加HIT状态为完全匹配
searchHit.setMatch();
}
if (ds.hasNextNode()) {
//添加HIT状态为前缀匹配
searchHit.setPrefix();
//记录当前位置的DictSegment
searchHit.setMatchedDictSegment(ds);
}
return searchHit;
}
}
//STEP3 没有找到DictSegment 将HIT设置为不匹配
return searchHit;
}
/**
* 加载填充词典片段
*/
void fillSegment(char[] charArray) {
this.fillSegment(charArray, 0, charArray.length, 1);
}
/**
* 屏蔽词典中的一个词
*/
void disableSegment(char[] charArray) {
this.fillSegment(charArray, 0, charArray.length, 0);
}
/**
* 加载填充词典片段
*/
private synchronized void fillSegment(char[] charArray, int begin, int length, int enabled) {
//获取字典表中的汉字对象
Character beginChar = charArray[begin];
Character keyChar = charMap.get(beginChar);
//字典中没有该字则将其添加入字典
if (keyChar == null) {
charMap.put(beginChar, beginChar);
keyChar = beginChar;
}
//搜索当前节点的存储查询对应keyChar的keyChar如果没有则创建
DictSegment ds = lookforSegment(keyChar, enabled);
if (ds != null) {
//处理keyChar对应的segment
if (length > 1) {
//词元还没有完全加入词典树
ds.fillSegment(charArray, begin + 1, length - 1, enabled);
} else if (length == 1) {
//已经是词元的最后一个char,设置当前节点状态为enabled
//enabled=1表明一个完整的词enabled=0表示从词典中屏蔽当前词
ds.nodeState = enabled;
}
}
}
/**
* 查找本节点下对应的keyChar的segment *
*
* @param create =1如果没有找到则创建新的segment ; =0如果没有找到不创建返回null
*/
private DictSegment lookforSegment(Character keyChar, int create) {
DictSegment ds = null;
if (this.storeSize <= ARRAY_LENGTH_LIMIT) {
//获取数组容器如果数组未创建则创建数组
DictSegment[] segmentArray = getChildrenArray();
//搜寻数组
DictSegment keySegment = new DictSegment(keyChar);
int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
if (position >= 0) {
ds = segmentArray[position];
}
//遍历数组后没有找到对应的segment
if (ds == null && create == 1) {
ds = keySegment;
if (this.storeSize < ARRAY_LENGTH_LIMIT) {
//数组容量未满使用数组存储
segmentArray[this.storeSize] = ds;
//segment数目+1
this.storeSize++;
Arrays.sort(segmentArray, 0, this.storeSize);
} else {
//数组容量已满切换Map存储
//获取Map容器如果Map未创建,则创建Map
Map<Character, DictSegment> segmentMap = getChildrenMap();
//将数组中的segment迁移到Map中
migrate(segmentArray, segmentMap);
//存储新的segment
segmentMap.put(keyChar, ds);
//segment数目+1 必须在释放数组前执行storeSize++ 确保极端情况下不会取到空的数组
this.storeSize++;
//释放当前的数组引用
this.childrenArray = null;
}
}
} else {
//获取Map容器如果Map未创建,则创建Map
Map<Character, DictSegment> segmentMap = getChildrenMap();
//搜索Map
ds = segmentMap.get(keyChar);
if (ds == null && create == 1) {
//构造新的segment
ds = new DictSegment(keyChar);
segmentMap.put(keyChar, ds);
//当前节点存储segment数目+1
this.storeSize++;
}
}
return ds;
}
/**
* 获取数组容器
* 线程同步方法
*/
private DictSegment[] getChildrenArray() {
if (this.childrenArray == null) {
synchronized (this) {
if (this.childrenArray == null) {
this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT];
}
}
}
return this.childrenArray;
}
/**
* 获取Map容器
* 线程同步方法
*/
private Map<Character, DictSegment> getChildrenMap() {
if (this.childrenMap == null) {
synchronized (this) {
if (this.childrenMap == null) {
this.childrenMap = new HashMap<>(ARRAY_LENGTH_LIMIT * 2, 0.8f);
}
}
}
return this.childrenMap;
}
/**
* 将数组中的segment迁移到Map中
*/
private void migrate(DictSegment[] segmentArray, Map<Character, DictSegment> segmentMap) {
for (DictSegment segment : segmentArray) {
if (segment != null) {
segmentMap.put(segment.nodeChar, segment);
}
}
}
/**
* 实现Comparable接口
*
* @return int
*/
public int compareTo(DictSegment o) {
//对当前节点存储的char进行比较
return this.nodeChar.compareTo(o.nodeChar);
}
}

View File

@ -0,0 +1,355 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.dic;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Collection;
import java.util.List;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.cfg.DefaultConfig;
/**
* 词典管理类,单子模式
*/
@SuppressWarnings("unused")
public class Dictionary {
/*
* 词典单子实例
*/
private static Dictionary singleton;
/*
* 主词典对象
*/
private DictSegment _MainDict;
/*
* 停止词词典
*/
private DictSegment _StopWordDict;
/*
* 量词词典
*/
private DictSegment _QuantifierDict;
/**
* 配置对象
*/
private Configuration cfg;
private Dictionary(Configuration cfg) {
this.cfg = cfg;
this.loadMainDict();
this.loadStopWordDict();
this.loadQuantifierDict();
}
/**
* 词典初始化
* 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
* 只有当Dictionary类被实际调用时才会开始载入词典
* 这将延长首次分词操作的时间
* 该方法提供了一个在应用加载阶段就初始化字典的手段
*/
public static void initial(Configuration cfg) {
if (singleton == null) {
synchronized (Dictionary.class) {
if (singleton == null) {
singleton = new Dictionary(cfg);
}
}
}
}
/**
* 获取词典单子实例
*
* @return Dictionary 单例对象
*/
public static Dictionary getSingleton() {
if (singleton == null) {
throw new IllegalStateException("词典尚未初始化请先调用initial方法");
}
return singleton;
}
/**
* 重新更新词典
* 由于停用词等不经常变也不建议常增加故这里只修改动态扩展词库
*/
public static void reloadDic(List<InputStream> inputStreamList) {
if (singleton == null) {
Configuration cfg = DefaultConfig.getInstance();
initial(cfg);
}
for (InputStream is : inputStreamList) {
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
singleton._MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Other Dictionary loading exception.");
ioe.printStackTrace();
} finally {
try {
if (is != null) {
is.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
* 批量加载新词条
*
* @param words Collection<String>词条列表
*/
public void addWords(Collection<String> words) {
if (words != null) {
for (String word : words) {
if (word != null) {
//批量加载词条到主内存词典中
singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
}
}
}
}
/**
* 批量移除屏蔽词条
*/
public void disableWords(Collection<String> words) {
if (words != null) {
for (String word : words) {
if (word != null) {
//批量屏蔽词条
singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
}
}
}
}
/**
* 检索匹配主词典
*
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray) {
return singleton._MainDict.match(charArray);
}
/**
* 检索匹配主词典
*
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray, int begin, int length) {
return singleton._MainDict.match(charArray, begin, length);
}
/**
* 检索匹配量词词典
*
* @return Hit 匹配结果描述
*/
public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
return singleton._QuantifierDict.match(charArray, begin, length);
}
/**
* 从已匹配的Hit中直接取出DictSegment继续向下匹配
*
* @return Hit
*/
public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
DictSegment ds = matchedHit.getMatchedDictSegment();
return ds.match(charArray, currentIndex, 1, matchedHit);
}
/**
* 判断是否是停止词
*
* @return boolean
*/
public boolean isStopWord(char[] charArray, int begin, int length) {
return singleton._StopWordDict.match(charArray, begin, length).isMatch();
}
/**
* 加载主词典及扩展词典
*/
private void loadMainDict() {
//建立一个主词典实例
_MainDict = new DictSegment((char) 0);
//读取主词典文件
InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getMainDictionary());
if (is == null) {
throw new RuntimeException("Main Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Main Dictionary loading exception.");
ioe.printStackTrace();
} finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
//加载扩展词典
this.loadExtDict();
}
/**
* 加载用户配置的扩展词典到主词库表
*/
private void loadExtDict() {
//加载扩展词典配置
List<String> extDictFiles = cfg.getExtDictionarys();
if (extDictFiles != null) {
InputStream is;
for (String extDictName : extDictFiles) {
//读取扩展词典文件
System.out.println("加载扩展词典:" + extDictName);
is = this.getClass().getClassLoader().getResourceAsStream(extDictName);
//如果找不到扩展的字典则忽略
if (is == null) {
continue;
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
//加载扩展词典数据到主内存词典中
//System.out.println(theWord);
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Extension Dictionary loading exception.");
ioe.printStackTrace();
} finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
/**
* 加载用户扩展的停止词词典
*/
private void loadStopWordDict() {
//建立一个主词典实例
_StopWordDict = new DictSegment((char) 0);
//加载扩展停止词典
List<String> extStopWordDictFiles = cfg.getExtStopWordDictionarys();
if (extStopWordDictFiles != null) {
InputStream is;
for (String extStopWordDictName : extStopWordDictFiles) {
System.out.println("加载扩展停止词典:" + extStopWordDictName);
//读取扩展词典文件
is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName);
//如果找不到扩展的字典则忽略
if (is == null) {
continue;
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
//System.out.println(theWord);
//加载扩展停止词典数据到内存中
_StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Extension Stop word Dictionary loading exception.");
ioe.printStackTrace();
} finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
/**
* 加载量词词典
*/
private void loadQuantifierDict() {
//建立一个量词典实例
_QuantifierDict = new DictSegment((char) 0);
//读取量词词典文件
InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDicionary());
if (is == null) {
throw new RuntimeException("Quantifier Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Quantifier Dictionary loading exception.");
ioe.printStackTrace();
} finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}

View File

@ -0,0 +1,98 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.dic;
/**
* 表示一次词典匹配的命中
*/
@SuppressWarnings("unused")
public class Hit {
//Hit不匹配
private static final int UNMATCH = 0x00000000;
//Hit完全匹配
private static final int MATCH = 0x00000001;
//Hit前缀匹配
private static final int PREFIX = 0x00000010;
//该HIT当前状态默认未匹配
private int hitState = UNMATCH;
//记录词典匹配过程中当前匹配到的词典分支节点
private DictSegment matchedDictSegment;
/*
* 词段开始位置
*/
private int begin;
/*
* 词段的结束位置
*/
private int end;
/**
* 判断是否完全匹配
*/
public boolean isMatch() {
return (this.hitState & MATCH) > 0;
}
/**
*
*/
void setMatch() {
this.hitState = this.hitState | MATCH;
}
/**
* 判断是否是词的前缀
*/
public boolean isPrefix() {
return (this.hitState & PREFIX) > 0;
}
/**
*
*/
void setPrefix() {
this.hitState = this.hitState | PREFIX;
}
/**
* 判断是否是不匹配
*/
public boolean isUnmatch() {
return this.hitState == UNMATCH ;
}
/**
*
*/
void setUnmatch() {
this.hitState = UNMATCH;
}
DictSegment getMatchedDictSegment() {
return matchedDictSegment;
}
void setMatchedDictSegment(DictSegment matchedDictSegment) {
this.matchedDictSegment = matchedDictSegment;
}
public int getBegin() {
return begin;
}
void setBegin(int begin) {
this.begin = begin;
}
public int getEnd() {
return end;
}
void setEnd(int end) {
this.end = end;
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,316 @@
世纪
位数
像素
克拉
公亩
公克
公分
公升
公尺
公担
公斤
公里
公顷
分钟
分米
加仑
千克
千米
厘米
周年
小时
平方
平方公尺
平方公里
平方分米
平方厘米
平方码
平方米
平方英寸
平方英尺
平方英里
平米
年代
年级
月份
毫升
毫米
毫克
海里
点钟
盎司
秒钟
立方公尺
立方分米
立方厘米
立方码
立方米
立方英寸
立方英尺
英亩
英寸
英尺
英里
阶段

View File

@ -0,0 +1,55 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.lucene;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
/**
* IK分词器Lucene Analyzer接口实现
*/
@SuppressWarnings("unused")
public final class IKAnalyzer extends Analyzer{
private boolean useSmart;
private boolean useSmart() {
return useSmart;
}
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
/**
* IK分词器Lucene Analyzer接口实现类
*
* 默认细粒度切分算法
*/
public IKAnalyzer(){
this(false);
}
/**
* IK分词器Lucene Analyzer接口实现类
*
* @param useSmart 当为true时分词器进行智能切分
*/
public IKAnalyzer(boolean useSmart){
super();
this.useSmart = useSmart;
}
/**
* 重载Analyzer接口构造分词组件
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer _IKTokenizer = new IKTokenizer(this.useSmart());
return new TokenStreamComponents(_IKTokenizer);
}
}

View File

@ -0,0 +1,108 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.lucene;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeFactory;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
/**
* IK分词器 Lucene Tokenizer适配器类
* 兼容Lucene 4.0版本
*/
@SuppressWarnings("unused")
public final class IKTokenizer extends Tokenizer {
//IK分词器实现
private IKSegmenter _IKImplement;
//词元文本属性
private CharTermAttribute termAtt;
//词元位移属性
private OffsetAttribute offsetAtt;
//词元分类属性该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量
private TypeAttribute typeAtt;
//记录最后一个词元的结束位置
private int endPosition;
/**
* Lucene 4.0 Tokenizer适配器类构造函数
*/
public IKTokenizer() {
this(false);
}
IKTokenizer(boolean useSmart) {
super();
init(useSmart);
}
public IKTokenizer(AttributeFactory factory) {
this(factory, false);
}
IKTokenizer(AttributeFactory factory, boolean useSmart) {
super(factory);
init(useSmart);
}
private void init(boolean useSmart) {
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
_IKImplement = new IKSegmenter(input , useSmart);
}
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
@Override
public boolean incrementToken() throws IOException {
//清除所有的词元属性
clearAttributes();
Lexeme nextLexeme = _IKImplement.next();
if(nextLexeme != null){
//将Lexeme转成Attributes
//设置词元文本
termAtt.append(nextLexeme.getLexemeText());
//设置词元长度
termAtt.setLength(nextLexeme.getLength());
//设置词元位移
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
//记录分词的最后位置
endPosition = nextLexeme.getEndPosition();
//记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
//返会true告知还有下个词元
return true;
}
//返会false告知词元输出完毕
return false;
}
/*
* (non-Javadoc)
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
*/
@Override
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
}
}

View File

@ -0,0 +1,116 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.lucene;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import org.wltea.analyzer.dic.Dictionary;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
/**
* @author <a href="magese@live.cn">Magese</a>
*/
@SuppressWarnings("unchecked")
public class IKTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware, UpdateKeeper.UpdateJob {
private boolean useSmart;
private ResourceLoader loader;
private long lastUpdateTime = -1L;
private String conf = "ik.conf";
public IKTokenizerFactory(Map<String, String> args) {
super(args);
String useSmartArg = args.get("useSmart");
this.setUseSmart(Boolean.parseBoolean(useSmartArg));
}
@Override
public Tokenizer create(AttributeFactory factory) {
return new IKTokenizer(factory, useSmart());
}
@Override
public void inform(ResourceLoader resourceLoader) throws IOException {
System.out.println(String.format(":::ik:::inform:::::::::::::::::::::::: %s", this.conf));
this.loader = resourceLoader;
update();
if ((this.conf != null) && (!this.conf.trim().isEmpty())) {
UpdateKeeper.getInstance().register(this);
}
}
@Override
public void update() throws IOException {
Properties p = canUpdate();
if (p != null) {
List<String> dicPaths = SplitFileNames(p.getProperty("files"));
List inputStreamList = new ArrayList();
for (String path : dicPaths) {
if ((path != null) && (!path.isEmpty())) {
InputStream is = this.loader.openResource(path);
if (is != null) {
inputStreamList.add(is);
}
}
}
if (!inputStreamList.isEmpty())
Dictionary.reloadDic(inputStreamList);
}
}
/**
* 检查是否要更新
*/
private Properties canUpdate() {
try {
if (this.conf == null)
return null;
Properties p = new Properties();
InputStream confStream = this.loader.openResource(this.conf);
p.load(confStream);
confStream.close();
String lastupdate = p.getProperty("lastupdate", "0");
Long t = new Long(lastupdate);
if (t > this.lastUpdateTime) {
this.lastUpdateTime = t;
String paths = p.getProperty("files");
if ((paths == null) || (paths.trim().isEmpty()))
return null;
System.out.println("loading conf files success.");
return p;
}
this.lastUpdateTime = t;
return null;
} catch (Exception e) {
System.err.println("IK parsing conf NullPointerException~~~~~" + Arrays.toString(e.getStackTrace()));
}
return null;
}
private static List<String> SplitFileNames(String fileNames) {
if (fileNames == null) {
return Collections.emptyList();
}
List result = new ArrayList();
Collections.addAll(result, fileNames.split("[,\\s]+"));
return result;
}
private boolean useSmart() {
return useSmart;
}
private void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
}

View File

@ -0,0 +1,65 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.lucene;
import java.io.IOException;
import java.util.Vector;
public class UpdateKeeper implements Runnable {
private static final long INTERVAL = 60000L;
private static UpdateKeeper singleton;
private Vector<UpdateJob> filterFactorys;
private UpdateKeeper() {
this.filterFactorys = new Vector<>();
Thread worker = new Thread(this);
worker.setDaemon(true);
worker.start();
}
static UpdateKeeper getInstance() {
if (singleton == null) {
synchronized (UpdateKeeper.class) {
if (singleton == null) {
singleton = new UpdateKeeper();
return singleton;
}
}
}
return singleton;
}
void register(UpdateJob filterFactory) {
this.filterFactorys.add(filterFactory);
}
@Override
public void run() {
//noinspection InfiniteLoopStatement
while (true) {
try {
Thread.sleep(INTERVAL);
} catch (InterruptedException e) {
e.printStackTrace();
}
if (!this.filterFactorys.isEmpty()) {
for (UpdateJob factory : this.filterFactorys) {
try {
factory.update();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
public interface UpdateJob {
void update() throws IOException;
}
}

View File

@ -0,0 +1,683 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.query;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.util.BytesRef;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Stack;
/**
* IK简易查询表达式解析
* 结合SWMCQuery算法
* <p>
* 表达式例子
* (id='1231231' && title:'monkey') || (content:'你好吗' || ulr='www.ik.com') - name:'helloword'
*
* @author linliangyi
*/
public class IKQueryExpressionParser {
private List<Element> elements = new ArrayList<>();
private Stack<Query> querys = new Stack<>();
private Stack<Element> operates = new Stack<>();
/**
* 解析查询表达式生成Lucene Query对象
*
* @return Lucene query
*/
private Query parseExp(String expression) {
Query lucenceQuery = null;
if (expression != null && !"".equals(expression.trim())) {
try {
//文法解析
this.splitElements(expression);
//语法解析
this.parseSyntax();
if (this.querys.size() == 1) {
lucenceQuery = this.querys.pop();
} else {
throw new IllegalStateException("表达式异常: 缺少逻辑操作符 或 括号缺失");
}
} finally {
elements.clear();
querys.clear();
operates.clear();
}
}
return lucenceQuery;
}
/**
* 表达式文法解析
*/
private void splitElements(String expression) {
if (expression == null) {
return;
}
Element curretElement = null;
char[] expChars = expression.toCharArray();
for (char expChar : expChars) {
switch (expChar) {
case '&':
if (curretElement == null) {
curretElement = new Element();
curretElement.type = '&';
curretElement.append(expChar);
} else if (curretElement.type == '&') {
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
} else if (curretElement.type == '\'') {
curretElement.append(expChar);
} else {
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = '&';
curretElement.append(expChar);
}
break;
case '|':
if (curretElement == null) {
curretElement = new Element();
curretElement.type = '|';
curretElement.append(expChar);
} else if (curretElement.type == '|') {
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
} else if (curretElement.type == '\'') {
curretElement.append(expChar);
} else {
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = '|';
curretElement.append(expChar);
}
break;
case '-':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = '-';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
break;
case '(':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = '(';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
break;
case ')':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = ')';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
break;
case ':':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = ':';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
break;
case '=':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = '=';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
break;
case ' ':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
} else {
this.elements.add(curretElement);
curretElement = null;
}
}
break;
case '\'':
if (curretElement == null) {
curretElement = new Element();
curretElement.type = '\'';
} else if (curretElement.type == '\'') {
this.elements.add(curretElement);
curretElement = null;
} else {
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = '\'';
}
break;
case '[':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = '[';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
break;
case ']':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = ']';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
break;
case '{':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = '{';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
break;
case '}':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = '}';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
break;
case ',':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = ',';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
break;
default:
if (curretElement == null) {
curretElement = new Element();
curretElement.type = 'F';
curretElement.append(expChar);
} else if (curretElement.type == 'F') {
curretElement.append(expChar);
} else if (curretElement.type == '\'') {
curretElement.append(expChar);
} else {
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = 'F';
curretElement.append(expChar);
}
}
}
if (curretElement != null) {
this.elements.add(curretElement);
}
}
/**
* 语法解析
*/
private void parseSyntax() {
for (int i = 0; i < this.elements.size(); i++) {
Element e = this.elements.get(i);
if ('F' == e.type) {
Element e2 = this.elements.get(i + 1);
if ('=' != e2.type && ':' != e2.type) {
throw new IllegalStateException("表达式异常: = 或 号丢失");
}
Element e3 = this.elements.get(i + 2);
//处理 = 运算
if ('\'' == e3.type) {
i += 2;
if ('=' == e2.type) {
TermQuery tQuery = new TermQuery(new Term(e.toString(), e3.toString()));
this.querys.push(tQuery);
} else {
String keyword = e3.toString();
//SWMCQuery Here
Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword);
this.querys.push(_SWMCQuery);
}
} else if ('[' == e3.type || '{' == e3.type) {
i += 2;
//处理 [] {}
LinkedList<Element> eQueue = new LinkedList<>();
eQueue.add(e3);
for (i++; i < this.elements.size(); i++) {
Element eN = this.elements.get(i);
eQueue.add(eN);
if (']' == eN.type || '}' == eN.type) {
break;
}
}
//翻译RangeQuery
Query rangeQuery = this.toTermRangeQuery(e, eQueue);
this.querys.push(rangeQuery);
} else {
throw new IllegalStateException("表达式异常:匹配值丢失");
}
} else if ('(' == e.type) {
this.operates.push(e);
} else if (')' == e.type) {
boolean doPop = true;
while (doPop && !this.operates.empty()) {
Element op = this.operates.pop();
if ('(' == op.type) {
doPop = false;
} else {
Query q = toBooleanQuery(op);
this.querys.push(q);
}
}
} else {
if (this.operates.isEmpty()) {
this.operates.push(e);
} else {
boolean doPeek = true;
while (doPeek && !this.operates.isEmpty()) {
Element eleOnTop = this.operates.peek();
if ('(' == eleOnTop.type) {
doPeek = false;
this.operates.push(e);
} else if (compare(e, eleOnTop) == 1) {
this.operates.push(e);
doPeek = false;
} else if (compare(e, eleOnTop) == 0) {
Query q = toBooleanQuery(eleOnTop);
this.operates.pop();
this.querys.push(q);
} else {
Query q = toBooleanQuery(eleOnTop);
this.operates.pop();
this.querys.push(q);
}
}
if (doPeek && this.operates.empty()) {
this.operates.push(e);
}
}
}
}
while (!this.operates.isEmpty()) {
Element eleOnTop = this.operates.pop();
Query q = toBooleanQuery(eleOnTop);
this.querys.push(q);
}
}
/**
* 根据逻辑操作符生成BooleanQuery
*/
private Query toBooleanQuery(Element op) {
if (this.querys.size() == 0) {
return null;
}
BooleanQuery.Builder resultQuery = new BooleanQuery.Builder();
if (this.querys.size() == 1) {
return this.querys.get(0);
}
Query q2 = this.querys.pop();
Query q1 = this.querys.pop();
if ('&' == op.type) {
if (q1 != null) {
if (q1 instanceof BooleanQuery) {
List<BooleanClause> clauses = ((BooleanQuery) q1).clauses();
if (clauses.size() > 0
&& clauses.get(0).getOccur() == Occur.MUST) {
for (BooleanClause c : clauses) {
resultQuery.add(c);
}
} else {
resultQuery.add(q1, Occur.MUST);
}
} else {
//q1 instanceof TermQuery
//q1 instanceof TermRangeQuery
//q1 instanceof PhraseQuery
//others
resultQuery.add(q1, Occur.MUST);
}
}
if (q2 != null) {
if (q2 instanceof BooleanQuery) {
List<BooleanClause> clauses = ((BooleanQuery) q2).clauses();
if (clauses.size() > 0
&& clauses.get(0).getOccur() == Occur.MUST) {
for (BooleanClause c : clauses) {
resultQuery.add(c);
}
} else {
resultQuery.add(q2, Occur.MUST);
}
} else {
//q1 instanceof TermQuery
//q1 instanceof TermRangeQuery
//q1 instanceof PhraseQuery
//others
resultQuery.add(q2, Occur.MUST);
}
}
} else if ('|' == op.type) {
if (q1 != null) {
if (q1 instanceof BooleanQuery) {
List<BooleanClause> clauses = ((BooleanQuery) q1).clauses();
if (clauses.size() > 0
&& clauses.get(0).getOccur() == Occur.SHOULD) {
for (BooleanClause c : clauses) {
resultQuery.add(c);
}
} else {
resultQuery.add(q1, Occur.SHOULD);
}
} else {
//q1 instanceof TermQuery
//q1 instanceof TermRangeQuery
//q1 instanceof PhraseQuery
//others
resultQuery.add(q1, Occur.SHOULD);
}
}
if (q2 != null) {
if (q2 instanceof BooleanQuery) {
List<BooleanClause> clauses = ((BooleanQuery) q2).clauses();
if (clauses.size() > 0
&& clauses.get(0).getOccur() == Occur.SHOULD) {
for (BooleanClause c : clauses) {
resultQuery.add(c);
}
} else {
resultQuery.add(q2, Occur.SHOULD);
}
} else {
//q2 instanceof TermQuery
//q2 instanceof TermRangeQuery
//q2 instanceof PhraseQuery
//others
resultQuery.add(q2, Occur.SHOULD);
}
}
} else if ('-' == op.type) {
if (q1 == null || q2 == null) {
throw new IllegalStateException("表达式异常SubQuery 个数不匹配");
}
if (q1 instanceof BooleanQuery) {
List<BooleanClause> clauses = ((BooleanQuery) q1).clauses();
if (clauses.size() > 0) {
for (BooleanClause c : clauses) {
resultQuery.add(c);
}
} else {
resultQuery.add(q1, Occur.MUST);
}
} else {
//q1 instanceof TermQuery
//q1 instanceof TermRangeQuery
//q1 instanceof PhraseQuery
//others
resultQuery.add(q1, Occur.MUST);
}
resultQuery.add(q2, Occur.MUST_NOT);
}
return resultQuery.build();
}
/**
* 组装TermRangeQuery
*/
private TermRangeQuery toTermRangeQuery(Element fieldNameEle, LinkedList<Element> elements) {
boolean includeFirst;
boolean includeLast;
String firstValue;
String lastValue = null;
//检查第一个元素是否是[或者{
Element first = elements.getFirst();
if ('[' == first.type) {
includeFirst = true;
} else if ('{' == first.type) {
includeFirst = false;
} else {
throw new IllegalStateException("表达式异常");
}
//检查最后一个元素是否是]或者}
Element last = elements.getLast();
if (']' == last.type) {
includeLast = true;
} else if ('}' == last.type) {
includeLast = false;
} else {
throw new IllegalStateException("表达式异常, RangeQuery缺少结束括号");
}
if (elements.size() < 4 || elements.size() > 5) {
throw new IllegalStateException("表达式异常, RangeQuery 错误");
}
//读出中间部分
Element e2 = elements.get(1);
if ('\'' == e2.type) {
firstValue = e2.toString();
//
Element e3 = elements.get(2);
if (',' != e3.type) {
throw new IllegalStateException("表达式异常, RangeQuery缺少逗号分隔");
}
//
Element e4 = elements.get(3);
if ('\'' == e4.type) {
lastValue = e4.toString();
} else if (e4 != last) {
throw new IllegalStateException("表达式异常RangeQuery格式错误");
}
} else if (',' == e2.type) {
firstValue = null;
//
Element e3 = elements.get(2);
if ('\'' == e3.type) {
lastValue = e3.toString();
} else {
throw new IllegalStateException("表达式异常RangeQuery格式错误");
}
} else {
throw new IllegalStateException("表达式异常, RangeQuery格式错误");
}
assert firstValue != null;
assert lastValue != null;
return new TermRangeQuery(fieldNameEle.toString(), new BytesRef(firstValue), new BytesRef(lastValue), includeFirst, includeLast);
}
/**
* 比较操作符优先级
*/
private int compare(Element e1, Element e2) {
if ('&' == e1.type) {
if ('&' == e2.type) {
return 0;
} else {
return 1;
}
} else if ('|' == e1.type) {
if ('&' == e2.type) {
return -1;
} else if ('|' == e2.type) {
return 0;
} else {
return 1;
}
} else {
if ('-' == e2.type) {
return 0;
} else {
return -1;
}
}
}
/**
* 表达式元素操作符FieldNameFieldValue
*
* @author linliangyi
* May 20, 2010
*/
private class Element {
char type = 0;
StringBuffer eleTextBuff;
Element() {
eleTextBuff = new StringBuffer();
}
void append(char c) {
this.eleTextBuff.append(c);
}
public String toString() {
return this.eleTextBuff.toString();
}
}
public static void main(String[] args) {
IKQueryExpressionParser parser = new IKQueryExpressionParser();
//String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'";
String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'";
Query result = parser.parseExp(ikQueryExp);
System.out.println(result);
}
}

View File

@ -0,0 +1,120 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.query;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.Query;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
/**
* Single Word Multi Char Query Builder
* IK分词算法专用
*
* @author linliangyi
*/
class SWMCQueryBuilder {
/**
* 生成SWMCQuery
*
* @return Lucene Query
*/
static Query create(String fieldName, String keywords) {
if (fieldName == null || keywords == null) {
throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
}
//1.对keywords进行分词处理
List<Lexeme> lexemes = doAnalyze(keywords);
//2.根据分词结果生成SWMCQuery
return getSWMCQuery(fieldName, lexemes);
}
/**
* 分词切分并返回结链表
*/
private static List<Lexeme> doAnalyze(String keywords) {
List<Lexeme> lexemes = new ArrayList<>();
IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords), true);
try {
Lexeme l;
while ((l = ikSeg.next()) != null) {
lexemes.add(l);
}
} catch (IOException e) {
e.printStackTrace();
}
return lexemes;
}
/**
* 根据分词结果生成SWMC搜索
*/
private static Query getSWMCQuery(String fieldName, List<Lexeme> lexemes) {
//构造SWMC的查询表达式
StringBuilder keywordBuffer = new StringBuilder();
//精简的SWMC的查询表达式
StringBuilder keywordBuffer_Short = new StringBuilder();
//记录最后词元长度
int lastLexemeLength = 0;
//记录最后词元结束位置
int lastLexemeEnd = -1;
int shortCount = 0;
int totalCount = 0;
for (Lexeme l : lexemes) {
totalCount += l.getLength();
//精简表达式
if (l.getLength() > 1) {
keywordBuffer_Short.append(' ').append(l.getLexemeText());
shortCount += l.getLength();
}
if (lastLexemeLength == 0) {
keywordBuffer.append(l.getLexemeText());
} else if (lastLexemeLength == 1 && l.getLength() == 1
&& lastLexemeEnd == l.getBeginPosition()) {//单字位置相邻长度为一合并)
keywordBuffer.append(l.getLexemeText());
} else {
keywordBuffer.append(' ').append(l.getLexemeText());
}
lastLexemeLength = l.getLength();
lastLexemeEnd = l.getEndPosition();
}
//借助lucene queryparser 生成SWMC Query
QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer());
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
qp.setAutoGeneratePhraseQueries(true);
if ((shortCount * 1.0f / totalCount) > 0.5f) {
try {
return qp.parse(keywordBuffer_Short.toString());
} catch (ParseException e) {
e.printStackTrace();
}
} else {
if (keywordBuffer.length() > 0) {
try {
return qp.parse(keywordBuffer.toString());
} catch (ParseException e) {
e.printStackTrace();
}
}
}
return null;
}
}

View File

@ -0,0 +1,64 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.sample;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.io.IOException;
import java.io.StringReader;
/**
* 使用IKAnalyzer进行分词的演示
* 2012-10-22
*/
public class IKAnalzyerDemo {
public static void main(String[] args) {
//构建IK分词器使用smart分词模式
Analyzer analyzer = new IKAnalyzer(true);
//获取Lucene的TokenStream对象
TokenStream ts = null;
try {
ts = analyzer.tokenStream("myfield", new StringReader("这是一个中文分词的例子你可以直接运行它IKAnalyer can analysis english text too"));
//获取词元位置属性
OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
//获取词元文本属性
CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
//获取词元文本属性
TypeAttribute type = ts.addAttribute(TypeAttribute.class);
//重置TokenStream重置StringReader
ts.reset();
//迭代获取分词结果
while (ts.incrementToken()) {
System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type());
}
//关闭TokenStream关闭StringReader
ts.end(); // Perform end-of-stream operations, e.g. set the final offset.
} catch (IOException e) {
e.printStackTrace();
} finally {
//释放TokenStream的所有资源
if (ts != null) {
try {
ts.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}

View File

@ -0,0 +1,115 @@
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by 高志成(magese@live.cn)
*/
package org.wltea.analyzer.sample;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.io.IOException;
/**
* 使用IKAnalyzer进行Lucene索引和查询的演示
* 2012-3-2
* <p>
* 以下是结合Lucene4.0 API的写法
*/
public class LuceneIndexAndSearchDemo {
/**
* 模拟
* 创建一个单条记录的索引并对其进行搜索
*
*/
public static void main(String[] args) {
//Lucene Document的域名
String fieldName = "text";
//检索内容
String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
//实例化IKAnalyzer分词器
Analyzer analyzer = new IKAnalyzer(true);
Directory directory = null;
IndexWriter iwriter;
IndexReader ireader = null;
IndexSearcher isearcher;
try {
//建立内存索引对象
directory = new RAMDirectory();
//配置IndexWriterConfig
IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
iwriter = new IndexWriter(directory, iwConfig);
//写入索引
Document doc = new Document();
doc.add(new StringField("ID", "10000", Field.Store.YES));
doc.add(new TextField(fieldName, text, Field.Store.YES));
iwriter.addDocument(doc);
iwriter.close();
//搜索过程**********************************
//实例化搜索器
ireader = DirectoryReader.open(directory);
isearcher = new IndexSearcher(ireader);
String keyword = "中文分词工具包";
//使用QueryParser查询分析器构造Query对象
QueryParser qp = new QueryParser(fieldName, analyzer);
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
Query query = qp.parse(keyword);
System.out.println("Query = " + query);
//搜索相似度最高的5条记录
TopDocs topDocs = isearcher.search(query, 5);
System.out.println("命中:" + topDocs.totalHits);
//输出结果
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (int i = 0; i < topDocs.totalHits; i++) {
Document targetDoc = isearcher.doc(scoreDocs[i].doc);
System.out.println("内容:" + targetDoc.toString());
}
} catch (ParseException | IOException e) {
e.printStackTrace();
} finally {
if (ireader != null) {
try {
ireader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (directory != null) {
try {
directory.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}

View File

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>IK Analyzer 扩展配置</comment>
<!--用户可以在这里配置自己的扩展字典 -->
<entry key="ext_dict">ext.dic;</entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">stopword.dic;</entry>
</properties>

View File

@ -0,0 +1,4 @@
诛仙
诛仙2
梦幻诛仙
梦幻诛仙2

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,316 @@
世纪
位数
像素
克拉
公亩
公克
公分
公升
公尺
公担
公斤
公里
公顷
分钟
分米
加仑
千克
千米
厘米
周年
小时
平方
平方公尺
平方公里
平方分米
平方厘米
平方码
平方米
平方英寸
平方英尺
平方英里
平米
年代
年级
月份
毫升
毫米
毫克
海里
点钟
盎司
秒钟
立方公尺
立方分米
立方厘米
立方码
立方米
立方英寸
立方英尺
英亩
英寸
英尺
英里
阶段

File diff suppressed because it is too large Load Diff