更新Lucene版本为8.3.1;
This commit is contained in:
parent
4dd4a86b4d
commit
8b04070253
12
README.md
12
README.md
@ -22,15 +22,16 @@ ik-analyzer for solr 7.x-8.x
|
||||
| word | 64.2万 | 2014年 |
|
||||
| jieba | 58.4万 | 2012年 |
|
||||
| jcesg | 16.6万 | 2018年 |
|
||||
| sougou词库 | 115.2万 | 2019年 |
|
||||
| sougou词库 | 115.2万 | 2020年 |
|
||||
#### 将以上词库进行整理后约187.1万条词汇;
|
||||
#### 添加动态加载词典表功能,在不需要重启solr服务的情况下加载新增的词典。
|
||||
> <small>关闭默认主词典请在`IKAnalyzer.cfg.xml`配置文件中设置`use_main_dict`为`false`。</small>
|
||||
* IKAnalyzer的原作者为林良益<linliangyi2007@gmail.com>,项目网站为<http://code.google.com/p/ik-analyzer>
|
||||
* 该项目动态加载功能根据博主[@星火燎原智勇](http://www.cnblogs.com/liang1101/articles/6395016.html)的博客进行修改,其GITHUB地址为[@liang68](https://github.com/liang68)
|
||||
|
||||
|
||||
## 使用说明
|
||||
* jar包下载地址:[](https://search.maven.org/remotecontent?filepath=com/github/magese/ik-analyzer/8.3.0/ik-analyzer-8.3.0.jar)
|
||||
* jar包下载地址:[](https://search.maven.org/remotecontent?filepath=com/github/magese/ik-analyzer/8.3.1/ik-analyzer-8.3.1.jar)
|
||||
* 历史版本:[](https://search.maven.org/search?q=g:com.github.magese%20AND%20a:ik-analyzer&core=gav)
|
||||
|
||||
```console
|
||||
@ -38,7 +39,7 @@ ik-analyzer for solr 7.x-8.x
|
||||
<dependency>
|
||||
<groupId>com.github.magese</groupId>
|
||||
<artifactId>ik-analyzer</artifactId>
|
||||
<version>8.3.0</version>
|
||||
<version>8.3.1</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
@ -79,7 +80,7 @@ ik-analyzer for solr 7.x-8.x
|
||||
5. `IKAnalyzer.cfg.xml`配置文件说明:
|
||||
|
||||
| 名称 | 类型 | 描述 | 默认 |
|
||||
| :------: | :------: | :------: | :------: |
|
||||
| ------ | ------ | ------ | ------ |
|
||||
| use_main_dict | boolean | 是否使用默认主词典 | true |
|
||||
| ext_dict | String | 扩展词典文件名称,多个用分号隔开 | ext.dic; |
|
||||
| ext_stopwords | String | 停用词典文件名称,多个用分号隔开 | stopword.dic; |
|
||||
@ -100,6 +101,9 @@ ik-analyzer for solr 7.x-8.x
|
||||
|
||||
|
||||
## 更新说明
|
||||
- `2020-12-30:`
|
||||
- 升级lucene版本为`8.3.1`
|
||||
- 更新词库
|
||||
- `2019-11-12:`
|
||||
- 升级lucene版本为`8.3.0`
|
||||
- `IKAnalyzer.cfg.xml`增加配置项`use_main_dict`,用于配置是否启用默认主词典
|
||||
|
5
pom.xml
5
pom.xml
@ -4,7 +4,7 @@
|
||||
|
||||
<groupId>com.github.magese</groupId>
|
||||
<artifactId>ik-analyzer</artifactId>
|
||||
<version>8.3.0</version>
|
||||
<version>8.3.1</version>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<name>ik-analyzer-solr</name>
|
||||
@ -13,7 +13,7 @@
|
||||
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<lucene.version>8.3.0</lucene.version>
|
||||
<lucene.version>8.3.1</lucene.version>
|
||||
<javac.src.version>1.8</javac.src.version>
|
||||
<javac.target.version>1.8</javac.target.version>
|
||||
<maven.compiler.plugin.version>3.3</maven.compiler.plugin.version>
|
||||
@ -152,4 +152,3 @@
|
||||
</profile>
|
||||
</profiles>
|
||||
</project>
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.cfg;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.cfg;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,23 +21,19 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.wltea.analyzer.cfg.Configuration;
|
||||
import org.wltea.analyzer.dic.Dictionary;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* 分词器上下文状态
|
||||
*/
|
||||
@ -66,17 +62,17 @@ class AnalyzeContext {
|
||||
|
||||
//子分词器锁
|
||||
//该集合非空,说明有子分词器在占用segmentBuff
|
||||
private Set<String> buffLocker;
|
||||
private final Set<String> buffLocker;
|
||||
|
||||
//原始分词结果集合,未经歧义处理
|
||||
private QuickSortSet orgLexemes;
|
||||
//LexemePath位置索引表
|
||||
private Map<Integer, LexemePath> pathMap;
|
||||
private final Map<Integer, LexemePath> pathMap;
|
||||
//最终分词结果集
|
||||
private LinkedList<Lexeme> results;
|
||||
private final LinkedList<Lexeme> results;
|
||||
|
||||
//分词器配置项
|
||||
private Configuration cfg;
|
||||
private final Configuration cfg;
|
||||
|
||||
AnalyzeContext(Configuration cfg) {
|
||||
this.cfg = cfg;
|
||||
@ -254,7 +250,7 @@ class AnalyzeContext {
|
||||
*/
|
||||
void outputToResult() {
|
||||
int index = 0;
|
||||
for (; index <= this.cursor; ) {
|
||||
while (index <= this.cursor) {
|
||||
//跳过非CJK字符
|
||||
if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) {
|
||||
index++;
|
||||
@ -353,12 +349,14 @@ class AnalyzeContext {
|
||||
if (Lexeme.TYPE_ARABIC == result.getLexemeType()) {
|
||||
Lexeme nextLexeme = this.results.peekFirst();
|
||||
boolean appendOk = false;
|
||||
if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) {
|
||||
//合并英文数词+中文数词
|
||||
appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
|
||||
} else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
|
||||
//合并英文数词+中文量词
|
||||
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
|
||||
if (nextLexeme != null) {
|
||||
if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) {
|
||||
//合并英文数词+中文数词
|
||||
appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
|
||||
} else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
|
||||
//合并英文数词+中文量词
|
||||
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
|
||||
}
|
||||
}
|
||||
if (appendOk) {
|
||||
//弹出
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
@ -38,13 +38,13 @@ import org.wltea.analyzer.dic.Hit;
|
||||
* 中文-日韩文子分词器
|
||||
*/
|
||||
class CJKSegmenter implements ISegmenter {
|
||||
|
||||
|
||||
//子分词器标签
|
||||
private static final String SEGMENTER_NAME = "CJK_SEGMENTER";
|
||||
//待处理的分词hit队列
|
||||
private List<Hit> tmpHits;
|
||||
|
||||
|
||||
|
||||
|
||||
CJKSegmenter(){
|
||||
this.tmpHits = new LinkedList<>();
|
||||
}
|
||||
@ -54,7 +54,7 @@ class CJKSegmenter implements ISegmenter {
|
||||
*/
|
||||
public void analyze(AnalyzeContext context) {
|
||||
if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){
|
||||
|
||||
|
||||
//优先处理tmpHits中的hit
|
||||
if(!this.tmpHits.isEmpty()){
|
||||
//处理词段队列
|
||||
@ -65,18 +65,18 @@ class CJKSegmenter implements ISegmenter {
|
||||
//输出当前的词
|
||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
|
||||
context.addLexeme(newLexeme);
|
||||
|
||||
|
||||
if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除
|
||||
this.tmpHits.remove(hit);
|
||||
}
|
||||
|
||||
|
||||
}else if(hit.isUnmatch()){
|
||||
//hit不是词,移除
|
||||
this.tmpHits.remove(hit);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//*********************************
|
||||
//再对当前指针位置的字符进行单字匹配
|
||||
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
|
||||
@ -94,24 +94,24 @@ class CJKSegmenter implements ISegmenter {
|
||||
//前缀匹配则放入hit列表
|
||||
this.tmpHits.add(singleCharHit);
|
||||
}
|
||||
|
||||
|
||||
|
||||
}else{
|
||||
//遇到CHAR_USELESS字符
|
||||
//清空队列
|
||||
this.tmpHits.clear();
|
||||
}
|
||||
|
||||
|
||||
//判断缓冲区是否已经读完
|
||||
if(context.isBufferConsumed()){
|
||||
//清空队列
|
||||
this.tmpHits.clear();
|
||||
}
|
||||
|
||||
|
||||
//判断是否锁定缓冲区
|
||||
if(this.tmpHits.size() == 0){
|
||||
context.unlockBuffer(SEGMENTER_NAME);
|
||||
|
||||
|
||||
}else{
|
||||
context.lockBuffer(SEGMENTER_NAME);
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
@ -36,11 +36,11 @@ import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
* 中文数量词子分词器
|
||||
*/
|
||||
class CN_QuantifierSegmenter implements ISegmenter{
|
||||
|
||||
|
||||
//子分词器标签
|
||||
private static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
|
||||
|
||||
@ -54,7 +54,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
|
||||
ChnNumberChars.add(nChar);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* 词元的开始位置,
|
||||
* 同时作为子分词器状态标识
|
||||
@ -69,14 +69,14 @@ class CN_QuantifierSegmenter implements ISegmenter{
|
||||
|
||||
//待处理的量词hit队列
|
||||
private List<Hit> countHits;
|
||||
|
||||
|
||||
|
||||
|
||||
CN_QuantifierSegmenter(){
|
||||
nStart = -1;
|
||||
nEnd = -1;
|
||||
this.countHits = new LinkedList<>();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 分词
|
||||
*/
|
||||
@ -85,7 +85,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
|
||||
this.processCNumber(context);
|
||||
//处理中文量词
|
||||
this.processCount(context);
|
||||
|
||||
|
||||
//判断是否锁定缓冲区
|
||||
if(this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()){
|
||||
//对缓冲区解锁
|
||||
@ -94,7 +94,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
|
||||
context.lockBuffer(SEGMENTER_NAME);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* 重置子分词器状态
|
||||
@ -104,20 +104,20 @@ class CN_QuantifierSegmenter implements ISegmenter{
|
||||
nEnd = -1;
|
||||
countHits.clear();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 处理数词
|
||||
*/
|
||||
private void processCNumber(AnalyzeContext context){
|
||||
if(nStart == -1 && nEnd == -1){//初始状态
|
||||
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
|
||||
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
|
||||
&& ChnNumberChars.contains(context.getCurrentChar())){
|
||||
//记录数词的起始、结束位置
|
||||
nStart = context.getCursor();
|
||||
nEnd = context.getCursor();
|
||||
}
|
||||
}else{//正在处理状态
|
||||
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
|
||||
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
|
||||
&& ChnNumberChars.contains(context.getCurrentChar())){
|
||||
//记录数词的结束位置
|
||||
nEnd = context.getCursor();
|
||||
@ -129,7 +129,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
|
||||
nEnd = -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//缓冲区已经用完,还有尚未输出的数词
|
||||
if(context.isBufferConsumed()){
|
||||
if(nStart != -1 && nEnd != -1){
|
||||
@ -139,9 +139,9 @@ class CN_QuantifierSegmenter implements ISegmenter{
|
||||
nStart = -1;
|
||||
nEnd = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 处理中文量词
|
||||
* @param context 需要处理的内容
|
||||
@ -151,9 +151,9 @@ class CN_QuantifierSegmenter implements ISegmenter{
|
||||
if(!this.needCountScan(context)){
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){
|
||||
|
||||
|
||||
//优先处理countHits中的hit
|
||||
if(!this.countHits.isEmpty()){
|
||||
//处理词段队列
|
||||
@ -164,17 +164,17 @@ class CN_QuantifierSegmenter implements ISegmenter{
|
||||
//输出当前的词
|
||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
|
||||
context.addLexeme(newLexeme);
|
||||
|
||||
|
||||
if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除
|
||||
this.countHits.remove(hit);
|
||||
}
|
||||
|
||||
|
||||
}else if(hit.isUnmatch()){
|
||||
//hit不是词,移除
|
||||
this.countHits.remove(hit);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//*********************************
|
||||
//对当前指针位置的字符进行单字匹配
|
||||
@ -193,21 +193,21 @@ class CN_QuantifierSegmenter implements ISegmenter{
|
||||
//前缀匹配则放入hit列表
|
||||
this.countHits.add(singleCharHit);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}else{
|
||||
//输入的不是中文字符
|
||||
//清空未成形的量词
|
||||
this.countHits.clear();
|
||||
}
|
||||
|
||||
|
||||
//缓冲区数据已经读完,还有尚未输出的量词
|
||||
if(context.isBufferConsumed()){
|
||||
//清空未成形的量词
|
||||
this.countHits.clear();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 判断是否需要扫描量词
|
||||
*/
|
||||
@ -226,7 +226,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 添加数词词元到结果集
|
||||
* @param context 需要添加的词元
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
@ -32,18 +32,18 @@ package org.wltea.analyzer.core;
|
||||
* 字符集识别工具类
|
||||
*/
|
||||
class CharacterUtil {
|
||||
|
||||
|
||||
static final int CHAR_USELESS = 0;
|
||||
|
||||
|
||||
static final int CHAR_ARABIC = 0X00000001;
|
||||
|
||||
|
||||
static final int CHAR_ENGLISH = 0X00000002;
|
||||
|
||||
|
||||
static final int CHAR_CHINESE = 0X00000004;
|
||||
|
||||
|
||||
static final int CHAR_OTHER_CJK = 0X00000008;
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* 识别字符类型
|
||||
* @param input 需要识别的字符
|
||||
@ -52,23 +52,23 @@ class CharacterUtil {
|
||||
static int identifyCharType(char input){
|
||||
if(input >= '0' && input <= '9'){
|
||||
return CHAR_ARABIC;
|
||||
|
||||
|
||||
}else if((input >= 'a' && input <= 'z')
|
||||
|| (input >= 'A' && input <= 'Z')){
|
||||
return CHAR_ENGLISH;
|
||||
|
||||
|
||||
}else {
|
||||
Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
|
||||
|
||||
if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|
||||
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|
||||
|
||||
if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|
||||
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|
||||
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){
|
||||
//目前已知的中文字符UTF-8集合
|
||||
return CHAR_CHINESE;
|
||||
|
||||
|
||||
}else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符
|
||||
//韩文字符集
|
||||
|| ub == Character.UnicodeBlock.HANGUL_SYLLABLES
|
||||
|| ub == Character.UnicodeBlock.HANGUL_SYLLABLES
|
||||
|| ub == Character.UnicodeBlock.HANGUL_JAMO
|
||||
|| ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
|
||||
//日文字符集
|
||||
@ -76,13 +76,13 @@ class CharacterUtil {
|
||||
|| ub == Character.UnicodeBlock.KATAKANA //片假名
|
||||
|| ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){
|
||||
return CHAR_OTHER_CJK;
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
//其他的不做处理的字符
|
||||
return CHAR_USELESS;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 进行字符规格化(全角转半角,大写转小写处理)
|
||||
* @param input 需要转换的字符
|
||||
@ -91,14 +91,14 @@ class CharacterUtil {
|
||||
static char regularize(char input){
|
||||
if (input == 12288) {
|
||||
input = (char) 32;
|
||||
|
||||
|
||||
}else if (input > 65280 && input < 65375) {
|
||||
input = (char) (input - 65248);
|
||||
|
||||
|
||||
}else if (input >= 'A' && input <= 'Z') {
|
||||
input += 32;
|
||||
}
|
||||
|
||||
|
||||
return input;
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,26 +21,26 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
* 子分词器接口
|
||||
*/
|
||||
interface ISegmenter {
|
||||
|
||||
|
||||
/**
|
||||
* 从分析器读取下一个可能分解的词元对象
|
||||
* @param context 分词算法上下文
|
||||
*/
|
||||
void analyze(AnalyzeContext context);
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* 重置子分析器状态
|
||||
*/
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,14 +21,14 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
|
||||
/**
|
||||
* IK词元对象
|
||||
* IK词元对象
|
||||
*/
|
||||
@SuppressWarnings("unused")
|
||||
public class Lexeme implements Comparable<Lexeme>{
|
||||
@ -50,7 +50,7 @@ public class Lexeme implements Comparable<Lexeme>{
|
||||
static final int TYPE_COUNT = 32;
|
||||
//中文数量词
|
||||
static final int TYPE_CQUAN = 48;
|
||||
|
||||
|
||||
//词元的起始位移
|
||||
private int offset;
|
||||
//词元的相对起始位置
|
||||
@ -61,8 +61,8 @@ public class Lexeme implements Comparable<Lexeme>{
|
||||
private String lexemeText;
|
||||
//词元类型
|
||||
private int lexemeType;
|
||||
|
||||
|
||||
|
||||
|
||||
public Lexeme(int offset , int begin , int length , int lexemeType){
|
||||
this.offset = offset;
|
||||
this.begin = begin;
|
||||
@ -72,7 +72,7 @@ public class Lexeme implements Comparable<Lexeme>{
|
||||
this.length = length;
|
||||
this.lexemeType = lexemeType;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* 判断词元相等算法
|
||||
* 起始位置偏移、起始位置、终止位置相同
|
||||
@ -82,21 +82,21 @@ public class Lexeme implements Comparable<Lexeme>{
|
||||
if(o == null){
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
if(this == o){
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
if(o instanceof Lexeme){
|
||||
Lexeme other = (Lexeme)o;
|
||||
return this.offset == other.getOffset()
|
||||
&& this.begin == other.getBegin()
|
||||
&& this.length == other.getLength();
|
||||
}else{
|
||||
}else{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* 词元哈希编码算法
|
||||
* @see java.lang.Object#hashCode()
|
||||
@ -106,7 +106,7 @@ public class Lexeme implements Comparable<Lexeme>{
|
||||
int absEnd = getEndPosition();
|
||||
return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* 词元在排序集合中的比较算法
|
||||
* @see java.lang.Comparable#compareTo(java.lang.Object)
|
||||
@ -119,12 +119,12 @@ public class Lexeme implements Comparable<Lexeme>{
|
||||
//词元长度优先
|
||||
//this.length < other.getLength()
|
||||
return Integer.compare(other.getLength(), this.length);
|
||||
|
||||
|
||||
}else{//this.begin > other.getBegin()
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private int getOffset() {
|
||||
return offset;
|
||||
}
|
||||
@ -155,22 +155,22 @@ public class Lexeme implements Comparable<Lexeme>{
|
||||
public int getEndPosition(){
|
||||
return offset + begin + length;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 获取词元的字符长度
|
||||
* @return int
|
||||
*/
|
||||
public int getLength(){
|
||||
return this.length;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void setLength(int length) {
|
||||
if(this.length < 0){
|
||||
throw new IllegalArgumentException("length < 0");
|
||||
}
|
||||
this.length = length;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 获取词元的文本内容
|
||||
* @return String
|
||||
@ -199,7 +199,7 @@ public class Lexeme implements Comparable<Lexeme>{
|
||||
int getLexemeType() {
|
||||
return lexemeType;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 获取词元类型标示字符串
|
||||
* @return String
|
||||
@ -209,41 +209,41 @@ public class Lexeme implements Comparable<Lexeme>{
|
||||
|
||||
case TYPE_ENGLISH :
|
||||
return "ENGLISH";
|
||||
|
||||
|
||||
case TYPE_ARABIC :
|
||||
return "ARABIC";
|
||||
|
||||
|
||||
case TYPE_LETTER :
|
||||
return "LETTER";
|
||||
|
||||
case TYPE_CNWORD :
|
||||
|
||||
case TYPE_CNWORD :
|
||||
return "CN_WORD";
|
||||
|
||||
case TYPE_CNCHAR :
|
||||
|
||||
case TYPE_CNCHAR :
|
||||
return "CN_CHAR";
|
||||
|
||||
|
||||
case TYPE_OTHER_CJK :
|
||||
return "OTHER_CJK";
|
||||
|
||||
|
||||
case TYPE_COUNT :
|
||||
return "COUNT";
|
||||
|
||||
|
||||
case TYPE_CNUM :
|
||||
return "TYPE_CNUM";
|
||||
|
||||
case TYPE_CQUAN:
|
||||
|
||||
case TYPE_CQUAN:
|
||||
return "TYPE_CQUAN";
|
||||
|
||||
|
||||
default :
|
||||
return "UNKONW";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void setLexemeType(int lexemeType) {
|
||||
this.lexemeType = lexemeType;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 合并两个相邻的词元
|
||||
* @return boolean 词元是否成功合并
|
||||
@ -257,16 +257,16 @@ public class Lexeme implements Comparable<Lexeme>{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
*/
|
||||
public String toString(){
|
||||
return this.getBeginPosition() + "-" + this.getEndPosition() +
|
||||
" : " + this.lexemeText + " : \t" +
|
||||
this.getLexemeTypeString();
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.2.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -31,178 +31,182 @@ package org.wltea.analyzer.core;
|
||||
* IK分词器专用的Lexem快速排序集合
|
||||
*/
|
||||
class QuickSortSet {
|
||||
//链表头
|
||||
private Cell head;
|
||||
//链表尾
|
||||
private Cell tail;
|
||||
//链表的实际大小
|
||||
private int size;
|
||||
|
||||
QuickSortSet(){
|
||||
this.size = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* 向链表集合添加词元
|
||||
*/
|
||||
void addLexeme(Lexeme lexeme){
|
||||
Cell newCell = new Cell(lexeme);
|
||||
if(this.size == 0){
|
||||
this.head = newCell;
|
||||
this.tail = newCell;
|
||||
this.size++;
|
||||
//链表头
|
||||
private Cell head;
|
||||
//链表尾
|
||||
private Cell tail;
|
||||
//链表的实际大小
|
||||
private int size;
|
||||
|
||||
}else{
|
||||
QuickSortSet() {
|
||||
this.size = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* 向链表集合添加词元
|
||||
*/
|
||||
void addLexeme(Lexeme lexeme) {
|
||||
Cell newCell = new Cell(lexeme);
|
||||
if (this.size == 0) {
|
||||
this.head = newCell;
|
||||
this.tail = newCell;
|
||||
this.size++;
|
||||
|
||||
} else {
|
||||
/*if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同,不放入集合
|
||||
|
||||
}else */if(this.tail.compareTo(newCell) < 0){//词元接入链表尾部
|
||||
this.tail.next = newCell;
|
||||
newCell.prev = this.tail;
|
||||
this.tail = newCell;
|
||||
this.size++;
|
||||
}else */
|
||||
if (this.tail.compareTo(newCell) < 0) {//词元接入链表尾部
|
||||
this.tail.next = newCell;
|
||||
newCell.prev = this.tail;
|
||||
this.tail = newCell;
|
||||
this.size++;
|
||||
|
||||
}else if(this.head.compareTo(newCell) > 0){//词元接入链表头部
|
||||
this.head.prev = newCell;
|
||||
newCell.next = this.head;
|
||||
this.head = newCell;
|
||||
this.size++;
|
||||
} else if (this.head.compareTo(newCell) > 0) {//词元接入链表头部
|
||||
this.head.prev = newCell;
|
||||
newCell.next = this.head;
|
||||
this.head = newCell;
|
||||
this.size++;
|
||||
|
||||
}else{
|
||||
//从尾部上逆
|
||||
Cell index = this.tail;
|
||||
while(index != null && index.compareTo(newCell) > 0){
|
||||
index = index.prev;
|
||||
}
|
||||
} else {
|
||||
//从尾部上逆
|
||||
Cell index = this.tail;
|
||||
while (index != null && index.compareTo(newCell) > 0) {
|
||||
index = index.prev;
|
||||
}
|
||||
/*if(index.compareTo(newCell) == 0){//词元与集合中的词元重复,不放入集合
|
||||
|
||||
}else */if((index != null ? index.compareTo(newCell) : 1) < 0){//词元插入链表中的某个位置
|
||||
newCell.prev = index;
|
||||
newCell.next = index.next;
|
||||
index.next.prev = newCell;
|
||||
index.next = newCell;
|
||||
this.size++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 返回链表头部元素
|
||||
*/
|
||||
Lexeme peekFirst(){
|
||||
if(this.head != null){
|
||||
return this.head.lexeme;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 取出链表集合的第一个元素
|
||||
* @return Lexeme
|
||||
*/
|
||||
Lexeme pollFirst(){
|
||||
if(this.size == 1){
|
||||
Lexeme first = this.head.lexeme;
|
||||
this.head = null;
|
||||
this.tail = null;
|
||||
this.size--;
|
||||
return first;
|
||||
}else if(this.size > 1){
|
||||
Lexeme first = this.head.lexeme;
|
||||
this.head = this.head.next;
|
||||
this.size --;
|
||||
return first;
|
||||
}else{
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 返回链表尾部元素
|
||||
*/
|
||||
Lexeme peekLast(){
|
||||
if(this.tail != null){
|
||||
return this.tail.lexeme;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 取出链表集合的最后一个元素
|
||||
* @return Lexeme
|
||||
*/
|
||||
Lexeme pollLast(){
|
||||
if(this.size == 1){
|
||||
Lexeme last = this.head.lexeme;
|
||||
this.head = null;
|
||||
this.tail = null;
|
||||
this.size--;
|
||||
return last;
|
||||
|
||||
}else if(this.size > 1){
|
||||
Lexeme last = this.tail.lexeme;
|
||||
this.tail = this.tail.prev;
|
||||
this.size--;
|
||||
return last;
|
||||
|
||||
}else{
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 返回集合大小
|
||||
*/
|
||||
int size(){
|
||||
return this.size;
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断集合是否为空
|
||||
*/
|
||||
boolean isEmpty(){
|
||||
return this.size == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* 返回lexeme链的头部
|
||||
*/
|
||||
Cell getHead(){
|
||||
return this.head;
|
||||
}
|
||||
}else */
|
||||
if ((index != null ? index.compareTo(newCell) : 1) < 0) {//词元插入链表中的某个位置
|
||||
newCell.prev = index;
|
||||
newCell.next = index.next;
|
||||
index.next.prev = newCell;
|
||||
index.next = newCell;
|
||||
this.size++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* IK 中文分词 版本 7.0
|
||||
* IK Analyzer release 7.0
|
||||
* update by Magese(magese@live.cn)
|
||||
*/
|
||||
@SuppressWarnings("unused")
|
||||
class Cell implements Comparable<Cell>{
|
||||
private Cell prev;
|
||||
private Cell next;
|
||||
private Lexeme lexeme;
|
||||
|
||||
Cell(Lexeme lexeme){
|
||||
if(lexeme == null){
|
||||
throw new IllegalArgumentException("lexeme must not be null");
|
||||
}
|
||||
this.lexeme = lexeme;
|
||||
}
|
||||
/**
|
||||
* 返回链表头部元素
|
||||
*/
|
||||
Lexeme peekFirst() {
|
||||
if (this.head != null) {
|
||||
return this.head.lexeme;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public int compareTo(Cell o) {
|
||||
return this.lexeme.compareTo(o.lexeme);
|
||||
}
|
||||
/**
|
||||
* 取出链表集合的第一个元素
|
||||
*
|
||||
* @return Lexeme
|
||||
*/
|
||||
Lexeme pollFirst() {
|
||||
if (this.size == 1) {
|
||||
Lexeme first = this.head.lexeme;
|
||||
this.head = null;
|
||||
this.tail = null;
|
||||
this.size--;
|
||||
return first;
|
||||
} else if (this.size > 1) {
|
||||
Lexeme first = this.head.lexeme;
|
||||
this.head = this.head.next;
|
||||
this.size--;
|
||||
return first;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public Cell getPrev(){
|
||||
return this.prev;
|
||||
}
|
||||
|
||||
Cell getNext(){
|
||||
return this.next;
|
||||
}
|
||||
|
||||
public Lexeme getLexeme(){
|
||||
return this.lexeme;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* 返回链表尾部元素
|
||||
*/
|
||||
Lexeme peekLast() {
|
||||
if (this.tail != null) {
|
||||
return this.tail.lexeme;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 取出链表集合的最后一个元素
|
||||
*
|
||||
* @return Lexeme
|
||||
*/
|
||||
Lexeme pollLast() {
|
||||
if (this.size == 1) {
|
||||
Lexeme last = this.head.lexeme;
|
||||
this.head = null;
|
||||
this.tail = null;
|
||||
this.size--;
|
||||
return last;
|
||||
|
||||
} else if (this.size > 1) {
|
||||
Lexeme last = this.tail.lexeme;
|
||||
this.tail = this.tail.prev;
|
||||
this.size--;
|
||||
return last;
|
||||
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 返回集合大小
|
||||
*/
|
||||
int size() {
|
||||
return this.size;
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断集合是否为空
|
||||
*/
|
||||
boolean isEmpty() {
|
||||
return this.size == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* 返回lexeme链的头部
|
||||
*/
|
||||
Cell getHead() {
|
||||
return this.head;
|
||||
}
|
||||
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
* update by Magese(magese@live.cn)
|
||||
*/
|
||||
@SuppressWarnings("unused")
|
||||
static class Cell implements Comparable<Cell> {
|
||||
private Cell prev;
|
||||
private Cell next;
|
||||
private final Lexeme lexeme;
|
||||
|
||||
Cell(Lexeme lexeme) {
|
||||
if (lexeme == null) {
|
||||
throw new IllegalArgumentException("lexeme must not be null");
|
||||
}
|
||||
this.lexeme = lexeme;
|
||||
}
|
||||
|
||||
public int compareTo(Cell o) {
|
||||
return this.lexeme.compareTo(o.lexeme);
|
||||
}
|
||||
|
||||
public Cell getPrev() {
|
||||
return this.prev;
|
||||
}
|
||||
|
||||
Cell getNext() {
|
||||
return this.next;
|
||||
}
|
||||
|
||||
public Lexeme getLexeme() {
|
||||
return this.lexeme;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.dic;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.dic;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.dic;
|
||||
@ -38,13 +38,13 @@ public class Hit {
|
||||
private static final int MATCH = 0x00000001;
|
||||
//Hit前缀匹配
|
||||
private static final int PREFIX = 0x00000010;
|
||||
|
||||
|
||||
|
||||
|
||||
//该HIT当前状态,默认未匹配
|
||||
private int hitState = UNMATCH;
|
||||
|
||||
|
||||
//记录词典匹配过程中,当前匹配到的词典分支节点
|
||||
private DictSegment matchedDictSegment;
|
||||
private DictSegment matchedDictSegment;
|
||||
/*
|
||||
* 词段开始位置
|
||||
*/
|
||||
@ -53,8 +53,8 @@ public class Hit {
|
||||
* 词段的结束位置
|
||||
*/
|
||||
private int end;
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* 判断是否完全匹配
|
||||
*/
|
||||
@ -62,7 +62,7 @@ public class Hit {
|
||||
return (this.hitState & MATCH) > 0;
|
||||
}
|
||||
/**
|
||||
*
|
||||
*
|
||||
*/
|
||||
void setMatch() {
|
||||
this.hitState = this.hitState | MATCH;
|
||||
@ -75,7 +75,7 @@ public class Hit {
|
||||
return (this.hitState & PREFIX) > 0;
|
||||
}
|
||||
/**
|
||||
*
|
||||
*
|
||||
*/
|
||||
void setPrefix() {
|
||||
this.hitState = this.hitState | PREFIX;
|
||||
@ -87,34 +87,34 @@ public class Hit {
|
||||
return this.hitState == UNMATCH ;
|
||||
}
|
||||
/**
|
||||
*
|
||||
*
|
||||
*/
|
||||
void setUnmatch() {
|
||||
this.hitState = UNMATCH;
|
||||
}
|
||||
|
||||
|
||||
DictSegment getMatchedDictSegment() {
|
||||
return matchedDictSegment;
|
||||
}
|
||||
|
||||
|
||||
void setMatchedDictSegment(DictSegment matchedDictSegment) {
|
||||
this.matchedDictSegment = matchedDictSegment;
|
||||
}
|
||||
|
||||
|
||||
public int getBegin() {
|
||||
return begin;
|
||||
}
|
||||
|
||||
|
||||
void setBegin(int begin) {
|
||||
this.begin = begin;
|
||||
}
|
||||
|
||||
|
||||
public int getEnd() {
|
||||
return end;
|
||||
}
|
||||
|
||||
|
||||
void setEnd(int end) {
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.lucene;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.lucene;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.lucene;
|
||||
@ -74,7 +74,7 @@ public class IKTokenizerFactory extends TokenizerFactory implements ResourceLoad
|
||||
*/
|
||||
@Override
|
||||
public void inform(ResourceLoader resourceLoader) throws IOException {
|
||||
System.out.println(String.format("IKTokenizerFactory " + this.hashCode() + " inform conf: %s", getConf()));
|
||||
System.out.printf("IKTokenizerFactory " + this.hashCode() + " inform conf: %s%n", getConf());
|
||||
this.loader = resourceLoader;
|
||||
update();
|
||||
if ((getConf() != null) && (!getConf().trim().isEmpty())) {
|
||||
@ -174,4 +174,4 @@ public class IKTokenizerFactory extends TokenizerFactory implements ResourceLoad
|
||||
private void setConf(String conf) {
|
||||
this.conf = conf;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.lucene;
|
||||
@ -35,7 +35,7 @@ import java.util.Vector;
|
||||
*/
|
||||
public class UpdateThread implements Runnable {
|
||||
private static final long INTERVAL = 30000L; // 循环等待时间
|
||||
private Vector<UpdateJob> filterFactorys; // 更新任务集合
|
||||
private final Vector<UpdateJob> filterFactorys; // 更新任务集合
|
||||
|
||||
/**
|
||||
* 私有化构造器,阻止外部进行实例化
|
||||
@ -51,7 +51,7 @@ public class UpdateThread implements Runnable {
|
||||
* 静态内部类,实现线程安全单例模式
|
||||
*/
|
||||
private static class Builder {
|
||||
private static UpdateThread singleton = new UpdateThread();
|
||||
private static final UpdateThread singleton = new UpdateThread();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -81,6 +81,7 @@ public class UpdateThread implements Runnable {
|
||||
//noinspection InfiniteLoopStatement
|
||||
while (true) {
|
||||
try {
|
||||
//noinspection BusyWait
|
||||
Thread.sleep(INTERVAL);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.query;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.query;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.sample;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.3.0
|
||||
* IK Analyzer release 8.3.0
|
||||
* IK 中文分词 版本 8.3.1
|
||||
* IK Analyzer release 8.3.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.3.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.0 update by Magese(magese@live.cn)
|
||||
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.sample;
|
||||
@ -76,6 +76,7 @@ public class LuceneIndexAndSearchDemo {
|
||||
IndexSearcher isearcher;
|
||||
try {
|
||||
//建立内存索引对象
|
||||
//noinspection deprecation
|
||||
directory = new RAMDirectory();
|
||||
|
||||
//配置IndexWriterConfig
|
||||
|
Loading…
x
Reference in New Issue
Block a user