更新Lucene版本为8.3.1;

This commit is contained in:
gaozhicheng 2020-12-30 10:59:19 +08:00
parent 4dd4a86b4d
commit 8b04070253
26 changed files with 407 additions and 400 deletions

View File

@ -22,15 +22,16 @@ ik-analyzer for solr 7.x-8.x
| word | 64.2万 | 2014年 |
| jieba | 58.4万 | 2012年 |
| jcesg | 16.6万 | 2018年 |
| sougou词库 | 115.2万 | 2019年 |
| sougou词库 | 115.2万 | 2020年 |
#### 将以上词库进行整理后约187.1万条词汇;
#### 添加动态加载词典表功能在不需要重启solr服务的情况下加载新增的词典。
> <small>关闭默认主词典请在`IKAnalyzer.cfg.xml`配置文件中设置`use_main_dict``false`</small>
* IKAnalyzer的原作者为林良益<linliangyi2007@gmail.com>,项目网站为<http://code.google.com/p/ik-analyzer>
* 该项目动态加载功能根据博主[@星火燎原智勇](http://www.cnblogs.com/liang1101/articles/6395016.html)的博客进行修改其GITHUB地址为[@liang68](https://github.com/liang68)
## 使用说明
* jar包下载地址[![GitHub version](https://img.shields.io/badge/version-8.3.0-519dd9.svg)](https://search.maven.org/remotecontent?filepath=com/github/magese/ik-analyzer/8.3.0/ik-analyzer-8.3.0.jar)
* jar包下载地址[![GitHub version](https://img.shields.io/badge/version-8.3.1-519dd9.svg)](https://search.maven.org/remotecontent?filepath=com/github/magese/ik-analyzer/8.3.1/ik-analyzer-8.3.1.jar)
* 历史版本:[![GitHub version](https://img.shields.io/maven-central/v/com.github.magese/ik-analyzer.svg?style=flat-square)](https://search.maven.org/search?q=g:com.github.magese%20AND%20a:ik-analyzer&core=gav)
```console
@ -38,7 +39,7 @@ ik-analyzer for solr 7.x-8.x
<dependency>
<groupId>com.github.magese</groupId>
<artifactId>ik-analyzer</artifactId>
<version>8.3.0</version>
<version>8.3.1</version>
</dependency>
```
@ -79,7 +80,7 @@ ik-analyzer for solr 7.x-8.x
5. `IKAnalyzer.cfg.xml`配置文件说明:
| 名称 | 类型 | 描述 | 默认 |
| :------: | :------: | :------: | :------: |
| ------ | ------ | ------ | ------ |
| use_main_dict | boolean | 是否使用默认主词典 | true |
| ext_dict | String | 扩展词典文件名称,多个用分号隔开 | ext.dic; |
| ext_stopwords | String | 停用词典文件名称,多个用分号隔开 | stopword.dic; |
@ -100,6 +101,9 @@ ik-analyzer for solr 7.x-8.x
## 更新说明
- `2020-12-30:`
- 升级lucene版本为`8.3.1`
- 更新词库
- `2019-11-12:`
- 升级lucene版本为`8.3.0`
- `IKAnalyzer.cfg.xml`增加配置项`use_main_dict`,用于配置是否启用默认主词典

View File

@ -4,7 +4,7 @@
<groupId>com.github.magese</groupId>
<artifactId>ik-analyzer</artifactId>
<version>8.3.0</version>
<version>8.3.1</version>
<packaging>jar</packaging>
<name>ik-analyzer-solr</name>
@ -13,7 +13,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<lucene.version>8.3.0</lucene.version>
<lucene.version>8.3.1</lucene.version>
<javac.src.version>1.8</javac.src.version>
<javac.target.version>1.8</javac.target.version>
<maven.compiler.plugin.version>3.3</maven.compiler.plugin.version>
@ -152,4 +152,3 @@
</profile>
</profiles>
</project>

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.cfg;

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.cfg;

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,23 +21,19 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.core;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.dic.Dictionary;
import java.io.IOException;
import java.io.Reader;
import java.util.*;
/**
* 分词器上下文状态
*/
@ -66,17 +62,17 @@ class AnalyzeContext {
//子分词器锁
//该集合非空说明有子分词器在占用segmentBuff
private Set<String> buffLocker;
private final Set<String> buffLocker;
//原始分词结果集合未经歧义处理
private QuickSortSet orgLexemes;
//LexemePath位置索引表
private Map<Integer, LexemePath> pathMap;
private final Map<Integer, LexemePath> pathMap;
//最终分词结果集
private LinkedList<Lexeme> results;
private final LinkedList<Lexeme> results;
//分词器配置项
private Configuration cfg;
private final Configuration cfg;
AnalyzeContext(Configuration cfg) {
this.cfg = cfg;
@ -254,7 +250,7 @@ class AnalyzeContext {
*/
void outputToResult() {
int index = 0;
for (; index <= this.cursor; ) {
while (index <= this.cursor) {
//跳过非CJK字符
if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) {
index++;
@ -353,12 +349,14 @@ class AnalyzeContext {
if (Lexeme.TYPE_ARABIC == result.getLexemeType()) {
Lexeme nextLexeme = this.results.peekFirst();
boolean appendOk = false;
if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) {
//合并英文数词+中文数词
appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
} else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
//合并英文数词+中文量词
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
if (nextLexeme != null) {
if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) {
//合并英文数词+中文数词
appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
} else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
//合并英文数词+中文量词
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
}
}
if (appendOk) {
//弹出

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.core;
@ -38,13 +38,13 @@ import org.wltea.analyzer.dic.Hit;
* 中文-日韩文子分词器
*/
class CJKSegmenter implements ISegmenter {
//子分词器标签
private static final String SEGMENTER_NAME = "CJK_SEGMENTER";
//待处理的分词hit队列
private List<Hit> tmpHits;
CJKSegmenter(){
this.tmpHits = new LinkedList<>();
}
@ -54,7 +54,7 @@ class CJKSegmenter implements ISegmenter {
*/
public void analyze(AnalyzeContext context) {
if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){
//优先处理tmpHits中的hit
if(!this.tmpHits.isEmpty()){
//处理词段队列
@ -65,18 +65,18 @@ class CJKSegmenter implements ISegmenter {
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
context.addLexeme(newLexeme);
if(!hit.isPrefix()){//不是词前缀hit不需要继续匹配移除
this.tmpHits.remove(hit);
}
}else if(hit.isUnmatch()){
//hit不是词移除
this.tmpHits.remove(hit);
}
}
}
}
}
//*********************************
//再对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
@ -94,24 +94,24 @@ class CJKSegmenter implements ISegmenter {
//前缀匹配则放入hit列表
this.tmpHits.add(singleCharHit);
}
}else{
//遇到CHAR_USELESS字符
//清空队列
this.tmpHits.clear();
}
//判断缓冲区是否已经读完
if(context.isBufferConsumed()){
//清空队列
this.tmpHits.clear();
}
//判断是否锁定缓冲区
if(this.tmpHits.size() == 0){
context.unlockBuffer(SEGMENTER_NAME);
}else{
context.lockBuffer(SEGMENTER_NAME);
}

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.core;
@ -36,11 +36,11 @@ import java.util.List;
import java.util.Set;
/**
*
*
* 中文数量词子分词器
*/
class CN_QuantifierSegmenter implements ISegmenter{
//子分词器标签
private static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
@ -54,7 +54,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
ChnNumberChars.add(nChar);
}
}
/*
* 词元的开始位置
* 同时作为子分词器状态标识
@ -69,14 +69,14 @@ class CN_QuantifierSegmenter implements ISegmenter{
//待处理的量词hit队列
private List<Hit> countHits;
CN_QuantifierSegmenter(){
nStart = -1;
nEnd = -1;
this.countHits = new LinkedList<>();
}
/**
* 分词
*/
@ -85,7 +85,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
this.processCNumber(context);
//处理中文量词
this.processCount(context);
//判断是否锁定缓冲区
if(this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()){
//对缓冲区解锁
@ -94,7 +94,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
context.lockBuffer(SEGMENTER_NAME);
}
}
/**
* 重置子分词器状态
@ -104,20 +104,20 @@ class CN_QuantifierSegmenter implements ISegmenter{
nEnd = -1;
countHits.clear();
}
/**
* 处理数词
*/
private void processCNumber(AnalyzeContext context){
if(nStart == -1 && nEnd == -1){//初始状态
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
&& ChnNumberChars.contains(context.getCurrentChar())){
//记录数词的起始结束位置
nStart = context.getCursor();
nEnd = context.getCursor();
}
}else{//正在处理状态
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
&& ChnNumberChars.contains(context.getCurrentChar())){
//记录数词的结束位置
nEnd = context.getCursor();
@ -129,7 +129,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
nEnd = -1;
}
}
//缓冲区已经用完还有尚未输出的数词
if(context.isBufferConsumed()){
if(nStart != -1 && nEnd != -1){
@ -139,9 +139,9 @@ class CN_QuantifierSegmenter implements ISegmenter{
nStart = -1;
nEnd = -1;
}
}
}
}
/**
* 处理中文量词
* @param context 需要处理的内容
@ -151,9 +151,9 @@ class CN_QuantifierSegmenter implements ISegmenter{
if(!this.needCountScan(context)){
return;
}
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){
//优先处理countHits中的hit
if(!this.countHits.isEmpty()){
//处理词段队列
@ -164,17 +164,17 @@ class CN_QuantifierSegmenter implements ISegmenter{
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
context.addLexeme(newLexeme);
if(!hit.isPrefix()){//不是词前缀hit不需要继续匹配移除
this.countHits.remove(hit);
}
}else if(hit.isUnmatch()){
//hit不是词移除
this.countHits.remove(hit);
}
}
}
}
}
//*********************************
//对当前指针位置的字符进行单字匹配
@ -193,21 +193,21 @@ class CN_QuantifierSegmenter implements ISegmenter{
//前缀匹配则放入hit列表
this.countHits.add(singleCharHit);
}
}else{
//输入的不是中文字符
//清空未成形的量词
this.countHits.clear();
}
//缓冲区数据已经读完还有尚未输出的量词
if(context.isBufferConsumed()){
//清空未成形的量词
this.countHits.clear();
}
}
/**
* 判断是否需要扫描量词
*/
@ -226,7 +226,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
}
return false;
}
/**
* 添加数词词元到结果集
* @param context 需要添加的词元

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.core;
@ -32,18 +32,18 @@ package org.wltea.analyzer.core;
* 字符集识别工具类
*/
class CharacterUtil {
static final int CHAR_USELESS = 0;
static final int CHAR_ARABIC = 0X00000001;
static final int CHAR_ENGLISH = 0X00000002;
static final int CHAR_CHINESE = 0X00000004;
static final int CHAR_OTHER_CJK = 0X00000008;
/**
* 识别字符类型
* @param input 需要识别的字符
@ -52,23 +52,23 @@ class CharacterUtil {
static int identifyCharType(char input){
if(input >= '0' && input <= '9'){
return CHAR_ARABIC;
}else if((input >= 'a' && input <= 'z')
|| (input >= 'A' && input <= 'Z')){
return CHAR_ENGLISH;
}else {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){
//目前已知的中文字符UTF-8集合
return CHAR_CHINESE;
}else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符
//韩文字符集
|| ub == Character.UnicodeBlock.HANGUL_SYLLABLES
|| ub == Character.UnicodeBlock.HANGUL_SYLLABLES
|| ub == Character.UnicodeBlock.HANGUL_JAMO
|| ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
//日文字符集
@ -76,13 +76,13 @@ class CharacterUtil {
|| ub == Character.UnicodeBlock.KATAKANA //片假名
|| ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){
return CHAR_OTHER_CJK;
}
}
//其他的不做处理的字符
return CHAR_USELESS;
}
/**
* 进行字符规格化全角转半角大写转小写处理
* @param input 需要转换的字符
@ -91,14 +91,14 @@ class CharacterUtil {
static char regularize(char input){
if (input == 12288) {
input = (char) 32;
}else if (input > 65280 && input < 65375) {
input = (char) (input - 65248);
}else if (input >= 'A' && input <= 'Z') {
input += 32;
}
return input;
}
}

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.core;

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.core;

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,26 +21,26 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.core;
/**
*
*
* 子分词器接口
*/
interface ISegmenter {
/**
* 从分析器读取下一个可能分解的词元对象
* @param context 分词算法上下文
*/
void analyze(AnalyzeContext context);
/**
* 重置子分析器状态
*/

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.core;

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,14 +21,14 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.core;
/**
* IK词元对象
* IK词元对象
*/
@SuppressWarnings("unused")
public class Lexeme implements Comparable<Lexeme>{
@ -50,7 +50,7 @@ public class Lexeme implements Comparable<Lexeme>{
static final int TYPE_COUNT = 32;
//中文数量词
static final int TYPE_CQUAN = 48;
//词元的起始位移
private int offset;
//词元的相对起始位置
@ -61,8 +61,8 @@ public class Lexeme implements Comparable<Lexeme>{
private String lexemeText;
//词元类型
private int lexemeType;
public Lexeme(int offset , int begin , int length , int lexemeType){
this.offset = offset;
this.begin = begin;
@ -72,7 +72,7 @@ public class Lexeme implements Comparable<Lexeme>{
this.length = length;
this.lexemeType = lexemeType;
}
/*
* 判断词元相等算法
* 起始位置偏移起始位置终止位置相同
@ -82,21 +82,21 @@ public class Lexeme implements Comparable<Lexeme>{
if(o == null){
return false;
}
if(this == o){
return true;
}
if(o instanceof Lexeme){
Lexeme other = (Lexeme)o;
return this.offset == other.getOffset()
&& this.begin == other.getBegin()
&& this.length == other.getLength();
}else{
}else{
return false;
}
}
/*
* 词元哈希编码算法
* @see java.lang.Object#hashCode()
@ -106,7 +106,7 @@ public class Lexeme implements Comparable<Lexeme>{
int absEnd = getEndPosition();
return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
}
/*
* 词元在排序集合中的比较算法
* @see java.lang.Comparable#compareTo(java.lang.Object)
@ -119,12 +119,12 @@ public class Lexeme implements Comparable<Lexeme>{
//词元长度优先
//this.length < other.getLength()
return Integer.compare(other.getLength(), this.length);
}else{//this.begin > other.getBegin()
return 1;
}
}
private int getOffset() {
return offset;
}
@ -155,22 +155,22 @@ public class Lexeme implements Comparable<Lexeme>{
public int getEndPosition(){
return offset + begin + length;
}
/**
* 获取词元的字符长度
* @return int
*/
public int getLength(){
return this.length;
}
}
public void setLength(int length) {
if(this.length < 0){
throw new IllegalArgumentException("length < 0");
}
this.length = length;
}
/**
* 获取词元的文本内容
* @return String
@ -199,7 +199,7 @@ public class Lexeme implements Comparable<Lexeme>{
int getLexemeType() {
return lexemeType;
}
/**
* 获取词元类型标示字符串
* @return String
@ -209,41 +209,41 @@ public class Lexeme implements Comparable<Lexeme>{
case TYPE_ENGLISH :
return "ENGLISH";
case TYPE_ARABIC :
return "ARABIC";
case TYPE_LETTER :
return "LETTER";
case TYPE_CNWORD :
case TYPE_CNWORD :
return "CN_WORD";
case TYPE_CNCHAR :
case TYPE_CNCHAR :
return "CN_CHAR";
case TYPE_OTHER_CJK :
return "OTHER_CJK";
case TYPE_COUNT :
return "COUNT";
case TYPE_CNUM :
return "TYPE_CNUM";
case TYPE_CQUAN:
case TYPE_CQUAN:
return "TYPE_CQUAN";
default :
return "UNKONW";
}
}
public void setLexemeType(int lexemeType) {
this.lexemeType = lexemeType;
}
/**
* 合并两个相邻的词元
* @return boolean 词元是否成功合并
@ -257,16 +257,16 @@ public class Lexeme implements Comparable<Lexeme>{
return false;
}
}
/**
*
*
*/
public String toString(){
return this.getBeginPosition() + "-" + this.getEndPosition() +
" : " + this.lexemeText + " : \t" +
this.getLexemeTypeString();
}
}

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.core;

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.2.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -31,178 +31,182 @@ package org.wltea.analyzer.core;
* IK分词器专用的Lexem快速排序集合
*/
class QuickSortSet {
//链表头
private Cell head;
//链表尾
private Cell tail;
//链表的实际大小
private int size;
QuickSortSet(){
this.size = 0;
}
/**
* 向链表集合添加词元
*/
void addLexeme(Lexeme lexeme){
Cell newCell = new Cell(lexeme);
if(this.size == 0){
this.head = newCell;
this.tail = newCell;
this.size++;
//链表头
private Cell head;
//链表尾
private Cell tail;
//链表的实际大小
private int size;
}else{
QuickSortSet() {
this.size = 0;
}
/**
* 向链表集合添加词元
*/
void addLexeme(Lexeme lexeme) {
Cell newCell = new Cell(lexeme);
if (this.size == 0) {
this.head = newCell;
this.tail = newCell;
this.size++;
} else {
/*if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同不放入集合
}else */if(this.tail.compareTo(newCell) < 0){//词元接入链表尾部
this.tail.next = newCell;
newCell.prev = this.tail;
this.tail = newCell;
this.size++;
}else */
if (this.tail.compareTo(newCell) < 0) {//词元接入链表尾部
this.tail.next = newCell;
newCell.prev = this.tail;
this.tail = newCell;
this.size++;
}else if(this.head.compareTo(newCell) > 0){//词元接入链表头部
this.head.prev = newCell;
newCell.next = this.head;
this.head = newCell;
this.size++;
} else if (this.head.compareTo(newCell) > 0) {//词元接入链表头部
this.head.prev = newCell;
newCell.next = this.head;
this.head = newCell;
this.size++;
}else{
//从尾部上逆
Cell index = this.tail;
while(index != null && index.compareTo(newCell) > 0){
index = index.prev;
}
} else {
//从尾部上逆
Cell index = this.tail;
while (index != null && index.compareTo(newCell) > 0) {
index = index.prev;
}
/*if(index.compareTo(newCell) == 0){//词元与集合中的词元重复不放入集合
}else */if((index != null ? index.compareTo(newCell) : 1) < 0){//词元插入链表中的某个位置
newCell.prev = index;
newCell.next = index.next;
index.next.prev = newCell;
index.next = newCell;
this.size++;
}
}
}
}
/**
* 返回链表头部元素
*/
Lexeme peekFirst(){
if(this.head != null){
return this.head.lexeme;
}
return null;
}
/**
* 取出链表集合的第一个元素
* @return Lexeme
*/
Lexeme pollFirst(){
if(this.size == 1){
Lexeme first = this.head.lexeme;
this.head = null;
this.tail = null;
this.size--;
return first;
}else if(this.size > 1){
Lexeme first = this.head.lexeme;
this.head = this.head.next;
this.size --;
return first;
}else{
return null;
}
}
/**
* 返回链表尾部元素
*/
Lexeme peekLast(){
if(this.tail != null){
return this.tail.lexeme;
}
return null;
}
/**
* 取出链表集合的最后一个元素
* @return Lexeme
*/
Lexeme pollLast(){
if(this.size == 1){
Lexeme last = this.head.lexeme;
this.head = null;
this.tail = null;
this.size--;
return last;
}else if(this.size > 1){
Lexeme last = this.tail.lexeme;
this.tail = this.tail.prev;
this.size--;
return last;
}else{
return null;
}
}
/**
* 返回集合大小
*/
int size(){
return this.size;
}
/**
* 判断集合是否为空
*/
boolean isEmpty(){
return this.size == 0;
}
/**
* 返回lexeme链的头部
*/
Cell getHead(){
return this.head;
}
}else */
if ((index != null ? index.compareTo(newCell) : 1) < 0) {//词元插入链表中的某个位置
newCell.prev = index;
newCell.next = index.next;
index.next.prev = newCell;
index.next = newCell;
this.size++;
}
}
}
}
/*
* IK 中文分词 版本 7.0
* IK Analyzer release 7.0
* update by Magese(magese@live.cn)
*/
@SuppressWarnings("unused")
class Cell implements Comparable<Cell>{
private Cell prev;
private Cell next;
private Lexeme lexeme;
Cell(Lexeme lexeme){
if(lexeme == null){
throw new IllegalArgumentException("lexeme must not be null");
}
this.lexeme = lexeme;
}
/**
* 返回链表头部元素
*/
Lexeme peekFirst() {
if (this.head != null) {
return this.head.lexeme;
}
return null;
}
public int compareTo(Cell o) {
return this.lexeme.compareTo(o.lexeme);
}
/**
* 取出链表集合的第一个元素
*
* @return Lexeme
*/
Lexeme pollFirst() {
if (this.size == 1) {
Lexeme first = this.head.lexeme;
this.head = null;
this.tail = null;
this.size--;
return first;
} else if (this.size > 1) {
Lexeme first = this.head.lexeme;
this.head = this.head.next;
this.size--;
return first;
} else {
return null;
}
}
public Cell getPrev(){
return this.prev;
}
Cell getNext(){
return this.next;
}
public Lexeme getLexeme(){
return this.lexeme;
}
}
/**
* 返回链表尾部元素
*/
Lexeme peekLast() {
if (this.tail != null) {
return this.tail.lexeme;
}
return null;
}
/**
* 取出链表集合的最后一个元素
*
* @return Lexeme
*/
Lexeme pollLast() {
if (this.size == 1) {
Lexeme last = this.head.lexeme;
this.head = null;
this.tail = null;
this.size--;
return last;
} else if (this.size > 1) {
Lexeme last = this.tail.lexeme;
this.tail = this.tail.prev;
this.size--;
return last;
} else {
return null;
}
}
/**
* 返回集合大小
*/
int size() {
return this.size;
}
/**
* 判断集合是否为空
*/
boolean isEmpty() {
return this.size == 0;
}
/**
* 返回lexeme链的头部
*/
Cell getHead() {
return this.head;
}
/*
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
* update by Magese(magese@live.cn)
*/
@SuppressWarnings("unused")
static class Cell implements Comparable<Cell> {
private Cell prev;
private Cell next;
private final Lexeme lexeme;
Cell(Lexeme lexeme) {
if (lexeme == null) {
throw new IllegalArgumentException("lexeme must not be null");
}
this.lexeme = lexeme;
}
public int compareTo(Cell o) {
return this.lexeme.compareTo(o.lexeme);
}
public Cell getPrev() {
return this.prev;
}
Cell getNext() {
return this.next;
}
public Lexeme getLexeme() {
return this.lexeme;
}
}
}

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.dic;

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.dic;

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.dic;
@ -38,13 +38,13 @@ public class Hit {
private static final int MATCH = 0x00000001;
//Hit前缀匹配
private static final int PREFIX = 0x00000010;
//该HIT当前状态默认未匹配
private int hitState = UNMATCH;
//记录词典匹配过程中当前匹配到的词典分支节点
private DictSegment matchedDictSegment;
private DictSegment matchedDictSegment;
/*
* 词段开始位置
*/
@ -53,8 +53,8 @@ public class Hit {
* 词段的结束位置
*/
private int end;
/**
* 判断是否完全匹配
*/
@ -62,7 +62,7 @@ public class Hit {
return (this.hitState & MATCH) > 0;
}
/**
*
*
*/
void setMatch() {
this.hitState = this.hitState | MATCH;
@ -75,7 +75,7 @@ public class Hit {
return (this.hitState & PREFIX) > 0;
}
/**
*
*
*/
void setPrefix() {
this.hitState = this.hitState | PREFIX;
@ -87,34 +87,34 @@ public class Hit {
return this.hitState == UNMATCH ;
}
/**
*
*
*/
void setUnmatch() {
this.hitState = UNMATCH;
}
DictSegment getMatchedDictSegment() {
return matchedDictSegment;
}
void setMatchedDictSegment(DictSegment matchedDictSegment) {
this.matchedDictSegment = matchedDictSegment;
}
public int getBegin() {
return begin;
}
void setBegin(int begin) {
this.begin = begin;
}
public int getEnd() {
return end;
}
void setEnd(int end) {
this.end = end;
}
}
}

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.lucene;

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.lucene;

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.lucene;
@ -74,7 +74,7 @@ public class IKTokenizerFactory extends TokenizerFactory implements ResourceLoad
*/
@Override
public void inform(ResourceLoader resourceLoader) throws IOException {
System.out.println(String.format("IKTokenizerFactory " + this.hashCode() + " inform conf: %s", getConf()));
System.out.printf("IKTokenizerFactory " + this.hashCode() + " inform conf: %s%n", getConf());
this.loader = resourceLoader;
update();
if ((getConf() != null) && (!getConf().trim().isEmpty())) {
@ -174,4 +174,4 @@ public class IKTokenizerFactory extends TokenizerFactory implements ResourceLoad
private void setConf(String conf) {
this.conf = conf;
}
}
}

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.lucene;
@ -35,7 +35,7 @@ import java.util.Vector;
*/
public class UpdateThread implements Runnable {
private static final long INTERVAL = 30000L; // 循环等待时间
private Vector<UpdateJob> filterFactorys; // 更新任务集合
private final Vector<UpdateJob> filterFactorys; // 更新任务集合
/**
* 私有化构造器阻止外部进行实例化
@ -51,7 +51,7 @@ public class UpdateThread implements Runnable {
* 静态内部类实现线程安全单例模式
*/
private static class Builder {
private static UpdateThread singleton = new UpdateThread();
private static final UpdateThread singleton = new UpdateThread();
}
/**
@ -81,6 +81,7 @@ public class UpdateThread implements Runnable {
//noinspection InfiniteLoopStatement
while (true) {
try {
//noinspection BusyWait
Thread.sleep(INTERVAL);
} catch (InterruptedException e) {
e.printStackTrace();

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.query;

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.query;

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.sample;

View File

@ -1,6 +1,6 @@
/*
* IK 中文分词 版本 8.3.0
* IK Analyzer release 8.3.0
* IK 中文分词 版本 8.3.1
* IK Analyzer release 8.3.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.0版本 Magese (magese@live.cn) 更新
* release 8.3.0 update by Magese(magese@live.cn)
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.sample;
@ -76,6 +76,7 @@ public class LuceneIndexAndSearchDemo {
IndexSearcher isearcher;
try {
//建立内存索引对象
//noinspection deprecation
directory = new RAMDirectory();
//配置IndexWriterConfig