Compare commits

..

No commits in common. "master" and "v8.4.0" have entirely different histories.

27 changed files with 1238 additions and 1090 deletions

View File

@ -6,6 +6,7 @@ ik-analyzer for solr 7.x-8.x
[![Open Source Love](https://badges.frapsoft.com/os/v1/open-source.svg?v=103)](https://github.com/magese/ik-analyzer-solr/releases) [![Open Source Love](https://badges.frapsoft.com/os/v1/open-source.svg?v=103)](https://github.com/magese/ik-analyzer-solr/releases)
[![Crates.io](https://img.shields.io/crates/l/rustc-serialize.svg)](./LICENSE) [![Crates.io](https://img.shields.io/crates/l/rustc-serialize.svg)](./LICENSE)
[![Build Status](https://travis-ci.org/magese/ik-analyzer-solr.svg?branch=master)](https://travis-ci.org/magese/ik-analyzer-solr) [![Build Status](https://travis-ci.org/magese/ik-analyzer-solr.svg?branch=master)](https://travis-ci.org/magese/ik-analyzer-solr)
[![HitCount](http://hits.dwyl.io/magese/ik-analyzer-solr.svg)](http://hits.dwyl.io/magese/ik-analyzer-solr)
[![GitHub forks](https://img.shields.io/github/forks/magese/ik-analyzer-solr.svg?style=social&label=Fork)](https://github.com/magese/ik-analyzer-solr/network/members) [![GitHub forks](https://img.shields.io/github/forks/magese/ik-analyzer-solr.svg?style=social&label=Fork)](https://github.com/magese/ik-analyzer-solr/network/members)
[![GitHub stars](https://img.shields.io/github/stars/magese/ik-analyzer-solr.svg?style=social&label=Star)](https://github.com/magese/ik-analyzer-solr/stargazers) [![GitHub stars](https://img.shields.io/github/stars/magese/ik-analyzer-solr.svg?style=social&label=Star)](https://github.com/magese/ik-analyzer-solr/stargazers)
@ -35,7 +36,7 @@ ik-analyzer for solr 7.x-8.x
## 使用说明 ## 使用说明
* jar包下载地址[![GitHub version](https://img.shields.io/badge/version-8.5.0-519dd9.svg)](https://search.maven.org/remotecontent?filepath=com/github/magese/ik-analyzer/8.5.0/ik-analyzer-8.5.0.jar) * jar包下载地址[![GitHub version](https://img.shields.io/badge/version-8.4.0-519dd9.svg)](https://search.maven.org/remotecontent?filepath=com/github/magese/ik-analyzer/8.4.0/ik-analyzer-8.4.0.jar)
* 历史版本:[![GitHub version](https://img.shields.io/maven-central/v/com.github.magese/ik-analyzer.svg?style=flat-square)](https://search.maven.org/search?q=g:com.github.magese%20AND%20a:ik-analyzer&core=gav) * 历史版本:[![GitHub version](https://img.shields.io/maven-central/v/com.github.magese/ik-analyzer.svg?style=flat-square)](https://search.maven.org/search?q=g:com.github.magese%20AND%20a:ik-analyzer&core=gav)
```xml ```xml
@ -43,7 +44,7 @@ ik-analyzer for solr 7.x-8.x
<dependency> <dependency>
<groupId>com.github.magese</groupId> <groupId>com.github.magese</groupId>
<artifactId>ik-analyzer</artifactId> <artifactId>ik-analyzer</artifactId>
<version>8.5.0</version> <version>8.4.0</version>
</dependency> </dependency>
``` ```
@ -105,7 +106,6 @@ ik-analyzer for solr 7.x-8.x
## 更新说明 ## 更新说明
- **2021-12-23:** 升级lucene版本为`8.5.0`
- **2021-03-22:** 升级lucene版本为`8.4.0` - **2021-03-22:** 升级lucene版本为`8.4.0`
- **2020-12-30:** - **2020-12-30:**
- 升级lucene版本为`8.3.1` - 升级lucene版本为`8.3.1`
@ -129,7 +129,7 @@ ik-analyzer for solr 7.x-8.x
- **2018-12-04:** 整理更新词库列表`magese.dic` - **2018-12-04:** 整理更新词库列表`magese.dic`
- **2018-10-10:** 升级lucene版本为`7.5.0` - **2018-10-10:** 升级lucene版本为`7.5.0`
- **2018-09-03:** 优化注释与输出信息取消部分中文输出避免不同字符集乱码现会打印被调用inform方法的hashcode - **2018-09-03:** 优化注释与输出信息取消部分中文输出避免不同字符集乱码现会打印被调用inform方法的hashcode
- **2018-08-23:** - **2018-08-23: **
- 完善了动态更新词库代码注释; - 完善了动态更新词库代码注释;
- 将ik.conf配置文件中的lastUpdate属性改为long类型现已支持时间戳形式 - 将ik.conf配置文件中的lastUpdate属性改为long类型现已支持时间戳形式
- **2018-08-13:** 更新maven仓库地址 - **2018-08-13:** 更新maven仓库地址

11
pom.xml
View File

@ -4,7 +4,7 @@
<groupId>com.github.magese</groupId> <groupId>com.github.magese</groupId>
<artifactId>ik-analyzer</artifactId> <artifactId>ik-analyzer</artifactId>
<version>8.5.0</version> <version>8.4.0</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<name>ik-analyzer-solr</name> <name>ik-analyzer-solr</name>
@ -13,13 +13,20 @@
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<lucene.version>8.5.0</lucene.version> <lucene.version>8.4.0</lucene.version>
<javac.src.version>1.8</javac.src.version> <javac.src.version>1.8</javac.src.version>
<javac.target.version>1.8</javac.target.version> <javac.target.version>1.8</javac.target.version>
<maven.compiler.plugin.version>3.3</maven.compiler.plugin.version> <maven.compiler.plugin.version>3.3</maven.compiler.plugin.version>
</properties> </properties>
<dependencies> <dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13.1</version>
<scope>test</scope>
</dependency>
<dependency> <dependency>
<groupId>org.apache.lucene</groupId> <groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId> <artifactId>lucene-core</artifactId>

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.4.0 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.cfg; package org.wltea.analyzer.cfg;
@ -76,7 +76,7 @@ public interface Configuration {
* *
* @return String 量词词典路径 * @return String 量词词典路径
*/ */
String getQuantifierDictionary(); String getQuantifierDicionary();
/** /**
* 获取扩展字典配置路径 * 获取扩展字典配置路径

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.4.0 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.cfg; package org.wltea.analyzer.cfg;
@ -145,7 +145,7 @@ public class DefaultConfig implements Configuration {
* *
* @return String 量词词典路径 * @return String 量词词典路径
*/ */
public String getQuantifierDictionary() { public String getQuantifierDicionary() {
return PATH_DIC_QUANTIFIER; return PATH_DIC_QUANTIFIER;
} }

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.4.0 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
@ -39,39 +39,39 @@ import java.util.*;
*/ */
class AnalyzeContext { class AnalyzeContext {
// 默认缓冲区大小 //默认缓冲区大小
private static final int BUFF_SIZE = 4096; private static final int BUFF_SIZE = 4096;
// 缓冲区耗尽的临界值 //缓冲区耗尽的临界值
private static final int BUFF_EXHAUST_CRITICAL = 100; private static final int BUFF_EXHAUST_CRITICAL = 100;
// 字符窜读取缓冲 //字符窜读取缓冲
private char[] segmentBuff; private char[] segmentBuff;
// 字符类型数组 //字符类型数组
private int[] charTypes; private int[] charTypes;
// 记录Reader内已分析的字串总长度 //记录Reader内已分析的字串总长度
// 在分多段分析词元时该变量累计当前的segmentBuff相对于reader起始位置的位移 //在分多段分析词元时该变量累计当前的segmentBuff相对于reader起始位置的位移
private int buffOffset; private int buffOffset;
// 当前缓冲区位置指针 //当前缓冲区位置指针
private int cursor; private int cursor;
// 最近一次读入的,可处理的字串长度 //最近一次读入的,可处理的字串长度
private int available; private int available;
// 子分词器锁 //子分词器锁
// 该集合非空说明有子分词器在占用segmentBuff //该集合非空说明有子分词器在占用segmentBuff
private final Set<String> buffLocker; private final Set<String> buffLocker;
// 原始分词结果集合未经歧义处理 //原始分词结果集合未经歧义处理
private QuickSortSet orgLexemes; private QuickSortSet orgLexemes;
// LexemePath位置索引表 //LexemePath位置索引表
private final Map<Integer, LexemePath> pathMap; private final Map<Integer, LexemePath> pathMap;
// 最终分词结果集 //最终分词结果集
private final LinkedList<Lexeme> results; private final LinkedList<Lexeme> results;
// 分词器配置项 //分词器配置项
private final Configuration cfg; private final Configuration cfg;
AnalyzeContext(Configuration cfg) { AnalyzeContext(Configuration cfg) {
@ -113,21 +113,21 @@ class AnalyzeContext {
int fillBuffer(Reader reader) throws IOException { int fillBuffer(Reader reader) throws IOException {
int readCount = 0; int readCount = 0;
if (this.buffOffset == 0) { if (this.buffOffset == 0) {
// 首次读取reader //首次读取reader
readCount = reader.read(segmentBuff); readCount = reader.read(segmentBuff);
} else { } else {
int offset = this.available - this.cursor; int offset = this.available - this.cursor;
if (offset > 0) { if (offset > 0) {
// 最近一次读取的>最近一次处理的将未处理的字串拷贝到segmentBuff头部 //最近一次读取的>最近一次处理的将未处理的字串拷贝到segmentBuff头部
System.arraycopy(this.segmentBuff, this.cursor, this.segmentBuff, 0, offset); System.arraycopy(this.segmentBuff, this.cursor, this.segmentBuff, 0, offset);
readCount = offset; readCount = offset;
} }
// 继续读取reader 以onceReadIn - onceAnalyzed为起始位置继续填充segmentBuff剩余的部分 //继续读取reader 以onceReadIn - onceAnalyzed为起始位置继续填充segmentBuff剩余的部分
readCount += reader.read(this.segmentBuff, offset, BUFF_SIZE - offset); readCount += reader.read(this.segmentBuff, offset, BUFF_SIZE - offset);
} }
// 记录最后一次从Reader中读入的可用字符长度 //记录最后一次从Reader中读入的可用字符长度
this.available = readCount; this.available = readCount;
// 重置当前指针 //重置当前指针
this.cursor = 0; this.cursor = 0;
return readCount; return readCount;
} }
@ -251,35 +251,35 @@ class AnalyzeContext {
void outputToResult() { void outputToResult() {
int index = 0; int index = 0;
while (index <= this.cursor) { while (index <= this.cursor) {
// 跳过非CJK字符 //跳过非CJK字符
if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) { if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) {
index++; index++;
continue; continue;
} }
// 从pathMap找出对应index位置的LexemePath //从pathMap找出对应index位置的LexemePath
LexemePath path = this.pathMap.get(index); LexemePath path = this.pathMap.get(index);
if (path != null) { if (path != null) {
// 输出LexemePath中的lexeme到results集合 //输出LexemePath中的lexeme到results集合
Lexeme l = path.pollFirst(); Lexeme l = path.pollFirst();
while (l != null) { while (l != null) {
this.results.add(l); this.results.add(l);
// 将index移至lexeme后 //将index移至lexeme后
index = l.getBegin() + l.getLength(); index = l.getBegin() + l.getLength();
l = path.pollFirst(); l = path.pollFirst();
if (l != null) { if (l != null) {
// 输出path内部词元间遗漏的单字 //输出path内部词元间遗漏的单字
for (; index < l.getBegin(); index++) { for (; index < l.getBegin(); index++) {
this.outputSingleCJK(index); this.outputSingleCJK(index);
} }
} }
} }
} else {// pathMap中找不到index对应的LexemePath } else {//pathMap中找不到index对应的LexemePath
// 单字输出 //单字输出
this.outputSingleCJK(index); this.outputSingleCJK(index);
index++; index++;
} }
} }
// 清空当前的Map //清空当前的Map
this.pathMap.clear(); this.pathMap.clear();
} }
@ -304,16 +304,16 @@ class AnalyzeContext {
* 同时处理合并 * 同时处理合并
*/ */
Lexeme getNextLexeme() { Lexeme getNextLexeme() {
// 从结果集取出并移除第一个Lexme //从结果集取出并移除第一个Lexme
Lexeme result = this.results.pollFirst(); Lexeme result = this.results.pollFirst();
while (result != null) { while (result != null) {
// 数量词合并 //数量词合并
this.compound(result); this.compound(result);
if (Dictionary.getSingleton().isStopWord(this.segmentBuff, result.getBegin(), result.getLength())) { if (Dictionary.getSingleton().isStopWord(this.segmentBuff, result.getBegin(), result.getLength())) {
// 是停止词继续取列表的下一个 //是停止词继续取列表的下一个
result = this.results.pollFirst(); result = this.results.pollFirst();
} else { } else {
// 不是停止词, 生成lexeme的词元文本,输出 //不是停止词, 生成lexeme的词元文本,输出
result.setLexemeText(String.valueOf(segmentBuff, result.getBegin(), result.getLength())); result.setLexemeText(String.valueOf(segmentBuff, result.getBegin(), result.getLength()));
break; break;
} }
@ -343,7 +343,7 @@ class AnalyzeContext {
if (!this.cfg.useSmart()) { if (!this.cfg.useSmart()) {
return; return;
} }
// 数量词合并处理 //数量词合并处理
if (!this.results.isEmpty()) { if (!this.results.isEmpty()) {
if (Lexeme.TYPE_ARABIC == result.getLexemeType()) { if (Lexeme.TYPE_ARABIC == result.getLexemeType()) {
@ -351,29 +351,29 @@ class AnalyzeContext {
boolean appendOk = false; boolean appendOk = false;
if (nextLexeme != null) { if (nextLexeme != null) {
if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) { if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) {
// 合并英文数词+中文数词 //合并英文数词+中文数词
appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM); appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
} else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) { } else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
// 合并英文数词+中文量词 //合并英文数词+中文量词
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN); appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
} }
} }
if (appendOk) { if (appendOk) {
// 弹出 //弹出
this.results.pollFirst(); this.results.pollFirst();
} }
} }
// 可能存在第二轮合并 //可能存在第二轮合并
if (Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()) { if (Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()) {
Lexeme nextLexeme = this.results.peekFirst(); Lexeme nextLexeme = this.results.peekFirst();
boolean appendOk = false; boolean appendOk = false;
if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) { if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
// 合并中文数词+中文量词 //合并中文数词+中文量词
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN); appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
} }
if (appendOk) { if (appendOk) {
// 弹出 //弹出
this.results.pollFirst(); this.results.pollFirst();
} }
} }

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,107 +21,108 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.4.0 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
/** /**
* 中文-日韩文子分词器 * 中文-日韩文子分词器
*/ */
class CJKSegmenter implements ISegmenter { class CJKSegmenter implements ISegmenter {
// 子分词器标签 //子分词器标签
private static final String SEGMENTER_NAME = "CJK_SEGMENTER"; private static final String SEGMENTER_NAME = "CJK_SEGMENTER";
// 待处理的分词hit队列 //待处理的分词hit队列
private final List<Hit> tmpHits; private List<Hit> tmpHits;
CJKSegmenter() { CJKSegmenter(){
this.tmpHits = new LinkedList<>(); this.tmpHits = new LinkedList<>();
} }
/* (non-Javadoc) /* (non-Javadoc)
* @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext) * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
*/ */
public void analyze(AnalyzeContext context) { public void analyze(AnalyzeContext context) {
if (CharacterUtil.CHAR_USELESS != context.getCurrentCharType()) { if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){
// 优先处理tmpHits中的hit //优先处理tmpHits中的hit
if (!this.tmpHits.isEmpty()) { if(!this.tmpHits.isEmpty()){
// 处理词段队列 //处理词段队列
Hit[] tmpArray = this.tmpHits.toArray(new Hit[0]); Hit[] tmpArray = this.tmpHits.toArray(new Hit[0]);
for (Hit hit : tmpArray) { for(Hit hit : tmpArray){
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor(), hit); hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if (hit.isMatch()) { if(hit.isMatch()){
// 输出当前的词 //输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), hit.getBegin(), context.getCursor() - hit.getBegin() + 1, Lexeme.TYPE_CNWORD); Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
context.addLexeme(newLexeme); context.addLexeme(newLexeme);
if (!hit.isPrefix()) {// 不是词前缀hit不需要继续匹配移除 if(!hit.isPrefix()){//不是词前缀hit不需要继续匹配移除
this.tmpHits.remove(hit); this.tmpHits.remove(hit);
} }
} else if (hit.isUnmatch()) { }else if(hit.isUnmatch()){
// hit不是词移除 //hit不是词移除
this.tmpHits.remove(hit); this.tmpHits.remove(hit);
} }
} }
} }
// ********************************* //*********************************
// 再对当前指针位置的字符进行单字匹配 //再对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1); Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成词
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);
context.addLexeme(newLexeme);
// 首字为词前缀 //同时也是词前缀
if (singleCharHit.isMatch()) { if(singleCharHit.isPrefix()){
// 输出当前的词 //前缀匹配则放入hit列表
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_CNWORD); this.tmpHits.add(singleCharHit);
context.addLexeme(newLexeme); }
} }else if(singleCharHit.isPrefix()){//首字为词前缀
//前缀匹配则放入hit列表
// 前缀匹配则放入hit列表 this.tmpHits.add(singleCharHit);
if (singleCharHit.isPrefix()) { }
// 前缀匹配则放入hit列表
this.tmpHits.add(singleCharHit);
}
} else { }else{
// 遇到CHAR_USELESS字符 //遇到CHAR_USELESS字符
// 清空队列 //清空队列
this.tmpHits.clear(); this.tmpHits.clear();
} }
// 判断缓冲区是否已经读完 //判断缓冲区是否已经读完
if (context.isBufferConsumed()) { if(context.isBufferConsumed()){
// 清空队列 //清空队列
this.tmpHits.clear(); this.tmpHits.clear();
} }
// 判断是否锁定缓冲区 //判断是否锁定缓冲区
if (this.tmpHits.size() == 0) { if(this.tmpHits.size() == 0){
context.unlockBuffer(SEGMENTER_NAME); context.unlockBuffer(SEGMENTER_NAME);
} else { }else{
context.lockBuffer(SEGMENTER_NAME); context.lockBuffer(SEGMENTER_NAME);
} }
} }
/* (non-Javadoc) /* (non-Javadoc)
* @see org.wltea.analyzer.core.ISegmenter#reset() * @see org.wltea.analyzer.core.ISegmenter#reset()
*/ */
public void reset() { public void reset() {
// 清空队列 //清空队列
this.tmpHits.clear(); this.tmpHits.clear();
} }
} }

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.4.0 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
@ -36,205 +36,206 @@ import java.util.List;
import java.util.Set; import java.util.Set;
/** /**
*
* 中文数量词子分词器 * 中文数量词子分词器
*/ */
class CN_QuantifierSegmenter implements ISegmenter { class CN_QuantifierSegmenter implements ISegmenter{
// 子分词器标签 //子分词器标签
private static final String SEGMENTER_NAME = "QUAN_SEGMENTER"; private static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
private static final Set<Character> CHN_NUMBER_CHARS = new HashSet<>(); private static Set<Character> ChnNumberChars = new HashSet<>();
static{
//中文数词
//Cnum
String chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";
char[] ca = chn_Num.toCharArray();
for(char nChar : ca){
ChnNumberChars.add(nChar);
}
}
static { /*
// 中文数词 * 词元的开始位置
String chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿"; * 同时作为子分词器状态标识
char[] ca = chn_Num.toCharArray(); * 当start > -1 标识当前的分词器正在处理字符
for (char nChar : ca) { */
CHN_NUMBER_CHARS.add(nChar); private int nStart;
} /*
} * 记录词元结束位置
* end记录的是在词元中最后一个出现的合理的数词结束
*/
private int nEnd;
/* //待处理的量词hit队列
* 词元的开始位置 private List<Hit> countHits;
* 同时作为子分词器状态标识
* 当start > -1 标识当前的分词器正在处理字符
*/
private int nStart;
/*
* 记录词元结束位置
* end记录的是在词元中最后一个出现的合理的数词结束
*/
private int nEnd;
// 待处理的量词hit队列
private final List<Hit> countHits;
CN_QuantifierSegmenter() { CN_QuantifierSegmenter(){
nStart = -1; nStart = -1;
nEnd = -1; nEnd = -1;
this.countHits = new LinkedList<>(); this.countHits = new LinkedList<>();
} }
/** /**
* 分词 * 分词
*/ */
public void analyze(AnalyzeContext context) { public void analyze(AnalyzeContext context) {
// 处理中文数词 //处理中文数词
this.processCNumber(context); this.processCNumber(context);
// 处理中文量词 //处理中文量词
this.processCount(context); this.processCount(context);
// 判断是否锁定缓冲区 //判断是否锁定缓冲区
if (this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()) { if(this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()){
// 对缓冲区解锁 //对缓冲区解锁
context.unlockBuffer(SEGMENTER_NAME); context.unlockBuffer(SEGMENTER_NAME);
} else { }else{
context.lockBuffer(SEGMENTER_NAME); context.lockBuffer(SEGMENTER_NAME);
} }
} }
/** /**
* 重置子分词器状态 * 重置子分词器状态
*/ */
public void reset() { public void reset() {
nStart = -1; nStart = -1;
nEnd = -1; nEnd = -1;
countHits.clear(); countHits.clear();
} }
/** /**
* 处理数词 * 处理数词
*/ */
private void processCNumber(AnalyzeContext context) { private void processCNumber(AnalyzeContext context){
if (nStart == -1 && nEnd == -1) {// 初始状态 if(nStart == -1 && nEnd == -1){//初始状态
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
&& CHN_NUMBER_CHARS.contains(context.getCurrentChar())) { && ChnNumberChars.contains(context.getCurrentChar())){
// 记录数词的起始结束位置 //记录数词的起始结束位置
nStart = context.getCursor(); nStart = context.getCursor();
nEnd = context.getCursor(); nEnd = context.getCursor();
} }
} else {// 正在处理状态 }else{//正在处理状态
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
&& CHN_NUMBER_CHARS.contains(context.getCurrentChar())) { && ChnNumberChars.contains(context.getCurrentChar())){
// 记录数词的结束位置 //记录数词的结束位置
nEnd = context.getCursor(); nEnd = context.getCursor();
} else { }else{
// 输出数词 //输出数词
this.outputNumLexeme(context); this.outputNumLexeme(context);
// 重置头尾指针 //重置头尾指针
nStart = -1; nStart = -1;
nEnd = -1; nEnd = -1;
} }
} }
// 缓冲区已经用完还有尚未输出的数词 //缓冲区已经用完还有尚未输出的数词
if (context.isBufferConsumed()) { if(context.isBufferConsumed()){
if (nStart != -1 && nEnd != -1) { if(nStart != -1 && nEnd != -1){
// 输出数词 //输出数词
outputNumLexeme(context); outputNumLexeme(context);
// 重置头尾指针 //重置头尾指针
nStart = -1; nStart = -1;
nEnd = -1; nEnd = -1;
} }
} }
} }
/** /**
* 处理中文量词 * 处理中文量词
* * @param context 需要处理的内容
* @param context 需要处理的内容 */
*/ private void processCount(AnalyzeContext context){
private void processCount(AnalyzeContext context) { // 判断是否需要启动量词扫描
// 判断是否需要启动量词扫描 if(!this.needCountScan(context)){
if (!this.needCountScan(context)) { return;
return; }
}
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()) { if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){
// 优先处理countHits中的hit //优先处理countHits中的hit
if (!this.countHits.isEmpty()) { if(!this.countHits.isEmpty()){
// 处理词段队列 //处理词段队列
Hit[] tmpArray = this.countHits.toArray(new Hit[0]); Hit[] tmpArray = this.countHits.toArray(new Hit[0]);
for (Hit hit : tmpArray) { for(Hit hit : tmpArray){
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor(), hit); hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if (hit.isMatch()) { if(hit.isMatch()){
// 输出当前的词 //输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), hit.getBegin(), context.getCursor() - hit.getBegin() + 1, Lexeme.TYPE_COUNT); Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
context.addLexeme(newLexeme); context.addLexeme(newLexeme);
if (!hit.isPrefix()) {// 不是词前缀hit不需要继续匹配移除 if(!hit.isPrefix()){//不是词前缀hit不需要继续匹配移除
this.countHits.remove(hit); this.countHits.remove(hit);
} }
} else if (hit.isUnmatch()) { }else if(hit.isUnmatch()){
// hit不是词移除 //hit不是词移除
this.countHits.remove(hit); this.countHits.remove(hit);
} }
} }
} }
// ********************************* //*********************************
// 对当前指针位置的字符进行单字匹配 //对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1); Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成量词词
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
context.addLexeme(newLexeme);
// 首字为量词前缀 //同时也是词前缀
if (singleCharHit.isMatch()) { if(singleCharHit.isPrefix()){
// 输出当前的词 //前缀匹配则放入hit列表
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_COUNT); this.countHits.add(singleCharHit);
context.addLexeme(newLexeme); }
} }else if(singleCharHit.isPrefix()){//首字为量词前缀
//前缀匹配则放入hit列表
this.countHits.add(singleCharHit);
}
// 前缀匹配则放入hit列表
if (singleCharHit.isPrefix()) {
// 前缀匹配则放入hit列表
this.countHits.add(singleCharHit);
}
} else { }else{
// 输入的不是中文字符 //输入的不是中文字符
// 清空未成形的量词 //清空未成形的量词
this.countHits.clear(); this.countHits.clear();
} }
// 缓冲区数据已经读完还有尚未输出的量词 //缓冲区数据已经读完还有尚未输出的量词
if (context.isBufferConsumed()) { if(context.isBufferConsumed()){
// 清空未成形的量词 //清空未成形的量词
this.countHits.clear(); this.countHits.clear();
} }
} }
/** /**
* 判断是否需要扫描量词 * 判断是否需要扫描量词
*/ */
private boolean needCountScan(AnalyzeContext context) { private boolean needCountScan(AnalyzeContext context){
if ((nStart != -1 && nEnd != -1) || !countHits.isEmpty()) { if((nStart != -1 && nEnd != -1 ) || !countHits.isEmpty()){
// 正在处理中文数词,或者正在处理量词 //正在处理中文数词,或者正在处理量词
return true; return true;
} else { }else{
// 找到一个相邻的数词 //找到一个相邻的数词
if (!context.getOrgLexemes().isEmpty()) { if(!context.getOrgLexemes().isEmpty()){
Lexeme l = context.getOrgLexemes().peekLast(); Lexeme l = context.getOrgLexemes().peekLast();
if (Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()) { if(Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()){
return l.getBegin() + l.getLength() == context.getCursor(); return l.getBegin() + l.getLength() == context.getCursor();
} }
} }
} }
return false; return false;
} }
/** /**
* 添加数词词元到结果集 * 添加数词词元到结果集
* * @param context 需要添加的词元
* @param context 需要添加的词元 */
*/ private void outputNumLexeme(AnalyzeContext context){
private void outputNumLexeme(AnalyzeContext context) { if(nStart > -1 && nEnd > -1){
if (nStart > -1 && nEnd > -1) { //输出数词
// 输出数词 Lexeme newLexeme = new Lexeme(context.getBufferOffset() , nStart , nEnd - nStart + 1 , Lexeme.TYPE_CNUM);
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), nStart, nEnd - nStart + 1, Lexeme.TYPE_CNUM); context.addLexeme(newLexeme);
context.addLexeme(newLexeme); }
} }
}
} }

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,85 +21,84 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.4.0 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
/** /**
*
* 字符集识别工具类 * 字符集识别工具类
*/ */
class CharacterUtil { class CharacterUtil {
static final int CHAR_USELESS = 0; static final int CHAR_USELESS = 0;
static final int CHAR_ARABIC = 0X00000001; static final int CHAR_ARABIC = 0X00000001;
static final int CHAR_ENGLISH = 0X00000002; static final int CHAR_ENGLISH = 0X00000002;
static final int CHAR_CHINESE = 0X00000004; static final int CHAR_CHINESE = 0X00000004;
static final int CHAR_OTHER_CJK = 0X00000008; static final int CHAR_OTHER_CJK = 0X00000008;
/** /**
* 识别字符类型 * 识别字符类型
* * @param input 需要识别的字符
* @param input 需要识别的字符 * @return int CharacterUtil定义的字符类型常量
* @return int CharacterUtil定义的字符类型常量 */
*/ static int identifyCharType(char input){
static int identifyCharType(char input) { if(input >= '0' && input <= '9'){
if (input >= '0' && input <= '9') { return CHAR_ARABIC;
return CHAR_ARABIC;
} else if ((input >= 'a' && input <= 'z') }else if((input >= 'a' && input <= 'z')
|| (input >= 'A' && input <= 'Z')) { || (input >= 'A' && input <= 'Z')){
return CHAR_ENGLISH; return CHAR_ENGLISH;
} else { }else {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(input); Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A) { || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){
//目前已知的中文字符UTF-8集合 //目前已知的中文字符UTF-8集合
return CHAR_CHINESE; return CHAR_CHINESE;
} else if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符 }else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符
//韩文字符集 //韩文字符集
|| ub == Character.UnicodeBlock.HANGUL_SYLLABLES || ub == Character.UnicodeBlock.HANGUL_SYLLABLES
|| ub == Character.UnicodeBlock.HANGUL_JAMO || ub == Character.UnicodeBlock.HANGUL_JAMO
|| ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
//日文字符集 //日文字符集
|| ub == Character.UnicodeBlock.HIRAGANA //平假名 || ub == Character.UnicodeBlock.HIRAGANA //平假名
|| ub == Character.UnicodeBlock.KATAKANA //片假名 || ub == Character.UnicodeBlock.KATAKANA //片假名
|| ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS) { || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){
return CHAR_OTHER_CJK; return CHAR_OTHER_CJK;
} }
} }
//其他的不做处理的字符 //其他的不做处理的字符
return CHAR_USELESS; return CHAR_USELESS;
} }
/** /**
* 进行字符规格化全角转半角大写转小写处理 * 进行字符规格化全角转半角大写转小写处理
* * @param input 需要转换的字符
* @param input 需要转换的字符 * @return char
* @return char */
*/ static char regularize(char input){
static char regularize(char input) {
if (input == 12288) { if (input == 12288) {
input = (char) 32; input = (char) 32;
} else if (input > 65280 && input < 65375) { }else if (input > 65280 && input < 65375) {
input = (char) (input - 65248); input = (char) (input - 65248);
} else if (input >= 'A' && input <= 'Z') { }else if (input >= 'A' && input <= 'Z') {
input += 32; input += 32;
} }
return input; return input;
} }
} }

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.4.0 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
@ -35,7 +35,9 @@ import java.util.TreeSet;
*/ */
class IKArbitrator { class IKArbitrator {
IKArbitrator() {} IKArbitrator() {
}
/** /**
* 分词歧义处理 * 分词歧义处理
@ -50,20 +52,20 @@ class IKArbitrator {
LexemePath crossPath = new LexemePath(); LexemePath crossPath = new LexemePath();
while (orgLexeme != null) { while (orgLexeme != null) {
if (!crossPath.addCrossLexeme(orgLexeme)) { if (!crossPath.addCrossLexeme(orgLexeme)) {
// 找到与crossPath不相交的下一个crossPath //找到与crossPath不相交的下一个crossPath
if (crossPath.size() == 1 || !useSmart) { if (crossPath.size() == 1 || !useSmart) {
// crossPath没有歧义 或者 不做歧义处理 //crossPath没有歧义 或者 不做歧义处理
// 直接输出当前crossPath //直接输出当前crossPath
context.addLexemePath(crossPath); context.addLexemePath(crossPath);
} else { } else {
// 对当前的crossPath进行歧义处理 //对当前的crossPath进行歧义处理
QuickSortSet.Cell headCell = crossPath.getHead(); QuickSortSet.Cell headCell = crossPath.getHead();
LexemePath judgeResult = this.judge(headCell); LexemePath judgeResult = this.judge(headCell);
// 输出歧义处理结果judgeResult //输出歧义处理结果judgeResult
context.addLexemePath(judgeResult); context.addLexemePath(judgeResult);
} }
// 把orgLexeme加入新的crossPath中 //把orgLexeme加入新的crossPath中
crossPath = new LexemePath(); crossPath = new LexemePath();
crossPath.addCrossLexeme(orgLexeme); crossPath.addCrossLexeme(orgLexeme);
} }
@ -71,16 +73,16 @@ class IKArbitrator {
} }
// 处理最后的path //处理最后的path
if (crossPath.size() == 1 || !useSmart) { if (crossPath.size() == 1 || !useSmart) {
// crossPath没有歧义 或者 不做歧义处理 //crossPath没有歧义 或者 不做歧义处理
// 直接输出当前crossPath //直接输出当前crossPath
context.addLexemePath(crossPath); context.addLexemePath(crossPath);
} else { } else {
// 对当前的crossPath进行歧义处理 //对当前的crossPath进行歧义处理
QuickSortSet.Cell headCell = crossPath.getHead(); QuickSortSet.Cell headCell = crossPath.getHead();
LexemePath judgeResult = this.judge(headCell); LexemePath judgeResult = this.judge(headCell);
// 输出歧义处理结果judgeResult //输出歧义处理结果judgeResult
context.addLexemePath(judgeResult); context.addLexemePath(judgeResult);
} }
} }
@ -91,29 +93,29 @@ class IKArbitrator {
* @param lexemeCell 歧义路径链表头 * @param lexemeCell 歧义路径链表头
*/ */
private LexemePath judge(QuickSortSet.Cell lexemeCell) { private LexemePath judge(QuickSortSet.Cell lexemeCell) {
// 候选路径集合 //候选路径集合
TreeSet<LexemePath> pathOptions = new TreeSet<>(); TreeSet<LexemePath> pathOptions = new TreeSet<>();
// 候选结果路径 //候选结果路径
LexemePath option = new LexemePath(); LexemePath option = new LexemePath();
// 对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈 //对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
Stack<QuickSortSet.Cell> lexemeStack = this.forwardPath(lexemeCell, option); Stack<QuickSortSet.Cell> lexemeStack = this.forwardPath(lexemeCell, option);
// 当前词元链并非最理想的加入候选路径集合 //当前词元链并非最理想的加入候选路径集合
pathOptions.add(option.copy()); pathOptions.add(option.copy());
// 存在歧义词处理 //存在歧义词处理
QuickSortSet.Cell c; QuickSortSet.Cell c;
while (!lexemeStack.isEmpty()) { while (!lexemeStack.isEmpty()) {
c = lexemeStack.pop(); c = lexemeStack.pop();
// 回滚词元链 //回滚词元链
this.backPath(c.getLexeme(), option); this.backPath(c.getLexeme(), option);
// 从歧义词位置开始递归生成可选方案 //从歧义词位置开始递归生成可选方案
this.forwardPath(c, option); this.forwardPath(c, option);
pathOptions.add(option.copy()); pathOptions.add(option.copy());
} }
// 返回集合中的最优方案 //返回集合中的最优方案
return pathOptions.first(); return pathOptions.first();
} }
@ -122,13 +124,13 @@ class IKArbitrator {
* 向前遍历添加词元构造一个无歧义词元组合 * 向前遍历添加词元构造一个无歧义词元组合
*/ */
private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell, LexemePath option) { private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell, LexemePath option) {
// 发生冲突的Lexeme栈 //发生冲突的Lexeme栈
Stack<QuickSortSet.Cell> conflictStack = new Stack<>(); Stack<QuickSortSet.Cell> conflictStack = new Stack<>();
QuickSortSet.Cell c = lexemeCell; QuickSortSet.Cell c = lexemeCell;
// 迭代遍历Lexeme链表 //迭代遍历Lexeme链表
while (c != null && c.getLexeme() != null) { while (c != null && c.getLexeme() != null) {
if (!option.addNotCrossLexeme(c.getLexeme())) { if (!option.addNotCrossLexeme(c.getLexeme())) {
// 词元交叉添加失败则加入lexemeStack栈 //词元交叉添加失败则加入lexemeStack栈
conflictStack.push(c); conflictStack.push(c);
} }
c = c.getNext(); c = c.getNext();

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,7 +21,7 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn) * release 8.3.1 update by Magese(magese@live.cn)
* *
*/ */
@ -41,25 +41,15 @@ import java.util.List;
*/ */
public final class IKSegmenter { public final class IKSegmenter {
/** //字符窜reader
* 字符窜reader
*/
private Reader input; private Reader input;
/** //分词器配置项
* 分词器配置项 private Configuration cfg;
*/ //分词器上下文
private final Configuration cfg;
/**
* 分词器上下文
*/
private AnalyzeContext context; private AnalyzeContext context;
/** //分词处理器列表
* 分词处理器列表
*/
private List<ISegmenter> segmenters; private List<ISegmenter> segmenters;
/** //分词歧义裁决器
* 分词歧义裁决器
*/
private IKArbitrator arbitrator; private IKArbitrator arbitrator;
@ -95,13 +85,13 @@ public final class IKSegmenter {
* 初始化 * 初始化
*/ */
private void init() { private void init() {
// 初始化词典单例 //初始化词典单例
Dictionary.initial(this.cfg); Dictionary.initial(this.cfg);
// 初始化分词上下文 //初始化分词上下文
this.context = new AnalyzeContext(this.cfg); this.context = new AnalyzeContext(this.cfg);
// 加载子分词器 //加载子分词器
this.segmenters = this.loadSegmenters(); this.segmenters = this.loadSegmenters();
// 加载歧义裁决器 //加载歧义裁决器
this.arbitrator = new IKArbitrator(); this.arbitrator = new IKArbitrator();
} }
@ -112,11 +102,11 @@ public final class IKSegmenter {
*/ */
private List<ISegmenter> loadSegmenters() { private List<ISegmenter> loadSegmenters() {
List<ISegmenter> segmenters = new ArrayList<>(4); List<ISegmenter> segmenters = new ArrayList<>(4);
// 处理字母的子分词器 //处理字母的子分词器
segmenters.add(new LetterSegmenter()); segmenters.add(new LetterSegmenter());
// 处理中文数量词的子分词器 //处理中文数量词的子分词器
segmenters.add(new CN_QuantifierSegmenter()); segmenters.add(new CN_QuantifierSegmenter());
// 处理中文词的子分词器 //处理中文词的子分词器
segmenters.add(new CJKSegmenter()); segmenters.add(new CJKSegmenter());
return segmenters; return segmenters;
} }
@ -136,34 +126,34 @@ public final class IKSegmenter {
*/ */
int available = context.fillBuffer(this.input); int available = context.fillBuffer(this.input);
if (available <= 0) { if (available <= 0) {
// reader已经读完 //reader已经读完
context.reset(); context.reset();
return null; return null;
} else { } else {
// 初始化指针 //初始化指针
context.initCursor(); context.initCursor();
do { do {
// 遍历子分词器 //遍历子分词器
for (ISegmenter segmenter : segmenters) { for (ISegmenter segmenter : segmenters) {
segmenter.analyze(context); segmenter.analyze(context);
} }
// 字符缓冲区接近读完需要读入新的字符 //字符缓冲区接近读完需要读入新的字符
if (context.needRefillBuffer()) { if (context.needRefillBuffer()) {
break; break;
} }
// 向前移动指针 //向前移动指针
} while (context.moveCursor()); } while (context.moveCursor());
// 重置子分词器为下轮循环进行初始化 //重置子分词器为下轮循环进行初始化
for (ISegmenter segmenter : segmenters) { for (ISegmenter segmenter : segmenters) {
segmenter.reset(); segmenter.reset();
} }
} }
// 对分词进行歧义处理 //对分词进行歧义处理
this.arbitrator.process(context, this.cfg.useSmart()); this.arbitrator.process(context, this.cfg.useSmart());
// 将分词结果输出到结果集并处理未切分的单个CJK字符 //将分词结果输出到结果集并处理未切分的单个CJK字符
context.outputToResult(); context.outputToResult();
// 记录本次分词的缓冲区位移 //记录本次分词的缓冲区位移
context.markBufferOffset(); context.markBufferOffset();
} }
return l; return l;

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,29 +21,29 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.4.0 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
/** /**
*
* 子分词器接口 * 子分词器接口
*/ */
interface ISegmenter { interface ISegmenter {
/** /**
* 从分析器读取下一个可能分解的词元对象 * 从分析器读取下一个可能分解的词元对象
* * @param context 分词算法上下文
* @param context 分词算法上下文 */
*/ void analyze(AnalyzeContext context);
void analyze(AnalyzeContext context);
/** /**
* 重置子分析器状态 * 重置子分析器状态
*/ */
void reset(); void reset();
} }

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.4.0 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
@ -34,18 +34,14 @@ import java.util.Arrays;
*/ */
class LetterSegmenter implements ISegmenter { class LetterSegmenter implements ISegmenter {
/** //子分词器标签
* 子分词器标签
*/
private static final String SEGMENTER_NAME = "LETTER_SEGMENTER"; private static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
/** //链接符号
* 链接符号
*/
private static final char[] Letter_Connector = new char[]{'#', '&', '+', '-', '.', '@', '_'}; private static final char[] Letter_Connector = new char[]{'#', '&', '+', '-', '.', '@', '_'};
/**
* 数字符号 //数字符号
*/
private static final char[] Num_Connector = new char[]{',', '.'}; private static final char[] Num_Connector = new char[]{',', '.'};
/* /*
* 词元的开始位置 * 词元的开始位置
* 同时作为子分词器状态标识 * 同时作为子分词器状态标识
@ -57,18 +53,22 @@ class LetterSegmenter implements ISegmenter {
* end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置 * end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置
*/ */
private int end; private int end;
/* /*
* 字母起始位置 * 字母起始位置
*/ */
private int englishStart; private int englishStart;
/* /*
* 字母结束位置 * 字母结束位置
*/ */
private int englishEnd; private int englishEnd;
/* /*
* 阿拉伯数字起始位置 * 阿拉伯数字起始位置
*/ */
private int arabicStart; private int arabicStart;
/* /*
* 阿拉伯数字结束位置 * 阿拉伯数字结束位置
*/ */
@ -91,18 +91,18 @@ class LetterSegmenter implements ISegmenter {
*/ */
public void analyze(AnalyzeContext context) { public void analyze(AnalyzeContext context) {
boolean bufferLockFlag; boolean bufferLockFlag;
// 处理英文字母 //处理英文字母
bufferLockFlag = this.processEnglishLetter(context); bufferLockFlag = this.processEnglishLetter(context);
// 处理阿拉伯字母 //处理阿拉伯字母
bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag; bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag;
// 处理混合字母(这个要放最后处理可以通过QuickSortSet排除重复) //处理混合字母(这个要放最后处理可以通过QuickSortSet排除重复)
bufferLockFlag = this.processMixLetter(context) || bufferLockFlag; bufferLockFlag = this.processMixLetter(context) || bufferLockFlag;
// 判断是否锁定缓冲区 //判断是否锁定缓冲区
if (bufferLockFlag) { if (bufferLockFlag) {
context.lockBuffer(SEGMENTER_NAME); context.lockBuffer(SEGMENTER_NAME);
} else { } else {
// 对缓冲区解锁 //对缓冲区解锁
context.unlockBuffer(SEGMENTER_NAME); context.unlockBuffer(SEGMENTER_NAME);
} }
} }
@ -128,26 +128,26 @@ class LetterSegmenter implements ISegmenter {
private boolean processMixLetter(AnalyzeContext context) { private boolean processMixLetter(AnalyzeContext context) {
boolean needLock; boolean needLock;
if (this.start == -1) {// 当前的分词器尚未开始处理字符 if (this.start == -1) {//当前的分词器尚未开始处理字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType() if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) { || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
// 记录起始指针的位置,标明分词器进入处理状态 //记录起始指针的位置,标明分词器进入处理状态
this.start = context.getCursor(); this.start = context.getCursor();
this.end = start; this.end = start;
} }
} else {// 当前的分词器正在处理字符 } else {//当前的分词器正在处理字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType() if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) { || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
// 记录下可能的结束位置 //记录下可能的结束位置
this.end = context.getCursor(); this.end = context.getCursor();
} else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType() } else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
&& this.isLetterConnector(context.getCurrentChar())) { && this.isLetterConnector(context.getCurrentChar())) {
// 记录下可能的结束位置 //记录下可能的结束位置
this.end = context.getCursor(); this.end = context.getCursor();
} else { } else {
// 遇到非Letter字符输出词元 //遇到非Letter字符输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start + 1, Lexeme.TYPE_LETTER); Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start + 1, Lexeme.TYPE_LETTER);
context.addLexeme(newLexeme); context.addLexeme(newLexeme);
this.start = -1; this.start = -1;
@ -155,10 +155,10 @@ class LetterSegmenter implements ISegmenter {
} }
} }
// 判断缓冲区是否已经读完 //判断缓冲区是否已经读完
if (context.isBufferConsumed()) { if (context.isBufferConsumed()) {
if (this.start != -1 && this.end != -1) { if (this.start != -1 && this.end != -1) {
// 缓冲以读完输出词元 //缓冲以读完输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start + 1, Lexeme.TYPE_LETTER); Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start + 1, Lexeme.TYPE_LETTER);
context.addLexeme(newLexeme); context.addLexeme(newLexeme);
this.start = -1; this.start = -1;
@ -166,7 +166,7 @@ class LetterSegmenter implements ISegmenter {
} }
} }
// 判断是否锁定缓冲区 //判断是否锁定缓冲区
needLock = this.start != -1 || this.end != -1; needLock = this.start != -1 || this.end != -1;
return needLock; return needLock;
} }
@ -179,18 +179,18 @@ class LetterSegmenter implements ISegmenter {
private boolean processEnglishLetter(AnalyzeContext context) { private boolean processEnglishLetter(AnalyzeContext context) {
boolean needLock; boolean needLock;
if (this.englishStart == -1) {// 当前的分词器尚未开始处理英文字符 if (this.englishStart == -1) {//当前的分词器尚未开始处理英文字符
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) { if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
// 记录起始指针的位置,标明分词器进入处理状态 //记录起始指针的位置,标明分词器进入处理状态
this.englishStart = context.getCursor(); this.englishStart = context.getCursor();
this.englishEnd = this.englishStart; this.englishEnd = this.englishStart;
} }
} else {// 当前的分词器正在处理英文字符 } else {//当前的分词器正在处理英文字符
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) { if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
// 记录当前指针位置为结束位置 //记录当前指针位置为结束位置
this.englishEnd = context.getCursor(); this.englishEnd = context.getCursor();
} else { } else {
// 遇到非English字符,输出词元 //遇到非English字符,输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd - this.englishStart + 1, Lexeme.TYPE_ENGLISH); Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd - this.englishStart + 1, Lexeme.TYPE_ENGLISH);
context.addLexeme(newLexeme); context.addLexeme(newLexeme);
this.englishStart = -1; this.englishStart = -1;
@ -198,10 +198,10 @@ class LetterSegmenter implements ISegmenter {
} }
} }
// 判断缓冲区是否已经读完 //判断缓冲区是否已经读完
if (context.isBufferConsumed()) { if (context.isBufferConsumed()) {
if (this.englishStart != -1 && this.englishEnd != -1) { if (this.englishStart != -1 && this.englishEnd != -1) {
// 缓冲以读完输出词元 //缓冲以读完输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd - this.englishStart + 1, Lexeme.TYPE_ENGLISH); Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd - this.englishStart + 1, Lexeme.TYPE_ENGLISH);
context.addLexeme(newLexeme); context.addLexeme(newLexeme);
this.englishStart = -1; this.englishStart = -1;
@ -209,7 +209,7 @@ class LetterSegmenter implements ISegmenter {
} }
} }
// 判断是否锁定缓冲区 //判断是否锁定缓冲区
needLock = this.englishStart != -1 || this.englishEnd != -1; needLock = this.englishStart != -1 || this.englishEnd != -1;
return needLock; return needLock;
} }
@ -222,21 +222,21 @@ class LetterSegmenter implements ISegmenter {
private boolean processArabicLetter(AnalyzeContext context) { private boolean processArabicLetter(AnalyzeContext context) {
boolean needLock; boolean needLock;
if (this.arabicStart == -1) {// 当前的分词器尚未开始处理数字字符 if (this.arabicStart == -1) {//当前的分词器尚未开始处理数字字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) { if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
// 记录起始指针的位置,标明分词器进入处理状态 //记录起始指针的位置,标明分词器进入处理状态
this.arabicStart = context.getCursor(); this.arabicStart = context.getCursor();
this.arabicEnd = this.arabicStart; this.arabicEnd = this.arabicStart;
} }
} else {// 当前的分词器正在处理数字字符 } else {//当前的分词器正在处理数字字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) { if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
// 记录当前指针位置为结束位置 //记录当前指针位置为结束位置
this.arabicEnd = context.getCursor(); this.arabicEnd = context.getCursor();
}/* else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType() }/* else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
&& this.isNumConnector(context.getCurrentChar())) { && this.isNumConnector(context.getCurrentChar())) {
// 不输出数字但不标记结束 //不输出数字但不标记结束
}*/ else { }*/ else {
// //遇到非Arabic字符,输出词元 ////遇到非Arabic字符,输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd - this.arabicStart + 1, Lexeme.TYPE_ARABIC); Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd - this.arabicStart + 1, Lexeme.TYPE_ARABIC);
context.addLexeme(newLexeme); context.addLexeme(newLexeme);
this.arabicStart = -1; this.arabicStart = -1;
@ -244,10 +244,10 @@ class LetterSegmenter implements ISegmenter {
} }
} }
// 判断缓冲区是否已经读完 //判断缓冲区是否已经读完
if (context.isBufferConsumed()) { if (context.isBufferConsumed()) {
if (this.arabicStart != -1 && this.arabicEnd != -1) { if (this.arabicStart != -1 && this.arabicEnd != -1) {
// 生成已切分的词元 //生成已切分的词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd - this.arabicStart + 1, Lexeme.TYPE_ARABIC); Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd - this.arabicStart + 1, Lexeme.TYPE_ARABIC);
context.addLexeme(newLexeme); context.addLexeme(newLexeme);
this.arabicStart = -1; this.arabicStart = -1;
@ -255,7 +255,7 @@ class LetterSegmenter implements ISegmenter {
} }
} }
// 判断是否锁定缓冲区 //判断是否锁定缓冲区
needLock = this.arabicStart != -1 || this.arabicEnd != -1; needLock = this.arabicStart != -1 || this.arabicEnd != -1;
return needLock; return needLock;
} }

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.4.0 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
@ -31,278 +31,242 @@ package org.wltea.analyzer.core;
* IK词元对象 * IK词元对象
*/ */
@SuppressWarnings("unused") @SuppressWarnings("unused")
public class Lexeme implements Comparable<Lexeme> { public class Lexeme implements Comparable<Lexeme>{
/** //英文
* 英文 static final int TYPE_ENGLISH = 1;
*/ //数字
static final int TYPE_ENGLISH = 1; static final int TYPE_ARABIC = 2;
/** //英文数字混合
* 数字 static final int TYPE_LETTER = 3;
*/ //中文词元
static final int TYPE_ARABIC = 2; static final int TYPE_CNWORD = 4;
/** //中文单字
* 英文数字混合 static final int TYPE_CNCHAR = 64;
*/ //日韩文字
static final int TYPE_LETTER = 3; static final int TYPE_OTHER_CJK = 8;
/** //中文数词
* 中文词元 static final int TYPE_CNUM = 16;
*/ //中文量词
static final int TYPE_CNWORD = 4; static final int TYPE_COUNT = 32;
/** //中文数量词
* 中文单字 static final int TYPE_CQUAN = 48;
*/
static final int TYPE_CNCHAR = 64; //词元的起始位移
/** private int offset;
* 日韩文字 //词元的相对起始位置
*/
static final int TYPE_OTHER_CJK = 8;
/**
* 中文数词
*/
static final int TYPE_CNUM = 16;
/**
* 中文量词
*/
static final int TYPE_COUNT = 32;
/**
* 中文数量词
*/
static final int TYPE_CQUAN = 48;
/**
* 词元的起始位移
*/
private int offset;
/**
* 词元的相对起始位置
*/
private int begin; private int begin;
/** //词元的长度
* 词元的长度
*/
private int length; private int length;
/** //词元文本
* 词元文本
*/
private String lexemeText; private String lexemeText;
/** //词元类型
* 词元类型
*/
private int lexemeType; private int lexemeType;
public Lexeme(int offset, int begin, int length, int lexemeType) { public Lexeme(int offset , int begin , int length , int lexemeType){
this.offset = offset; this.offset = offset;
this.begin = begin; this.begin = begin;
if (length < 0) { if(length < 0){
throw new IllegalArgumentException("length < 0"); throw new IllegalArgumentException("length < 0");
} }
this.length = length; this.length = length;
this.lexemeType = lexemeType; this.lexemeType = lexemeType;
} }
/* /*
* 判断词元相等算法 * 判断词元相等算法
* 起始位置偏移起始位置终止位置相同 * 起始位置偏移起始位置终止位置相同
* @see java.lang.Object#equals(Object o) * @see java.lang.Object#equals(Object o)
*/ */
public boolean equals(Object o) { public boolean equals(Object o){
if (o == null) { if(o == null){
return false; return false;
} }
if (this == o) { if(this == o){
return true; return true;
} }
if (o instanceof Lexeme) { if(o instanceof Lexeme){
Lexeme other = (Lexeme) o; Lexeme other = (Lexeme)o;
return this.offset == other.getOffset() return this.offset == other.getOffset()
&& this.begin == other.getBegin() && this.begin == other.getBegin()
&& this.length == other.getLength(); && this.length == other.getLength();
} else { }else{
return false; return false;
} }
} }
/* /*
* 词元哈希编码算法 * 词元哈希编码算法
* @see java.lang.Object#hashCode() * @see java.lang.Object#hashCode()
*/ */
public int hashCode() { public int hashCode(){
int absBegin = getBeginPosition(); int absBegin = getBeginPosition();
int absEnd = getEndPosition(); int absEnd = getEndPosition();
return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11; return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
} }
/* /*
* 词元在排序集合中的比较算法 * 词元在排序集合中的比较算法
* @see java.lang.Comparable#compareTo(java.lang.Object) * @see java.lang.Comparable#compareTo(java.lang.Object)
*/ */
public int compareTo(Lexeme other) { public int compareTo(Lexeme other) {
// 起始位置优先 //起始位置优先
if (this.begin < other.getBegin()) { if(this.begin < other.getBegin()){
return -1; return -1;
} else if (this.begin == other.getBegin()) { }else if(this.begin == other.getBegin()){
// 词元长度优先 //词元长度优先
// this.length < other.getLength() //this.length < other.getLength()
return Integer.compare(other.getLength(), this.length); return Integer.compare(other.getLength(), this.length);
} else { }else{//this.begin > other.getBegin()
return 1; return 1;
} }
} }
private int getOffset() { private int getOffset() {
return offset; return offset;
} }
public void setOffset(int offset) { public void setOffset(int offset) {
this.offset = offset; this.offset = offset;
} }
int getBegin() { int getBegin() {
return begin; return begin;
} }
/**
* 获取词元在文本中的起始位置
* @return int
*/
public int getBeginPosition(){
return offset + begin;
}
/** public void setBegin(int begin) {
* 获取词元在文本中的起始位置 this.begin = begin;
* }
* @return int
*/
public int getBeginPosition() {
return offset + begin;
}
public void setBegin(int begin) { /**
this.begin = begin; * 获取词元在文本中的结束位置
} * @return int
*/
public int getEndPosition(){
return offset + begin + length;
}
/** /**
* 获取词元在文本中的结束位置 * 获取词元的字符长度
* * @return int
* @return int */
*/ public int getLength(){
public int getEndPosition() { return this.length;
return offset + begin + length; }
}
/** public void setLength(int length) {
* 获取词元的字符长度 if(this.length < 0){
* throw new IllegalArgumentException("length < 0");
* @return int }
*/ this.length = length;
public int getLength() { }
return this.length;
}
public void setLength(int length) { /**
if (this.length < 0) { * 获取词元的文本内容
throw new IllegalArgumentException("length < 0"); * @return String
} */
this.length = length; public String getLexemeText() {
} if(lexemeText == null){
return "";
}
return lexemeText;
}
/** void setLexemeText(String lexemeText) {
* 获取词元的文本内容 if(lexemeText == null){
* this.lexemeText = "";
* @return String this.length = 0;
*/ }else{
public String getLexemeText() { this.lexemeText = lexemeText;
if (lexemeText == null) { this.length = lexemeText.length();
return ""; }
} }
return lexemeText;
}
void setLexemeText(String lexemeText) { /**
if (lexemeText == null) { * 获取词元类型
this.lexemeText = ""; * @return int
this.length = 0; */
} else { int getLexemeType() {
this.lexemeText = lexemeText; return lexemeType;
this.length = lexemeText.length(); }
}
}
/** /**
* 获取词元类型 * 获取词元类型标示字符串
* * @return String
* @return int */
*/ public String getLexemeTypeString(){
int getLexemeType() { switch(lexemeType) {
return lexemeType;
}
/** case TYPE_ENGLISH :
* 获取词元类型标示字符串 return "ENGLISH";
*
* @return String
*/
public String getLexemeTypeString() {
switch (lexemeType) {
case TYPE_ENGLISH: case TYPE_ARABIC :
return "ENGLISH"; return "ARABIC";
case TYPE_ARABIC: case TYPE_LETTER :
return "ARABIC"; return "LETTER";
case TYPE_LETTER: case TYPE_CNWORD :
return "LETTER"; return "CN_WORD";
case TYPE_CNWORD: case TYPE_CNCHAR :
return "CN_WORD"; return "CN_CHAR";
case TYPE_CNCHAR: case TYPE_OTHER_CJK :
return "CN_CHAR"; return "OTHER_CJK";
case TYPE_OTHER_CJK: case TYPE_COUNT :
return "OTHER_CJK"; return "COUNT";
case TYPE_COUNT: case TYPE_CNUM :
return "COUNT"; return "TYPE_CNUM";
case TYPE_CNUM: case TYPE_CQUAN:
return "TYPE_CNUM"; return "TYPE_CQUAN";
case TYPE_CQUAN: default :
return "TYPE_CQUAN"; return "UNKONW";
}
default: }
return "UNKNOWN";
}
}
public void setLexemeType(int lexemeType) { public void setLexemeType(int lexemeType) {
this.lexemeType = lexemeType; this.lexemeType = lexemeType;
} }
/** /**
* 合并两个相邻的词元 * 合并两个相邻的词元
* * @return boolean 词元是否成功合并
* @return boolean 词元是否成功合并 */
*/ boolean append(Lexeme l, int lexemeType){
boolean append(Lexeme l, int lexemeType) { if(l != null && this.getEndPosition() == l.getBeginPosition()){
if (l != null && this.getEndPosition() == l.getBeginPosition()) { this.length += l.getLength();
this.length += l.getLength(); this.lexemeType = lexemeType;
this.lexemeType = lexemeType; return true;
return true; }else {
} else { return false;
return false; }
} }
}
/**
* ToString 方法 /**
* *
* @return 字符串输出 */
*/ public String toString(){
public String toString() { return this.getBeginPosition() + "-" + this.getEndPosition() +
return this.getBeginPosition() + "-" + this.getEndPosition() + " : " + this.lexemeText + " : \t" +
" : " + this.lexemeText + " : \t" + this.getLexemeTypeString();
this.getLexemeTypeString(); }
}
} }

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.4.0 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
@ -34,17 +34,11 @@ package org.wltea.analyzer.core;
@SuppressWarnings("unused") @SuppressWarnings("unused")
class LexemePath extends QuickSortSet implements Comparable<LexemePath> { class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
/** //起始位置
* 起始位置
*/
private int pathBegin; private int pathBegin;
/** //结束
* 结束
*/
private int pathEnd; private int pathEnd;
/** //词元链的有效字符长度
* 词元链的有效字符长度
*/
private int payloadLength; private int payloadLength;
LexemePath() { LexemePath() {
@ -106,6 +100,7 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
/** /**
* 移除尾部的Lexeme * 移除尾部的Lexeme
*
*/ */
void removeTail() { void removeTail() {
Lexeme tail = this.pollLast(); Lexeme tail = this.pollLast();
@ -122,6 +117,7 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
/** /**
* 检测词元位置交叉有歧义的切分 * 检测词元位置交叉有歧义的切分
*
*/ */
boolean checkCross(Lexeme lexeme) { boolean checkCross(Lexeme lexeme) {
return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd) return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd)
@ -145,6 +141,7 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
/** /**
* 获取LexemePath的路径长度 * 获取LexemePath的路径长度
*
*/ */
private int getPathLength() { private int getPathLength() {
return this.pathEnd - this.pathBegin; return this.pathEnd - this.pathBegin;
@ -153,6 +150,7 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
/** /**
* X权重词元长度积 * X权重词元长度积
*
*/ */
private int getXWeight() { private int getXWeight() {
int product = 1; int product = 1;
@ -193,48 +191,48 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
} }
public int compareTo(LexemePath o) { public int compareTo(LexemePath o) {
// 比较有效文本长度 //比较有效文本长度
if (this.payloadLength > o.payloadLength) { if (this.payloadLength > o.payloadLength) {
return -1; return -1;
} else if (this.payloadLength < o.payloadLength) { } else if (this.payloadLength < o.payloadLength) {
return 1; return 1;
} } else {
//比较词元个数越少越好
if (this.size() < o.size()) {
return -1;
} else if (this.size() > o.size()) {
return 1;
} else {
//路径跨度越大越好
if (this.getPathLength() > o.getPathLength()) {
return -1;
} else if (this.getPathLength() < o.getPathLength()) {
return 1;
} else {
//根据统计学结论逆向切分概率高于正向切分因此位置越靠后的优先
if (this.pathEnd > o.pathEnd) {
return -1;
} else if (pathEnd < o.pathEnd) {
return 1;
} else {
//词长越平均越好
if (this.getXWeight() > o.getXWeight()) {
return -1;
} else if (this.getXWeight() < o.getXWeight()) {
return 1;
} else {
//词元位置权重比较
if (this.getPWeight() > o.getPWeight()) {
return -1;
} else if (this.getPWeight() < o.getPWeight()) {
return 1;
}
// 比较词元个数越少越好 }
if (this.size() < o.size()) { }
return -1; }
} else if (this.size() > o.size()) { }
return 1;
} }
// 路径跨度越大越好
if (this.getPathLength() > o.getPathLength()) {
return -1;
} else if (this.getPathLength() < o.getPathLength()) {
return 1;
}
// 根据统计学结论逆向切分概率高于正向切分因此位置越靠后的优先
if (this.pathEnd > o.pathEnd) {
return -1;
} else if (pathEnd < o.pathEnd) {
return 1;
}
// 词长越平均越好
if (this.getXWeight() > o.getXWeight()) {
return -1;
} else if (this.getXWeight() < o.getXWeight()) {
return 1;
}
// 词元位置权重比较
if (this.getPWeight() > o.getPWeight()) {
return -1;
} else if (this.getPWeight() < o.getPWeight()) {
return 1;
}
return 0; return 0;
} }

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -28,20 +28,14 @@
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
/** /**
* IK分词器专用的Lexeme快速排序集合 * IK分词器专用的Lexem快速排序集合
*/ */
class QuickSortSet { class QuickSortSet {
/** //链表头
* 链表头
*/
private Cell head; private Cell head;
/** //链表尾
* 链表尾
*/
private Cell tail; private Cell tail;
/** //链表的实际大小
* 链表的实际大小
*/
private int size; private int size;
QuickSortSet() { QuickSortSet() {
@ -59,29 +53,31 @@ class QuickSortSet {
this.size++; this.size++;
} else { } else {
if (this.tail.compareTo(newCell) < 0) { /*if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同不放入集合
// 词元接入链表尾部
}else */
if (this.tail.compareTo(newCell) < 0) {//词元接入链表尾部
this.tail.next = newCell; this.tail.next = newCell;
newCell.prev = this.tail; newCell.prev = this.tail;
this.tail = newCell; this.tail = newCell;
this.size++; this.size++;
} else if (this.head.compareTo(newCell) > 0) { } else if (this.head.compareTo(newCell) > 0) {//词元接入链表头部
// 词元接入链表头部
this.head.prev = newCell; this.head.prev = newCell;
newCell.next = this.head; newCell.next = this.head;
this.head = newCell; this.head = newCell;
this.size++; this.size++;
} else { } else {
// 从尾部上逆 //从尾部上逆
Cell index = this.tail; Cell index = this.tail;
while (index != null && index.compareTo(newCell) > 0) { while (index != null && index.compareTo(newCell) > 0) {
index = index.prev; index = index.prev;
} }
/*if(index.compareTo(newCell) == 0){//词元与集合中的词元重复不放入集合
// 词元插入链表中的某个位置 }else */
if ((index != null ? index.compareTo(newCell) : 1) < 0) { if ((index != null ? index.compareTo(newCell) : 1) < 0) {//词元插入链表中的某个位置
newCell.prev = index; newCell.prev = index;
newCell.next = index.next; newCell.next = index.next;
index.next.prev = newCell; index.next.prev = newCell;
@ -180,8 +176,8 @@ class QuickSortSet {
} }
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* update by Magese(magese@live.cn) * update by Magese(magese@live.cn)
*/ */
@SuppressWarnings("unused") @SuppressWarnings("unused")

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.4.0 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.dic; package org.wltea.analyzer.dic;
@ -37,38 +37,24 @@ import java.util.Map;
@SuppressWarnings("unused") @SuppressWarnings("unused")
class DictSegment implements Comparable<DictSegment> { class DictSegment implements Comparable<DictSegment> {
/** //公用字典表存储汉字
* 公用字典表存储汉字
*/
private static final Map<Character, Character> charMap = new HashMap<>(16, 0.95f); private static final Map<Character, Character> charMap = new HashMap<>(16, 0.95f);
/** //数组大小上限
* 数组大小上限
*/
private static final int ARRAY_LENGTH_LIMIT = 3; private static final int ARRAY_LENGTH_LIMIT = 3;
/** //Map存储结构
* Map存储结构 private Map<Character, DictSegment> childrenMap;
*/ //数组方式存储结构
private volatile Map<Character, DictSegment> childrenMap; private DictSegment[] childrenArray;
/**
* 数组方式存储结构
*/
private volatile DictSegment[] childrenArray;
/** //当前节点上存储的字符
* 当前节点上存储的字符 private Character nodeChar;
*/ //当前节点存储的Segment数目
private final Character nodeChar; //storeSize <=ARRAY_LENGTH_LIMIT 使用数组存储 storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
/**
* 当前节点存储的Segment数目
* storeSize <=ARRAY_LENGTH_LIMIT 使用数组存储 storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
*/
private int storeSize = 0; private int storeSize = 0;
/** //当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
* 当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
*/
private int nodeState = 0; private int nodeState = 0;

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,20 +21,20 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.4.0 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.dic; package org.wltea.analyzer.dic;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.cfg.DefaultConfig;
import java.io.*; import java.io.*;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.cfg.DefaultConfig;
/** /**
* 词典管理类单例模式 * 词典管理类单例模式
*/ */
@ -44,7 +44,7 @@ public class Dictionary {
/* /*
* 词典单子实例 * 词典单子实例
*/ */
private static volatile Dictionary singleton; private static Dictionary singleton;
/* /*
* 主词典对象 * 主词典对象
@ -63,7 +63,7 @@ public class Dictionary {
/** /**
* 配置对象 * 配置对象
*/ */
private final Configuration cfg; private Configuration cfg;
/** /**
* 私有构造方法阻止外部直接实例化本类 * 私有构造方法阻止外部直接实例化本类
@ -326,7 +326,7 @@ public class Dictionary {
// 建立一个量词典实例 // 建立一个量词典实例
_QuantifierDict = new DictSegment((char) 0); _QuantifierDict = new DictSegment((char) 0);
// 读取量词词典文件 // 读取量词词典文件
InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDictionary()); InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDicionary());
if (is == null) { if (is == null) {
throw new RuntimeException("Quantifier Dictionary not found!!!"); throw new RuntimeException("Quantifier Dictionary not found!!!");
} }

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.4.0 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.dic; package org.wltea.analyzer.dic;
@ -32,33 +32,24 @@ package org.wltea.analyzer.dic;
*/ */
@SuppressWarnings("unused") @SuppressWarnings("unused")
public class Hit { public class Hit {
/** //Hit不匹配
* Hit不匹配
*/
private static final int UNMATCH = 0x00000000; private static final int UNMATCH = 0x00000000;
/** //Hit完全匹配
* Hit完全匹配
*/
private static final int MATCH = 0x00000001; private static final int MATCH = 0x00000001;
/** //Hit前缀匹配
* Hit前缀匹配
*/
private static final int PREFIX = 0x00000010; private static final int PREFIX = 0x00000010;
/** //该HIT当前状态默认未匹配
* 该HIT当前状态默认未匹配
*/
private int hitState = UNMATCH; private int hitState = UNMATCH;
/**
* 记录词典匹配过程中当前匹配到的词典分支节点 //记录词典匹配过程中当前匹配到的词典分支节点
*/
private DictSegment matchedDictSegment; private DictSegment matchedDictSegment;
/** /*
* 词段开始位置 * 词段开始位置
*/ */
private int begin; private int begin;
/** /*
* 词段的结束位置 * 词段的结束位置
*/ */
private int end; private int end;
@ -95,7 +86,9 @@ public class Hit {
public boolean isUnmatch() { public boolean isUnmatch() {
return this.hitState == UNMATCH ; return this.hitState == UNMATCH ;
} }
/**
*
*/
void setUnmatch() { void setUnmatch() {
this.hitState = UNMATCH; this.hitState = UNMATCH;
} }

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.4.0 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.lucene; package org.wltea.analyzer.lucene;
@ -36,7 +36,7 @@ import org.apache.lucene.analysis.Tokenizer;
@SuppressWarnings("unused") @SuppressWarnings("unused")
public final class IKAnalyzer extends Analyzer { public final class IKAnalyzer extends Analyzer {
private final boolean useSmart; private boolean useSmart;
private boolean useSmart() { private boolean useSmart() {
return useSmart; return useSmart;

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.4.0 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.lucene; package org.wltea.analyzer.lucene;
@ -39,30 +39,21 @@ import java.io.IOException;
/** /**
* IK分词器 Lucene Tokenizer适配器类 * IK分词器 Lucene Tokenizer适配器类
* 兼容Lucene 4.0版本
*/ */
@SuppressWarnings({"unused", "FinalMethodInFinalClass"}) @SuppressWarnings("unused")
public final class IKTokenizer extends Tokenizer { public final class IKTokenizer extends Tokenizer {
/** //IK分词器实现
* IK分词器实现
*/
private IKSegmenter _IKImplement; private IKSegmenter _IKImplement;
/** //词元文本属性
* 词元文本属性
*/
private CharTermAttribute termAtt; private CharTermAttribute termAtt;
/** //词元位移属性
* 词元位移属性
*/
private OffsetAttribute offsetAtt; private OffsetAttribute offsetAtt;
/** //词元分类属性该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量
* 词元分类属性该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量
*/
private TypeAttribute typeAtt; private TypeAttribute typeAtt;
/** //记录最后一个词元的结束位置
* 记录最后一个词元的结束位置
*/
private int endPosition; private int endPosition;
/** /**
@ -93,31 +84,30 @@ public final class IKTokenizer extends Tokenizer {
_IKImplement = new IKSegmenter(input, useSmart); _IKImplement = new IKSegmenter(input, useSmart);
} }
/* /* (non-Javadoc)
* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#incrementToken() * @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/ */
@Override @Override
public boolean incrementToken() throws IOException { public boolean incrementToken() throws IOException {
// 清除所有的词元属性 //清除所有的词元属性
clearAttributes(); clearAttributes();
Lexeme nextLexeme = _IKImplement.next(); Lexeme nextLexeme = _IKImplement.next();
if (nextLexeme != null) { if (nextLexeme != null) {
// 将Lexeme转成Attributes //将Lexeme转成Attributes
// 设置词元文本 //设置词元文本
termAtt.append(nextLexeme.getLexemeText()); termAtt.append(nextLexeme.getLexemeText());
// 设置词元长度 //设置词元长度
termAtt.setLength(nextLexeme.getLength()); termAtt.setLength(nextLexeme.getLength());
// 设置词元位移 //设置词元位移
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
// 记录分词的最后位置 //记录分词的最后位置
endPosition = nextLexeme.getEndPosition(); endPosition = nextLexeme.getEndPosition();
// 记录词元分类 //记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString()); typeAtt.setType(nextLexeme.getLexemeTypeString());
// 返会true告知还有下个词元 //返会true告知还有下个词元
return true; return true;
} }
// 返会false告知词元输出完毕 //返会false告知词元输出完毕
return false; return false;
} }

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.3.1 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.lucene; package org.wltea.analyzer.lucene;
@ -44,8 +44,6 @@ import java.nio.charset.StandardCharsets;
import java.util.*; import java.util.*;
/** /**
* 分词器工厂类
*
* @author <a href="magese@live.cn">Magese</a> * @author <a href="magese@live.cn">Magese</a>
*/ */
public class IKTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware, UpdateThread.UpdateJob { public class IKTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware, UpdateThread.UpdateJob {

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.4.0 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.lucene; package org.wltea.analyzer.lucene;

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.4.0 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.query; package org.wltea.analyzer.query;
@ -46,11 +46,11 @@ import java.util.Stack;
public class IKQueryExpressionParser { public class IKQueryExpressionParser {
private final List<Element> elements = new ArrayList<>(); private List<Element> elements = new ArrayList<>();
private final Stack<Query> querys = new Stack<>(); private Stack<Query> querys = new Stack<>();
private final Stack<Element> operates = new Stack<>(); private Stack<Element> operates = new Stack<>();
/** /**
* 解析查询表达式生成Lucene Query对象 * 解析查询表达式生成Lucene Query对象
@ -61,9 +61,9 @@ public class IKQueryExpressionParser {
Query lucenceQuery = null; Query lucenceQuery = null;
if (expression != null && !"".equals(expression.trim())) { if (expression != null && !"".equals(expression.trim())) {
try { try {
// 文法解析 //文法解析
this.splitElements(expression); this.splitElements(expression);
// 语法解析 //语法解析
this.parseSyntax(); this.parseSyntax();
if (this.querys.size() == 1) { if (this.querys.size() == 1) {
lucenceQuery = this.querys.pop(); lucenceQuery = this.querys.pop();
@ -87,263 +87,263 @@ public class IKQueryExpressionParser {
if (expression == null) { if (expression == null) {
return; return;
} }
Element currentElement = null; Element curretElement = null;
char[] expChars = expression.toCharArray(); char[] expChars = expression.toCharArray();
for (char expChar : expChars) { for (char expChar : expChars) {
switch (expChar) { switch (expChar) {
case '&': case '&':
if (currentElement == null) { if (curretElement == null) {
currentElement = new Element(); curretElement = new Element();
currentElement.type = '&'; curretElement.type = '&';
currentElement.append(expChar); curretElement.append(expChar);
} else if (currentElement.type == '&') { } else if (curretElement.type == '&') {
currentElement.append(expChar); curretElement.append(expChar);
this.elements.add(currentElement); this.elements.add(curretElement);
currentElement = null; curretElement = null;
} else if (currentElement.type == '\'') { } else if (curretElement.type == '\'') {
currentElement.append(expChar); curretElement.append(expChar);
} else { } else {
this.elements.add(currentElement); this.elements.add(curretElement);
currentElement = new Element(); curretElement = new Element();
currentElement.type = '&'; curretElement.type = '&';
currentElement.append(expChar); curretElement.append(expChar);
} }
break; break;
case '|': case '|':
if (currentElement == null) { if (curretElement == null) {
currentElement = new Element(); curretElement = new Element();
currentElement.type = '|'; curretElement.type = '|';
currentElement.append(expChar); curretElement.append(expChar);
} else if (currentElement.type == '|') { } else if (curretElement.type == '|') {
currentElement.append(expChar); curretElement.append(expChar);
this.elements.add(currentElement); this.elements.add(curretElement);
currentElement = null; curretElement = null;
} else if (currentElement.type == '\'') { } else if (curretElement.type == '\'') {
currentElement.append(expChar); curretElement.append(expChar);
} else { } else {
this.elements.add(currentElement); this.elements.add(curretElement);
currentElement = new Element(); curretElement = new Element();
currentElement.type = '|'; curretElement.type = '|';
currentElement.append(expChar); curretElement.append(expChar);
} }
break; break;
case '-': case '-':
if (currentElement != null) { if (curretElement != null) {
if (currentElement.type == '\'') { if (curretElement.type == '\'') {
currentElement.append(expChar); curretElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(currentElement); this.elements.add(curretElement);
} }
} }
currentElement = new Element(); curretElement = new Element();
currentElement.type = '-'; curretElement.type = '-';
currentElement.append(expChar); curretElement.append(expChar);
this.elements.add(currentElement); this.elements.add(curretElement);
currentElement = null; curretElement = null;
break; break;
case '(': case '(':
if (currentElement != null) { if (curretElement != null) {
if (currentElement.type == '\'') { if (curretElement.type == '\'') {
currentElement.append(expChar); curretElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(currentElement); this.elements.add(curretElement);
} }
} }
currentElement = new Element(); curretElement = new Element();
currentElement.type = '('; curretElement.type = '(';
currentElement.append(expChar); curretElement.append(expChar);
this.elements.add(currentElement); this.elements.add(curretElement);
currentElement = null; curretElement = null;
break; break;
case ')': case ')':
if (currentElement != null) { if (curretElement != null) {
if (currentElement.type == '\'') { if (curretElement.type == '\'') {
currentElement.append(expChar); curretElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(currentElement); this.elements.add(curretElement);
} }
} }
currentElement = new Element(); curretElement = new Element();
currentElement.type = ')'; curretElement.type = ')';
currentElement.append(expChar); curretElement.append(expChar);
this.elements.add(currentElement); this.elements.add(curretElement);
currentElement = null; curretElement = null;
break; break;
case ':': case ':':
if (currentElement != null) { if (curretElement != null) {
if (currentElement.type == '\'') { if (curretElement.type == '\'') {
currentElement.append(expChar); curretElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(currentElement); this.elements.add(curretElement);
} }
} }
currentElement = new Element(); curretElement = new Element();
currentElement.type = ':'; curretElement.type = ':';
currentElement.append(expChar); curretElement.append(expChar);
this.elements.add(currentElement); this.elements.add(curretElement);
currentElement = null; curretElement = null;
break; break;
case '=': case '=':
if (currentElement != null) { if (curretElement != null) {
if (currentElement.type == '\'') { if (curretElement.type == '\'') {
currentElement.append(expChar); curretElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(currentElement); this.elements.add(curretElement);
} }
} }
currentElement = new Element(); curretElement = new Element();
currentElement.type = '='; curretElement.type = '=';
currentElement.append(expChar); curretElement.append(expChar);
this.elements.add(currentElement); this.elements.add(curretElement);
currentElement = null; curretElement = null;
break; break;
case ' ': case ' ':
if (currentElement != null) { if (curretElement != null) {
if (currentElement.type == '\'') { if (curretElement.type == '\'') {
currentElement.append(expChar); curretElement.append(expChar);
} else { } else {
this.elements.add(currentElement); this.elements.add(curretElement);
currentElement = null; curretElement = null;
} }
} }
break; break;
case '\'': case '\'':
if (currentElement == null) { if (curretElement == null) {
currentElement = new Element(); curretElement = new Element();
currentElement.type = '\''; curretElement.type = '\'';
} else if (currentElement.type == '\'') { } else if (curretElement.type == '\'') {
this.elements.add(currentElement); this.elements.add(curretElement);
currentElement = null; curretElement = null;
} else { } else {
this.elements.add(currentElement); this.elements.add(curretElement);
currentElement = new Element(); curretElement = new Element();
currentElement.type = '\''; curretElement.type = '\'';
} }
break; break;
case '[': case '[':
if (currentElement != null) { if (curretElement != null) {
if (currentElement.type == '\'') { if (curretElement.type == '\'') {
currentElement.append(expChar); curretElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(currentElement); this.elements.add(curretElement);
} }
} }
currentElement = new Element(); curretElement = new Element();
currentElement.type = '['; curretElement.type = '[';
currentElement.append(expChar); curretElement.append(expChar);
this.elements.add(currentElement); this.elements.add(curretElement);
currentElement = null; curretElement = null;
break; break;
case ']': case ']':
if (currentElement != null) { if (curretElement != null) {
if (currentElement.type == '\'') { if (curretElement.type == '\'') {
currentElement.append(expChar); curretElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(currentElement); this.elements.add(curretElement);
} }
} }
currentElement = new Element(); curretElement = new Element();
currentElement.type = ']'; curretElement.type = ']';
currentElement.append(expChar); curretElement.append(expChar);
this.elements.add(currentElement); this.elements.add(curretElement);
currentElement = null; curretElement = null;
break; break;
case '{': case '{':
if (currentElement != null) { if (curretElement != null) {
if (currentElement.type == '\'') { if (curretElement.type == '\'') {
currentElement.append(expChar); curretElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(currentElement); this.elements.add(curretElement);
} }
} }
currentElement = new Element(); curretElement = new Element();
currentElement.type = '{'; curretElement.type = '{';
currentElement.append(expChar); curretElement.append(expChar);
this.elements.add(currentElement); this.elements.add(curretElement);
currentElement = null; curretElement = null;
break; break;
case '}': case '}':
if (currentElement != null) { if (curretElement != null) {
if (currentElement.type == '\'') { if (curretElement.type == '\'') {
currentElement.append(expChar); curretElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(currentElement); this.elements.add(curretElement);
} }
} }
currentElement = new Element(); curretElement = new Element();
currentElement.type = '}'; curretElement.type = '}';
currentElement.append(expChar); curretElement.append(expChar);
this.elements.add(currentElement); this.elements.add(curretElement);
currentElement = null; curretElement = null;
break; break;
case ',': case ',':
if (currentElement != null) { if (curretElement != null) {
if (currentElement.type == '\'') { if (curretElement.type == '\'') {
currentElement.append(expChar); curretElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(currentElement); this.elements.add(curretElement);
} }
} }
currentElement = new Element(); curretElement = new Element();
currentElement.type = ','; curretElement.type = ',';
currentElement.append(expChar); curretElement.append(expChar);
this.elements.add(currentElement); this.elements.add(curretElement);
currentElement = null; curretElement = null;
break; break;
default: default:
if (currentElement == null) { if (curretElement == null) {
currentElement = new Element(); curretElement = new Element();
currentElement.type = 'F'; curretElement.type = 'F';
currentElement.append(expChar); curretElement.append(expChar);
} else if (currentElement.type == 'F') { } else if (curretElement.type == 'F') {
currentElement.append(expChar); curretElement.append(expChar);
} else if (currentElement.type == '\'') { } else if (curretElement.type == '\'') {
currentElement.append(expChar); curretElement.append(expChar);
} else { } else {
this.elements.add(currentElement); this.elements.add(curretElement);
currentElement = new Element(); curretElement = new Element();
currentElement.type = 'F'; curretElement.type = 'F';
currentElement.append(expChar); curretElement.append(expChar);
} }
} }
} }
if (currentElement != null) { if (curretElement != null) {
this.elements.add(currentElement); this.elements.add(curretElement);
} }
} }
@ -359,7 +359,7 @@ public class IKQueryExpressionParser {
throw new IllegalStateException("表达式异常: = 或 号丢失"); throw new IllegalStateException("表达式异常: = 或 号丢失");
} }
Element e3 = this.elements.get(i + 2); Element e3 = this.elements.get(i + 2);
// 处理 = 运算 //处理 = 运算
if ('\'' == e3.type) { if ('\'' == e3.type) {
i += 2; i += 2;
if ('=' == e2.type) { if ('=' == e2.type) {
@ -367,14 +367,14 @@ public class IKQueryExpressionParser {
this.querys.push(tQuery); this.querys.push(tQuery);
} else { } else {
String keyword = e3.toString(); String keyword = e3.toString();
// SWMCQuery Here //SWMCQuery Here
Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword); Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword);
this.querys.push(_SWMCQuery); this.querys.push(_SWMCQuery);
} }
} else if ('[' == e3.type || '{' == e3.type) { } else if ('[' == e3.type || '{' == e3.type) {
i += 2; i += 2;
// 处理 [] {} //处理 [] {}
LinkedList<Element> eQueue = new LinkedList<>(); LinkedList<Element> eQueue = new LinkedList<>();
eQueue.add(e3); eQueue.add(e3);
for (i++; i < this.elements.size(); i++) { for (i++; i < this.elements.size(); i++) {
@ -384,7 +384,7 @@ public class IKQueryExpressionParser {
break; break;
} }
} }
// 翻译RangeQuery //翻译RangeQuery
Query rangeQuery = this.toTermRangeQuery(e, eQueue); Query rangeQuery = this.toTermRangeQuery(e, eQueue);
this.querys.push(rangeQuery); this.querys.push(rangeQuery);
} else { } else {
@ -475,10 +475,10 @@ public class IKQueryExpressionParser {
} }
} else { } else {
// q1 instanceof TermQuery //q1 instanceof TermQuery
// q1 instanceof TermRangeQuery //q1 instanceof TermRangeQuery
// q1 instanceof PhraseQuery //q1 instanceof PhraseQuery
// others //others
resultQuery.add(q1, Occur.MUST); resultQuery.add(q1, Occur.MUST);
} }
} }
@ -496,10 +496,10 @@ public class IKQueryExpressionParser {
} }
} else { } else {
// q1 instanceof TermQuery //q1 instanceof TermQuery
// q1 instanceof TermRangeQuery //q1 instanceof TermRangeQuery
// q1 instanceof PhraseQuery //q1 instanceof PhraseQuery
// others //others
resultQuery.add(q2, Occur.MUST); resultQuery.add(q2, Occur.MUST);
} }
} }
@ -518,10 +518,10 @@ public class IKQueryExpressionParser {
} }
} else { } else {
// q1 instanceof TermQuery //q1 instanceof TermQuery
// q1 instanceof TermRangeQuery //q1 instanceof TermRangeQuery
// q1 instanceof PhraseQuery //q1 instanceof PhraseQuery
// others //others
resultQuery.add(q1, Occur.SHOULD); resultQuery.add(q1, Occur.SHOULD);
} }
} }
@ -538,10 +538,10 @@ public class IKQueryExpressionParser {
resultQuery.add(q2, Occur.SHOULD); resultQuery.add(q2, Occur.SHOULD);
} }
} else { } else {
// q2 instanceof TermQuery //q2 instanceof TermQuery
// q2 instanceof TermRangeQuery //q2 instanceof TermRangeQuery
// q2 instanceof PhraseQuery //q2 instanceof PhraseQuery
// others //others
resultQuery.add(q2, Occur.SHOULD); resultQuery.add(q2, Occur.SHOULD);
} }
@ -563,10 +563,10 @@ public class IKQueryExpressionParser {
} }
} else { } else {
// q1 instanceof TermQuery //q1 instanceof TermQuery
// q1 instanceof TermRangeQuery //q1 instanceof TermRangeQuery
// q1 instanceof PhraseQuery //q1 instanceof PhraseQuery
// others //others
resultQuery.add(q1, Occur.MUST); resultQuery.add(q1, Occur.MUST);
} }
@ -584,7 +584,7 @@ public class IKQueryExpressionParser {
boolean includeLast; boolean includeLast;
String firstValue; String firstValue;
String lastValue = null; String lastValue = null;
// 检查第一个元素是否是[或者{ //检查第一个元素是否是[或者{
Element first = elements.getFirst(); Element first = elements.getFirst();
if ('[' == first.type) { if ('[' == first.type) {
includeFirst = true; includeFirst = true;
@ -593,7 +593,7 @@ public class IKQueryExpressionParser {
} else { } else {
throw new IllegalStateException("表达式异常"); throw new IllegalStateException("表达式异常");
} }
// 检查最后一个元素是否是]或者} //检查最后一个元素是否是]或者}
Element last = elements.getLast(); Element last = elements.getLast();
if (']' == last.type) { if (']' == last.type) {
includeLast = true; includeLast = true;
@ -605,7 +605,7 @@ public class IKQueryExpressionParser {
if (elements.size() < 4 || elements.size() > 5) { if (elements.size() < 4 || elements.size() > 5) {
throw new IllegalStateException("表达式异常, RangeQuery 错误"); throw new IllegalStateException("表达式异常, RangeQuery 错误");
} }
// 读出中间部分 //读出中间部分
Element e2 = elements.get(1); Element e2 = elements.get(1);
if ('\'' == e2.type) { if ('\'' == e2.type) {
firstValue = e2.toString(); firstValue = e2.toString();
@ -673,7 +673,7 @@ public class IKQueryExpressionParser {
* @author linliangyi * @author linliangyi
* May 20, 2010 * May 20, 2010
*/ */
private static class Element { private class Element {
char type = 0; char type = 0;
StringBuffer eleTextBuff; StringBuffer eleTextBuff;
@ -692,9 +692,11 @@ public class IKQueryExpressionParser {
public static void main(String[] args) { public static void main(String[] args) {
IKQueryExpressionParser parser = new IKQueryExpressionParser(); IKQueryExpressionParser parser = new IKQueryExpressionParser();
//String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'";
String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'"; String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'";
Query result = parser.parseExp(ikQueryExp); Query result = parser.parseExp(ikQueryExp);
System.out.println(result); System.out.println(result);
} }
} }

View File

@ -1,6 +1,6 @@
/* /*
* IK 中文分词 版本 8.5.0 * IK 中文分词 版本 8.4.0
* IK Analyzer release 8.5.0 * IK Analyzer release 8.4.0
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* 8.5.0版本 Magese (magese@live.cn) 更新 * 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn) * release 8.4.0 update by Magese(magese@live.cn)
* *
*/ */
package org.wltea.analyzer.query; package org.wltea.analyzer.query;
@ -45,7 +45,6 @@ import java.util.List;
* *
* @author linliangyi * @author linliangyi
*/ */
@SuppressWarnings("unused")
class SWMCQueryBuilder { class SWMCQueryBuilder {
/** /**
@ -57,9 +56,9 @@ class SWMCQueryBuilder {
if (fieldName == null || keywords == null) { if (fieldName == null || keywords == null) {
throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null."); throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
} }
// 1.对keywords进行分词处理 //1.对keywords进行分词处理
List<Lexeme> lexemes = doAnalyze(keywords); List<Lexeme> lexemes = doAnalyze(keywords);
// 2.根据分词结果生成SWMCQuery //2.根据分词结果生成SWMCQuery
return getSWMCQuery(fieldName, lexemes); return getSWMCQuery(fieldName, lexemes);
} }
@ -85,20 +84,20 @@ class SWMCQueryBuilder {
* 根据分词结果生成SWMC搜索 * 根据分词结果生成SWMC搜索
*/ */
private static Query getSWMCQuery(String fieldName, List<Lexeme> lexemes) { private static Query getSWMCQuery(String fieldName, List<Lexeme> lexemes) {
// 构造SWMC的查询表达式 //构造SWMC的查询表达式
StringBuilder keywordBuffer = new StringBuilder(); StringBuilder keywordBuffer = new StringBuilder();
// 精简的SWMC的查询表达式 //精简的SWMC的查询表达式
StringBuilder keywordBuffer_Short = new StringBuilder(); StringBuilder keywordBuffer_Short = new StringBuilder();
// 记录最后词元长度 //记录最后词元长度
int lastLexemeLength = 0; int lastLexemeLength = 0;
// 记录最后词元结束位置 //记录最后词元结束位置
int lastLexemeEnd = -1; int lastLexemeEnd = -1;
int shortCount = 0; int shortCount = 0;
int totalCount = 0; int totalCount = 0;
for (Lexeme l : lexemes) { for (Lexeme l : lexemes) {
totalCount += l.getLength(); totalCount += l.getLength();
// 精简表达式 //精简表达式
if (l.getLength() > 1) { if (l.getLength() > 1) {
keywordBuffer_Short.append(' ').append(l.getLexemeText()); keywordBuffer_Short.append(' ').append(l.getLexemeText());
shortCount += l.getLength(); shortCount += l.getLength();
@ -107,7 +106,7 @@ class SWMCQueryBuilder {
if (lastLexemeLength == 0) { if (lastLexemeLength == 0) {
keywordBuffer.append(l.getLexemeText()); keywordBuffer.append(l.getLexemeText());
} else if (lastLexemeLength == 1 && l.getLength() == 1 } else if (lastLexemeLength == 1 && l.getLength() == 1
&& lastLexemeEnd == l.getBeginPosition()) {// 单字位置相邻长度为一合并) && lastLexemeEnd == l.getBeginPosition()) {//单字位置相邻长度为一合并)
keywordBuffer.append(l.getLexemeText()); keywordBuffer.append(l.getLexemeText());
} else { } else {
keywordBuffer.append(' ').append(l.getLexemeText()); keywordBuffer.append(' ').append(l.getLexemeText());
@ -117,10 +116,10 @@ class SWMCQueryBuilder {
lastLexemeEnd = l.getEndPosition(); lastLexemeEnd = l.getEndPosition();
} }
// 借助lucene queryparser 生成SWMC Query //借助lucene queryparser 生成SWMC Query
QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer()); QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer());
qp.setAutoGeneratePhraseQueries(false);
qp.setDefaultOperator(QueryParser.AND_OPERATOR); qp.setDefaultOperator(QueryParser.AND_OPERATOR);
qp.setAutoGeneratePhraseQueries(true);
if ((shortCount * 1.0f / totalCount) > 0.5f) { if ((shortCount * 1.0f / totalCount) > 0.5f) {
try { try {

View File

@ -0,0 +1,86 @@
/*
* IK 中文分词 版本 8.4.0
* IK Analyzer release 8.4.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.4.0 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.sample;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.io.IOException;
import java.io.StringReader;
/**
* 使用IKAnalyzer进行分词的演示
* 2012-10-22
*/
public class IKAnalzyerDemo {
public static void main(String[] args) {
//构建IK分词器使用smart分词模式
Analyzer analyzer = new IKAnalyzer(true);
//获取Lucene的TokenStream对象
TokenStream ts = null;
try {
ts = analyzer.tokenStream("myfield", new StringReader("这是一个中文分词的例子你可以直接运行它IKAnalyer can analysis english text too"));
//获取词元位置属性
OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
//获取词元文本属性
CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
//获取词元文本属性
TypeAttribute type = ts.addAttribute(TypeAttribute.class);
//重置TokenStream重置StringReader
ts.reset();
//迭代获取分词结果
while (ts.incrementToken()) {
System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type());
}
//关闭TokenStream关闭StringReader
ts.end(); // Perform end-of-stream operations, e.g. set the final offset.
} catch (IOException e) {
e.printStackTrace();
} finally {
//释放TokenStream的所有资源
if (ts != null) {
try {
ts.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}

View File

@ -0,0 +1,136 @@
/*
* IK 中文分词 版本 8.4.0
* IK Analyzer release 8.4.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.4.0版本 Magese (magese@live.cn) 更新
* release 8.4.0 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.sample;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.io.IOException;
/**
* 使用IKAnalyzer进行Lucene索引和查询的演示
* 2012-3-2
* <p>
* 以下是结合Lucene4.0 API的写法
*/
public class LuceneIndexAndSearchDemo {
/**
* 模拟
* 创建一个单条记录的索引并对其进行搜索
*
*/
public static void main(String[] args) {
//Lucene Document的域名
String fieldName = "text";
//检索内容
String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
//实例化IKAnalyzer分词器
Analyzer analyzer = new IKAnalyzer(true);
Directory directory = null;
IndexWriter iwriter;
IndexReader ireader = null;
IndexSearcher isearcher;
try {
//建立内存索引对象
//noinspection deprecation
directory = new RAMDirectory();
//配置IndexWriterConfig
IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
iwriter = new IndexWriter(directory, iwConfig);
//写入索引
Document doc = new Document();
doc.add(new StringField("ID", "10000", Field.Store.YES));
doc.add(new TextField(fieldName, text, Field.Store.YES));
iwriter.addDocument(doc);
iwriter.close();
//搜索过程**********************************
//实例化搜索器
ireader = DirectoryReader.open(directory);
isearcher = new IndexSearcher(ireader);
String keyword = "中文分词工具包";
//使用QueryParser查询分析器构造Query对象
QueryParser qp = new QueryParser(fieldName, analyzer);
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
Query query = qp.parse(keyword);
System.out.println("Query = " + query);
//搜索相似度最高的5条记录
TopDocs topDocs = isearcher.search(query, 5);
long totalHits = topDocs.totalHits.value;
System.out.println("命中:" + totalHits);
//输出结果
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (int i = 0; i < totalHits; i++) {
Document targetDoc = isearcher.doc(scoreDocs[i].doc);
System.out.println("内容:" + targetDoc.toString());
}
} catch (ParseException | IOException e) {
e.printStackTrace();
} finally {
if (ireader != null) {
try {
ireader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (directory != null) {
try {
directory.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}

View File

@ -1,3 +1,3 @@
Wed Aug 01 00:00:00 CST 2021 Wed Aug 01 11:21:30 CST 2018
files=dynamicdic.txt files=dynamicdic.txt
lastupdate=0 lastupdate=0