Compare commits
No commits in common. "master" and "v8.1.1" have entirely different histories.
@ -1,4 +1,3 @@
|
||||
language: java
|
||||
|
||||
jdk:
|
||||
- openjdk8
|
||||
- oraclejdk8
|
@ -25,7 +25,7 @@
|
||||
```
|
||||
|
||||
4. 配置Solr的`managed-schema`,添加`ik分词器`,示例如下;
|
||||
```xml
|
||||
```console
|
||||
<!-- ik分词器 -->
|
||||
<fieldType name="text_ik" class="solr.TextField">
|
||||
<analyzer type="index">
|
||||
|
78
README.md
78
README.md
@ -6,16 +6,15 @@ ik-analyzer for solr 7.x-8.x
|
||||
[](https://github.com/magese/ik-analyzer-solr/releases)
|
||||
[](./LICENSE)
|
||||
[](https://travis-ci.org/magese/ik-analyzer-solr)
|
||||
[](http://hits.dwyl.io/magese/ik-analyzer-solr)
|
||||
|
||||
[](https://github.com/magese/ik-analyzer-solr/network/members)
|
||||
[](https://github.com/magese/ik-analyzer-solr/stargazers)
|
||||
<!-- /Badges section end. -->
|
||||
|
||||
## 简介
|
||||
**适配最新版本solr 7&8;**
|
||||
|
||||
**扩展IK原有词库:**
|
||||
|
||||
#### 适配最新版本solr 7&8;
|
||||
#### 扩展IK原有词库:
|
||||
| 分词工具 | 词库中词的数量 | 最后更新时间 |
|
||||
| :------: | :------: | :------: |
|
||||
| ik | 27.5万 | 2012年 |
|
||||
@ -23,27 +22,23 @@ ik-analyzer for solr 7.x-8.x
|
||||
| word | 64.2万 | 2014年 |
|
||||
| jieba | 58.4万 | 2012年 |
|
||||
| jcesg | 16.6万 | 2018年 |
|
||||
| sougou词库 | 115.2万 | 2020年 |
|
||||
|
||||
**将以上词库进行整理后约187.1万条词汇;**
|
||||
|
||||
**添加动态加载词典表功能,在不需要重启solr服务的情况下加载新增的词典。**
|
||||
|
||||
> <small>关闭默认主词典请在`IKAnalyzer.cfg.xml`配置文件中设置`use_main_dict`为`false`。</small>
|
||||
> * IKAnalyzer的原作者为林良益<linliangyi2007@gmail.com>,项目网站为<http://code.google.com/p/ik-analyzer>
|
||||
> * 该项目动态加载功能根据博主[@星火燎原智勇](http://www.cnblogs.com/liang1101/articles/6395016.html)的博客进行修改,其GITHUB地址为[@liang68](https://github.com/liang68)
|
||||
| sougou词库 | 115.2万 | 2019年 |
|
||||
#### 将以上词库进行整理后约187.1万条词汇;
|
||||
#### 添加动态加载词典表功能,在不需要重启solr服务的情况下加载新增的词典。
|
||||
* IKAnalyzer的原作者为林良益<linliangyi2007@gmail.com>,项目网站为<http://code.google.com/p/ik-analyzer>
|
||||
* 该项目动态加载功能根据博主[@星火燎原智勇](http://www.cnblogs.com/liang1101/articles/6395016.html)的博客进行修改,其GITHUB地址为[@liang68](https://github.com/liang68)
|
||||
|
||||
|
||||
## 使用说明
|
||||
* jar包下载地址:[](https://search.maven.org/remotecontent?filepath=com/github/magese/ik-analyzer/8.5.0/ik-analyzer-8.5.0.jar)
|
||||
* jar包下载地址:[](https://search.maven.org/remotecontent?filepath=com/github/magese/ik-analyzer/8.1.1/ik-analyzer-8.1.1.jar)
|
||||
* 历史版本:[](https://search.maven.org/search?q=g:com.github.magese%20AND%20a:ik-analyzer&core=gav)
|
||||
|
||||
```xml
|
||||
```console
|
||||
<!-- Maven仓库地址 -->
|
||||
<dependency>
|
||||
<groupId>com.github.magese</groupId>
|
||||
<artifactId>ik-analyzer</artifactId>
|
||||
<version>8.5.0</version>
|
||||
<version>8.1.1</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
@ -63,7 +58,7 @@ ik-analyzer for solr 7.x-8.x
|
||||
```
|
||||
|
||||
3. 配置Solr的`managed-schema`,添加`ik分词器`,示例如下;
|
||||
```xml
|
||||
```console
|
||||
<!-- ik分词器 -->
|
||||
<fieldType name="text_ik" class="solr.TextField">
|
||||
<analyzer type="index">
|
||||
@ -81,16 +76,8 @@ ik-analyzer for solr 7.x-8.x
|
||||
|
||||

|
||||
|
||||
5. `IKAnalyzer.cfg.xml`配置文件说明:
|
||||
|
||||
| 名称 | 类型 | 描述 | 默认 |
|
||||
| ------ | ------ | ------ | ------ |
|
||||
| use_main_dict | boolean | 是否使用默认主词典 | true |
|
||||
| ext_dict | String | 扩展词典文件名称,多个用分号隔开 | ext.dic; |
|
||||
| ext_stopwords | String | 停用词典文件名称,多个用分号隔开 | stopword.dic; |
|
||||
|
||||
6. `ik.conf`文件说明:
|
||||
```properties
|
||||
5. `ik.conf`文件说明:
|
||||
```console
|
||||
files=dynamicdic.txt
|
||||
lastupdate=0
|
||||
```
|
||||
@ -98,43 +85,34 @@ ik-analyzer for solr 7.x-8.x
|
||||
1. `files`为动态词典列表,可以设置多个词典表,用逗号进行分隔,默认动态词典表为`dynamicdic.txt`;
|
||||
2. `lastupdate`默认值为`0`,每次对动态词典表修改后请+1,不然不会将词典表中新的词语添加到内存中。<s>`lastupdate`采用的是`int`类型,不支持时间戳,如果使用时间戳的朋友可以把源码中的`int`改成`long`即可;</s> `2018-08-23` 已将源码中`lastUpdate`改为`long`类型,现可以用时间戳了。
|
||||
|
||||
7. `dynamicdic.txt` 为动态词典
|
||||
6. `dynamicdic.txt` 为动态词典
|
||||
|
||||
在此文件配置的词语不需重启服务即可加载进内存中。
|
||||
以`#`开头的词语视为注释,将不会加载到内存中。
|
||||
|
||||
|
||||
## 更新说明
|
||||
- **2021-12-23:** 升级lucene版本为`8.5.0`
|
||||
- **2021-03-22:** 升级lucene版本为`8.4.0`
|
||||
- **2020-12-30:**
|
||||
- 升级lucene版本为`8.3.1`
|
||||
- 更新词库
|
||||
- **2019-11-12:**
|
||||
- 升级lucene版本为`8.3.0`
|
||||
- `IKAnalyzer.cfg.xml`增加配置项`use_main_dict`,用于配置是否启用默认主词典
|
||||
- **2019-09-27:** 升级lucene版本为`8.2.0`
|
||||
- **2019-07-11:** 升级lucene版本为`8.1.1`
|
||||
- **2019-05-27:**
|
||||
- `2019-07-11:` 升级lucene版本为`8.1.1`
|
||||
- `2019-05-27:`
|
||||
- 升级lucene版本为`8.1.0`
|
||||
- 优化原词典部分重复词语
|
||||
- 更新搜狗2019最新流行词汇词典,约20k词汇量
|
||||
- **2019-05-15:** 升级lucene版本为`8.0.0`,并支持Solr8使用
|
||||
- **2019-03-01:** 升级lucene版本为`7.7.1`
|
||||
- **2019-02-15:** 升级lucene版本为`7.7.0`
|
||||
- **2018-12-26:**
|
||||
- `2019-05-15:` 升级lucene版本为`8.0.0`,并支持Solr8使用
|
||||
- `2019-03-01:` 升级lucene版本为`7.7.1`
|
||||
- `2019-02-15:` 升级lucene版本为`7.7.0`
|
||||
- `2018-12-26:`
|
||||
- 升级lucene版本为`7.6.0`
|
||||
- 兼容solr-cloud,动态词典配置文件及动态词典可交由`zookeeper`进行管理
|
||||
- 动态词典增加注释功能,以`#`开头的行将视为注释
|
||||
- **2018-12-04:** 整理更新词库列表`magese.dic`
|
||||
- **2018-10-10:** 升级lucene版本为`7.5.0`
|
||||
- **2018-09-03:** 优化注释与输出信息,取消部分中文输出避免不同字符集乱码,现会打印被调用inform方法的hashcode
|
||||
- **2018-08-23:**
|
||||
- `2018-12-04:` 整理更新词库列表`magese.dic`
|
||||
- `2018-10-10:` 升级lucene版本为`7.5.0`
|
||||
- `2018-09-03:` 优化注释与输出信息,取消部分中文输出避免不同字符集乱码,现会打印被调用inform方法的hashcode
|
||||
- `2018-08-23: `
|
||||
- 完善了动态更新词库代码注释;
|
||||
- 将ik.conf配置文件中的lastUpdate属性改为long类型,现已支持时间戳形式
|
||||
- **2018-08-13:** 更新maven仓库地址
|
||||
- **2018-08-01:** 移除默认的扩展词与停用词
|
||||
- **2018-07-23:** 升级lucene版本为`7.4.0`
|
||||
- `2018-08-13:` 更新maven仓库地址
|
||||
- `2018-08-01:` 移除默认的扩展词与停用词
|
||||
- `2018-07-23:` 升级lucene版本为`7.4.0`
|
||||
|
||||
|
||||
## 感谢 Thanks
|
||||
|
12
pom.xml
12
pom.xml
@ -4,7 +4,7 @@
|
||||
|
||||
<groupId>com.github.magese</groupId>
|
||||
<artifactId>ik-analyzer</artifactId>
|
||||
<version>8.5.0</version>
|
||||
<version>8.1.1</version>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<name>ik-analyzer-solr</name>
|
||||
@ -13,13 +13,20 @@
|
||||
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<lucene.version>8.5.0</lucene.version>
|
||||
<lucene.version>8.1.1</lucene.version>
|
||||
<javac.src.version>1.8</javac.src.version>
|
||||
<javac.target.version>1.8</javac.target.version>
|
||||
<maven.compiler.plugin.version>3.3</maven.compiler.plugin.version>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.11</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-core</artifactId>
|
||||
@ -145,3 +152,4 @@
|
||||
</profile>
|
||||
</profiles>
|
||||
</project>
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.cfg;
|
||||
@ -50,19 +50,6 @@ public interface Configuration {
|
||||
*/
|
||||
void setUseSmart(boolean useSmart);
|
||||
|
||||
/**
|
||||
* 获取是否使用主词典
|
||||
*
|
||||
* @return = true 默认加载主词典, = false 不加载主词典
|
||||
*/
|
||||
boolean useMainDict();
|
||||
|
||||
/**
|
||||
* 设置是否使用主词典
|
||||
*
|
||||
* @param useMainDic = true 默认加载主词典, = false 不加载主词典
|
||||
*/
|
||||
void setUseMainDict(boolean useMainDic);
|
||||
|
||||
/**
|
||||
* 获取主词典路径
|
||||
@ -76,7 +63,7 @@ public interface Configuration {
|
||||
*
|
||||
* @return String 量词词典路径
|
||||
*/
|
||||
String getQuantifierDictionary();
|
||||
String getQuantifierDicionary();
|
||||
|
||||
/**
|
||||
* 获取扩展字典配置路径
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.cfg;
|
||||
@ -41,28 +41,24 @@ public class DefaultConfig implements Configuration {
|
||||
/*
|
||||
* 分词器默认字典路径
|
||||
*/
|
||||
private static final String PATH_DIC_MAIN = "dict/main_dic_2020.dic";
|
||||
private static final String PATH_DIC_MAIN = "dict/magese.dic";
|
||||
private static final String PATH_DIC_QUANTIFIER = "dict/quantifier.dic";
|
||||
|
||||
/*
|
||||
* 分词器配置文件路径
|
||||
*/
|
||||
private static final String FILE_NAME = "IKAnalyzer.cfg.xml";
|
||||
// 配置属性——是否使用主词典
|
||||
private static final String USE_MAIN = "use_main_dict";
|
||||
// 配置属性——扩展字典
|
||||
private static final String EXT_DICT = "ext_dict";
|
||||
// 配置属性——扩展停止词典
|
||||
private static final String EXT_STOP = "ext_stopwords";
|
||||
|
||||
private final Properties props;
|
||||
|
||||
// 是否使用smart方式分词
|
||||
private Properties props;
|
||||
/*
|
||||
* 是否使用smart方式分词
|
||||
*/
|
||||
private boolean useSmart;
|
||||
|
||||
// 是否加载主词典
|
||||
private boolean useMainDict = true;
|
||||
|
||||
/**
|
||||
* 返回单例
|
||||
*
|
||||
@ -104,33 +100,10 @@ public class DefaultConfig implements Configuration {
|
||||
*
|
||||
* @param useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
|
||||
*/
|
||||
@Override
|
||||
public void setUseSmart(boolean useSmart) {
|
||||
this.useSmart = useSmart;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取是否使用主词典
|
||||
*
|
||||
* @return = true 默认加载主词典, = false 不加载主词典
|
||||
*/
|
||||
public boolean useMainDict() {
|
||||
String useMainDictCfg = props.getProperty(USE_MAIN);
|
||||
if (useMainDictCfg != null && useMainDictCfg.trim().length() > 0)
|
||||
setUseMainDict(Boolean.parseBoolean(useMainDictCfg));
|
||||
return useMainDict;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置是否使用主词典
|
||||
*
|
||||
* @param useMainDict = true 默认加载主词典, = false 不加载主词典
|
||||
*/
|
||||
@Override
|
||||
public void setUseMainDict(boolean useMainDict) {
|
||||
this.useMainDict = useMainDict;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取主词典路径
|
||||
*
|
||||
@ -145,7 +118,7 @@ public class DefaultConfig implements Configuration {
|
||||
*
|
||||
* @return String 量词词典路径
|
||||
*/
|
||||
public String getQuantifierDictionary() {
|
||||
public String getQuantifierDicionary() {
|
||||
return PATH_DIC_QUANTIFIER;
|
||||
}
|
||||
|
||||
@ -169,6 +142,7 @@ public class DefaultConfig implements Configuration {
|
||||
return extDictFiles;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 获取扩展停止词典配置路径
|
||||
*
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,18 +21,22 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
|
||||
import org.wltea.analyzer.cfg.Configuration;
|
||||
import org.wltea.analyzer.dic.Dictionary;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.*;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.wltea.analyzer.cfg.Configuration;
|
||||
import org.wltea.analyzer.dic.Dictionary;
|
||||
|
||||
/**
|
||||
* 分词器上下文状态
|
||||
@ -62,17 +66,17 @@ class AnalyzeContext {
|
||||
|
||||
//子分词器锁
|
||||
//该集合非空,说明有子分词器在占用segmentBuff
|
||||
private final Set<String> buffLocker;
|
||||
private Set<String> buffLocker;
|
||||
|
||||
//原始分词结果集合,未经歧义处理
|
||||
private QuickSortSet orgLexemes;
|
||||
//LexemePath位置索引表
|
||||
private final Map<Integer, LexemePath> pathMap;
|
||||
private Map<Integer, LexemePath> pathMap;
|
||||
//最终分词结果集
|
||||
private final LinkedList<Lexeme> results;
|
||||
private LinkedList<Lexeme> results;
|
||||
|
||||
//分词器配置项
|
||||
private final Configuration cfg;
|
||||
private Configuration cfg;
|
||||
|
||||
AnalyzeContext(Configuration cfg) {
|
||||
this.cfg = cfg;
|
||||
@ -250,7 +254,7 @@ class AnalyzeContext {
|
||||
*/
|
||||
void outputToResult() {
|
||||
int index = 0;
|
||||
while (index <= this.cursor) {
|
||||
for (; index <= this.cursor; ) {
|
||||
//跳过非CJK字符
|
||||
if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) {
|
||||
index++;
|
||||
@ -349,7 +353,6 @@ class AnalyzeContext {
|
||||
if (Lexeme.TYPE_ARABIC == result.getLexemeType()) {
|
||||
Lexeme nextLexeme = this.results.peekFirst();
|
||||
boolean appendOk = false;
|
||||
if (nextLexeme != null) {
|
||||
if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) {
|
||||
//合并英文数词+中文数词
|
||||
appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
|
||||
@ -357,7 +360,6 @@ class AnalyzeContext {
|
||||
//合并英文数词+中文量词
|
||||
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
|
||||
}
|
||||
}
|
||||
if (appendOk) {
|
||||
//弹出
|
||||
this.results.pollFirst();
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,18 +21,18 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
|
||||
import org.wltea.analyzer.dic.Dictionary;
|
||||
import org.wltea.analyzer.dic.Hit;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.wltea.analyzer.dic.Dictionary;
|
||||
import org.wltea.analyzer.dic.Hit;
|
||||
|
||||
|
||||
/**
|
||||
* 中文-日韩文子分词器
|
||||
@ -42,7 +42,7 @@ class CJKSegmenter implements ISegmenter {
|
||||
//子分词器标签
|
||||
private static final String SEGMENTER_NAME = "CJK_SEGMENTER";
|
||||
//待处理的分词hit队列
|
||||
private final List<Hit> tmpHits;
|
||||
private List<Hit> tmpHits;
|
||||
|
||||
|
||||
CJKSegmenter(){
|
||||
@ -80,19 +80,20 @@ class CJKSegmenter implements ISegmenter {
|
||||
//*********************************
|
||||
//再对当前指针位置的字符进行单字匹配
|
||||
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
|
||||
|
||||
// 首字为词前缀
|
||||
if (singleCharHit.isMatch()) {
|
||||
if(singleCharHit.isMatch()){//首字成词
|
||||
//输出当前的词
|
||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);
|
||||
context.addLexeme(newLexeme);
|
||||
}
|
||||
|
||||
// 前缀匹配则放入hit列表
|
||||
//同时也是词前缀
|
||||
if(singleCharHit.isPrefix()){
|
||||
//前缀匹配则放入hit列表
|
||||
this.tmpHits.add(singleCharHit);
|
||||
}
|
||||
}else if(singleCharHit.isPrefix()){//首字为词前缀
|
||||
//前缀匹配则放入hit列表
|
||||
this.tmpHits.add(singleCharHit);
|
||||
}
|
||||
|
||||
|
||||
}else{
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
@ -36,6 +36,7 @@ import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
*
|
||||
* 中文数量词子分词器
|
||||
*/
|
||||
class CN_QuantifierSegmenter implements ISegmenter{
|
||||
@ -43,14 +44,14 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
||||
//子分词器标签
|
||||
private static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
|
||||
|
||||
private static final Set<Character> CHN_NUMBER_CHARS = new HashSet<>();
|
||||
|
||||
private static Set<Character> ChnNumberChars = new HashSet<>();
|
||||
static{
|
||||
//中文数词
|
||||
//Cnum
|
||||
String chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";
|
||||
char[] ca = chn_Num.toCharArray();
|
||||
for(char nChar : ca){
|
||||
CHN_NUMBER_CHARS.add(nChar);
|
||||
ChnNumberChars.add(nChar);
|
||||
}
|
||||
}
|
||||
|
||||
@ -67,7 +68,7 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
||||
private int nEnd;
|
||||
|
||||
//待处理的量词hit队列
|
||||
private final List<Hit> countHits;
|
||||
private List<Hit> countHits;
|
||||
|
||||
|
||||
CN_QuantifierSegmenter(){
|
||||
@ -110,14 +111,14 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
||||
private void processCNumber(AnalyzeContext context){
|
||||
if(nStart == -1 && nEnd == -1){//初始状态
|
||||
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
|
||||
&& CHN_NUMBER_CHARS.contains(context.getCurrentChar())) {
|
||||
&& ChnNumberChars.contains(context.getCurrentChar())){
|
||||
//记录数词的起始、结束位置
|
||||
nStart = context.getCursor();
|
||||
nEnd = context.getCursor();
|
||||
}
|
||||
}else{//正在处理状态
|
||||
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
|
||||
&& CHN_NUMBER_CHARS.contains(context.getCurrentChar())) {
|
||||
&& ChnNumberChars.contains(context.getCurrentChar())){
|
||||
//记录数词的结束位置
|
||||
nEnd = context.getCursor();
|
||||
}else{
|
||||
@ -143,7 +144,6 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
||||
|
||||
/**
|
||||
* 处理中文量词
|
||||
*
|
||||
* @param context 需要处理的内容
|
||||
*/
|
||||
private void processCount(AnalyzeContext context){
|
||||
@ -179,19 +179,21 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
||||
//*********************************
|
||||
//对当前指针位置的字符进行单字匹配
|
||||
Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
|
||||
|
||||
// 首字为量词前缀
|
||||
if (singleCharHit.isMatch()) {
|
||||
if(singleCharHit.isMatch()){//首字成量词词
|
||||
//输出当前的词
|
||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
|
||||
context.addLexeme(newLexeme);
|
||||
}
|
||||
|
||||
// 前缀匹配则放入hit列表
|
||||
//同时也是词前缀
|
||||
if(singleCharHit.isPrefix()){
|
||||
//前缀匹配则放入hit列表
|
||||
this.countHits.add(singleCharHit);
|
||||
}
|
||||
}else if(singleCharHit.isPrefix()){//首字为量词前缀
|
||||
//前缀匹配则放入hit列表
|
||||
this.countHits.add(singleCharHit);
|
||||
}
|
||||
|
||||
|
||||
}else{
|
||||
//输入的不是中文字符
|
||||
@ -227,7 +229,6 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
||||
|
||||
/**
|
||||
* 添加数词词元到结果集
|
||||
*
|
||||
* @param context 需要添加的词元
|
||||
*/
|
||||
private void outputNumLexeme(AnalyzeContext context){
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,13 +21,14 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
|
||||
/**
|
||||
*
|
||||
* 字符集识别工具类
|
||||
*/
|
||||
class CharacterUtil {
|
||||
@ -45,7 +46,6 @@ class CharacterUtil {
|
||||
|
||||
/**
|
||||
* 识别字符类型
|
||||
*
|
||||
* @param input 需要识别的字符
|
||||
* @return int CharacterUtil定义的字符类型常量
|
||||
*/
|
||||
@ -85,7 +85,6 @@ class CharacterUtil {
|
||||
|
||||
/**
|
||||
* 进行字符规格化(全角转半角,大写转小写处理)
|
||||
*
|
||||
* @param input 需要转换的字符
|
||||
* @return char
|
||||
*/
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
@ -35,7 +35,9 @@ import java.util.TreeSet;
|
||||
*/
|
||||
class IKArbitrator {
|
||||
|
||||
IKArbitrator() {}
|
||||
IKArbitrator() {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 分词歧义处理
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,45 +21,35 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.3.1 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
|
||||
import org.wltea.analyzer.cfg.Configuration;
|
||||
import org.wltea.analyzer.cfg.DefaultConfig;
|
||||
import org.wltea.analyzer.dic.Dictionary;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.wltea.analyzer.cfg.Configuration;
|
||||
import org.wltea.analyzer.cfg.DefaultConfig;
|
||||
import org.wltea.analyzer.dic.Dictionary;
|
||||
|
||||
/**
|
||||
* IK分词器主类
|
||||
*/
|
||||
public final class IKSegmenter {
|
||||
|
||||
/**
|
||||
* 字符窜reader
|
||||
*/
|
||||
//字符窜reader
|
||||
private Reader input;
|
||||
/**
|
||||
* 分词器配置项
|
||||
*/
|
||||
private final Configuration cfg;
|
||||
/**
|
||||
* 分词器上下文
|
||||
*/
|
||||
//分词器配置项
|
||||
private Configuration cfg;
|
||||
//分词器上下文
|
||||
private AnalyzeContext context;
|
||||
/**
|
||||
* 分词处理器列表
|
||||
*/
|
||||
//分词处理器列表
|
||||
private List<ISegmenter> segmenters;
|
||||
/**
|
||||
* 分词歧义裁决器
|
||||
*/
|
||||
//分词歧义裁决器
|
||||
private IKArbitrator arbitrator;
|
||||
|
||||
|
||||
@ -68,6 +58,7 @@ public final class IKSegmenter {
|
||||
*
|
||||
* @param input 读取流
|
||||
* @param useSmart 为true,使用智能分词策略
|
||||
* <p>
|
||||
* 非智能分词:细粒度输出所有可能的切分结果
|
||||
* 智能分词: 合并数词和量词,对分词结果进行歧义判断
|
||||
*/
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,21 +21,21 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* 子分词器接口
|
||||
*/
|
||||
interface ISegmenter {
|
||||
|
||||
/**
|
||||
* 从分析器读取下一个可能分解的词元对象
|
||||
*
|
||||
* @param context 分词算法上下文
|
||||
*/
|
||||
void analyze(AnalyzeContext context);
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
@ -34,18 +34,14 @@ import java.util.Arrays;
|
||||
*/
|
||||
class LetterSegmenter implements ISegmenter {
|
||||
|
||||
/**
|
||||
* 子分词器标签
|
||||
*/
|
||||
//子分词器标签
|
||||
private static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
|
||||
/**
|
||||
* 链接符号
|
||||
*/
|
||||
//链接符号
|
||||
private static final char[] Letter_Connector = new char[]{'#', '&', '+', '-', '.', '@', '_'};
|
||||
/**
|
||||
* 数字符号
|
||||
*/
|
||||
|
||||
//数字符号
|
||||
private static final char[] Num_Connector = new char[]{',', '.'};
|
||||
|
||||
/*
|
||||
* 词元的开始位置,
|
||||
* 同时作为子分词器状态标识
|
||||
@ -57,18 +53,22 @@ class LetterSegmenter implements ISegmenter {
|
||||
* end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置
|
||||
*/
|
||||
private int end;
|
||||
|
||||
/*
|
||||
* 字母起始位置
|
||||
*/
|
||||
private int englishStart;
|
||||
|
||||
/*
|
||||
* 字母结束位置
|
||||
*/
|
||||
private int englishEnd;
|
||||
|
||||
/*
|
||||
* 阿拉伯数字起始位置
|
||||
*/
|
||||
private int arabicStart;
|
||||
|
||||
/*
|
||||
* 阿拉伯数字结束位置
|
||||
*/
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
@ -32,61 +32,34 @@ package org.wltea.analyzer.core;
|
||||
*/
|
||||
@SuppressWarnings("unused")
|
||||
public class Lexeme implements Comparable<Lexeme>{
|
||||
/**
|
||||
* 英文
|
||||
*/
|
||||
//英文
|
||||
static final int TYPE_ENGLISH = 1;
|
||||
/**
|
||||
* 数字
|
||||
*/
|
||||
//数字
|
||||
static final int TYPE_ARABIC = 2;
|
||||
/**
|
||||
* 英文数字混合
|
||||
*/
|
||||
//英文数字混合
|
||||
static final int TYPE_LETTER = 3;
|
||||
/**
|
||||
* 中文词元
|
||||
*/
|
||||
//中文词元
|
||||
static final int TYPE_CNWORD = 4;
|
||||
/**
|
||||
* 中文单字
|
||||
*/
|
||||
//中文单字
|
||||
static final int TYPE_CNCHAR = 64;
|
||||
/**
|
||||
* 日韩文字
|
||||
*/
|
||||
//日韩文字
|
||||
static final int TYPE_OTHER_CJK = 8;
|
||||
/**
|
||||
* 中文数词
|
||||
*/
|
||||
//中文数词
|
||||
static final int TYPE_CNUM = 16;
|
||||
/**
|
||||
* 中文量词
|
||||
*/
|
||||
//中文量词
|
||||
static final int TYPE_COUNT = 32;
|
||||
/**
|
||||
* 中文数量词
|
||||
*/
|
||||
//中文数量词
|
||||
static final int TYPE_CQUAN = 48;
|
||||
/**
|
||||
* 词元的起始位移
|
||||
*/
|
||||
|
||||
//词元的起始位移
|
||||
private int offset;
|
||||
/**
|
||||
* 词元的相对起始位置
|
||||
*/
|
||||
//词元的相对起始位置
|
||||
private int begin;
|
||||
/**
|
||||
* 词元的长度
|
||||
*/
|
||||
//词元的长度
|
||||
private int length;
|
||||
/**
|
||||
* 词元文本
|
||||
*/
|
||||
//词元文本
|
||||
private String lexemeText;
|
||||
/**
|
||||
* 词元类型
|
||||
*/
|
||||
//词元类型
|
||||
private int lexemeType;
|
||||
|
||||
|
||||
@ -147,7 +120,7 @@ public class Lexeme implements Comparable<Lexeme> {
|
||||
//this.length < other.getLength()
|
||||
return Integer.compare(other.getLength(), this.length);
|
||||
|
||||
} else {
|
||||
}else{//this.begin > other.getBegin()
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
@ -163,10 +136,8 @@ public class Lexeme implements Comparable<Lexeme> {
|
||||
int getBegin() {
|
||||
return begin;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取词元在文本中的起始位置
|
||||
*
|
||||
* @return int
|
||||
*/
|
||||
public int getBeginPosition(){
|
||||
@ -179,7 +150,6 @@ public class Lexeme implements Comparable<Lexeme> {
|
||||
|
||||
/**
|
||||
* 获取词元在文本中的结束位置
|
||||
*
|
||||
* @return int
|
||||
*/
|
||||
public int getEndPosition(){
|
||||
@ -188,7 +158,6 @@ public class Lexeme implements Comparable<Lexeme> {
|
||||
|
||||
/**
|
||||
* 获取词元的字符长度
|
||||
*
|
||||
* @return int
|
||||
*/
|
||||
public int getLength(){
|
||||
@ -204,7 +173,6 @@ public class Lexeme implements Comparable<Lexeme> {
|
||||
|
||||
/**
|
||||
* 获取词元的文本内容
|
||||
*
|
||||
* @return String
|
||||
*/
|
||||
public String getLexemeText() {
|
||||
@ -226,7 +194,6 @@ public class Lexeme implements Comparable<Lexeme> {
|
||||
|
||||
/**
|
||||
* 获取词元类型
|
||||
*
|
||||
* @return int
|
||||
*/
|
||||
int getLexemeType() {
|
||||
@ -235,7 +202,6 @@ public class Lexeme implements Comparable<Lexeme> {
|
||||
|
||||
/**
|
||||
* 获取词元类型标示字符串
|
||||
*
|
||||
* @return String
|
||||
*/
|
||||
public String getLexemeTypeString(){
|
||||
@ -269,7 +235,7 @@ public class Lexeme implements Comparable<Lexeme> {
|
||||
return "TYPE_CQUAN";
|
||||
|
||||
default :
|
||||
return "UNKNOWN";
|
||||
return "UNKONW";
|
||||
}
|
||||
}
|
||||
|
||||
@ -280,7 +246,6 @@ public class Lexeme implements Comparable<Lexeme> {
|
||||
|
||||
/**
|
||||
* 合并两个相邻的词元
|
||||
*
|
||||
* @return boolean 词元是否成功合并
|
||||
*/
|
||||
boolean append(Lexeme l, int lexemeType){
|
||||
@ -293,10 +258,9 @@ public class Lexeme implements Comparable<Lexeme> {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* ToString 方法
|
||||
*
|
||||
* @return 字符串输出
|
||||
*/
|
||||
public String toString(){
|
||||
return this.getBeginPosition() + "-" + this.getEndPosition() +
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
@ -34,17 +34,11 @@ package org.wltea.analyzer.core;
|
||||
@SuppressWarnings("unused")
|
||||
class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
|
||||
|
||||
/**
|
||||
* 起始位置
|
||||
*/
|
||||
//起始位置
|
||||
private int pathBegin;
|
||||
/**
|
||||
* 结束
|
||||
*/
|
||||
//结束
|
||||
private int pathEnd;
|
||||
/**
|
||||
* 词元链的有效字符长度
|
||||
*/
|
||||
//词元链的有效字符长度
|
||||
private int payloadLength;
|
||||
|
||||
LexemePath() {
|
||||
@ -106,6 +100,7 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
|
||||
|
||||
/**
|
||||
* 移除尾部的Lexeme
|
||||
*
|
||||
*/
|
||||
void removeTail() {
|
||||
Lexeme tail = this.pollLast();
|
||||
@ -122,6 +117,7 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
|
||||
|
||||
/**
|
||||
* 检测词元位置交叉(有歧义的切分)
|
||||
*
|
||||
*/
|
||||
boolean checkCross(Lexeme lexeme) {
|
||||
return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd)
|
||||
@ -145,6 +141,7 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
|
||||
|
||||
/**
|
||||
* 获取LexemePath的路径长度
|
||||
*
|
||||
*/
|
||||
private int getPathLength() {
|
||||
return this.pathEnd - this.pathBegin;
|
||||
@ -153,6 +150,7 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
|
||||
|
||||
/**
|
||||
* X权重(词元长度积)
|
||||
*
|
||||
*/
|
||||
private int getXWeight() {
|
||||
int product = 1;
|
||||
@ -198,36 +196,31 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
|
||||
return -1;
|
||||
} else if (this.payloadLength < o.payloadLength) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
} else {
|
||||
//比较词元个数,越少越好
|
||||
if (this.size() < o.size()) {
|
||||
return -1;
|
||||
} else if (this.size() > o.size()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
} else {
|
||||
//路径跨度越大越好
|
||||
if (this.getPathLength() > o.getPathLength()) {
|
||||
return -1;
|
||||
} else if (this.getPathLength() < o.getPathLength()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
} else {
|
||||
//根据统计学结论,逆向切分概率高于正向切分,因此位置越靠后的优先
|
||||
if (this.pathEnd > o.pathEnd) {
|
||||
return -1;
|
||||
} else if (pathEnd < o.pathEnd) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
} else {
|
||||
//词长越平均越好
|
||||
if (this.getXWeight() > o.getXWeight()) {
|
||||
return -1;
|
||||
} else if (this.getXWeight() < o.getXWeight()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
} else {
|
||||
//词元位置权重比较
|
||||
if (this.getPWeight() > o.getPWeight()) {
|
||||
return -1;
|
||||
@ -235,6 +228,11 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
|
||||
return 1;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,27 +21,21 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.2.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.2.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.core;
|
||||
|
||||
/**
|
||||
* IK分词器专用的Lexeme快速排序集合
|
||||
* IK分词器专用的Lexem快速排序集合
|
||||
*/
|
||||
class QuickSortSet {
|
||||
/**
|
||||
* 链表头
|
||||
*/
|
||||
//链表头
|
||||
private Cell head;
|
||||
/**
|
||||
* 链表尾
|
||||
*/
|
||||
//链表尾
|
||||
private Cell tail;
|
||||
/**
|
||||
* 链表的实际大小
|
||||
*/
|
||||
//链表的实际大小
|
||||
private int size;
|
||||
|
||||
QuickSortSet(){
|
||||
@ -59,15 +53,15 @@ class QuickSortSet {
|
||||
this.size++;
|
||||
|
||||
}else{
|
||||
if (this.tail.compareTo(newCell) < 0) {
|
||||
// 词元接入链表尾部
|
||||
/*if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同,不放入集合
|
||||
|
||||
}else */if(this.tail.compareTo(newCell) < 0){//词元接入链表尾部
|
||||
this.tail.next = newCell;
|
||||
newCell.prev = this.tail;
|
||||
this.tail = newCell;
|
||||
this.size++;
|
||||
|
||||
} else if (this.head.compareTo(newCell) > 0) {
|
||||
// 词元接入链表头部
|
||||
}else if(this.head.compareTo(newCell) > 0){//词元接入链表头部
|
||||
this.head.prev = newCell;
|
||||
newCell.next = this.head;
|
||||
this.head = newCell;
|
||||
@ -79,9 +73,9 @@ class QuickSortSet {
|
||||
while(index != null && index.compareTo(newCell) > 0){
|
||||
index = index.prev;
|
||||
}
|
||||
/*if(index.compareTo(newCell) == 0){//词元与集合中的词元重复,不放入集合
|
||||
|
||||
// 词元插入链表中的某个位置
|
||||
if ((index != null ? index.compareTo(newCell) : 1) < 0) {
|
||||
}else */if((index != null ? index.compareTo(newCell) : 1) < 0){//词元插入链表中的某个位置
|
||||
newCell.prev = index;
|
||||
newCell.next = index.next;
|
||||
index.next.prev = newCell;
|
||||
@ -104,7 +98,6 @@ class QuickSortSet {
|
||||
|
||||
/**
|
||||
* 取出链表集合的第一个元素
|
||||
*
|
||||
* @return Lexeme
|
||||
*/
|
||||
Lexeme pollFirst(){
|
||||
@ -136,7 +129,6 @@ class QuickSortSet {
|
||||
|
||||
/**
|
||||
* 取出链表集合的最后一个元素
|
||||
*
|
||||
* @return Lexeme
|
||||
*/
|
||||
Lexeme pollLast(){
|
||||
@ -180,15 +172,15 @@ class QuickSortSet {
|
||||
}
|
||||
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 7.0
|
||||
* IK Analyzer release 7.0
|
||||
* update by Magese(magese@live.cn)
|
||||
*/
|
||||
@SuppressWarnings("unused")
|
||||
static class Cell implements Comparable<Cell> {
|
||||
class Cell implements Comparable<Cell>{
|
||||
private Cell prev;
|
||||
private Cell next;
|
||||
private final Lexeme lexeme;
|
||||
private Lexeme lexeme;
|
||||
|
||||
Cell(Lexeme lexeme){
|
||||
if(lexeme == null){
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.dic;
|
||||
@ -37,38 +37,24 @@ import java.util.Map;
|
||||
@SuppressWarnings("unused")
|
||||
class DictSegment implements Comparable<DictSegment> {
|
||||
|
||||
/**
|
||||
* 公用字典表,存储汉字
|
||||
*/
|
||||
//公用字典表,存储汉字
|
||||
private static final Map<Character, Character> charMap = new HashMap<>(16, 0.95f);
|
||||
/**
|
||||
* 数组大小上限
|
||||
*/
|
||||
//数组大小上限
|
||||
private static final int ARRAY_LENGTH_LIMIT = 3;
|
||||
|
||||
|
||||
/**
|
||||
* Map存储结构
|
||||
*/
|
||||
private volatile Map<Character, DictSegment> childrenMap;
|
||||
/**
|
||||
* 数组方式存储结构
|
||||
*/
|
||||
private volatile DictSegment[] childrenArray;
|
||||
//Map存储结构
|
||||
private Map<Character, DictSegment> childrenMap;
|
||||
//数组方式存储结构
|
||||
private DictSegment[] childrenArray;
|
||||
|
||||
|
||||
/**
|
||||
* 当前节点上存储的字符
|
||||
*/
|
||||
private final Character nodeChar;
|
||||
/**
|
||||
* 当前节点存储的Segment数目
|
||||
* storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
|
||||
*/
|
||||
//当前节点上存储的字符
|
||||
private Character nodeChar;
|
||||
//当前节点存储的Segment数目
|
||||
//storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
|
||||
private int storeSize = 0;
|
||||
/**
|
||||
* 当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
|
||||
*/
|
||||
//当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
|
||||
private int nodeState = 0;
|
||||
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,20 +21,20 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.dic;
|
||||
|
||||
import org.wltea.analyzer.cfg.Configuration;
|
||||
import org.wltea.analyzer.cfg.DefaultConfig;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
import org.wltea.analyzer.cfg.Configuration;
|
||||
import org.wltea.analyzer.cfg.DefaultConfig;
|
||||
|
||||
/**
|
||||
* 词典管理类,单例模式
|
||||
*/
|
||||
@ -44,7 +44,7 @@ public class Dictionary {
|
||||
/*
|
||||
* 词典单子实例
|
||||
*/
|
||||
private static volatile Dictionary singleton;
|
||||
private static Dictionary singleton;
|
||||
|
||||
/*
|
||||
* 主词典对象
|
||||
@ -63,7 +63,7 @@ public class Dictionary {
|
||||
/**
|
||||
* 配置对象
|
||||
*/
|
||||
private final Configuration cfg;
|
||||
private Configuration cfg;
|
||||
|
||||
/**
|
||||
* 私有构造方法,阻止外部直接实例化本类
|
||||
@ -226,15 +226,22 @@ public class Dictionary {
|
||||
private void loadMainDict() {
|
||||
// 建立一个主词典实例
|
||||
_MainDict = new DictSegment((char) 0);
|
||||
// 获取是否加载主词典
|
||||
if (cfg.useMainDict()) {
|
||||
// 读取主词典文件
|
||||
InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getMainDictionary());
|
||||
if (is == null) {
|
||||
throw new RuntimeException("Main Dictionary not found!!!");
|
||||
}
|
||||
|
||||
try {
|
||||
readDict(is, _MainDict);
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
|
||||
String theWord;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
|
||||
} catch (IOException ioe) {
|
||||
System.err.println("Main Dictionary loading exception.");
|
||||
ioe.printStackTrace();
|
||||
@ -246,7 +253,6 @@ public class Dictionary {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
// 加载扩展词典
|
||||
this.loadExtDict();
|
||||
}
|
||||
@ -268,7 +274,17 @@ public class Dictionary {
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
readDict(is, _MainDict);
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
|
||||
String theWord;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
// 加载扩展词典数据到主内存词典中
|
||||
// System.out.println(theWord);
|
||||
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
|
||||
} catch (IOException ioe) {
|
||||
System.err.println("Extension Dictionary loading exception.");
|
||||
ioe.printStackTrace();
|
||||
@ -303,7 +319,17 @@ public class Dictionary {
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
readDict(is, _StopWordDict);
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
|
||||
String theWord;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
// System.out.println(theWord);
|
||||
// 加载扩展停止词典数据到内存中
|
||||
_StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
|
||||
} catch (IOException ioe) {
|
||||
System.err.println("Extension Stop word Dictionary loading exception.");
|
||||
ioe.printStackTrace();
|
||||
@ -326,12 +352,20 @@ public class Dictionary {
|
||||
// 建立一个量词典实例
|
||||
_QuantifierDict = new DictSegment((char) 0);
|
||||
// 读取量词词典文件
|
||||
InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDictionary());
|
||||
InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDicionary());
|
||||
if (is == null) {
|
||||
throw new RuntimeException("Quantifier Dictionary not found!!!");
|
||||
}
|
||||
try {
|
||||
readDict(is, _QuantifierDict);
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
|
||||
String theWord;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
_QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
|
||||
} catch (IOException ioe) {
|
||||
System.err.println("Quantifier Dictionary loading exception.");
|
||||
ioe.printStackTrace();
|
||||
@ -345,21 +379,4 @@ public class Dictionary {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 读取词典文件到词典树中
|
||||
*
|
||||
* @param is 文件输入流
|
||||
* @param dictSegment 词典树分段
|
||||
* @throws IOException 读取异常
|
||||
*/
|
||||
private void readDict(InputStream is, DictSegment dictSegment) throws IOException {
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
|
||||
String theWord;
|
||||
do {
|
||||
theWord = br.readLine();
|
||||
if (theWord != null && !"".equals(theWord.trim())) {
|
||||
dictSegment.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
||||
}
|
||||
} while (theWord != null);
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.dic;
|
||||
@ -32,33 +32,24 @@ package org.wltea.analyzer.dic;
|
||||
*/
|
||||
@SuppressWarnings("unused")
|
||||
public class Hit {
|
||||
/**
|
||||
* Hit不匹配
|
||||
*/
|
||||
//Hit不匹配
|
||||
private static final int UNMATCH = 0x00000000;
|
||||
/**
|
||||
* Hit完全匹配
|
||||
*/
|
||||
//Hit完全匹配
|
||||
private static final int MATCH = 0x00000001;
|
||||
/**
|
||||
* Hit前缀匹配
|
||||
*/
|
||||
//Hit前缀匹配
|
||||
private static final int PREFIX = 0x00000010;
|
||||
|
||||
|
||||
/**
|
||||
* 该HIT当前状态,默认未匹配
|
||||
*/
|
||||
//该HIT当前状态,默认未匹配
|
||||
private int hitState = UNMATCH;
|
||||
/**
|
||||
* 记录词典匹配过程中,当前匹配到的词典分支节点
|
||||
*/
|
||||
|
||||
//记录词典匹配过程中,当前匹配到的词典分支节点
|
||||
private DictSegment matchedDictSegment;
|
||||
/**
|
||||
/*
|
||||
* 词段开始位置
|
||||
*/
|
||||
private int begin;
|
||||
/**
|
||||
/*
|
||||
* 词段的结束位置
|
||||
*/
|
||||
private int end;
|
||||
@ -95,7 +86,9 @@ public class Hit {
|
||||
public boolean isUnmatch() {
|
||||
return this.hitState == UNMATCH ;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
void setUnmatch() {
|
||||
this.hitState = UNMATCH;
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.lucene;
|
||||
@ -36,15 +36,19 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||
@SuppressWarnings("unused")
|
||||
public final class IKAnalyzer extends Analyzer{
|
||||
|
||||
private final boolean useSmart;
|
||||
private boolean useSmart;
|
||||
|
||||
private boolean useSmart() {
|
||||
return useSmart;
|
||||
}
|
||||
|
||||
public void setUseSmart(boolean useSmart) {
|
||||
this.useSmart = useSmart;
|
||||
}
|
||||
|
||||
/**
|
||||
* IK分词器Lucene Analyzer接口实现类
|
||||
*
|
||||
* 默认细粒度切分算法
|
||||
*/
|
||||
public IKAnalyzer(){
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.lucene;
|
||||
@ -39,30 +39,21 @@ import java.io.IOException;
|
||||
|
||||
/**
|
||||
* IK分词器 Lucene Tokenizer适配器类
|
||||
* 兼容Lucene 4.0版本
|
||||
*/
|
||||
@SuppressWarnings({"unused", "FinalMethodInFinalClass"})
|
||||
@SuppressWarnings("unused")
|
||||
public final class IKTokenizer extends Tokenizer {
|
||||
|
||||
/**
|
||||
* IK分词器实现
|
||||
*/
|
||||
//IK分词器实现
|
||||
private IKSegmenter _IKImplement;
|
||||
|
||||
/**
|
||||
* 词元文本属性
|
||||
*/
|
||||
//词元文本属性
|
||||
private CharTermAttribute termAtt;
|
||||
/**
|
||||
* 词元位移属性
|
||||
*/
|
||||
//词元位移属性
|
||||
private OffsetAttribute offsetAtt;
|
||||
/**
|
||||
* 词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
|
||||
*/
|
||||
//词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
|
||||
private TypeAttribute typeAtt;
|
||||
/**
|
||||
* 记录最后一个词元的结束位置
|
||||
*/
|
||||
//记录最后一个词元的结束位置
|
||||
private int endPosition;
|
||||
|
||||
/**
|
||||
@ -93,8 +84,7 @@ public final class IKTokenizer extends Tokenizer {
|
||||
_IKImplement = new IKSegmenter(input , useSmart);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
|
||||
*/
|
||||
@Override
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.lucene;
|
||||
@ -44,8 +44,6 @@ import java.nio.charset.StandardCharsets;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* 分词器工厂类
|
||||
*
|
||||
* @author <a href="magese@live.cn">Magese</a>
|
||||
*/
|
||||
public class IKTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware, UpdateThread.UpdateJob {
|
||||
@ -76,7 +74,7 @@ public class IKTokenizerFactory extends TokenizerFactory implements ResourceLoad
|
||||
*/
|
||||
@Override
|
||||
public void inform(ResourceLoader resourceLoader) throws IOException {
|
||||
System.out.printf("IKTokenizerFactory " + this.hashCode() + " inform conf: %s%n", getConf());
|
||||
System.out.println(String.format("IKTokenizerFactory " + this.hashCode() + " inform conf: %s", getConf()));
|
||||
this.loader = resourceLoader;
|
||||
update();
|
||||
if ((getConf() != null) && (!getConf().trim().isEmpty())) {
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.lucene;
|
||||
@ -35,7 +35,7 @@ import java.util.Vector;
|
||||
*/
|
||||
public class UpdateThread implements Runnable {
|
||||
private static final long INTERVAL = 30000L; // 循环等待时间
|
||||
private final Vector<UpdateJob> filterFactorys; // 更新任务集合
|
||||
private Vector<UpdateJob> filterFactorys; // 更新任务集合
|
||||
|
||||
/**
|
||||
* 私有化构造器,阻止外部进行实例化
|
||||
@ -51,7 +51,7 @@ public class UpdateThread implements Runnable {
|
||||
* 静态内部类,实现线程安全单例模式
|
||||
*/
|
||||
private static class Builder {
|
||||
private static final UpdateThread singleton = new UpdateThread();
|
||||
private static UpdateThread singleton = new UpdateThread();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -81,7 +81,6 @@ public class UpdateThread implements Runnable {
|
||||
//noinspection InfiniteLoopStatement
|
||||
while (true) {
|
||||
try {
|
||||
//noinspection BusyWait
|
||||
Thread.sleep(INTERVAL);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.query;
|
||||
@ -46,11 +46,11 @@ import java.util.Stack;
|
||||
public class IKQueryExpressionParser {
|
||||
|
||||
|
||||
private final List<Element> elements = new ArrayList<>();
|
||||
private List<Element> elements = new ArrayList<>();
|
||||
|
||||
private final Stack<Query> querys = new Stack<>();
|
||||
private Stack<Query> querys = new Stack<>();
|
||||
|
||||
private final Stack<Element> operates = new Stack<>();
|
||||
private Stack<Element> operates = new Stack<>();
|
||||
|
||||
/**
|
||||
* 解析查询表达式,生成Lucene Query对象
|
||||
@ -87,263 +87,263 @@ public class IKQueryExpressionParser {
|
||||
if (expression == null) {
|
||||
return;
|
||||
}
|
||||
Element currentElement = null;
|
||||
Element curretElement = null;
|
||||
|
||||
char[] expChars = expression.toCharArray();
|
||||
for (char expChar : expChars) {
|
||||
switch (expChar) {
|
||||
case '&':
|
||||
if (currentElement == null) {
|
||||
currentElement = new Element();
|
||||
currentElement.type = '&';
|
||||
currentElement.append(expChar);
|
||||
} else if (currentElement.type == '&') {
|
||||
currentElement.append(expChar);
|
||||
this.elements.add(currentElement);
|
||||
currentElement = null;
|
||||
} else if (currentElement.type == '\'') {
|
||||
currentElement.append(expChar);
|
||||
if (curretElement == null) {
|
||||
curretElement = new Element();
|
||||
curretElement.type = '&';
|
||||
curretElement.append(expChar);
|
||||
} else if (curretElement.type == '&') {
|
||||
curretElement.append(expChar);
|
||||
this.elements.add(curretElement);
|
||||
curretElement = null;
|
||||
} else if (curretElement.type == '\'') {
|
||||
curretElement.append(expChar);
|
||||
} else {
|
||||
this.elements.add(currentElement);
|
||||
currentElement = new Element();
|
||||
currentElement.type = '&';
|
||||
currentElement.append(expChar);
|
||||
this.elements.add(curretElement);
|
||||
curretElement = new Element();
|
||||
curretElement.type = '&';
|
||||
curretElement.append(expChar);
|
||||
}
|
||||
break;
|
||||
|
||||
case '|':
|
||||
if (currentElement == null) {
|
||||
currentElement = new Element();
|
||||
currentElement.type = '|';
|
||||
currentElement.append(expChar);
|
||||
} else if (currentElement.type == '|') {
|
||||
currentElement.append(expChar);
|
||||
this.elements.add(currentElement);
|
||||
currentElement = null;
|
||||
} else if (currentElement.type == '\'') {
|
||||
currentElement.append(expChar);
|
||||
if (curretElement == null) {
|
||||
curretElement = new Element();
|
||||
curretElement.type = '|';
|
||||
curretElement.append(expChar);
|
||||
} else if (curretElement.type == '|') {
|
||||
curretElement.append(expChar);
|
||||
this.elements.add(curretElement);
|
||||
curretElement = null;
|
||||
} else if (curretElement.type == '\'') {
|
||||
curretElement.append(expChar);
|
||||
} else {
|
||||
this.elements.add(currentElement);
|
||||
currentElement = new Element();
|
||||
currentElement.type = '|';
|
||||
currentElement.append(expChar);
|
||||
this.elements.add(curretElement);
|
||||
curretElement = new Element();
|
||||
curretElement.type = '|';
|
||||
curretElement.append(expChar);
|
||||
}
|
||||
break;
|
||||
|
||||
case '-':
|
||||
if (currentElement != null) {
|
||||
if (currentElement.type == '\'') {
|
||||
currentElement.append(expChar);
|
||||
if (curretElement != null) {
|
||||
if (curretElement.type == '\'') {
|
||||
curretElement.append(expChar);
|
||||
continue;
|
||||
} else {
|
||||
this.elements.add(currentElement);
|
||||
this.elements.add(curretElement);
|
||||
}
|
||||
}
|
||||
currentElement = new Element();
|
||||
currentElement.type = '-';
|
||||
currentElement.append(expChar);
|
||||
this.elements.add(currentElement);
|
||||
currentElement = null;
|
||||
curretElement = new Element();
|
||||
curretElement.type = '-';
|
||||
curretElement.append(expChar);
|
||||
this.elements.add(curretElement);
|
||||
curretElement = null;
|
||||
break;
|
||||
|
||||
case '(':
|
||||
if (currentElement != null) {
|
||||
if (currentElement.type == '\'') {
|
||||
currentElement.append(expChar);
|
||||
if (curretElement != null) {
|
||||
if (curretElement.type == '\'') {
|
||||
curretElement.append(expChar);
|
||||
continue;
|
||||
} else {
|
||||
this.elements.add(currentElement);
|
||||
this.elements.add(curretElement);
|
||||
}
|
||||
}
|
||||
currentElement = new Element();
|
||||
currentElement.type = '(';
|
||||
currentElement.append(expChar);
|
||||
this.elements.add(currentElement);
|
||||
currentElement = null;
|
||||
curretElement = new Element();
|
||||
curretElement.type = '(';
|
||||
curretElement.append(expChar);
|
||||
this.elements.add(curretElement);
|
||||
curretElement = null;
|
||||
break;
|
||||
|
||||
case ')':
|
||||
if (currentElement != null) {
|
||||
if (currentElement.type == '\'') {
|
||||
currentElement.append(expChar);
|
||||
if (curretElement != null) {
|
||||
if (curretElement.type == '\'') {
|
||||
curretElement.append(expChar);
|
||||
continue;
|
||||
} else {
|
||||
this.elements.add(currentElement);
|
||||
this.elements.add(curretElement);
|
||||
}
|
||||
}
|
||||
currentElement = new Element();
|
||||
currentElement.type = ')';
|
||||
currentElement.append(expChar);
|
||||
this.elements.add(currentElement);
|
||||
currentElement = null;
|
||||
curretElement = new Element();
|
||||
curretElement.type = ')';
|
||||
curretElement.append(expChar);
|
||||
this.elements.add(curretElement);
|
||||
curretElement = null;
|
||||
break;
|
||||
|
||||
case ':':
|
||||
if (currentElement != null) {
|
||||
if (currentElement.type == '\'') {
|
||||
currentElement.append(expChar);
|
||||
if (curretElement != null) {
|
||||
if (curretElement.type == '\'') {
|
||||
curretElement.append(expChar);
|
||||
continue;
|
||||
} else {
|
||||
this.elements.add(currentElement);
|
||||
this.elements.add(curretElement);
|
||||
}
|
||||
}
|
||||
currentElement = new Element();
|
||||
currentElement.type = ':';
|
||||
currentElement.append(expChar);
|
||||
this.elements.add(currentElement);
|
||||
currentElement = null;
|
||||
curretElement = new Element();
|
||||
curretElement.type = ':';
|
||||
curretElement.append(expChar);
|
||||
this.elements.add(curretElement);
|
||||
curretElement = null;
|
||||
break;
|
||||
|
||||
case '=':
|
||||
if (currentElement != null) {
|
||||
if (currentElement.type == '\'') {
|
||||
currentElement.append(expChar);
|
||||
if (curretElement != null) {
|
||||
if (curretElement.type == '\'') {
|
||||
curretElement.append(expChar);
|
||||
continue;
|
||||
} else {
|
||||
this.elements.add(currentElement);
|
||||
this.elements.add(curretElement);
|
||||
}
|
||||
}
|
||||
currentElement = new Element();
|
||||
currentElement.type = '=';
|
||||
currentElement.append(expChar);
|
||||
this.elements.add(currentElement);
|
||||
currentElement = null;
|
||||
curretElement = new Element();
|
||||
curretElement.type = '=';
|
||||
curretElement.append(expChar);
|
||||
this.elements.add(curretElement);
|
||||
curretElement = null;
|
||||
break;
|
||||
|
||||
case ' ':
|
||||
if (currentElement != null) {
|
||||
if (currentElement.type == '\'') {
|
||||
currentElement.append(expChar);
|
||||
if (curretElement != null) {
|
||||
if (curretElement.type == '\'') {
|
||||
curretElement.append(expChar);
|
||||
} else {
|
||||
this.elements.add(currentElement);
|
||||
currentElement = null;
|
||||
this.elements.add(curretElement);
|
||||
curretElement = null;
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case '\'':
|
||||
if (currentElement == null) {
|
||||
currentElement = new Element();
|
||||
currentElement.type = '\'';
|
||||
if (curretElement == null) {
|
||||
curretElement = new Element();
|
||||
curretElement.type = '\'';
|
||||
|
||||
} else if (currentElement.type == '\'') {
|
||||
this.elements.add(currentElement);
|
||||
currentElement = null;
|
||||
} else if (curretElement.type == '\'') {
|
||||
this.elements.add(curretElement);
|
||||
curretElement = null;
|
||||
|
||||
} else {
|
||||
this.elements.add(currentElement);
|
||||
currentElement = new Element();
|
||||
currentElement.type = '\'';
|
||||
this.elements.add(curretElement);
|
||||
curretElement = new Element();
|
||||
curretElement.type = '\'';
|
||||
|
||||
}
|
||||
break;
|
||||
|
||||
case '[':
|
||||
if (currentElement != null) {
|
||||
if (currentElement.type == '\'') {
|
||||
currentElement.append(expChar);
|
||||
if (curretElement != null) {
|
||||
if (curretElement.type == '\'') {
|
||||
curretElement.append(expChar);
|
||||
continue;
|
||||
} else {
|
||||
this.elements.add(currentElement);
|
||||
this.elements.add(curretElement);
|
||||
}
|
||||
}
|
||||
currentElement = new Element();
|
||||
currentElement.type = '[';
|
||||
currentElement.append(expChar);
|
||||
this.elements.add(currentElement);
|
||||
currentElement = null;
|
||||
curretElement = new Element();
|
||||
curretElement.type = '[';
|
||||
curretElement.append(expChar);
|
||||
this.elements.add(curretElement);
|
||||
curretElement = null;
|
||||
break;
|
||||
|
||||
case ']':
|
||||
if (currentElement != null) {
|
||||
if (currentElement.type == '\'') {
|
||||
currentElement.append(expChar);
|
||||
if (curretElement != null) {
|
||||
if (curretElement.type == '\'') {
|
||||
curretElement.append(expChar);
|
||||
continue;
|
||||
} else {
|
||||
this.elements.add(currentElement);
|
||||
this.elements.add(curretElement);
|
||||
}
|
||||
}
|
||||
currentElement = new Element();
|
||||
currentElement.type = ']';
|
||||
currentElement.append(expChar);
|
||||
this.elements.add(currentElement);
|
||||
currentElement = null;
|
||||
curretElement = new Element();
|
||||
curretElement.type = ']';
|
||||
curretElement.append(expChar);
|
||||
this.elements.add(curretElement);
|
||||
curretElement = null;
|
||||
|
||||
break;
|
||||
|
||||
case '{':
|
||||
if (currentElement != null) {
|
||||
if (currentElement.type == '\'') {
|
||||
currentElement.append(expChar);
|
||||
if (curretElement != null) {
|
||||
if (curretElement.type == '\'') {
|
||||
curretElement.append(expChar);
|
||||
continue;
|
||||
} else {
|
||||
this.elements.add(currentElement);
|
||||
this.elements.add(curretElement);
|
||||
}
|
||||
}
|
||||
currentElement = new Element();
|
||||
currentElement.type = '{';
|
||||
currentElement.append(expChar);
|
||||
this.elements.add(currentElement);
|
||||
currentElement = null;
|
||||
curretElement = new Element();
|
||||
curretElement.type = '{';
|
||||
curretElement.append(expChar);
|
||||
this.elements.add(curretElement);
|
||||
curretElement = null;
|
||||
break;
|
||||
|
||||
case '}':
|
||||
if (currentElement != null) {
|
||||
if (currentElement.type == '\'') {
|
||||
currentElement.append(expChar);
|
||||
if (curretElement != null) {
|
||||
if (curretElement.type == '\'') {
|
||||
curretElement.append(expChar);
|
||||
continue;
|
||||
} else {
|
||||
this.elements.add(currentElement);
|
||||
this.elements.add(curretElement);
|
||||
}
|
||||
}
|
||||
currentElement = new Element();
|
||||
currentElement.type = '}';
|
||||
currentElement.append(expChar);
|
||||
this.elements.add(currentElement);
|
||||
currentElement = null;
|
||||
curretElement = new Element();
|
||||
curretElement.type = '}';
|
||||
curretElement.append(expChar);
|
||||
this.elements.add(curretElement);
|
||||
curretElement = null;
|
||||
|
||||
break;
|
||||
case ',':
|
||||
if (currentElement != null) {
|
||||
if (currentElement.type == '\'') {
|
||||
currentElement.append(expChar);
|
||||
if (curretElement != null) {
|
||||
if (curretElement.type == '\'') {
|
||||
curretElement.append(expChar);
|
||||
continue;
|
||||
} else {
|
||||
this.elements.add(currentElement);
|
||||
this.elements.add(curretElement);
|
||||
}
|
||||
}
|
||||
currentElement = new Element();
|
||||
currentElement.type = ',';
|
||||
currentElement.append(expChar);
|
||||
this.elements.add(currentElement);
|
||||
currentElement = null;
|
||||
curretElement = new Element();
|
||||
curretElement.type = ',';
|
||||
curretElement.append(expChar);
|
||||
this.elements.add(curretElement);
|
||||
curretElement = null;
|
||||
|
||||
break;
|
||||
|
||||
default:
|
||||
if (currentElement == null) {
|
||||
currentElement = new Element();
|
||||
currentElement.type = 'F';
|
||||
currentElement.append(expChar);
|
||||
if (curretElement == null) {
|
||||
curretElement = new Element();
|
||||
curretElement.type = 'F';
|
||||
curretElement.append(expChar);
|
||||
|
||||
} else if (currentElement.type == 'F') {
|
||||
currentElement.append(expChar);
|
||||
} else if (curretElement.type == 'F') {
|
||||
curretElement.append(expChar);
|
||||
|
||||
} else if (currentElement.type == '\'') {
|
||||
currentElement.append(expChar);
|
||||
} else if (curretElement.type == '\'') {
|
||||
curretElement.append(expChar);
|
||||
|
||||
} else {
|
||||
this.elements.add(currentElement);
|
||||
currentElement = new Element();
|
||||
currentElement.type = 'F';
|
||||
currentElement.append(expChar);
|
||||
this.elements.add(curretElement);
|
||||
curretElement = new Element();
|
||||
curretElement.type = 'F';
|
||||
curretElement.append(expChar);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (currentElement != null) {
|
||||
this.elements.add(currentElement);
|
||||
if (curretElement != null) {
|
||||
this.elements.add(curretElement);
|
||||
}
|
||||
}
|
||||
|
||||
@ -673,7 +673,7 @@ public class IKQueryExpressionParser {
|
||||
* @author linliangyi
|
||||
* May 20, 2010
|
||||
*/
|
||||
private static class Element {
|
||||
private class Element {
|
||||
char type = 0;
|
||||
StringBuffer eleTextBuff;
|
||||
|
||||
@ -692,9 +692,11 @@ public class IKQueryExpressionParser {
|
||||
|
||||
public static void main(String[] args) {
|
||||
IKQueryExpressionParser parser = new IKQueryExpressionParser();
|
||||
//String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'";
|
||||
String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'";
|
||||
Query result = parser.parseExp(ikQueryExp);
|
||||
System.out.println(result);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.5.0
|
||||
* IK Analyzer release 8.5.0
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
@ -21,8 +21,8 @@
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.5.0 update by Magese(magese@live.cn)
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.query;
|
||||
@ -45,7 +45,6 @@ import java.util.List;
|
||||
*
|
||||
* @author linliangyi
|
||||
*/
|
||||
@SuppressWarnings("unused")
|
||||
class SWMCQueryBuilder {
|
||||
|
||||
/**
|
||||
@ -119,8 +118,8 @@ class SWMCQueryBuilder {
|
||||
|
||||
//借助lucene queryparser 生成SWMC Query
|
||||
QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer());
|
||||
qp.setAutoGeneratePhraseQueries(false);
|
||||
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
|
||||
qp.setAutoGeneratePhraseQueries(true);
|
||||
|
||||
if ((shortCount * 1.0f / totalCount) > 0.5f) {
|
||||
try {
|
||||
|
86
src/main/java/org/wltea/analyzer/sample/IKAnalzyerDemo.java
Normal file
86
src/main/java/org/wltea/analyzer/sample/IKAnalzyerDemo.java
Normal file
@ -0,0 +1,86 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.sample;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.wltea.analyzer.lucene.IKAnalyzer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
/**
|
||||
* 使用IKAnalyzer进行分词的演示
|
||||
* 2012-10-22
|
||||
*/
|
||||
public class IKAnalzyerDemo {
|
||||
|
||||
public static void main(String[] args) {
|
||||
//构建IK分词器,使用smart分词模式
|
||||
Analyzer analyzer = new IKAnalyzer(true);
|
||||
|
||||
//获取Lucene的TokenStream对象
|
||||
TokenStream ts = null;
|
||||
try {
|
||||
ts = analyzer.tokenStream("myfield", new StringReader("这是一个中文分词的例子,你可以直接运行它!IKAnalyer can analysis english text too"));
|
||||
//获取词元位置属性
|
||||
OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
|
||||
//获取词元文本属性
|
||||
CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
|
||||
//获取词元文本属性
|
||||
TypeAttribute type = ts.addAttribute(TypeAttribute.class);
|
||||
|
||||
|
||||
//重置TokenStream(重置StringReader)
|
||||
ts.reset();
|
||||
//迭代获取分词结果
|
||||
while (ts.incrementToken()) {
|
||||
System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type());
|
||||
}
|
||||
//关闭TokenStream(关闭StringReader)
|
||||
ts.end(); // Perform end-of-stream operations, e.g. set the final offset.
|
||||
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
//释放TokenStream的所有资源
|
||||
if (ts != null) {
|
||||
try {
|
||||
ts.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,135 @@
|
||||
/*
|
||||
* IK 中文分词 版本 8.1.1
|
||||
* IK Analyzer release 8.1.1
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
||||
* 版权声明 2012,乌龙茶工作室
|
||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||
*
|
||||
* 8.1.1版本 由 Magese (magese@live.cn) 更新
|
||||
* release 8.1.1 update by Magese(magese@live.cn)
|
||||
*
|
||||
*/
|
||||
package org.wltea.analyzer.sample;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.queryparser.classic.ParseException;
|
||||
import org.apache.lucene.queryparser.classic.QueryParser;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.wltea.analyzer.lucene.IKAnalyzer;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
/**
|
||||
* 使用IKAnalyzer进行Lucene索引和查询的演示
|
||||
* 2012-3-2
|
||||
* <p>
|
||||
* 以下是结合Lucene4.0 API的写法
|
||||
*/
|
||||
public class LuceneIndexAndSearchDemo {
|
||||
|
||||
|
||||
/**
|
||||
* 模拟:
|
||||
* 创建一个单条记录的索引,并对其进行搜索
|
||||
*
|
||||
*/
|
||||
public static void main(String[] args) {
|
||||
//Lucene Document的域名
|
||||
String fieldName = "text";
|
||||
//检索内容
|
||||
String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
|
||||
|
||||
//实例化IKAnalyzer分词器
|
||||
Analyzer analyzer = new IKAnalyzer(true);
|
||||
|
||||
Directory directory = null;
|
||||
IndexWriter iwriter;
|
||||
IndexReader ireader = null;
|
||||
IndexSearcher isearcher;
|
||||
try {
|
||||
//建立内存索引对象
|
||||
directory = new RAMDirectory();
|
||||
|
||||
//配置IndexWriterConfig
|
||||
IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
|
||||
iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
|
||||
iwriter = new IndexWriter(directory, iwConfig);
|
||||
//写入索引
|
||||
Document doc = new Document();
|
||||
doc.add(new StringField("ID", "10000", Field.Store.YES));
|
||||
doc.add(new TextField(fieldName, text, Field.Store.YES));
|
||||
iwriter.addDocument(doc);
|
||||
iwriter.close();
|
||||
|
||||
|
||||
//搜索过程**********************************
|
||||
//实例化搜索器
|
||||
ireader = DirectoryReader.open(directory);
|
||||
isearcher = new IndexSearcher(ireader);
|
||||
|
||||
String keyword = "中文分词工具包";
|
||||
//使用QueryParser查询分析器构造Query对象
|
||||
QueryParser qp = new QueryParser(fieldName, analyzer);
|
||||
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
|
||||
Query query = qp.parse(keyword);
|
||||
System.out.println("Query = " + query);
|
||||
|
||||
//搜索相似度最高的5条记录
|
||||
TopDocs topDocs = isearcher.search(query, 5);
|
||||
long totalHits = topDocs.totalHits.value;
|
||||
System.out.println("命中:" + totalHits);
|
||||
//输出结果
|
||||
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
|
||||
for (int i = 0; i < totalHits; i++) {
|
||||
Document targetDoc = isearcher.doc(scoreDocs[i].doc);
|
||||
System.out.println("内容:" + targetDoc.toString());
|
||||
}
|
||||
|
||||
} catch (ParseException | IOException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
if (ireader != null) {
|
||||
try {
|
||||
ireader.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
if (directory != null) {
|
||||
try {
|
||||
directory.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -2,10 +2,10 @@
|
||||
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
|
||||
<properties>
|
||||
<comment>IK Analyzer 扩展配置</comment>
|
||||
<!-- 配置是否加载默认词典 -->
|
||||
<entry key="use_main_dict">true</entry>
|
||||
<!-- 配置自己的扩展字典,多个用分号分隔 -->
|
||||
<!--用户可以在这里配置自己的扩展字典 -->
|
||||
<entry key="ext_dict">ext.dic;</entry>
|
||||
<!-- 配置自己的扩展停止词字典,多个用分号分隔 -->
|
||||
|
||||
<!--用户可以在这里配置自己的扩展停止词字典-->
|
||||
<entry key="ext_stopwords">stopword.dic;</entry>
|
||||
|
||||
</properties>
|
@ -1,3 +1,3 @@
|
||||
Wed Aug 01 00:00:00 CST 2021
|
||||
Wed Aug 01 11:21:30 CST 2018
|
||||
files=dynamicdic.txt
|
||||
lastupdate=0
|
||||
|
Loading…
x
Reference in New Issue
Block a user