2018-05-10 13:52:58 +08:00
|
|
|
|
/*
|
2021-12-23 17:09:21 +08:00
|
|
|
|
* IK 中文分词 版本 8.5.0
|
|
|
|
|
* IK Analyzer release 8.5.0
|
2018-11-15 11:05:24 +08:00
|
|
|
|
*
|
|
|
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
|
* contributor license agreements. See the NOTICE file distributed with
|
|
|
|
|
* this work for additional information regarding copyright ownership.
|
|
|
|
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
|
* (the "License"); you may not use this file except in compliance with
|
|
|
|
|
* the License. You may obtain a copy of the License at
|
|
|
|
|
*
|
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
*
|
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
|
* limitations under the License.
|
|
|
|
|
*
|
|
|
|
|
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
|
|
|
|
* 版权声明 2012,乌龙茶工作室
|
|
|
|
|
* provided by Linliangyi and copyright 2012 by Oolong studio
|
|
|
|
|
*
|
2021-12-23 17:09:21 +08:00
|
|
|
|
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
|
|
|
|
* release 8.5.0 update by Magese(magese@live.cn)
|
2018-11-15 11:05:24 +08:00
|
|
|
|
*
|
2018-05-10 13:52:58 +08:00
|
|
|
|
*/
|
|
|
|
|
package org.wltea.analyzer.dic;
|
|
|
|
|
|
|
|
|
|
import java.util.Arrays;
|
|
|
|
|
import java.util.HashMap;
|
|
|
|
|
import java.util.Map;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 词典树分段,表示词典树的一个分枝
|
|
|
|
|
*/
|
|
|
|
|
@SuppressWarnings("unused")
|
|
|
|
|
class DictSegment implements Comparable<DictSegment> {
|
|
|
|
|
|
|
|
|
|
//公用字典表,存储汉字
|
|
|
|
|
private static final Map<Character, Character> charMap = new HashMap<>(16, 0.95f);
|
|
|
|
|
//数组大小上限
|
|
|
|
|
private static final int ARRAY_LENGTH_LIMIT = 3;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//Map存储结构
|
|
|
|
|
private Map<Character, DictSegment> childrenMap;
|
|
|
|
|
//数组方式存储结构
|
|
|
|
|
private DictSegment[] childrenArray;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//当前节点上存储的字符
|
|
|
|
|
private Character nodeChar;
|
|
|
|
|
//当前节点存储的Segment数目
|
|
|
|
|
//storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
|
|
|
|
|
private int storeSize = 0;
|
|
|
|
|
//当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
|
|
|
|
|
private int nodeState = 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DictSegment(Character nodeChar) {
|
|
|
|
|
if (nodeChar == null) {
|
|
|
|
|
throw new IllegalArgumentException("参数为空异常,字符不能为空");
|
|
|
|
|
}
|
|
|
|
|
this.nodeChar = nodeChar;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Character getNodeChar() {
|
|
|
|
|
return nodeChar;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* 判断是否有下一个节点
|
|
|
|
|
*/
|
|
|
|
|
private boolean hasNextNode() {
|
|
|
|
|
return this.storeSize > 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 匹配词段
|
|
|
|
|
*
|
|
|
|
|
* @return Hit
|
|
|
|
|
*/
|
|
|
|
|
Hit match(char[] charArray) {
|
|
|
|
|
return this.match(charArray, 0, charArray.length, null);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 匹配词段
|
|
|
|
|
*
|
|
|
|
|
* @return Hit
|
|
|
|
|
*/
|
|
|
|
|
Hit match(char[] charArray, int begin, int length) {
|
|
|
|
|
return this.match(charArray, begin, length, null);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 匹配词段
|
|
|
|
|
*
|
|
|
|
|
* @return Hit
|
|
|
|
|
*/
|
|
|
|
|
Hit match(char[] charArray, int begin, int length, Hit searchHit) {
|
|
|
|
|
|
|
|
|
|
if (searchHit == null) {
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 如果hit为空,新建
|
2018-05-10 13:52:58 +08:00
|
|
|
|
searchHit = new Hit();
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 设置hit的其实文本位置
|
2018-05-10 13:52:58 +08:00
|
|
|
|
searchHit.setBegin(begin);
|
|
|
|
|
} else {
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 否则要将HIT状态重置
|
2018-05-10 13:52:58 +08:00
|
|
|
|
searchHit.setUnmatch();
|
|
|
|
|
}
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 设置hit的当前处理位置
|
2018-05-10 13:52:58 +08:00
|
|
|
|
searchHit.setEnd(begin);
|
|
|
|
|
|
|
|
|
|
Character keyChar = charArray[begin];
|
|
|
|
|
DictSegment ds = null;
|
|
|
|
|
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 引用实例变量为本地变量,避免查询时遇到更新的同步问题
|
2018-05-10 13:52:58 +08:00
|
|
|
|
DictSegment[] segmentArray = this.childrenArray;
|
|
|
|
|
Map<Character, DictSegment> segmentMap = this.childrenMap;
|
|
|
|
|
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// STEP1 在节点中查找keyChar对应的DictSegment
|
2018-05-10 13:52:58 +08:00
|
|
|
|
if (segmentArray != null) {
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 在数组中查找
|
2018-05-10 13:52:58 +08:00
|
|
|
|
DictSegment keySegment = new DictSegment(keyChar);
|
|
|
|
|
int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
|
|
|
|
|
if (position >= 0) {
|
|
|
|
|
ds = segmentArray[position];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} else if (segmentMap != null) {
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 在map中查找
|
2018-05-10 13:52:58 +08:00
|
|
|
|
ds = segmentMap.get(keyChar);
|
|
|
|
|
}
|
|
|
|
|
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// STEP2 找到DictSegment,判断词的匹配状态,是否继续递归,还是返回结果
|
2018-05-10 13:52:58 +08:00
|
|
|
|
if (ds != null) {
|
|
|
|
|
if (length > 1) {
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 词未匹配完,继续往下搜索
|
2018-05-10 13:52:58 +08:00
|
|
|
|
return ds.match(charArray, begin + 1, length - 1, searchHit);
|
|
|
|
|
} else if (length == 1) {
|
|
|
|
|
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 搜索最后一个char
|
2018-05-10 13:52:58 +08:00
|
|
|
|
if (ds.nodeState == 1) {
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 添加HIT状态为完全匹配
|
2018-05-10 13:52:58 +08:00
|
|
|
|
searchHit.setMatch();
|
|
|
|
|
}
|
|
|
|
|
if (ds.hasNextNode()) {
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 添加HIT状态为前缀匹配
|
2018-05-10 13:52:58 +08:00
|
|
|
|
searchHit.setPrefix();
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 记录当前位置的DictSegment
|
2018-05-10 13:52:58 +08:00
|
|
|
|
searchHit.setMatchedDictSegment(ds);
|
|
|
|
|
}
|
|
|
|
|
return searchHit;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// STEP3 没有找到DictSegment, 将HIT设置为不匹配
|
2018-05-10 13:52:58 +08:00
|
|
|
|
return searchHit;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 加载填充词典片段
|
|
|
|
|
*/
|
|
|
|
|
void fillSegment(char[] charArray) {
|
|
|
|
|
this.fillSegment(charArray, 0, charArray.length, 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 屏蔽词典中的一个词
|
|
|
|
|
*/
|
|
|
|
|
void disableSegment(char[] charArray) {
|
|
|
|
|
this.fillSegment(charArray, 0, charArray.length, 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 加载填充词典片段
|
|
|
|
|
*/
|
|
|
|
|
private synchronized void fillSegment(char[] charArray, int begin, int length, int enabled) {
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 获取字典表中的汉字对象
|
2018-05-10 13:52:58 +08:00
|
|
|
|
Character beginChar = charArray[begin];
|
|
|
|
|
Character keyChar = charMap.get(beginChar);
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 字典中没有该字,则将其添加入字典
|
2018-05-10 13:52:58 +08:00
|
|
|
|
if (keyChar == null) {
|
|
|
|
|
charMap.put(beginChar, beginChar);
|
|
|
|
|
keyChar = beginChar;
|
|
|
|
|
}
|
|
|
|
|
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 搜索当前节点的存储,查询对应keyChar的keyChar,如果没有则创建
|
2018-05-10 13:52:58 +08:00
|
|
|
|
DictSegment ds = lookforSegment(keyChar, enabled);
|
|
|
|
|
if (ds != null) {
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 处理keyChar对应的segment
|
2018-05-10 13:52:58 +08:00
|
|
|
|
if (length > 1) {
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 词元还没有完全加入词典树
|
2018-05-10 13:52:58 +08:00
|
|
|
|
ds.fillSegment(charArray, begin + 1, length - 1, enabled);
|
|
|
|
|
} else if (length == 1) {
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 已经是词元的最后一个char,设置当前节点状态为enabled,
|
|
|
|
|
// enabled=1表明一个完整的词,enabled=0表示从词典中屏蔽当前词
|
2018-05-10 13:52:58 +08:00
|
|
|
|
ds.nodeState = enabled;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 查找本节点下对应的keyChar的segment *
|
|
|
|
|
*
|
|
|
|
|
* @param create =1如果没有找到,则创建新的segment ; =0如果没有找到,不创建,返回null
|
|
|
|
|
*/
|
|
|
|
|
private DictSegment lookforSegment(Character keyChar, int create) {
|
|
|
|
|
|
|
|
|
|
DictSegment ds = null;
|
|
|
|
|
|
|
|
|
|
if (this.storeSize <= ARRAY_LENGTH_LIMIT) {
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 获取数组容器,如果数组未创建则创建数组
|
2018-05-10 13:52:58 +08:00
|
|
|
|
DictSegment[] segmentArray = getChildrenArray();
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 搜寻数组
|
2018-05-10 13:52:58 +08:00
|
|
|
|
DictSegment keySegment = new DictSegment(keyChar);
|
|
|
|
|
int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
|
|
|
|
|
if (position >= 0) {
|
|
|
|
|
ds = segmentArray[position];
|
|
|
|
|
}
|
|
|
|
|
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 遍历数组后没有找到对应的segment
|
2018-05-10 13:52:58 +08:00
|
|
|
|
if (ds == null && create == 1) {
|
|
|
|
|
ds = keySegment;
|
|
|
|
|
if (this.storeSize < ARRAY_LENGTH_LIMIT) {
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 数组容量未满,使用数组存储
|
2018-05-10 13:52:58 +08:00
|
|
|
|
segmentArray[this.storeSize] = ds;
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// segment数目+1
|
2018-05-10 13:52:58 +08:00
|
|
|
|
this.storeSize++;
|
|
|
|
|
Arrays.sort(segmentArray, 0, this.storeSize);
|
|
|
|
|
|
|
|
|
|
} else {
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 数组容量已满,切换Map存储
|
|
|
|
|
// 获取Map容器,如果Map未创建,则创建Map
|
2018-05-10 13:52:58 +08:00
|
|
|
|
Map<Character, DictSegment> segmentMap = getChildrenMap();
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 将数组中的segment迁移到Map中
|
2018-05-10 13:52:58 +08:00
|
|
|
|
migrate(segmentArray, segmentMap);
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 存储新的segment
|
2018-05-10 13:52:58 +08:00
|
|
|
|
segmentMap.put(keyChar, ds);
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// segment数目+1 , 必须在释放数组前执行storeSize++ , 确保极端情况下,不会取到空的数组
|
2018-05-10 13:52:58 +08:00
|
|
|
|
this.storeSize++;
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 释放当前的数组引用
|
2018-05-10 13:52:58 +08:00
|
|
|
|
this.childrenArray = null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} else {
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 获取Map容器,如果Map未创建,则创建Map
|
2018-05-10 13:52:58 +08:00
|
|
|
|
Map<Character, DictSegment> segmentMap = getChildrenMap();
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 搜索Map
|
2018-05-10 13:52:58 +08:00
|
|
|
|
ds = segmentMap.get(keyChar);
|
|
|
|
|
if (ds == null && create == 1) {
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 构造新的segment
|
2018-05-10 13:52:58 +08:00
|
|
|
|
ds = new DictSegment(keyChar);
|
|
|
|
|
segmentMap.put(keyChar, ds);
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 当前节点存储segment数目+1
|
2018-05-10 13:52:58 +08:00
|
|
|
|
this.storeSize++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return ds;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 获取数组容器
|
|
|
|
|
* 线程同步方法
|
|
|
|
|
*/
|
|
|
|
|
private DictSegment[] getChildrenArray() {
|
|
|
|
|
if (this.childrenArray == null) {
|
|
|
|
|
synchronized (this) {
|
|
|
|
|
if (this.childrenArray == null) {
|
|
|
|
|
this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return this.childrenArray;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 获取Map容器
|
|
|
|
|
* 线程同步方法
|
|
|
|
|
*/
|
|
|
|
|
private Map<Character, DictSegment> getChildrenMap() {
|
|
|
|
|
if (this.childrenMap == null) {
|
|
|
|
|
synchronized (this) {
|
|
|
|
|
if (this.childrenMap == null) {
|
|
|
|
|
this.childrenMap = new HashMap<>(ARRAY_LENGTH_LIMIT * 2, 0.8f);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return this.childrenMap;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 将数组中的segment迁移到Map中
|
|
|
|
|
*/
|
|
|
|
|
private void migrate(DictSegment[] segmentArray, Map<Character, DictSegment> segmentMap) {
|
|
|
|
|
for (DictSegment segment : segmentArray) {
|
|
|
|
|
if (segment != null) {
|
|
|
|
|
segmentMap.put(segment.nodeChar, segment);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 实现Comparable接口
|
|
|
|
|
*
|
|
|
|
|
* @return int
|
|
|
|
|
*/
|
|
|
|
|
public int compareTo(DictSegment o) {
|
2018-08-23 09:44:57 +08:00
|
|
|
|
// 对当前节点存储的char进行比较
|
2018-05-10 13:52:58 +08:00
|
|
|
|
return this.nodeChar.compareTo(o.nodeChar);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|