2021-12-23 17:09:21 +08:00

321 lines
10 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* IK 中文分词 版本 8.5.0
* IK Analyzer release 8.5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.5.0版本 由 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.dic;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
/**
* 词典树分段,表示词典树的一个分枝
*/
@SuppressWarnings("unused")
class DictSegment implements Comparable<DictSegment> {
//公用字典表,存储汉字
private static final Map<Character, Character> charMap = new HashMap<>(16, 0.95f);
//数组大小上限
private static final int ARRAY_LENGTH_LIMIT = 3;
//Map存储结构
private Map<Character, DictSegment> childrenMap;
//数组方式存储结构
private DictSegment[] childrenArray;
//当前节点上存储的字符
private Character nodeChar;
//当前节点存储的Segment数目
//storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
private int storeSize = 0;
//当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
private int nodeState = 0;
DictSegment(Character nodeChar) {
if (nodeChar == null) {
throw new IllegalArgumentException("参数为空异常,字符不能为空");
}
this.nodeChar = nodeChar;
}
Character getNodeChar() {
return nodeChar;
}
/*
* 判断是否有下一个节点
*/
private boolean hasNextNode() {
return this.storeSize > 0;
}
/**
* 匹配词段
*
* @return Hit
*/
Hit match(char[] charArray) {
return this.match(charArray, 0, charArray.length, null);
}
/**
* 匹配词段
*
* @return Hit
*/
Hit match(char[] charArray, int begin, int length) {
return this.match(charArray, begin, length, null);
}
/**
* 匹配词段
*
* @return Hit
*/
Hit match(char[] charArray, int begin, int length, Hit searchHit) {
if (searchHit == null) {
// 如果hit为空新建
searchHit = new Hit();
// 设置hit的其实文本位置
searchHit.setBegin(begin);
} else {
// 否则要将HIT状态重置
searchHit.setUnmatch();
}
// 设置hit的当前处理位置
searchHit.setEnd(begin);
Character keyChar = charArray[begin];
DictSegment ds = null;
// 引用实例变量为本地变量,避免查询时遇到更新的同步问题
DictSegment[] segmentArray = this.childrenArray;
Map<Character, DictSegment> segmentMap = this.childrenMap;
// STEP1 在节点中查找keyChar对应的DictSegment
if (segmentArray != null) {
// 在数组中查找
DictSegment keySegment = new DictSegment(keyChar);
int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
if (position >= 0) {
ds = segmentArray[position];
}
} else if (segmentMap != null) {
// 在map中查找
ds = segmentMap.get(keyChar);
}
// STEP2 找到DictSegment判断词的匹配状态是否继续递归还是返回结果
if (ds != null) {
if (length > 1) {
// 词未匹配完,继续往下搜索
return ds.match(charArray, begin + 1, length - 1, searchHit);
} else if (length == 1) {
// 搜索最后一个char
if (ds.nodeState == 1) {
// 添加HIT状态为完全匹配
searchHit.setMatch();
}
if (ds.hasNextNode()) {
// 添加HIT状态为前缀匹配
searchHit.setPrefix();
// 记录当前位置的DictSegment
searchHit.setMatchedDictSegment(ds);
}
return searchHit;
}
}
// STEP3 没有找到DictSegment 将HIT设置为不匹配
return searchHit;
}
/**
* 加载填充词典片段
*/
void fillSegment(char[] charArray) {
this.fillSegment(charArray, 0, charArray.length, 1);
}
/**
* 屏蔽词典中的一个词
*/
void disableSegment(char[] charArray) {
this.fillSegment(charArray, 0, charArray.length, 0);
}
/**
* 加载填充词典片段
*/
private synchronized void fillSegment(char[] charArray, int begin, int length, int enabled) {
// 获取字典表中的汉字对象
Character beginChar = charArray[begin];
Character keyChar = charMap.get(beginChar);
// 字典中没有该字,则将其添加入字典
if (keyChar == null) {
charMap.put(beginChar, beginChar);
keyChar = beginChar;
}
// 搜索当前节点的存储查询对应keyChar的keyChar如果没有则创建
DictSegment ds = lookforSegment(keyChar, enabled);
if (ds != null) {
// 处理keyChar对应的segment
if (length > 1) {
// 词元还没有完全加入词典树
ds.fillSegment(charArray, begin + 1, length - 1, enabled);
} else if (length == 1) {
// 已经是词元的最后一个char,设置当前节点状态为enabled
// enabled=1表明一个完整的词enabled=0表示从词典中屏蔽当前词
ds.nodeState = enabled;
}
}
}
/**
* 查找本节点下对应的keyChar的segment *
*
* @param create =1如果没有找到则创建新的segment ; =0如果没有找到不创建返回null
*/
private DictSegment lookforSegment(Character keyChar, int create) {
DictSegment ds = null;
if (this.storeSize <= ARRAY_LENGTH_LIMIT) {
// 获取数组容器,如果数组未创建则创建数组
DictSegment[] segmentArray = getChildrenArray();
// 搜寻数组
DictSegment keySegment = new DictSegment(keyChar);
int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
if (position >= 0) {
ds = segmentArray[position];
}
// 遍历数组后没有找到对应的segment
if (ds == null && create == 1) {
ds = keySegment;
if (this.storeSize < ARRAY_LENGTH_LIMIT) {
// 数组容量未满,使用数组存储
segmentArray[this.storeSize] = ds;
// segment数目+1
this.storeSize++;
Arrays.sort(segmentArray, 0, this.storeSize);
} else {
// 数组容量已满切换Map存储
// 获取Map容器如果Map未创建,则创建Map
Map<Character, DictSegment> segmentMap = getChildrenMap();
// 将数组中的segment迁移到Map中
migrate(segmentArray, segmentMap);
// 存储新的segment
segmentMap.put(keyChar, ds);
// segment数目+1 必须在释放数组前执行storeSize++ 确保极端情况下,不会取到空的数组
this.storeSize++;
// 释放当前的数组引用
this.childrenArray = null;
}
}
} else {
// 获取Map容器如果Map未创建,则创建Map
Map<Character, DictSegment> segmentMap = getChildrenMap();
// 搜索Map
ds = segmentMap.get(keyChar);
if (ds == null && create == 1) {
// 构造新的segment
ds = new DictSegment(keyChar);
segmentMap.put(keyChar, ds);
// 当前节点存储segment数目+1
this.storeSize++;
}
}
return ds;
}
/**
* 获取数组容器
* 线程同步方法
*/
private DictSegment[] getChildrenArray() {
if (this.childrenArray == null) {
synchronized (this) {
if (this.childrenArray == null) {
this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT];
}
}
}
return this.childrenArray;
}
/**
* 获取Map容器
* 线程同步方法
*/
private Map<Character, DictSegment> getChildrenMap() {
if (this.childrenMap == null) {
synchronized (this) {
if (this.childrenMap == null) {
this.childrenMap = new HashMap<>(ARRAY_LENGTH_LIMIT * 2, 0.8f);
}
}
}
return this.childrenMap;
}
/**
* 将数组中的segment迁移到Map中
*/
private void migrate(DictSegment[] segmentArray, Map<Character, DictSegment> segmentMap) {
for (DictSegment segment : segmentArray) {
if (segment != null) {
segmentMap.put(segment.nodeChar, segment);
}
}
}
/**
* 实现Comparable接口
*
* @return int
*/
public int compareTo(DictSegment o) {
// 对当前节点存储的char进行比较
return this.nodeChar.compareTo(o.nodeChar);
}
}