格式化代码、注释;
This commit is contained in:
parent
f173925dc0
commit
92cb2a28d6
@ -36,21 +36,21 @@ import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
*
|
||||
* 中文数量词子分词器
|
||||
*/
|
||||
class CN_QuantifierSegmenter implements ISegmenter{
|
||||
class CN_QuantifierSegmenter implements ISegmenter {
|
||||
|
||||
//子分词器标签
|
||||
// 子分词器标签
|
||||
private static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
|
||||
|
||||
private static Set<Character> ChnNumberChars = new HashSet<>();
|
||||
static{
|
||||
//中文数词
|
||||
//Cnum
|
||||
|
||||
static {
|
||||
// 中文数词
|
||||
// Cnum
|
||||
String chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";
|
||||
char[] ca = chn_Num.toCharArray();
|
||||
for(char nChar : ca){
|
||||
for (char nChar : ca) {
|
||||
ChnNumberChars.add(nChar);
|
||||
}
|
||||
}
|
||||
@ -67,11 +67,11 @@ class CN_QuantifierSegmenter implements ISegmenter{
|
||||
*/
|
||||
private int nEnd;
|
||||
|
||||
//待处理的量词hit队列
|
||||
// 待处理的量词hit队列
|
||||
private List<Hit> countHits;
|
||||
|
||||
|
||||
CN_QuantifierSegmenter(){
|
||||
CN_QuantifierSegmenter() {
|
||||
nStart = -1;
|
||||
nEnd = -1;
|
||||
this.countHits = new LinkedList<>();
|
||||
@ -81,16 +81,16 @@ class CN_QuantifierSegmenter implements ISegmenter{
|
||||
* 分词
|
||||
*/
|
||||
public void analyze(AnalyzeContext context) {
|
||||
//处理中文数词
|
||||
// 处理中文数词
|
||||
this.processCNumber(context);
|
||||
//处理中文量词
|
||||
// 处理中文量词
|
||||
this.processCount(context);
|
||||
|
||||
//判断是否锁定缓冲区
|
||||
if(this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()){
|
||||
//对缓冲区解锁
|
||||
// 判断是否锁定缓冲区
|
||||
if (this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()) {
|
||||
// 对缓冲区解锁
|
||||
context.unlockBuffer(SEGMENTER_NAME);
|
||||
}else{
|
||||
} else {
|
||||
context.lockBuffer(SEGMENTER_NAME);
|
||||
}
|
||||
}
|
||||
@ -108,34 +108,34 @@ class CN_QuantifierSegmenter implements ISegmenter{
|
||||
/**
|
||||
* 处理数词
|
||||
*/
|
||||
private void processCNumber(AnalyzeContext context){
|
||||
if(nStart == -1 && nEnd == -1){//初始状态
|
||||
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
|
||||
&& ChnNumberChars.contains(context.getCurrentChar())){
|
||||
//记录数词的起始、结束位置
|
||||
private void processCNumber(AnalyzeContext context) {
|
||||
if (nStart == -1 && nEnd == -1) {// 初始状态
|
||||
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
|
||||
&& ChnNumberChars.contains(context.getCurrentChar())) {
|
||||
// 记录数词的起始、结束位置
|
||||
nStart = context.getCursor();
|
||||
nEnd = context.getCursor();
|
||||
}
|
||||
}else{//正在处理状态
|
||||
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
|
||||
&& ChnNumberChars.contains(context.getCurrentChar())){
|
||||
//记录数词的结束位置
|
||||
} else {// 正在处理状态
|
||||
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
|
||||
&& ChnNumberChars.contains(context.getCurrentChar())) {
|
||||
// 记录数词的结束位置
|
||||
nEnd = context.getCursor();
|
||||
}else{
|
||||
//输出数词
|
||||
} else {
|
||||
// 输出数词
|
||||
this.outputNumLexeme(context);
|
||||
//重置头尾指针
|
||||
// 重置头尾指针
|
||||
nStart = -1;
|
||||
nEnd = -1;
|
||||
}
|
||||
}
|
||||
|
||||
//缓冲区已经用完,还有尚未输出的数词
|
||||
if(context.isBufferConsumed()){
|
||||
if(nStart != -1 && nEnd != -1){
|
||||
//输出数词
|
||||
// 缓冲区已经用完,还有尚未输出的数词
|
||||
if (context.isBufferConsumed()) {
|
||||
if (nStart != -1 && nEnd != -1) {
|
||||
// 输出数词
|
||||
outputNumLexeme(context);
|
||||
//重置头尾指针
|
||||
// 重置头尾指针
|
||||
nStart = -1;
|
||||
nEnd = -1;
|
||||
}
|
||||
@ -144,66 +144,67 @@ class CN_QuantifierSegmenter implements ISegmenter{
|
||||
|
||||
/**
|
||||
* 处理中文量词
|
||||
*
|
||||
* @param context 需要处理的内容
|
||||
*/
|
||||
private void processCount(AnalyzeContext context){
|
||||
private void processCount(AnalyzeContext context) {
|
||||
// 判断是否需要启动量词扫描
|
||||
if(!this.needCountScan(context)){
|
||||
if (!this.needCountScan(context)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){
|
||||
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()) {
|
||||
|
||||
//优先处理countHits中的hit
|
||||
if(!this.countHits.isEmpty()){
|
||||
//处理词段队列
|
||||
// 优先处理countHits中的hit
|
||||
if (!this.countHits.isEmpty()) {
|
||||
// 处理词段队列
|
||||
Hit[] tmpArray = this.countHits.toArray(new Hit[0]);
|
||||
for(Hit hit : tmpArray){
|
||||
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
|
||||
if(hit.isMatch()){
|
||||
//输出当前的词
|
||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
|
||||
for (Hit hit : tmpArray) {
|
||||
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor(), hit);
|
||||
if (hit.isMatch()) {
|
||||
// 输出当前的词
|
||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), hit.getBegin(), context.getCursor() - hit.getBegin() + 1, Lexeme.TYPE_COUNT);
|
||||
context.addLexeme(newLexeme);
|
||||
|
||||
if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除
|
||||
if (!hit.isPrefix()) {// 不是词前缀,hit不需要继续匹配,移除
|
||||
this.countHits.remove(hit);
|
||||
}
|
||||
|
||||
}else if(hit.isUnmatch()){
|
||||
//hit不是词,移除
|
||||
} else if (hit.isUnmatch()) {
|
||||
// hit不是词,移除
|
||||
this.countHits.remove(hit);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//*********************************
|
||||
//对当前指针位置的字符进行单字匹配
|
||||
// *********************************
|
||||
// 对当前指针位置的字符进行单字匹配
|
||||
Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
|
||||
if(singleCharHit.isMatch()){//首字成量词词
|
||||
//输出当前的词
|
||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
|
||||
if (singleCharHit.isMatch()) {// 首字成量词词
|
||||
// 输出当前的词
|
||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_COUNT);
|
||||
context.addLexeme(newLexeme);
|
||||
|
||||
//同时也是词前缀
|
||||
if(singleCharHit.isPrefix()){
|
||||
//前缀匹配则放入hit列表
|
||||
// 同时也是词前缀
|
||||
if (singleCharHit.isPrefix()) {
|
||||
// 前缀匹配则放入hit列表
|
||||
this.countHits.add(singleCharHit);
|
||||
}
|
||||
}else if(singleCharHit.isPrefix()){//首字为量词前缀
|
||||
//前缀匹配则放入hit列表
|
||||
} else if (singleCharHit.isPrefix()) {// 首字为量词前缀
|
||||
// 前缀匹配则放入hit列表
|
||||
this.countHits.add(singleCharHit);
|
||||
}
|
||||
|
||||
|
||||
}else{
|
||||
//输入的不是中文字符
|
||||
//清空未成形的量词
|
||||
} else {
|
||||
// 输入的不是中文字符
|
||||
// 清空未成形的量词
|
||||
this.countHits.clear();
|
||||
}
|
||||
|
||||
//缓冲区数据已经读完,还有尚未输出的量词
|
||||
if(context.isBufferConsumed()){
|
||||
//清空未成形的量词
|
||||
// 缓冲区数据已经读完,还有尚未输出的量词
|
||||
if (context.isBufferConsumed()) {
|
||||
// 清空未成形的量词
|
||||
this.countHits.clear();
|
||||
}
|
||||
}
|
||||
@ -211,15 +212,15 @@ class CN_QuantifierSegmenter implements ISegmenter{
|
||||
/**
|
||||
* 判断是否需要扫描量词
|
||||
*/
|
||||
private boolean needCountScan(AnalyzeContext context){
|
||||
if((nStart != -1 && nEnd != -1 ) || !countHits.isEmpty()){
|
||||
//正在处理中文数词,或者正在处理量词
|
||||
private boolean needCountScan(AnalyzeContext context) {
|
||||
if ((nStart != -1 && nEnd != -1) || !countHits.isEmpty()) {
|
||||
// 正在处理中文数词,或者正在处理量词
|
||||
return true;
|
||||
}else{
|
||||
//找到一个相邻的数词
|
||||
if(!context.getOrgLexemes().isEmpty()){
|
||||
} else {
|
||||
// 找到一个相邻的数词
|
||||
if (!context.getOrgLexemes().isEmpty()) {
|
||||
Lexeme l = context.getOrgLexemes().peekLast();
|
||||
if(Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()){
|
||||
if (Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()) {
|
||||
return l.getBegin() + l.getLength() == context.getCursor();
|
||||
}
|
||||
}
|
||||
@ -229,12 +230,13 @@ class CN_QuantifierSegmenter implements ISegmenter{
|
||||
|
||||
/**
|
||||
* 添加数词词元到结果集
|
||||
*
|
||||
* @param context 需要添加的词元
|
||||
*/
|
||||
private void outputNumLexeme(AnalyzeContext context){
|
||||
if(nStart > -1 && nEnd > -1){
|
||||
//输出数词
|
||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , nStart , nEnd - nStart + 1 , Lexeme.TYPE_CNUM);
|
||||
private void outputNumLexeme(AnalyzeContext context) {
|
||||
if (nStart > -1 && nEnd > -1) {
|
||||
// 输出数词
|
||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), nStart, nEnd - nStart + 1, Lexeme.TYPE_CNUM);
|
||||
context.addLexeme(newLexeme);
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user