2018-05-10 13:52:58 +08:00
|
|
|
|
/*
|
2021-03-22 11:41:13 +08:00
|
|
|
|
* IK 中文分词 版本 8.4.0
|
|
|
|
|
* IK Analyzer release 8.4.0
|
2018-11-15 11:05:24 +08:00
|
|
|
|
*
|
|
|
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
|
* contributor license agreements. See the NOTICE file distributed with
|
|
|
|
|
* this work for additional information regarding copyright ownership.
|
|
|
|
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
|
* (the "License"); you may not use this file except in compliance with
|
|
|
|
|
* the License. You may obtain a copy of the License at
|
|
|
|
|
*
|
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
*
|
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
|
* limitations under the License.
|
|
|
|
|
*
|
|
|
|
|
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
|
|
|
|
* 版权声明 2012,乌龙茶工作室
|
|
|
|
|
* provided by Linliangyi and copyright 2012 by Oolong studio
|
|
|
|
|
*
|
2021-03-22 11:41:13 +08:00
|
|
|
|
* 8.4.0版本 由 Magese (magese@live.cn) 更新
|
|
|
|
|
* release 8.4.0 update by Magese(magese@live.cn)
|
2018-11-15 11:05:24 +08:00
|
|
|
|
*
|
2018-05-10 13:52:58 +08:00
|
|
|
|
*/
|
|
|
|
|
package org.wltea.analyzer.lucene;
|
|
|
|
|
|
|
|
|
|
import org.apache.lucene.analysis.Tokenizer;
|
|
|
|
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|
|
|
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
|
|
|
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
|
|
|
|
import org.apache.lucene.util.AttributeFactory;
|
|
|
|
|
import org.wltea.analyzer.core.IKSegmenter;
|
|
|
|
|
import org.wltea.analyzer.core.Lexeme;
|
|
|
|
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* IK分词器 Lucene Tokenizer适配器类
|
|
|
|
|
* 兼容Lucene 4.0版本
|
|
|
|
|
*/
|
|
|
|
|
@SuppressWarnings("unused")
|
|
|
|
|
public final class IKTokenizer extends Tokenizer {
|
|
|
|
|
|
2019-11-12 11:30:57 +08:00
|
|
|
|
//IK分词器实现
|
|
|
|
|
private IKSegmenter _IKImplement;
|
2018-05-10 13:52:58 +08:00
|
|
|
|
|
2019-11-12 11:30:57 +08:00
|
|
|
|
//词元文本属性
|
|
|
|
|
private CharTermAttribute termAtt;
|
|
|
|
|
//词元位移属性
|
|
|
|
|
private OffsetAttribute offsetAtt;
|
|
|
|
|
//词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
|
|
|
|
|
private TypeAttribute typeAtt;
|
|
|
|
|
//记录最后一个词元的结束位置
|
|
|
|
|
private int endPosition;
|
2018-05-10 13:52:58 +08:00
|
|
|
|
|
2019-11-12 11:30:57 +08:00
|
|
|
|
/**
|
|
|
|
|
* Lucene 7.6 Tokenizer适配器类构造函数
|
|
|
|
|
*/
|
|
|
|
|
public IKTokenizer() {
|
|
|
|
|
this(false);
|
|
|
|
|
}
|
2018-05-10 13:52:58 +08:00
|
|
|
|
|
2019-11-12 11:30:57 +08:00
|
|
|
|
IKTokenizer(boolean useSmart) {
|
|
|
|
|
super();
|
|
|
|
|
init(useSmart);
|
|
|
|
|
}
|
2018-05-10 13:52:58 +08:00
|
|
|
|
|
2019-11-12 11:30:57 +08:00
|
|
|
|
public IKTokenizer(AttributeFactory factory) {
|
|
|
|
|
this(factory, false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
IKTokenizer(AttributeFactory factory, boolean useSmart) {
|
|
|
|
|
super(factory);
|
|
|
|
|
init(useSmart);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private void init(boolean useSmart) {
|
|
|
|
|
offsetAtt = addAttribute(OffsetAttribute.class);
|
|
|
|
|
termAtt = addAttribute(CharTermAttribute.class);
|
|
|
|
|
typeAtt = addAttribute(TypeAttribute.class);
|
|
|
|
|
_IKImplement = new IKSegmenter(input, useSmart);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* (non-Javadoc)
|
|
|
|
|
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public boolean incrementToken() throws IOException {
|
|
|
|
|
//清除所有的词元属性
|
|
|
|
|
clearAttributes();
|
|
|
|
|
Lexeme nextLexeme = _IKImplement.next();
|
|
|
|
|
if (nextLexeme != null) {
|
|
|
|
|
//将Lexeme转成Attributes
|
|
|
|
|
//设置词元文本
|
|
|
|
|
termAtt.append(nextLexeme.getLexemeText());
|
|
|
|
|
//设置词元长度
|
|
|
|
|
termAtt.setLength(nextLexeme.getLength());
|
|
|
|
|
//设置词元位移
|
|
|
|
|
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
|
|
|
|
|
//记录分词的最后位置
|
|
|
|
|
endPosition = nextLexeme.getEndPosition();
|
|
|
|
|
//记录词元分类
|
|
|
|
|
typeAtt.setType(nextLexeme.getLexemeTypeString());
|
|
|
|
|
//返会true告知还有下个词元
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
//返会false告知词元输出完毕
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* (non-Javadoc)
|
|
|
|
|
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public void reset() throws IOException {
|
|
|
|
|
super.reset();
|
|
|
|
|
_IKImplement.reset(input);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public final void end() {
|
|
|
|
|
// set final offset
|
|
|
|
|
int finalOffset = correctOffset(this.endPosition);
|
|
|
|
|
offsetAtt.setOffset(finalOffset, finalOffset);
|
|
|
|
|
}
|
2018-05-10 13:52:58 +08:00
|
|
|
|
}
|