2021-03-22 11:41:13 +08:00

131 lines
4.2 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* IK 中文分词 版本 8.4.0
* IK Analyzer release 8.4.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.4.0版本 由 Magese (magese@live.cn) 更新
* release 8.4.0 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.lucene;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeFactory;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
/**
* IK分词器 Lucene Tokenizer适配器类
* 兼容Lucene 4.0版本
*/
@SuppressWarnings("unused")
public final class IKTokenizer extends Tokenizer {
//IK分词器实现
private IKSegmenter _IKImplement;
//词元文本属性
private CharTermAttribute termAtt;
//词元位移属性
private OffsetAttribute offsetAtt;
//词元分类属性该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量
private TypeAttribute typeAtt;
//记录最后一个词元的结束位置
private int endPosition;
/**
* Lucene 7.6 Tokenizer适配器类构造函数
*/
public IKTokenizer() {
this(false);
}
IKTokenizer(boolean useSmart) {
super();
init(useSmart);
}
public IKTokenizer(AttributeFactory factory) {
this(factory, false);
}
IKTokenizer(AttributeFactory factory, boolean useSmart) {
super(factory);
init(useSmart);
}
private void init(boolean useSmart) {
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
_IKImplement = new IKSegmenter(input, useSmart);
}
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
@Override
public boolean incrementToken() throws IOException {
//清除所有的词元属性
clearAttributes();
Lexeme nextLexeme = _IKImplement.next();
if (nextLexeme != null) {
//将Lexeme转成Attributes
//设置词元文本
termAtt.append(nextLexeme.getLexemeText());
//设置词元长度
termAtt.setLength(nextLexeme.getLength());
//设置词元位移
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
//记录分词的最后位置
endPosition = nextLexeme.getEndPosition();
//记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
//返会true告知还有下个词元
return true;
}
//返会false告知词元输出完毕
return false;
}
/*
* (non-Javadoc)
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
*/
@Override
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
}
}