From 6ef47987522a8cd5c994c724e27efdd0e9a9ea51 Mon Sep 17 00:00:00 2001 From: Magese Date: Fri, 31 Dec 2021 17:43:40 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E5=8F=8A=E6=B3=A8=E9=87=8A?= =?UTF-8?q?=E6=A0=BC=E5=BC=8F=E5=8C=96=EF=BC=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/org/wltea/analyzer/core/Lexeme.java | 402 ++++++++++-------- 1 file changed, 219 insertions(+), 183 deletions(-) diff --git a/src/main/java/org/wltea/analyzer/core/Lexeme.java b/src/main/java/org/wltea/analyzer/core/Lexeme.java index 242f450..c120f12 100644 --- a/src/main/java/org/wltea/analyzer/core/Lexeme.java +++ b/src/main/java/org/wltea/analyzer/core/Lexeme.java @@ -31,242 +31,278 @@ package org.wltea.analyzer.core; * IK词元对象 */ @SuppressWarnings("unused") -public class Lexeme implements Comparable{ - //英文 - static final int TYPE_ENGLISH = 1; - //数字 - static final int TYPE_ARABIC = 2; - //英文数字混合 - static final int TYPE_LETTER = 3; - //中文词元 - static final int TYPE_CNWORD = 4; - //中文单字 - static final int TYPE_CNCHAR = 64; - //日韩文字 - static final int TYPE_OTHER_CJK = 8; - //中文数词 - static final int TYPE_CNUM = 16; - //中文量词 - static final int TYPE_COUNT = 32; - //中文数量词 - static final int TYPE_CQUAN = 48; - - //词元的起始位移 - private int offset; - //词元的相对起始位置 +public class Lexeme implements Comparable { + /** + * 英文 + */ + static final int TYPE_ENGLISH = 1; + /** + * 数字 + */ + static final int TYPE_ARABIC = 2; + /** + * 英文数字混合 + */ + static final int TYPE_LETTER = 3; + /** + * 中文词元 + */ + static final int TYPE_CNWORD = 4; + /** + * 中文单字 + */ + static final int TYPE_CNCHAR = 64; + /** + * 日韩文字 + */ + static final int TYPE_OTHER_CJK = 8; + /** + * 中文数词 + */ + static final int TYPE_CNUM = 16; + /** + * 中文量词 + */ + static final int TYPE_COUNT = 32; + /** + * 中文数量词 + */ + static final int TYPE_CQUAN = 48; + /** + * 词元的起始位移 + */ + private int offset; + /** + * 词元的相对起始位置 + */ private int begin; - //词元的长度 + /** + * 词元的长度 + */ private int length; - //词元文本 + /** + * 词元文本 + */ private String lexemeText; - //词元类型 + /** + * 词元类型 + */ private int lexemeType; - public Lexeme(int offset , int begin , int length , int lexemeType){ - this.offset = offset; - this.begin = begin; - if(length < 0){ - throw new IllegalArgumentException("length < 0"); - } - this.length = length; - this.lexemeType = lexemeType; - } + public Lexeme(int offset, int begin, int length, int lexemeType) { + this.offset = offset; + this.begin = begin; + if (length < 0) { + throw new IllegalArgumentException("length < 0"); + } + this.length = length; + this.lexemeType = lexemeType; + } /* * 判断词元相等算法 * 起始位置偏移、起始位置、终止位置相同 * @see java.lang.Object#equals(Object o) */ - public boolean equals(Object o){ - if(o == null){ - return false; - } + public boolean equals(Object o) { + if (o == null) { + return false; + } - if(this == o){ - return true; - } + if (this == o) { + return true; + } - if(o instanceof Lexeme){ - Lexeme other = (Lexeme)o; - return this.offset == other.getOffset() - && this.begin == other.getBegin() - && this.length == other.getLength(); - }else{ - return false; - } - } + if (o instanceof Lexeme) { + Lexeme other = (Lexeme) o; + return this.offset == other.getOffset() + && this.begin == other.getBegin() + && this.length == other.getLength(); + } else { + return false; + } + } /* * 词元哈希编码算法 * @see java.lang.Object#hashCode() */ - public int hashCode(){ - int absBegin = getBeginPosition(); - int absEnd = getEndPosition(); - return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11; + public int hashCode() { + int absBegin = getBeginPosition(); + int absEnd = getEndPosition(); + return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11; } /* * 词元在排序集合中的比较算法 * @see java.lang.Comparable#compareTo(java.lang.Object) */ - public int compareTo(Lexeme other) { - //起始位置优先 - if(this.begin < other.getBegin()){ + public int compareTo(Lexeme other) { + // 起始位置优先 + if (this.begin < other.getBegin()) { return -1; - }else if(this.begin == other.getBegin()){ - //词元长度优先 - //this.length < other.getLength() - return Integer.compare(other.getLength(), this.length); + } else if (this.begin == other.getBegin()) { + // 词元长度优先 + // this.length < other.getLength() + return Integer.compare(other.getLength(), this.length); - }else{//this.begin > other.getBegin() - return 1; + } else { + return 1; } - } + } - private int getOffset() { - return offset; - } + private int getOffset() { + return offset; + } - public void setOffset(int offset) { - this.offset = offset; - } + public void setOffset(int offset) { + this.offset = offset; + } - int getBegin() { - return begin; - } - /** - * 获取词元在文本中的起始位置 - * @return int - */ - public int getBeginPosition(){ - return offset + begin; - } + int getBegin() { + return begin; + } - public void setBegin(int begin) { - this.begin = begin; - } + /** + * 获取词元在文本中的起始位置 + * + * @return int + */ + public int getBeginPosition() { + return offset + begin; + } - /** - * 获取词元在文本中的结束位置 - * @return int - */ - public int getEndPosition(){ - return offset + begin + length; - } + public void setBegin(int begin) { + this.begin = begin; + } - /** - * 获取词元的字符长度 - * @return int - */ - public int getLength(){ - return this.length; - } + /** + * 获取词元在文本中的结束位置 + * + * @return int + */ + public int getEndPosition() { + return offset + begin + length; + } - public void setLength(int length) { - if(this.length < 0){ - throw new IllegalArgumentException("length < 0"); - } - this.length = length; - } + /** + * 获取词元的字符长度 + * + * @return int + */ + public int getLength() { + return this.length; + } - /** - * 获取词元的文本内容 - * @return String - */ - public String getLexemeText() { - if(lexemeText == null){ - return ""; - } - return lexemeText; - } + public void setLength(int length) { + if (this.length < 0) { + throw new IllegalArgumentException("length < 0"); + } + this.length = length; + } - void setLexemeText(String lexemeText) { - if(lexemeText == null){ - this.lexemeText = ""; - this.length = 0; - }else{ - this.lexemeText = lexemeText; - this.length = lexemeText.length(); - } - } + /** + * 获取词元的文本内容 + * + * @return String + */ + public String getLexemeText() { + if (lexemeText == null) { + return ""; + } + return lexemeText; + } - /** - * 获取词元类型 - * @return int - */ - int getLexemeType() { - return lexemeType; - } + void setLexemeText(String lexemeText) { + if (lexemeText == null) { + this.lexemeText = ""; + this.length = 0; + } else { + this.lexemeText = lexemeText; + this.length = lexemeText.length(); + } + } - /** - * 获取词元类型标示字符串 - * @return String - */ - public String getLexemeTypeString(){ - switch(lexemeType) { + /** + * 获取词元类型 + * + * @return int + */ + int getLexemeType() { + return lexemeType; + } - case TYPE_ENGLISH : - return "ENGLISH"; + /** + * 获取词元类型标示字符串 + * + * @return String + */ + public String getLexemeTypeString() { + switch (lexemeType) { - case TYPE_ARABIC : - return "ARABIC"; + case TYPE_ENGLISH: + return "ENGLISH"; - case TYPE_LETTER : - return "LETTER"; + case TYPE_ARABIC: + return "ARABIC"; - case TYPE_CNWORD : - return "CN_WORD"; + case TYPE_LETTER: + return "LETTER"; - case TYPE_CNCHAR : - return "CN_CHAR"; + case TYPE_CNWORD: + return "CN_WORD"; - case TYPE_OTHER_CJK : - return "OTHER_CJK"; + case TYPE_CNCHAR: + return "CN_CHAR"; - case TYPE_COUNT : - return "COUNT"; + case TYPE_OTHER_CJK: + return "OTHER_CJK"; - case TYPE_CNUM : - return "TYPE_CNUM"; + case TYPE_COUNT: + return "COUNT"; - case TYPE_CQUAN: - return "TYPE_CQUAN"; + case TYPE_CNUM: + return "TYPE_CNUM"; - default : - return "UNKONW"; - } - } + case TYPE_CQUAN: + return "TYPE_CQUAN"; + + default: + return "UNKNOWN"; + } + } - public void setLexemeType(int lexemeType) { - this.lexemeType = lexemeType; - } + public void setLexemeType(int lexemeType) { + this.lexemeType = lexemeType; + } - /** - * 合并两个相邻的词元 - * @return boolean 词元是否成功合并 - */ - boolean append(Lexeme l, int lexemeType){ - if(l != null && this.getEndPosition() == l.getBeginPosition()){ - this.length += l.getLength(); - this.lexemeType = lexemeType; - return true; - }else { - return false; - } - } + /** + * 合并两个相邻的词元 + * + * @return boolean 词元是否成功合并 + */ + boolean append(Lexeme l, int lexemeType) { + if (l != null && this.getEndPosition() == l.getBeginPosition()) { + this.length += l.getLength(); + this.lexemeType = lexemeType; + return true; + } else { + return false; + } + } - - /** - * - */ - public String toString(){ - return this.getBeginPosition() + "-" + this.getEndPosition() + - " : " + this.lexemeText + " : \t" + - this.getLexemeTypeString(); - } + /** + * ToString 方法 + * + * @return 字符串输出 + */ + public String toString() { + return this.getBeginPosition() + "-" + this.getEndPosition() + + " : " + this.lexemeText + " : \t" + + this.getLexemeTypeString(); + } }