LUCENE-1916: smartcn hhmm doc translation

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@821325 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2009-10-03 14:24:45 +00:00
parent 1f9088b038
commit 8da43c4bb8
6 changed files with 65 additions and 43 deletions

View File

@ -27,6 +27,9 @@ Optimizations
Documentation
* LUCENE-1916: Translated documentation in the smartcn hhmm package.
(Patricia Peng via Robert Muir)
Build
Test Cases

View File

@ -117,8 +117,9 @@ abstract class AbstractDictionary {
// Should be a two-byte character
return -1;
}
int b0 = (int) (buffer[0] & 0x0FF) - 161; // 编码从A1开始因此减去0xA1=161
int b1 = (int) (buffer[1] & 0x0FF) - 161; // 第一个字符和最后一个字符没有汉字因此每个区只收16*6-2=94个汉字
int b0 = (int) (buffer[0] & 0x0FF) - 161; // Code starts from A1, therefore subtract 0xA1=161
int b1 = (int) (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol.
// Therefore, each code page only has 16*6-2=94 characters.
return (short) (b0 * 94 + b1);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();

View File

@ -63,7 +63,7 @@ class BiSegGraph {
char[] idBuffer;
// get the list of tokens ordered and indexed
segTokenList = segGraph.makeIndex();
// 因为startToken"始##始"的起始位置是-1因此key为-1时可以取出startToken
// Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1
int key = -1;
List nextTokens = null;
while (key < maxStart) {
@ -71,16 +71,17 @@ class BiSegGraph {
List tokenList = segGraph.getStartList(key);
// 为某一个key对应的所有Token都计算一次
// Calculate all tokens for a given key.
for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
SegToken t1 = (SegToken) iter.next();
oneWordFreq = t1.weight;
next = t1.endOffset;
nextTokens = null;
// 找到下一个对应的Token例如阳光海岸当前Token是阳光 下一个Token可以是或者海岸
// 如果找不到下一个Token则说明到了末尾重新循环
// Find the next corresponding Token.
// For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore".
// If we cannot find the next Token, then go to the end and repeat the same cycle.
while (next <= maxStart) {
// 因为endToken的起始位置是sentenceLen因此等于sentenceLen是可以找到endToken
// Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken.
if (segGraph.isStartExist(next)) {
nextTokens = segGraph.getStartList(next);
break;

View File

@ -156,7 +156,8 @@ class BigramDictionary extends AbstractDictionary {
IOException, UnsupportedEncodingException {
int i, cnt, length, total = 0;
// 文件中只统计了6763个汉字加5个空汉字符3756~3760其中第3756个用来存储符号信息
// The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
// The 3756th is used (as a header) to store information.
int[] buffer = new int[3];
byte[] intBuffer = new byte[4];
String tmpword;

View File

@ -49,7 +49,7 @@ public class HHMMSegmenter {
int[] charTypeArray = getCharTypes(sentence);
StringBuilder wordBuf = new StringBuilder();
SegToken token;
int frequency = 0; // word的出现次数
int frequency = 0; // the number of times word appears.
boolean hasFullWidth;
int wordType;
char[] charArray;
@ -64,7 +64,9 @@ public class HHMMSegmenter {
case CharType.HANZI:
j = i + 1;
wordBuf.delete(0, wordBuf.length());
// 不管单个汉字能不能构成词都将单个汉字存到segGraph中去否则会造成分此图断字
// It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not,
// it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will
// cause word division.
wordBuf.append(sentence.charAt(i));
charArray = new char[] { sentence.charAt(i) };
frequency = wordDict.getFrequency(charArray);
@ -75,7 +77,8 @@ public class HHMMSegmenter {
foundIndex = wordDict.getPrefixMatch(charArray);
while (j <= length && foundIndex != -1) {
if (wordDict.isEqual(charArray, foundIndex) && charArray.length > 1) {
// 就是我们要找的词 也就是说找到了从i到j的一个成词SegToken并且不是单字词
// It is the phrase we are looking for; In other words, we have found a phrase SegToken
// from i to j. It is not a monosyllabic word (single word).
frequency = wordDict.getFrequency(charArray);
token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
frequency);
@ -89,9 +92,9 @@ public class HHMMSegmenter {
wordBuf.append(sentence.charAt(j));
charArray = new char[wordBuf.length()];
wordBuf.getChars(0, charArray.length, charArray, 0);
// idArray作为前缀已经找到过(foundWordIndex!=-1),
// 因此加长过后的idArray只可能出现在foundWordIndex以后,
// 故从foundWordIndex之后开始查找
// idArray has been found (foundWordIndex!=-1) as a prefix before.
// Therefore, idArray after it has been lengthened can only appear after foundWordIndex.
// So start searching after foundWordIndex.
foundIndex = wordDict.getPrefixMatch(charArray, foundIndex);
j++;
} else {
@ -110,7 +113,7 @@ public class HHMMSegmenter {
hasFullWidth = true;
j++;
}
// 找到了从i到j的一个Token类型为LETTER的字符串
// Found a Token from i to j. Type is LETTER char string.
charArray = Utility.STRING_CHAR_ARRAY;
frequency = wordDict.getFrequency(charArray);
wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
@ -128,7 +131,7 @@ public class HHMMSegmenter {
hasFullWidth = true;
j++;
}
// 找到了从i到j的一个Token类型为NUMBER的字符串
// Found a Token from i to j. Type is NUMBER char string.
charArray = Utility.NUMBER_CHAR_ARRAY;
frequency = wordDict.getFrequency(charArray);
wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
@ -138,7 +141,7 @@ public class HHMMSegmenter {
break;
case CharType.DELIMITER:
j = i + 1;
// 标点符号的weight不用查了选个最大的频率即可
// No need to search the weight for the punctuation. Picking the highest frequency will work.
frequency = Utility.MAX_FREQUENCE;
charArray = new char[] { sentence.charAt(i) };
token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
@ -147,7 +150,8 @@ public class HHMMSegmenter {
break;
default:
j = i + 1;
// 把不认识的字符当作未知串看待例如GB2312编码之外的字符每个字符当作一个
// Treat the unrecognized char symbol as unknown string.
// For example, any symbol not in GB2312 is treated as one of these.
charArray = Utility.STRING_CHAR_ARRAY;
frequency = wordDict.getFrequency(charArray);
token = new SegToken(charArray, i, j, WordType.STRING, frequency);
@ -157,13 +161,13 @@ public class HHMMSegmenter {
}
}
// 为segGraph增加两个新Token "始##始","末##末"
// Add two more Tokens: "beginning xx beginning"
charArray = Utility.START_CHAR_ARRAY;
frequency = wordDict.getFrequency(charArray);
token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
segGraph.addToken(token);
// "末##末"
// "end xx end"
charArray = Utility.END_CHAR_ARRAY;
frequency = wordDict.getFrequency(charArray);
token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,

View File

@ -55,19 +55,24 @@ class WordDictionary extends AbstractDictionary {
public static final int PRIME_INDEX_LENGTH = 12071;
/**
* wordIndexTable保证将Unicode中的所有汉字编码hash到PRIME_INDEX_LENGTH长度的数组中
* 当然会有冲突但实际上本程序只处理GB2312字符部分6768个字符加上一些ASCII字符
* 因此对这些字符是有效的为了保证比较的准确性保留原来的字符在charIndexTable中以确定查找的准确性
* wordIndexTable guarantees to hash all Chinese characters in Unicode into
* PRIME_INDEX_LENGTH array. There will be conflict, but in reality this
* program only handles the 6768 characters found in GB2312 plus some
* ASCII characters. Therefore in order to guarantee better precision, it is
* necessary to retain the original symbol in the charIndexTable.
*/
private short[] wordIndexTable;
private char[] charIndexTable;
/**
* 存储所有词库的真正数据结构为了避免占用空间太多用了两个单独的多维数组来存储词组和频率
* 每个词放在一个char[]每个char对应一个汉字或其他字符每个频率放在一个int中
* 这两个数组的前两个下表是一一对应的因此可以利用wordItem_charArrayTable[i][j]来查词
* 用wordItem_frequencyTable[i][j]来查询对应的频率
* To avoid taking too much space, the data structure needed to store the
* lexicon requires two multidimensional arrays to store word and frequency.
* Each word is placed in a char[]. Each char represents a Chinese char or
* other symbol. Each frequency is put into an int. These two arrays
* correspond to each other one-to-one. Therefore, one can use
* wordItem_charArrayTable[i][j] to look up word from lexicon, and
* wordItem_frequencyTable[i][j] to look up the corresponding frequency.
*/
private char[][][] wordItem_charArrayTable;
@ -193,7 +198,8 @@ class WordDictionary extends AbstractDictionary {
private int loadMainDataFromFile(String dctFilePath)
throws FileNotFoundException, IOException, UnsupportedEncodingException {
int i, cnt, length, total = 0;
// 文件中只统计了6763个汉字加5个空汉字符3756~3760其中第3756个用来存储符号信息
// The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
// The 3756th is used (as a header) to store information.
int[] buffer = new int[3];
byte[] intBuffer = new byte[4];
String tmpword;
@ -255,33 +261,37 @@ class WordDictionary extends AbstractDictionary {
}
/**
* 原词库将所有标点符号的信息合并到一个列表里(从1开始的3755处)这里将其展开分别放到各个符号对应的列表中
* The original lexicon puts all information with punctuation into a
* chart (from 1 to 3755). Here it then gets expanded, separately being
* placed into the chart that has the corresponding symbol.
*/
private void expandDelimiterData() {
int i;
int cnt;
// 标点符号在从1开始的3755处将原始的标点符号对应的字典分配到对应的标点符号中
// Punctuation then treating index 3755 as 1,
// distribute the original punctuation corresponding dictionary into
int delimiterIndex = 3755 + GB2312_FIRST_CHAR;
i = 0;
while (i < wordItem_charArrayTable[delimiterIndex].length) {
char c = wordItem_charArrayTable[delimiterIndex][i][0];
int j = getGB2312Id(c);// 该标点符号应该所在的index值
int j = getGB2312Id(c);// the id value of the punctuation
if (wordItem_charArrayTable[j] == null) {
int k = i;
// 从i开始计数后面以j开头的符号的worditem的个数
// Starting from i, count the number of the following worditem symbol from j
while (k < wordItem_charArrayTable[delimiterIndex].length
&& wordItem_charArrayTable[delimiterIndex][k][0] == c) {
k++;
}
// 此时k-i为id为j的标点符号对应的wordItem的个数
// c is the punctuation character, j is the id value of c
// k-1 represents the index of the last punctuation character
cnt = k - i;
if (cnt != 0) {
wordItem_charArrayTable[j] = new char[cnt][];
wordItem_frequencyTable[j] = new int[cnt];
}
// 为每一个wordItem赋值
// Assign value for each wordItem.
for (k = 0; k < cnt; k++, i++) {
// wordItemTable[j][k] = new WordItem();
wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i];
@ -293,7 +303,7 @@ class WordDictionary extends AbstractDictionary {
setTableIndex(c, j);
}
}
// 将原符号对应的数组删除
// Delete the original corresponding symbol array.
wordItem_charArrayTable[delimiterIndex] = null;
wordItem_frequencyTable[delimiterIndex] = null;
}
@ -362,8 +372,8 @@ class WordDictionary extends AbstractDictionary {
}
/*
* 计算字符c在哈希表中应该在的位置然后将地址列表中该位置的值初始化
*
* Calculate character c's position in hash table,
* then initialize the value of that position in the address table.
*/
private boolean setTableIndex(char c, int j) {
int index = getAvaliableTableIndex(c);
@ -420,12 +430,14 @@ class WordDictionary extends AbstractDictionary {
}
/**
* 在字典库中查找单词对应的char数组为charArray的字符串返回该单词在单词序列中的位置
* Look up the text string corresponding with the word char array,
* and return the position of the word list.
*
* @param knownHashIndex 已知单词第一个字符charArray[0]在hash表中的位置如果未计算可以用函数int
* findInTable(char[] charArray) 代替
* @param charArray 查找单词对应的char数组
* @return 单词在单词数组中的位置如果没找到则返回-1
* @param knownHashIndex already figure out position of the first word
* symbol charArray[0] in hash table. If not calculated yet, can be
* replaced with function int findInTable(char[] charArray).
* @param charArray look up the char array corresponding with the word.
* @return word location in word array. If not found, then return -1.
*/
private int findInTable(short knownHashIndex, char[] charArray) {
if (charArray == null || charArray.length == 0)
@ -488,7 +500,7 @@ class WordDictionary extends AbstractDictionary {
&& Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0)
mid--;
mid++;
return mid;// 找到第一个以charArray为前缀的单词
return mid;// Find the first word that uses charArray as prefix.
} else if (cmpResult < 0)
end = mid - 1;
else