LUCENE-1629: remove unnecessary source files

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@775468 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2009-05-16 14:08:38 +00:00
parent f81f6796a2
commit af550281cb
2 changed files with 0 additions and 845 deletions

View File

@ -1,303 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart.hhmm;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
public class CopyOfBigramDictionary extends AbstractDictionary {
private CopyOfBigramDictionary() {
}
public static final char WORD_SEGMENT_CHAR = '@';
private static CopyOfBigramDictionary singleInstance;
public static final int PRIME_BIGRAM_LENGTH = 402137;
/**
* bigramTable 来存储词与词之间的跳转频率 bigramHashTable frequencyTable
* 就是用来存储这些频率的数据结构 为了提高查询速度和节省内存 采用 hash 值来代替关联词作为查询依据 关联词就是
* (formWord+'@'+toWord) 利用 FNV1 hash 算法来计算关联词的hash值 并保存在 bigramHashTable
* 利用 hash 值来代替关联词有可能会产生很小概率的冲突 但是 long 类型
* (64bit)的hash值有效地将此概率降到极低bigramHashTable[i]与frequencyTable[i]一一对应
*/
private long[] bigramHashTable;
private int[] frequencyTable;
private int max = 0;
private int repeat = 0;
// static Logger log = Logger.getLogger(BigramDictionary.class);
public synchronized static CopyOfBigramDictionary getInstance() {
if (singleInstance == null) {
String dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
singleInstance = new CopyOfBigramDictionary();
singleInstance.load(dictRoot);
}
return singleInstance;
}
private boolean loadFromObj(File serialObj) {
boolean loadFromObject = false;
try {
ObjectInputStream input = new ObjectInputStream(new FileInputStream(
serialObj));
bigramHashTable = (long[]) input.readObject();
frequencyTable = (int[]) input.readObject();
// log.info("load bigram dict from serialization.");
loadFromObject = true;
input.close();
} catch (Exception e) {
// log.warn(e.getMessage());
}
return loadFromObject;
}
private void saveToObj(File serialObj) {
try {
ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
serialObj));
output.writeObject(bigramHashTable);
output.writeObject(frequencyTable);
output.close();
// log.info("serialize bigram dict.");
} catch (Exception e) {
// log.warn(e.getMessage());
}
}
private void load(String dictRoot) {
String bigramDictPath = dictRoot + "/bigramdict.dct";
File serialObj = new File(dictRoot + "/bigramdict.mem");
if (serialObj.exists() && loadFromObj(serialObj)) {
} else {
try {
bigramHashTable = new long[PRIME_BIGRAM_LENGTH];
frequencyTable = new int[PRIME_BIGRAM_LENGTH];
for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++) {
// 实际上将0作为初始值有一点问题因为某个字符串可能hash值为0但是概率非常小因此影响不大
bigramHashTable[i] = 0;
frequencyTable[i] = 0;
}
loadFromFile(bigramDictPath);
} catch (IOException e) {
throw new RuntimeException(e.getMessage());
}
saveToObj(serialObj);
}
}
/**
* 将词库文件加载到WordDictionary的相关数据结构中只是加载没有进行合并和修改操作
*
* @param dctFilePath
* @return
* @throws FileNotFoundException
* @throws IOException
* @throws UnsupportedEncodingException
*/
public void loadFromFile(String dctFilePath) throws FileNotFoundException,
IOException, UnsupportedEncodingException {
int i, cnt, length, total = 0;
// 文件中只统计了6763个汉字加5个空汉字符3756~3760其中第3756个用来存储符号信息
int[] buffer = new int[3];
byte[] intBuffer = new byte[4];
String tmpword;
RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
// 字典文件中第一个汉字出现的位置是0最后一个是6768
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
String currentStr = getCCByGB2312Id(i);
// if (i == 5231)
// System.out.println(i);
dctFile.read(intBuffer);// 原词库文件在c下开发所以写入的文件为little
// endian编码而java为big endian必须转换过来
cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
if (cnt <= 0) {
continue;
}
total += cnt;
int j = 0;
while (j < cnt) {
dctFile.read(intBuffer);
buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
.getInt();// frequency
dctFile.read(intBuffer);
buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
.getInt();// length
dctFile.read(intBuffer);
// buffer[2] = ByteBuffer.wrap(intBuffer).order(
// ByteOrder.LITTLE_ENDIAN).getInt();// handle
length = buffer[1];
if (length > 0) {
byte[] lchBuffer = new byte[length];
dctFile.read(lchBuffer);
tmpword = new String(lchBuffer, "GB2312");
if (i != 3755 + GB2312_FIRST_CHAR) {
tmpword = currentStr + tmpword;
}
char carray[] = tmpword.toCharArray();
long hashId = hash1(carray);
int index = getAvaliableIndex(hashId, carray);
if (index != -1) {
if (bigramHashTable[index] == 0) {
bigramHashTable[index] = hashId;
// bigramStringTable[index] = tmpword;
}
frequencyTable[index] += buffer[0];
}
}
j++;
}
}
dctFile.close();
// log.info("load dictionary done! " + dctFilePath + " total:" + total);
}
/*
* public void test(String dctFilePath) throws IOException { int i, cnt,
* length, total = 0; int corrupt = 0, notFound = 0; //
* 文件中只统计了6763个汉字加5个空汉字符3756~3760其中第3756个用来存储符号信息 int[] buffer = new int[3];
* byte[] intBuffer = new byte[4]; String tmpword; RandomAccessFile dctFile =
* new RandomAccessFile(dctFilePath, "r");
*
* // 字典文件中第一个汉字出现的位置是0最后一个是6768 for (i = GB2312_FIRST_CHAR; i <
* GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { String currentStr =
* getCCByGB2312Id(i); // if (i == 5231) // System.out.println(i);
*
* dctFile.read(intBuffer);// 原词库文件在c下开发所以写入的文件为little // endian编码而java为big
* endian必须转换过来 cnt =
* ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN) .getInt(); if
* (cnt <= 0) { continue; } total += cnt; int j = 0; while (j < cnt) {
* dctFile.read(intBuffer); buffer[0] = ByteBuffer.wrap(intBuffer).order(
* ByteOrder.LITTLE_ENDIAN).getInt();// frequency dctFile.read(intBuffer);
* buffer[1] = ByteBuffer.wrap(intBuffer).order(
* ByteOrder.LITTLE_ENDIAN).getInt();// length dctFile.read(intBuffer); //
* buffer[2] = ByteBuffer.wrap(intBuffer).order( //
* ByteOrder.LITTLE_ENDIAN).getInt();// handle
*
* length = buffer[1]; if (length > 0) { byte[] lchBuffer = new byte[length];
* dctFile.read(lchBuffer); tmpword = new String(lchBuffer, "GB2312"); if (i
* != 3755 + GB2312_FIRST_CHAR) { tmpword = currentStr + tmpword; } char
* carray[] = tmpword.toCharArray(); int index = getBigramItemIndex(carray);
* if (index != -1) { // if (!bigramStringTable[index].equals(tmpword)) { //
* System.out.println("corrupt: " + tmpword + "<->" // +
* bigramStringTable[index]); // corrupt++; // } } else {
* System.out.println("not found: " + tmpword); notFound++; } } j++; } }
* dctFile.close(); System.out.println("num not found:" + notFound);
* System.out.println("num corrupt:" + corrupt);
*
* log.info("test dictionary done! " + dctFilePath + " total:" + total); cnt =
* 0; for (int j = 0; j < PRIME_BIGRAM_LENGTH; j++) { if (bigramHashTable[j]
* != 0) { cnt++; } } System.out.println("total num in bigramTable: " + cnt);
* }
*/
private int getAvaliableIndex(long hashId, char carray[]) {
int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
if (hash1 < 0)
hash1 = PRIME_BIGRAM_LENGTH + hash1;
if (hash2 < 0)
hash2 = PRIME_BIGRAM_LENGTH + hash2;
int index = hash1;
int i = 1;
while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
&& i < PRIME_BIGRAM_LENGTH) {
index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
i++;
}
// System.out.println(i - 1);
if (i < PRIME_BIGRAM_LENGTH
&& (bigramHashTable[index] == 0 || bigramHashTable[index] == hashId)) {
return index;
} else
return -1;
}
/**
* @param c
* @return
*/
private int getBigramItemIndex(char carray[]) {
long hashId = hash1(carray);
int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
if (hash1 < 0)
hash1 = PRIME_BIGRAM_LENGTH + hash1;
if (hash2 < 0)
hash2 = PRIME_BIGRAM_LENGTH + hash2;
int index = hash1;
int i = 1;
repeat++;
while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
&& i < PRIME_BIGRAM_LENGTH) {
index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
i++;
repeat++;
if (i > max)
max = i;
}
// System.out.println(i - 1);
if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId) {
return index;
} else
return -1;
}
public int getFrequency(char[] carray) {
int index = getBigramItemIndex(carray);
if (index != -1)
return frequencyTable[index];
return 0;
}
public static void main(String[] args) throws FileNotFoundException,
UnsupportedEncodingException, IOException {
CopyOfBigramDictionary dic = new CopyOfBigramDictionary();
dic.load("D:/analysis-data");
// dic.test("D:/analysis-data/BigramDict.dct");
System.out.println("max:" + dic.max);
System.out.println("average repeat:" + (double) dic.repeat / 328856);
System.out.println("end");
}
}

View File

@ -1,542 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart.hhmm;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
import org.apache.lucene.analysis.cn.smart.Utility;
public class CopyOfWordDictionary extends AbstractDictionary {
private CopyOfWordDictionary() {
}
private static CopyOfWordDictionary singleInstance;
/**
* 一个较大的素数保证hash查找能够遍历所有位置
*/
public static final int PRIME_INDEX_LENGTH = 12071;
/**
* wordIndexTable保证将Unicode中的所有汉字编码hash到PRIME_INDEX_LENGTH长度的数组中
* 当然会有冲突但实际上本程序只处理GB2312字符部分6768个字符加上一些ASCII字符
* 因此对这些字符是有效的为了保证比较的准确性保留原来的字符在charIndexTable中以确定查找的准确性
*/
private short[] wordIndexTable;
private char[] charIndexTable;
/**
* 存储所有词库的真正数据结构为了避免占用空间太多用了两个单独的多维数组来存储词组和频率
* 每个词放在一个char[]每个char对应一个汉字或其他字符每个频率放在一个int中
* 这两个数组的前两个下表是一一对应的因此可以利用wordItem_charArrayTable[i][j]来查词
* 用wordItem_frequencyTable[i][j]来查询对应的频率
*/
private char[][][] wordItem_charArrayTable;
private int[][] wordItem_frequencyTable;
// static Logger log = Logger.getLogger(WordDictionary.class);
public synchronized static CopyOfWordDictionary getInstance() {
if (singleInstance == null) {
singleInstance = new CopyOfWordDictionary();
String wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
singleInstance.load(wordDictRoot);
}
return singleInstance;
}
/**
* 加在词典库文件
*
* @param dctFileName 词典库文件的路径
*/
public void load(String dctFileRoot) {
String dctFilePath = dctFileRoot + "/coredict.dct";
File serialObj = new File(dctFileRoot + "/coredict.mem");
if (serialObj.exists() && loadFromObj(serialObj)) {
} else {
try {
wordIndexTable = new short[PRIME_INDEX_LENGTH];
charIndexTable = new char[PRIME_INDEX_LENGTH];
for (int i = 0; i < PRIME_INDEX_LENGTH; i++) {
charIndexTable[i] = 0;
wordIndexTable[i] = -1;
}
wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][];
wordItem_frequencyTable = new int[GB2312_CHAR_NUM][];
// int total =
loadMainDataFromFile(dctFilePath);
expandDelimiterData();
mergeSameWords();
sortEachItems();
// log.info("load dictionary: " + dctFilePath + " total:" + total);
} catch (IOException e) {
throw new RuntimeException(e.getMessage());
}
saveToObj(serialObj);
}
}
private boolean loadFromObj(File serialObj) {
boolean loadFromObject = false;
try {
ObjectInputStream input = new ObjectInputStream(new FileInputStream(
serialObj));
wordIndexTable = (short[]) input.readObject();
charIndexTable = (char[]) input.readObject();
wordItem_charArrayTable = (char[][][]) input.readObject();
wordItem_frequencyTable = (int[][]) input.readObject();
// log.info("load core dict from serialization.");
input.close();
loadFromObject = true;
} catch (Exception e) {
// log.warn(e.getMessage());
}
return loadFromObject;
}
private void saveToObj(File serialObj) {
try {
ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
serialObj));
output.writeObject(wordIndexTable);
output.writeObject(charIndexTable);
output.writeObject(wordItem_charArrayTable);
output.writeObject(wordItem_frequencyTable);
output.close();
// log.info("serialize core dict.");
} catch (Exception e) {
// log.warn(e.getMessage());
}
}
/**
* 将词库文件加载到WordDictionary的相关数据结构中只是加载没有进行合并和修改操作
*
* @param dctFilePath
* @return
* @throws FileNotFoundException
* @throws IOException
* @throws UnsupportedEncodingException
*/
private int loadMainDataFromFile(String dctFilePath)
throws FileNotFoundException, IOException, UnsupportedEncodingException {
int i, cnt, length, total = 0;
// 文件中只统计了6763个汉字加5个空汉字符3756~3760其中第3756个用来存储符号信息
int[] buffer = new int[3];
byte[] intBuffer = new byte[4];
String tmpword;
RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
// 字典文件中第一个汉字出现的位置是0最后一个是6768
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
// if (i == 5231)
// System.out.println(i);
dctFile.read(intBuffer);// 原词库文件在c下开发所以写入的文件为little
// endian编码而java为big endian必须转换过来
cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
if (cnt <= 0) {
wordItem_charArrayTable[i] = null;
wordItem_frequencyTable[i] = null;
continue;
}
wordItem_charArrayTable[i] = new char[cnt][];
wordItem_frequencyTable[i] = new int[cnt];
total += cnt;
int j = 0;
while (j < cnt) {
// wordItemTable[i][j] = new WordItem();
dctFile.read(intBuffer);
buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
.getInt();// frequency
dctFile.read(intBuffer);
buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
.getInt();// length
dctFile.read(intBuffer);
buffer[2] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
.getInt();// handle
// wordItemTable[i][j].frequency = buffer[0];
wordItem_frequencyTable[i][j] = buffer[0];
length = buffer[1];
if (length > 0) {
byte[] lchBuffer = new byte[length];
dctFile.read(lchBuffer);
tmpword = new String(lchBuffer, "GB2312");
// indexTable[i].wordItems[j].word = tmpword;
// wordItemTable[i][j].charArray = tmpword.toCharArray();
wordItem_charArrayTable[i][j] = tmpword.toCharArray();
} else {
// wordItemTable[i][j].charArray = null;
wordItem_charArrayTable[i][j] = null;
}
// System.out.println(indexTable[i].wordItems[j]);
j++;
}
String str = getCCByGB2312Id(i);
setTableIndex(str.charAt(0), i);
}
dctFile.close();
return total;
}
/**
* 原词库将所有标点符号的信息合并到一个列表里(从1开始的3755处)这里将其展开分别放到各个符号对应的列表中
*/
private void expandDelimiterData() {
int i;
int cnt;
// 标点符号在从1开始的3755处将原始的标点符号对应的字典分配到对应的标点符号中
int delimiterIndex = 3755 + GB2312_FIRST_CHAR;
i = 0;
while (i < wordItem_charArrayTable[delimiterIndex].length) {
char c = wordItem_charArrayTable[delimiterIndex][i][0];
int j = getGB2312Id(c);// 该标点符号应该所在的index值
if (wordItem_charArrayTable[j] == null) {
int k = i;
// 从i开始计数后面以j开头的符号的worditem的个数
while (k < wordItem_charArrayTable[delimiterIndex].length
&& wordItem_charArrayTable[delimiterIndex][k][0] == c) {
k++;
}
// 此时k-i为id为j的标点符号对应的wordItem的个数
cnt = k - i;
if (cnt != 0) {
wordItem_charArrayTable[j] = new char[cnt][];
wordItem_frequencyTable[j] = new int[cnt];
}
// 为每一个wordItem赋值
for (k = 0; k < cnt; k++, i++) {
// wordItemTable[j][k] = new WordItem();
wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i];
wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].length - 1];
System.arraycopy(wordItem_charArrayTable[delimiterIndex][i], 1,
wordItem_charArrayTable[j][k], 0,
wordItem_charArrayTable[j][k].length);
}
setTableIndex(c, j);
}
}
// 将原符号对应的数组删除
wordItem_charArrayTable[delimiterIndex] = null;
wordItem_frequencyTable[delimiterIndex] = null;
}
/**
* 本程序不做词性标注因此将相同词不同词性的频率合并到同一个词下以减小存储空间加快搜索速度
*/
private void mergeSameWords() {
int i;
for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
if (wordItem_charArrayTable[i] == null)
continue;
int len = 1;
for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
wordItem_charArrayTable[i][j - 1], 0) != 0)
len++;
}
if (len < wordItem_charArrayTable[i].length) {
char[][] tempArray = new char[len][];
int[] tempFreq = new int[len];
int k = 0;
tempArray[0] = wordItem_charArrayTable[i][0];
tempFreq[0] = wordItem_frequencyTable[i][0];
for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
tempArray[k], 0) != 0) {
k++;
// temp[k] = wordItemTable[i][j];
tempArray[k] = wordItem_charArrayTable[i][j];
tempFreq[k] = wordItem_frequencyTable[i][j];
} else {
// temp[k].frequency += wordItemTable[i][j].frequency;
tempFreq[k] += wordItem_frequencyTable[i][j];
}
}
// wordItemTable[i] = temp;
wordItem_charArrayTable[i] = tempArray;
wordItem_frequencyTable[i] = tempFreq;
}
}
}
private void sortEachItems() {
char[] tmpArray;
int tmpFreq;
for (int i = 0; i < wordItem_charArrayTable.length; i++) {
if (wordItem_charArrayTable[i] != null
&& wordItem_charArrayTable[i].length > 1) {
for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) {
for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) {
if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
wordItem_charArrayTable[i][j2], 0) > 0) {
tmpArray = wordItem_charArrayTable[i][j];
tmpFreq = wordItem_frequencyTable[i][j];
wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2];
wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2];
wordItem_charArrayTable[i][j2] = tmpArray;
wordItem_frequencyTable[i][j2] = tmpFreq;
}
}
}
}
}
}
/**
* 计算字符c在哈希表中应该在的位置然后将地址列表中该位置的值初始化
*
* @param c
* @param j
* @return
*/
private boolean setTableIndex(char c, int j) {
int index = getAvaliableTableIndex(c);
if (index != -1) {
charIndexTable[index] = c;
wordIndexTable[index] = (short) j;
return true;
} else
return false;
}
private short getAvaliableTableIndex(char c) {
int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
if (hash1 < 0)
hash1 = PRIME_INDEX_LENGTH + hash1;
if (hash2 < 0)
hash2 = PRIME_INDEX_LENGTH + hash2;
int index = hash1;
int i = 1;
while (charIndexTable[index] != 0 && charIndexTable[index] != c
&& i < PRIME_INDEX_LENGTH) {
index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
i++;
}
// System.out.println(i - 1);
if (i < PRIME_INDEX_LENGTH
&& (charIndexTable[index] == 0 || charIndexTable[index] == c)) {
return (short) index;
} else
return -1;
}
/**
* @param c
* @return
*/
private short getWordItemTableIndex(char c) {
int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
if (hash1 < 0)
hash1 = PRIME_INDEX_LENGTH + hash1;
if (hash2 < 0)
hash2 = PRIME_INDEX_LENGTH + hash2;
int index = hash1;
int i = 1;
while (charIndexTable[index] != 0 && charIndexTable[index] != c
&& i < PRIME_INDEX_LENGTH) {
index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
i++;
}
if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) {
return (short) index;
} else
return -1;
}
/**
* 在字典库中查找单词对应的char数组为charArray的字符串返回该单词在单词序列中的位置
*
* @param charArray 查找单词对应的char数组
* @return 单词在单词数组中的位置如果没找到则返回-1
*/
private int findInTable(char[] charArray) {
if (charArray == null || charArray.length == 0)
return -1;
short index = getWordItemTableIndex(charArray[0]);
if (index == -1)
return -1;
return findInTable(index, charArray);
}
/**
* 在字典库中查找单词对应的char数组为charArray的字符串返回该单词在单词序列中的位置
*
* @param knownHashIndex 已知单词第一个字符charArray[0]在hash表中的位置如果未计算可以用函数int
* findInTable(char[] charArray) 代替
* @param charArray 查找单词对应的char数组
* @return 单词在单词数组中的位置如果没找到则返回-1
*/
private int findInTable(short knownHashIndex, char[] charArray) {
if (charArray == null || charArray.length == 0)
return -1;
char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]];
int start = 0, end = items.length - 1;
int mid = (start + end) / 2, cmpResult;
// Binary search for the index of idArray
while (start <= end) {
cmpResult = Utility.compareArray(items[mid], 0, charArray, 1);
if (cmpResult == 0)
return mid;// find it
else if (cmpResult < 0)
start = mid + 1;
else if (cmpResult > 0)
end = mid - 1;
mid = (start + end) / 2;
}
return -1;
}
/**
* charArray这个单词对应的词组在不在WordDictionary中出现
*
* @param charArray
* @return true表示存在false表示不存在
*/
public boolean isExist(char[] charArray) {
return findInTable(charArray) != -1;
}
/**
* @see{getPrefixMatch(char[] charArray, int knownStart)}
* @param charArray
* @return
*/
public int getPrefixMatch(char[] charArray) {
return getPrefixMatch(charArray, 0);
}
/**
* 从词典中查找以charArray对应的单词为前缀(prefix)的单词的位置, 并返回第一个满足条件的位置为了减小搜索代价,
* 可以根据已有知识设置起始搜索位置, 如果不知道起始位置默认是0
*
* @see{getPrefixMatch(char[] charArray)}
* @param charArray 前缀单词
* @param knownStart 已知的起始位置
* @return 满足前缀条件的第一个单词的位置
*/
public int getPrefixMatch(char[] charArray, int knownStart) {
short index = getWordItemTableIndex(charArray[0]);
if (index == -1)
return -1;
char[][] items = wordItem_charArrayTable[wordIndexTable[index]];
int start = knownStart, end = items.length - 1;
int mid = (start + end) / 2, cmpResult;
// Binary search for the index of idArray
while (start <= end) {
cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0);
if (cmpResult == 0) {
// Get the first item which match the current word
while (mid >= 0
&& Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0)
mid--;
mid++;
return mid;// 找到第一个以charArray为前缀的单词
} else if (cmpResult < 0)
end = mid - 1;
else
start = mid + 1;
mid = (start + end) / 2;
}
return -1;
}
/**
* 获取idArray对应的词的词频若pos为-1则获取所有词性的词频
*
* @param charArray 输入的单词对应的charArray
* @param pos 词性-1表示要求求出所有的词性的词频
* @return idArray对应的词频
*/
public int getFrequency(char[] charArray) {
short hashIndex = getWordItemTableIndex(charArray[0]);
if (hashIndex == -1)
return 0;
int itemIndex = findInTable(hashIndex, charArray);
if (itemIndex != -1)
return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex];
return 0;
}
/**
* 判断charArray对应的字符串是否跟词典中charArray[0]对应的wordIndex的charArray相等,
* 也就是说charArray的位置查找结果是不是就是wordIndex
*
* @param charArray 输入的charArray词组第一个数表示词典中的索引号
* @param itemIndex 位置编号
* @return 是否相等
*/
public boolean isEqual(char[] charArray, int itemIndex) {
short hashIndex = getWordItemTableIndex(charArray[0]);
return Utility.compareArray(charArray, 1,
wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0;
}
public static void main(String[] args) throws FileNotFoundException,
IOException {
CopyOfWordDictionary dic = new CopyOfWordDictionary();
dic.load("D:/analysis-data");
Utility.getCharType('。');
Utility.getCharType('汗');
Utility.getCharType(' ');// 0020
Utility.getCharType(' ');// 3000
Utility.getCharType('');// E095
Utility.getCharType(' ');// 3000
Utility.getCharType('\r');// 000D
Utility.getCharType('\n');// 000A
Utility.getCharType('\t');// 0009
}
}