LUCENE-1257: port smartchineseanalyzer to java 5

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@831121 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2009-10-29 22:29:50 +00:00
parent de0aaadb81
commit 19e55ea991
7 changed files with 63 additions and 73 deletions

View File

@ -60,7 +60,7 @@ import org.apache.lucene.util.Version;
*/
public class SmartChineseAnalyzer extends Analyzer {
private final Set stopWords;
private final Set<?> stopWords;
private static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";

View File

@ -17,7 +17,7 @@
package org.apache.lucene.analysis.cn.smart;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter;
@ -45,18 +45,19 @@ class WordSegmenter {
* @param startOffset start offset of sentence
* @return {@link List} of {@link SegToken}
*/
public List segmentSentence(String sentence, int startOffset) {
List segTokenList = hhmmSegmenter.process(sentence);
List result = new ArrayList();
public List<SegToken> segmentSentence(String sentence, int startOffset) {
List<SegToken> segTokenList = hhmmSegmenter.process(sentence);
// tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
for (int i = 1; i < segTokenList.size() - 1; i++) {
result.add(convertSegToken((SegToken) segTokenList.get(i), sentence, startOffset));
}
return result;
List<SegToken> result = Collections.emptyList();
if (segTokenList.size() > 2) // if its not an empty sentence
result = segTokenList.subList(1, segTokenList.size() - 1);
for (SegToken st : result)
convertSegToken(st, sentence, startOffset);
return result;
}
/**

View File

@ -40,9 +40,9 @@ public final class WordTokenFilter extends TokenFilter {
private WordSegmenter wordSegmenter;
private Iterator tokenIter;
private Iterator<SegToken> tokenIter;
private List tokenBuffer;
private List<SegToken> tokenBuffer;
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
@ -81,7 +81,7 @@ public final class WordTokenFilter extends TokenFilter {
// WordTokenFilter must clear attributes, as it is creating new tokens.
clearAttributes();
// There are remaining tokens from the current sentence, return the next one.
SegToken nextWord = (SegToken) tokenIter.next();
SegToken nextWord = tokenIter.next();
termAtt.setTermBuffer(nextWord.charArray, 0, nextWord.charArray.length);
offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
typeAtt.setType("word");

View File

@ -20,7 +20,6 @@ package org.apache.lucene.analysis.cn.smart.hhmm;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@ -39,9 +38,9 @@ import org.apache.lucene.analysis.cn.smart.Utility;
*/
class BiSegGraph {
private Map tokenPairListTable = new HashMap();
private Map<Integer,ArrayList<SegTokenPair>> tokenPairListTable = new HashMap<Integer,ArrayList<SegTokenPair>>();
private List segTokenList;
private List<SegToken> segTokenList;
private static BigramDictionary bigramDict = BigramDictionary.getInstance();
@ -65,15 +64,14 @@ class BiSegGraph {
segTokenList = segGraph.makeIndex();
// Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1
int key = -1;
List nextTokens = null;
List<SegToken> nextTokens = null;
while (key < maxStart) {
if (segGraph.isStartExist(key)) {
List tokenList = segGraph.getStartList(key);
List<SegToken> tokenList = segGraph.getStartList(key);
// Calculate all tokens for a given key.
for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
SegToken t1 = (SegToken) iter.next();
for (SegToken t1 : tokenList) {
oneWordFreq = t1.weight;
next = t1.endOffset;
nextTokens = null;
@ -91,8 +89,7 @@ class BiSegGraph {
if (nextTokens == null) {
break;
}
for (Iterator iter2 = nextTokens.iterator(); iter2.hasNext();) {
SegToken t2 = (SegToken) iter2.next();
for (SegToken t2 : nextTokens) {
idBuffer = new char[t1.charArray.length + t2.charArray.length + 1];
System.arraycopy(t1.charArray, 0, idBuffer, 0, t1.charArray.length);
idBuffer[t1.charArray.length] = BigramDictionary.WORD_SEGMENT_CHAR;
@ -139,8 +136,8 @@ class BiSegGraph {
* @param to index of the second token in the token pair
* @return {@link List} of token pairs.
*/
public List getToList(int to) {
return (List) tokenPairListTable.get(Integer.valueOf(to));
public List<SegTokenPair> getToList(int to) {
return tokenPairListTable.get(to);
}
/**
@ -151,11 +148,11 @@ class BiSegGraph {
public void addSegTokenPair(SegTokenPair tokenPair) {
int to = tokenPair.to;
if (!isToExist(to)) {
ArrayList newlist = new ArrayList();
ArrayList<SegTokenPair> newlist = new ArrayList<SegTokenPair>();
newlist.add(tokenPair);
tokenPairListTable.put(Integer.valueOf(to), newlist);
tokenPairListTable.put(to, newlist);
} else {
List tokenPairList = (List) tokenPairListTable.get(Integer.valueOf(to));
List<SegTokenPair> tokenPairList = tokenPairListTable.get(to);
tokenPairList.add(tokenPair);
}
}
@ -172,24 +169,23 @@ class BiSegGraph {
* Find the shortest path with the Viterbi algorithm.
* @return {@link List}
*/
public List getShortPath() {
public List<SegToken> getShortPath() {
int current;
int nodeCount = getToCount();
List path = new ArrayList();
List<PathNode> path = new ArrayList<PathNode>();
PathNode zeroPath = new PathNode();
zeroPath.weight = 0;
zeroPath.preNode = 0;
path.add(zeroPath);
for (current = 1; current <= nodeCount; current++) {
double weight;
List edges = getToList(current);
List<SegTokenPair> edges = getToList(current);
double minWeight = Double.MAX_VALUE;
SegTokenPair minEdge = null;
for (Iterator iter1 = edges.iterator(); iter1.hasNext();) {
SegTokenPair edge = (SegTokenPair) iter1.next();
for (SegTokenPair edge : edges) {
weight = edge.weight;
PathNode preNode = (PathNode) path.get(edge.from);
PathNode preNode = path.get(edge.from);
if (preNode.weight + weight < minWeight) {
minWeight = preNode.weight + weight;
minEdge = edge;
@ -205,10 +201,10 @@ class BiSegGraph {
int preNode, lastNode;
lastNode = path.size() - 1;
current = lastNode;
List rpath = new ArrayList();
List resultPath = new ArrayList();
List<Integer> rpath = new ArrayList<Integer>();
List<SegToken> resultPath = new ArrayList<SegToken>();
rpath.add(Integer.valueOf(current));
rpath.add(current);
while (current != 0) {
PathNode currentPathNode = (PathNode) path.get(current);
preNode = currentPathNode.preNode;
@ -218,7 +214,7 @@ class BiSegGraph {
for (int j = rpath.size() - 1; j >= 0; j--) {
Integer idInteger = (Integer) rpath.get(j);
int id = idInteger.intValue();
SegToken t = (SegToken) segTokenList.get(id);
SegToken t = segTokenList.get(id);
resultPath.add(t);
}
return resultPath;
@ -227,11 +223,9 @@ class BiSegGraph {
public String toString() {
StringBuilder sb = new StringBuilder();
Collection values = tokenPairListTable.values();
for (Iterator iter1 = values.iterator(); iter1.hasNext();) {
List segList = (List) iter1.next();
for (Iterator iter2 = segList.iterator(); iter2.hasNext();) {
SegTokenPair pair = (SegTokenPair) iter2.next();
Collection<ArrayList<SegTokenPair>> values = tokenPairListTable.values();
for (ArrayList<SegTokenPair> segList : values) {
for (SegTokenPair pair : segList) {
sb.append(pair + "\n");
}
}

View File

@ -22,7 +22,7 @@ import java.util.List;
import org.apache.lucene.analysis.cn.smart.CharType;
import org.apache.lucene.analysis.cn.smart.Utility;
import org.apache.lucene.analysis.cn.smart.WordType;
import org.apache.lucene.analysis.cn.smart.hhmm.PathNode;//javadoc @link
import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;//javadoc @link
/**
* Finds the optimal segmentation of a sentence into Chinese words
@ -196,14 +196,14 @@ public class HHMMSegmenter {
}
/**
* Return a list of {@link PathNode} representing the best segmentation of a sentence
* Return a list of {@link SegToken} representing the best segmentation of a sentence
* @param sentence input sentence
* @return best segmentation as a {@link List}
*/
public List process(String sentence) {
public List<SegToken> process(String sentence) {
SegGraph segGraph = createSegGraph(sentence);
BiSegGraph biSegGraph = new BiSegGraph(segGraph);
List shortPath = biSegGraph.getShortPath();
List<SegToken> shortPath = biSegGraph.getShortPath();
return shortPath;
}
}

View File

@ -28,13 +28,12 @@ package org.apache.lucene.analysis.cn.smart.hhmm;
* supported anymore in such a case.</font>
* </p>
*/
class PathNode implements Comparable {
class PathNode implements Comparable<PathNode> {
public double weight;
public int preNode;
public int compareTo(Object p) {
PathNode pn = (PathNode) p;
public int compareTo(PathNode pn) {
if (weight < pn.weight)
return -1;
else if (weight == pn.weight)

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.cn.smart.hhmm;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@ -39,7 +38,7 @@ class SegGraph {
/**
* Map of start offsets to ArrayList of tokens at that position
*/
private Map /* <Integer, ArrayList<SegToken>> */ tokenListTable = new HashMap();
private Map<Integer,ArrayList<SegToken>> tokenListTable = new HashMap<Integer,ArrayList<SegToken>>();
private int maxStart = -1;
@ -50,7 +49,7 @@ class SegGraph {
* @return true if there are tokens for the startOffset
*/
public boolean isStartExist(int s) {
return tokenListTable.get(Integer.valueOf(s)) != null;
return tokenListTable.get(s) != null;
}
/**
@ -59,8 +58,8 @@ class SegGraph {
* @param s startOffset
* @return List of tokens at the specified start offset.
*/
public List getStartList(int s) {
return (List) tokenListTable.get(Integer.valueOf(s));
public List<SegToken> getStartList(int s) {
return tokenListTable.get(s);
}
/**
@ -76,16 +75,15 @@ class SegGraph {
* Set the {@link SegToken#index} for each token, based upon its order by startOffset.
* @return a {@link List} of these ordered tokens.
*/
public List makeIndex() {
List result = new ArrayList();
public List<SegToken> makeIndex() {
List<SegToken> result = new ArrayList<SegToken>();
int s = -1, count = 0, size = tokenListTable.size();
List tokenList;
List<SegToken> tokenList;
short index = 0;
while (count < size) {
if (isStartExist(s)) {
tokenList = (List) tokenListTable.get(Integer.valueOf(s));
for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
SegToken st = (SegToken) iter.next();
tokenList = tokenListTable.get(s);
for (SegToken st : tokenList) {
st.index = index;
result.add(st);
index++;
@ -104,11 +102,11 @@ class SegGraph {
public void addToken(SegToken token) {
int s = token.startOffset;
if (!isStartExist(s)) {
ArrayList newlist = new ArrayList();
ArrayList<SegToken> newlist = new ArrayList<SegToken>();
newlist.add(token);
tokenListTable.put((Object) (Integer.valueOf(s)), newlist);
tokenListTable.put(s, newlist);
} else {
List tokenList = (List) tokenListTable.get((Object) (Integer.valueOf(s)));
List<SegToken> tokenList = tokenListTable.get(s);
tokenList.add(token);
}
if (s > maxStart)
@ -120,16 +118,15 @@ class SegGraph {
*
* @return {@link List} of all tokens in the map.
*/
public List toTokenList() {
List result = new ArrayList();
public List<SegToken> toTokenList() {
List<SegToken> result = new ArrayList<SegToken>();
int s = -1, count = 0, size = tokenListTable.size();
List tokenList;
List<SegToken> tokenList;
while (count < size) {
if (isStartExist(s)) {
tokenList = (List) tokenListTable.get(Integer.valueOf(s));
for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
SegToken st = (SegToken) iter.next();
tokenList = tokenListTable.get(s);
for (SegToken st : tokenList) {
result.add(st);
}
count++;
@ -140,10 +137,9 @@ class SegGraph {
}
public String toString() {
List tokenList = this.toTokenList();
List<SegToken> tokenList = this.toTokenList();
StringBuilder sb = new StringBuilder();
for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
SegToken t = (SegToken) iter.next();
for (SegToken t : tokenList) {
sb.append(t + "\n");
}
return sb.toString();