mirror of
https://github.com/apache/lucene.git
synced 2025-03-04 07:19:18 +00:00
LUCENE-1257: port smartchineseanalyzer to java 5
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@831121 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
de0aaadb81
commit
19e55ea991
@ -60,7 +60,7 @@ import org.apache.lucene.util.Version;
|
||||
*/
|
||||
public class SmartChineseAnalyzer extends Analyzer {
|
||||
|
||||
private final Set stopWords;
|
||||
private final Set<?> stopWords;
|
||||
|
||||
private static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
|
@ -17,7 +17,7 @@
|
||||
|
||||
package org.apache.lucene.analysis.cn.smart;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter;
|
||||
@ -45,18 +45,19 @@ class WordSegmenter {
|
||||
* @param startOffset start offset of sentence
|
||||
* @return {@link List} of {@link SegToken}
|
||||
*/
|
||||
public List segmentSentence(String sentence, int startOffset) {
|
||||
|
||||
List segTokenList = hhmmSegmenter.process(sentence);
|
||||
|
||||
List result = new ArrayList();
|
||||
public List<SegToken> segmentSentence(String sentence, int startOffset) {
|
||||
|
||||
List<SegToken> segTokenList = hhmmSegmenter.process(sentence);
|
||||
// tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
|
||||
for (int i = 1; i < segTokenList.size() - 1; i++) {
|
||||
result.add(convertSegToken((SegToken) segTokenList.get(i), sentence, startOffset));
|
||||
}
|
||||
List<SegToken> result = Collections.emptyList();
|
||||
|
||||
if (segTokenList.size() > 2) // if its not an empty sentence
|
||||
result = segTokenList.subList(1, segTokenList.size() - 1);
|
||||
|
||||
for (SegToken st : result)
|
||||
convertSegToken(st, sentence, startOffset);
|
||||
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -40,9 +40,9 @@ public final class WordTokenFilter extends TokenFilter {
|
||||
|
||||
private WordSegmenter wordSegmenter;
|
||||
|
||||
private Iterator tokenIter;
|
||||
private Iterator<SegToken> tokenIter;
|
||||
|
||||
private List tokenBuffer;
|
||||
private List<SegToken> tokenBuffer;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
@ -81,7 +81,7 @@ public final class WordTokenFilter extends TokenFilter {
|
||||
// WordTokenFilter must clear attributes, as it is creating new tokens.
|
||||
clearAttributes();
|
||||
// There are remaining tokens from the current sentence, return the next one.
|
||||
SegToken nextWord = (SegToken) tokenIter.next();
|
||||
SegToken nextWord = tokenIter.next();
|
||||
termAtt.setTermBuffer(nextWord.charArray, 0, nextWord.charArray.length);
|
||||
offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
|
||||
typeAtt.setType("word");
|
||||
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.cn.smart.hhmm;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@ -39,9 +38,9 @@ import org.apache.lucene.analysis.cn.smart.Utility;
|
||||
*/
|
||||
class BiSegGraph {
|
||||
|
||||
private Map tokenPairListTable = new HashMap();
|
||||
private Map<Integer,ArrayList<SegTokenPair>> tokenPairListTable = new HashMap<Integer,ArrayList<SegTokenPair>>();
|
||||
|
||||
private List segTokenList;
|
||||
private List<SegToken> segTokenList;
|
||||
|
||||
private static BigramDictionary bigramDict = BigramDictionary.getInstance();
|
||||
|
||||
@ -65,15 +64,14 @@ class BiSegGraph {
|
||||
segTokenList = segGraph.makeIndex();
|
||||
// Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1
|
||||
int key = -1;
|
||||
List nextTokens = null;
|
||||
List<SegToken> nextTokens = null;
|
||||
while (key < maxStart) {
|
||||
if (segGraph.isStartExist(key)) {
|
||||
|
||||
List tokenList = segGraph.getStartList(key);
|
||||
List<SegToken> tokenList = segGraph.getStartList(key);
|
||||
|
||||
// Calculate all tokens for a given key.
|
||||
for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
|
||||
SegToken t1 = (SegToken) iter.next();
|
||||
for (SegToken t1 : tokenList) {
|
||||
oneWordFreq = t1.weight;
|
||||
next = t1.endOffset;
|
||||
nextTokens = null;
|
||||
@ -91,8 +89,7 @@ class BiSegGraph {
|
||||
if (nextTokens == null) {
|
||||
break;
|
||||
}
|
||||
for (Iterator iter2 = nextTokens.iterator(); iter2.hasNext();) {
|
||||
SegToken t2 = (SegToken) iter2.next();
|
||||
for (SegToken t2 : nextTokens) {
|
||||
idBuffer = new char[t1.charArray.length + t2.charArray.length + 1];
|
||||
System.arraycopy(t1.charArray, 0, idBuffer, 0, t1.charArray.length);
|
||||
idBuffer[t1.charArray.length] = BigramDictionary.WORD_SEGMENT_CHAR;
|
||||
@ -139,8 +136,8 @@ class BiSegGraph {
|
||||
* @param to index of the second token in the token pair
|
||||
* @return {@link List} of token pairs.
|
||||
*/
|
||||
public List getToList(int to) {
|
||||
return (List) tokenPairListTable.get(Integer.valueOf(to));
|
||||
public List<SegTokenPair> getToList(int to) {
|
||||
return tokenPairListTable.get(to);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -151,11 +148,11 @@ class BiSegGraph {
|
||||
public void addSegTokenPair(SegTokenPair tokenPair) {
|
||||
int to = tokenPair.to;
|
||||
if (!isToExist(to)) {
|
||||
ArrayList newlist = new ArrayList();
|
||||
ArrayList<SegTokenPair> newlist = new ArrayList<SegTokenPair>();
|
||||
newlist.add(tokenPair);
|
||||
tokenPairListTable.put(Integer.valueOf(to), newlist);
|
||||
tokenPairListTable.put(to, newlist);
|
||||
} else {
|
||||
List tokenPairList = (List) tokenPairListTable.get(Integer.valueOf(to));
|
||||
List<SegTokenPair> tokenPairList = tokenPairListTable.get(to);
|
||||
tokenPairList.add(tokenPair);
|
||||
}
|
||||
}
|
||||
@ -172,24 +169,23 @@ class BiSegGraph {
|
||||
* Find the shortest path with the Viterbi algorithm.
|
||||
* @return {@link List}
|
||||
*/
|
||||
public List getShortPath() {
|
||||
public List<SegToken> getShortPath() {
|
||||
int current;
|
||||
int nodeCount = getToCount();
|
||||
List path = new ArrayList();
|
||||
List<PathNode> path = new ArrayList<PathNode>();
|
||||
PathNode zeroPath = new PathNode();
|
||||
zeroPath.weight = 0;
|
||||
zeroPath.preNode = 0;
|
||||
path.add(zeroPath);
|
||||
for (current = 1; current <= nodeCount; current++) {
|
||||
double weight;
|
||||
List edges = getToList(current);
|
||||
List<SegTokenPair> edges = getToList(current);
|
||||
|
||||
double minWeight = Double.MAX_VALUE;
|
||||
SegTokenPair minEdge = null;
|
||||
for (Iterator iter1 = edges.iterator(); iter1.hasNext();) {
|
||||
SegTokenPair edge = (SegTokenPair) iter1.next();
|
||||
for (SegTokenPair edge : edges) {
|
||||
weight = edge.weight;
|
||||
PathNode preNode = (PathNode) path.get(edge.from);
|
||||
PathNode preNode = path.get(edge.from);
|
||||
if (preNode.weight + weight < minWeight) {
|
||||
minWeight = preNode.weight + weight;
|
||||
minEdge = edge;
|
||||
@ -205,10 +201,10 @@ class BiSegGraph {
|
||||
int preNode, lastNode;
|
||||
lastNode = path.size() - 1;
|
||||
current = lastNode;
|
||||
List rpath = new ArrayList();
|
||||
List resultPath = new ArrayList();
|
||||
List<Integer> rpath = new ArrayList<Integer>();
|
||||
List<SegToken> resultPath = new ArrayList<SegToken>();
|
||||
|
||||
rpath.add(Integer.valueOf(current));
|
||||
rpath.add(current);
|
||||
while (current != 0) {
|
||||
PathNode currentPathNode = (PathNode) path.get(current);
|
||||
preNode = currentPathNode.preNode;
|
||||
@ -218,7 +214,7 @@ class BiSegGraph {
|
||||
for (int j = rpath.size() - 1; j >= 0; j--) {
|
||||
Integer idInteger = (Integer) rpath.get(j);
|
||||
int id = idInteger.intValue();
|
||||
SegToken t = (SegToken) segTokenList.get(id);
|
||||
SegToken t = segTokenList.get(id);
|
||||
resultPath.add(t);
|
||||
}
|
||||
return resultPath;
|
||||
@ -227,11 +223,9 @@ class BiSegGraph {
|
||||
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
Collection values = tokenPairListTable.values();
|
||||
for (Iterator iter1 = values.iterator(); iter1.hasNext();) {
|
||||
List segList = (List) iter1.next();
|
||||
for (Iterator iter2 = segList.iterator(); iter2.hasNext();) {
|
||||
SegTokenPair pair = (SegTokenPair) iter2.next();
|
||||
Collection<ArrayList<SegTokenPair>> values = tokenPairListTable.values();
|
||||
for (ArrayList<SegTokenPair> segList : values) {
|
||||
for (SegTokenPair pair : segList) {
|
||||
sb.append(pair + "\n");
|
||||
}
|
||||
}
|
||||
|
@ -22,7 +22,7 @@ import java.util.List;
|
||||
import org.apache.lucene.analysis.cn.smart.CharType;
|
||||
import org.apache.lucene.analysis.cn.smart.Utility;
|
||||
import org.apache.lucene.analysis.cn.smart.WordType;
|
||||
import org.apache.lucene.analysis.cn.smart.hhmm.PathNode;//javadoc @link
|
||||
import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;//javadoc @link
|
||||
|
||||
/**
|
||||
* Finds the optimal segmentation of a sentence into Chinese words
|
||||
@ -196,14 +196,14 @@ public class HHMMSegmenter {
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a list of {@link PathNode} representing the best segmentation of a sentence
|
||||
* Return a list of {@link SegToken} representing the best segmentation of a sentence
|
||||
* @param sentence input sentence
|
||||
* @return best segmentation as a {@link List}
|
||||
*/
|
||||
public List process(String sentence) {
|
||||
public List<SegToken> process(String sentence) {
|
||||
SegGraph segGraph = createSegGraph(sentence);
|
||||
BiSegGraph biSegGraph = new BiSegGraph(segGraph);
|
||||
List shortPath = biSegGraph.getShortPath();
|
||||
List<SegToken> shortPath = biSegGraph.getShortPath();
|
||||
return shortPath;
|
||||
}
|
||||
}
|
||||
|
@ -28,13 +28,12 @@ package org.apache.lucene.analysis.cn.smart.hhmm;
|
||||
* supported anymore in such a case.</font>
|
||||
* </p>
|
||||
*/
|
||||
class PathNode implements Comparable {
|
||||
class PathNode implements Comparable<PathNode> {
|
||||
public double weight;
|
||||
|
||||
public int preNode;
|
||||
|
||||
public int compareTo(Object p) {
|
||||
PathNode pn = (PathNode) p;
|
||||
public int compareTo(PathNode pn) {
|
||||
if (weight < pn.weight)
|
||||
return -1;
|
||||
else if (weight == pn.weight)
|
||||
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.cn.smart.hhmm;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@ -39,7 +38,7 @@ class SegGraph {
|
||||
/**
|
||||
* Map of start offsets to ArrayList of tokens at that position
|
||||
*/
|
||||
private Map /* <Integer, ArrayList<SegToken>> */ tokenListTable = new HashMap();
|
||||
private Map<Integer,ArrayList<SegToken>> tokenListTable = new HashMap<Integer,ArrayList<SegToken>>();
|
||||
|
||||
private int maxStart = -1;
|
||||
|
||||
@ -50,7 +49,7 @@ class SegGraph {
|
||||
* @return true if there are tokens for the startOffset
|
||||
*/
|
||||
public boolean isStartExist(int s) {
|
||||
return tokenListTable.get(Integer.valueOf(s)) != null;
|
||||
return tokenListTable.get(s) != null;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -59,8 +58,8 @@ class SegGraph {
|
||||
* @param s startOffset
|
||||
* @return List of tokens at the specified start offset.
|
||||
*/
|
||||
public List getStartList(int s) {
|
||||
return (List) tokenListTable.get(Integer.valueOf(s));
|
||||
public List<SegToken> getStartList(int s) {
|
||||
return tokenListTable.get(s);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -76,16 +75,15 @@ class SegGraph {
|
||||
* Set the {@link SegToken#index} for each token, based upon its order by startOffset.
|
||||
* @return a {@link List} of these ordered tokens.
|
||||
*/
|
||||
public List makeIndex() {
|
||||
List result = new ArrayList();
|
||||
public List<SegToken> makeIndex() {
|
||||
List<SegToken> result = new ArrayList<SegToken>();
|
||||
int s = -1, count = 0, size = tokenListTable.size();
|
||||
List tokenList;
|
||||
List<SegToken> tokenList;
|
||||
short index = 0;
|
||||
while (count < size) {
|
||||
if (isStartExist(s)) {
|
||||
tokenList = (List) tokenListTable.get(Integer.valueOf(s));
|
||||
for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
|
||||
SegToken st = (SegToken) iter.next();
|
||||
tokenList = tokenListTable.get(s);
|
||||
for (SegToken st : tokenList) {
|
||||
st.index = index;
|
||||
result.add(st);
|
||||
index++;
|
||||
@ -104,11 +102,11 @@ class SegGraph {
|
||||
public void addToken(SegToken token) {
|
||||
int s = token.startOffset;
|
||||
if (!isStartExist(s)) {
|
||||
ArrayList newlist = new ArrayList();
|
||||
ArrayList<SegToken> newlist = new ArrayList<SegToken>();
|
||||
newlist.add(token);
|
||||
tokenListTable.put((Object) (Integer.valueOf(s)), newlist);
|
||||
tokenListTable.put(s, newlist);
|
||||
} else {
|
||||
List tokenList = (List) tokenListTable.get((Object) (Integer.valueOf(s)));
|
||||
List<SegToken> tokenList = tokenListTable.get(s);
|
||||
tokenList.add(token);
|
||||
}
|
||||
if (s > maxStart)
|
||||
@ -120,16 +118,15 @@ class SegGraph {
|
||||
*
|
||||
* @return {@link List} of all tokens in the map.
|
||||
*/
|
||||
public List toTokenList() {
|
||||
List result = new ArrayList();
|
||||
public List<SegToken> toTokenList() {
|
||||
List<SegToken> result = new ArrayList<SegToken>();
|
||||
int s = -1, count = 0, size = tokenListTable.size();
|
||||
List tokenList;
|
||||
List<SegToken> tokenList;
|
||||
|
||||
while (count < size) {
|
||||
if (isStartExist(s)) {
|
||||
tokenList = (List) tokenListTable.get(Integer.valueOf(s));
|
||||
for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
|
||||
SegToken st = (SegToken) iter.next();
|
||||
tokenList = tokenListTable.get(s);
|
||||
for (SegToken st : tokenList) {
|
||||
result.add(st);
|
||||
}
|
||||
count++;
|
||||
@ -140,10 +137,9 @@ class SegGraph {
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
List tokenList = this.toTokenList();
|
||||
List<SegToken> tokenList = this.toTokenList();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
|
||||
SegToken t = (SegToken) iter.next();
|
||||
for (SegToken t : tokenList) {
|
||||
sb.append(t + "\n");
|
||||
}
|
||||
return sb.toString();
|
||||
|
Loading…
x
Reference in New Issue
Block a user