mirror of https://github.com/apache/lucene.git
LUCENE-10493: factor out Viterbi algorithm and share it between kuromoji and nori (#805)
This commit is contained in:
parent
2a4c21bb58
commit
c89f8a7ea1
|
@ -60,6 +60,8 @@ Other
|
|||
All classes in `org.apache.lucene.analysis.[ja|ko].util` was moved to `org.apache.lucene.analysis.[ja|ko].dict`.
|
||||
(Tomoko Uchida)
|
||||
|
||||
* LUCENE-10493: Factor out Viterbi algorithm in Kuromoji and Nori to analysis-common. (Tomoko Uchida)
|
||||
|
||||
======================= Lucene 9.2.0 =======================
|
||||
|
||||
API Changes
|
||||
|
|
|
@ -14,22 +14,16 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.ko;
|
||||
package org.apache.lucene.analysis.morph;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.analysis.ko.KoreanTokenizer.Position;
|
||||
import org.apache.lucene.analysis.ko.KoreanTokenizer.WrappedPositionArray;
|
||||
import org.apache.lucene.analysis.ko.dict.ConnectionCosts;
|
||||
import org.apache.lucene.analysis.ko.dict.KoMorphData;
|
||||
import org.apache.lucene.analysis.morph.Dictionary;
|
||||
|
||||
// TODO: would be nice to show 2nd best path in a diff't
|
||||
// color...
|
||||
|
||||
/** Outputs the dot (graphviz) string for the viterbi lattice. */
|
||||
public class GraphvizFormatter {
|
||||
|
||||
public class GraphvizFormatter<T extends MorphData> {
|
||||
private static final String BOS_LABEL = "BOS";
|
||||
|
||||
private static final String EOS_LABEL = "EOS";
|
||||
|
@ -56,36 +50,39 @@ public class GraphvizFormatter {
|
|||
}
|
||||
|
||||
// Backtraces another incremental fragment:
|
||||
void onBacktrace(
|
||||
KoreanTokenizer tok,
|
||||
WrappedPositionArray positions,
|
||||
public void onBacktrace(
|
||||
DictionaryProvider<T> dictProvider,
|
||||
Viterbi.WrappedPositionArray<? extends Viterbi.Position> positions,
|
||||
int lastBackTracePos,
|
||||
Position endPosData,
|
||||
Viterbi.Position endPosData,
|
||||
int fromIDX,
|
||||
char[] fragment,
|
||||
boolean isEnd) {
|
||||
setBestPathMap(positions, lastBackTracePos, endPosData, fromIDX);
|
||||
sb.append(formatNodes(tok, positions, lastBackTracePos, endPosData, fragment));
|
||||
sb.append(formatNodes(dictProvider, positions, lastBackTracePos, endPosData, fragment));
|
||||
if (isEnd) {
|
||||
sb.append(" fini [style=invis]\n");
|
||||
sb.append(" ");
|
||||
sb.append(getNodeID(endPosData.pos, fromIDX));
|
||||
sb.append(getNodeID(endPosData.getPos(), fromIDX));
|
||||
sb.append(" -> fini [label=\"" + EOS_LABEL + "\"]");
|
||||
}
|
||||
}
|
||||
|
||||
// Records which arcs make up the best bath:
|
||||
private void setBestPathMap(
|
||||
WrappedPositionArray positions, int startPos, Position endPosData, int fromIDX) {
|
||||
Viterbi.WrappedPositionArray<? extends Viterbi.Position> positions,
|
||||
int startPos,
|
||||
Viterbi.Position endPosData,
|
||||
int fromIDX) {
|
||||
bestPathMap.clear();
|
||||
|
||||
int pos = endPosData.pos;
|
||||
int pos = endPosData.getPos();
|
||||
int bestIDX = fromIDX;
|
||||
while (pos > startPos) {
|
||||
final Position posData = positions.get(pos);
|
||||
final Viterbi.Position posData = positions.get(pos);
|
||||
|
||||
final int backPos = posData.backPos[bestIDX];
|
||||
final int backIDX = posData.backIndex[bestIDX];
|
||||
final int backPos = posData.getBackPos(bestIDX);
|
||||
final int backIDX = posData.getBackIndex(bestIDX);
|
||||
|
||||
final String toNodeID = getNodeID(pos, bestIDX);
|
||||
final String fromNodeID = getNodeID(backPos, backIDX);
|
||||
|
@ -99,34 +96,34 @@ public class GraphvizFormatter {
|
|||
}
|
||||
|
||||
private String formatNodes(
|
||||
KoreanTokenizer tok,
|
||||
WrappedPositionArray positions,
|
||||
DictionaryProvider<T> dictProvider,
|
||||
Viterbi.WrappedPositionArray<? extends Viterbi.Position> positions,
|
||||
int startPos,
|
||||
Position endPosData,
|
||||
Viterbi.Position endPosData,
|
||||
char[] fragment) {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
// Output nodes
|
||||
for (int pos = startPos + 1; pos <= endPosData.pos; pos++) {
|
||||
final Position posData = positions.get(pos);
|
||||
for (int idx = 0; idx < posData.count; idx++) {
|
||||
for (int pos = startPos + 1; pos <= endPosData.getPos(); pos++) {
|
||||
final Viterbi.Position posData = positions.get(pos);
|
||||
for (int idx = 0; idx < posData.getCount(); idx++) {
|
||||
sb.append(" ");
|
||||
sb.append(getNodeID(pos, idx));
|
||||
sb.append(" [label=\"");
|
||||
sb.append(pos);
|
||||
sb.append(": ");
|
||||
sb.append(posData.lastRightID[idx]);
|
||||
sb.append(posData.getLastRightID(idx));
|
||||
sb.append("\"]\n");
|
||||
}
|
||||
}
|
||||
|
||||
// Output arcs
|
||||
for (int pos = endPosData.pos; pos > startPos; pos--) {
|
||||
final Position posData = positions.get(pos);
|
||||
for (int idx = 0; idx < posData.count; idx++) {
|
||||
final Position backPosData = positions.get(posData.backPos[idx]);
|
||||
for (int pos = endPosData.getPos(); pos > startPos; pos--) {
|
||||
final Viterbi.Position posData = positions.get(pos);
|
||||
for (int idx = 0; idx < posData.getCount(); idx++) {
|
||||
final Viterbi.Position backPosData = positions.get(posData.getBackPos(idx));
|
||||
final String toNodeID = getNodeID(pos, idx);
|
||||
final String fromNodeID = getNodeID(posData.backPos[idx], posData.backIndex[idx]);
|
||||
final String fromNodeID = getNodeID(posData.getBackPos(idx), posData.getBackIndex(idx));
|
||||
|
||||
sb.append(" ");
|
||||
sb.append(fromNodeID);
|
||||
|
@ -141,15 +138,15 @@ public class GraphvizFormatter {
|
|||
attrs = "";
|
||||
}
|
||||
|
||||
final Dictionary<? extends KoMorphData> dict = tok.getDict(posData.backType[idx]);
|
||||
final int wordCost = dict.getWordCost(posData.backID[idx]);
|
||||
final Dictionary<? extends T> dict = dictProvider.get(posData.getBackType(idx));
|
||||
final int wordCost = dict.getWordCost(posData.getBackID(idx));
|
||||
final int bgCost =
|
||||
costs.get(
|
||||
backPosData.lastRightID[posData.backIndex[idx]],
|
||||
dict.getLeftId(posData.backID[idx]));
|
||||
backPosData.getLastRightID(posData.getBackIndex(idx)),
|
||||
dict.getLeftId(posData.getBackID(idx)));
|
||||
|
||||
final String surfaceForm =
|
||||
new String(fragment, posData.backPos[idx] - startPos, pos - posData.backPos[idx]);
|
||||
new String(fragment, posData.getBackPos(idx) - startPos, pos - posData.getBackPos(idx));
|
||||
|
||||
sb.append(" [label=\"");
|
||||
sb.append(surfaceForm);
|
||||
|
@ -190,4 +187,10 @@ public class GraphvizFormatter {
|
|||
private String getNodeID(int pos, int idx) {
|
||||
return pos + "." + idx;
|
||||
}
|
||||
|
||||
/** {@link Dictionary} provider */
|
||||
@FunctionalInterface
|
||||
public interface DictionaryProvider<T extends MorphData> {
|
||||
Dictionary<? extends T> get(TokenType type);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,815 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.morph;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.lang.reflect.Array;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.analysis.util.RollingCharBuffer;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
|
||||
/**
|
||||
* Performs <a href="https://en.wikipedia.org/wiki/Viterbi_algorithm">Viterbi algorithm</a> for
|
||||
* morphological Tokenizers, which split texts by Hidden Markov Model or Conditional Random Fields.
|
||||
*
|
||||
* @param <T> output token class
|
||||
* @param <U> position class
|
||||
*/
|
||||
public abstract class Viterbi<T extends Token, U extends Viterbi.Position> {
|
||||
protected static final boolean VERBOSE = false;
|
||||
|
||||
// For safety:
|
||||
protected static final int MAX_UNKNOWN_WORD_LENGTH = 1024;
|
||||
private static final int MAX_BACKTRACE_GAP = 1024;
|
||||
|
||||
private final TokenInfoFST fst;
|
||||
private final BinaryDictionary<? extends MorphData> dictionary;
|
||||
private final Dictionary<? extends MorphData> userDictionary;
|
||||
protected final ConnectionCosts costs;
|
||||
|
||||
private final FST.Arc<Long> arc = new FST.Arc<>();
|
||||
private final FST.BytesReader fstReader;
|
||||
protected final IntsRef wordIdRef = new IntsRef();
|
||||
|
||||
private final FST.BytesReader userFSTReader;
|
||||
private final TokenInfoFST userFST;
|
||||
|
||||
protected final RollingCharBuffer buffer = new RollingCharBuffer();
|
||||
|
||||
protected final WrappedPositionArray<U> positions;
|
||||
|
||||
// True once we've hit the EOF from the input reader:
|
||||
protected boolean end;
|
||||
|
||||
// Last absolute position we backtraced from:
|
||||
protected int lastBackTracePos;
|
||||
|
||||
// Next absolute position to process:
|
||||
protected int pos;
|
||||
|
||||
// Already parsed, but not yet passed to caller, tokens:
|
||||
protected final List<T> pending = new ArrayList<>();
|
||||
|
||||
protected boolean outputNBest = false;
|
||||
|
||||
protected boolean enableSpacePenaltyFactor = false;
|
||||
|
||||
protected boolean outputLongestUserEntryOnly = false;
|
||||
|
||||
protected Viterbi(
|
||||
TokenInfoFST fst,
|
||||
FST.BytesReader fstReader,
|
||||
BinaryDictionary<? extends MorphData> dictionary,
|
||||
TokenInfoFST userFST,
|
||||
FST.BytesReader userFSTReader,
|
||||
Dictionary<? extends MorphData> userDictionary,
|
||||
ConnectionCosts costs,
|
||||
Class<U> positionImpl) {
|
||||
this.fst = fst;
|
||||
this.fstReader = fstReader;
|
||||
this.dictionary = dictionary;
|
||||
this.userFST = userFST;
|
||||
this.userFSTReader = userFSTReader;
|
||||
this.userDictionary = userDictionary;
|
||||
this.costs = costs;
|
||||
this.positions = new WrappedPositionArray<>(positionImpl);
|
||||
}
|
||||
|
||||
/**
|
||||
* Incrementally parse some more characters. This runs the viterbi search forwards "enough" so
|
||||
* that we generate some more tokens. How much forward depends on the chars coming in, since some
|
||||
* chars could cause longer-lasting ambiguity in the parsing. Once the ambiguity is resolved, then
|
||||
* we back trace, produce the pending tokens, and return.
|
||||
*/
|
||||
public final void forward() throws IOException {
|
||||
if (VERBOSE) {
|
||||
System.out.println("\nPARSE");
|
||||
}
|
||||
|
||||
// Index of the last character of unknown word:
|
||||
int unknownWordEndIndex = -1;
|
||||
|
||||
// Maximum posAhead of user word in the entire input
|
||||
int userWordMaxPosAhead = -1;
|
||||
|
||||
// Advances over each position (character):
|
||||
while (buffer.get(pos) != -1) {
|
||||
final Position posData = positions.get(pos);
|
||||
final boolean isFrontier = positions.getNextPos() == pos + 1;
|
||||
|
||||
if (posData.count == 0) {
|
||||
// No arcs arrive here; move to next position:
|
||||
if (VERBOSE) {
|
||||
System.out.println(" no arcs in; skip pos=" + pos);
|
||||
}
|
||||
pos++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
|
||||
// We are at a "frontier", and only one node is
|
||||
// alive, so whatever the eventual best path is must
|
||||
// come through this node. So we can safely commit
|
||||
// to the prefix of the best path at this point:
|
||||
if (outputNBest) {
|
||||
backtraceNBest(posData, false);
|
||||
}
|
||||
backtrace(posData, 0);
|
||||
if (outputNBest) {
|
||||
fixupPendingList();
|
||||
}
|
||||
|
||||
// Re-base cost so we don't risk int overflow:
|
||||
posData.costs[0] = 0;
|
||||
if (pending.size() > 0) {
|
||||
return;
|
||||
} else {
|
||||
// This means the backtrace only produced
|
||||
// punctuation tokens, so we must keep parsing.
|
||||
}
|
||||
}
|
||||
|
||||
if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP) {
|
||||
// Safety: if we've buffered too much, force a
|
||||
// backtrace now. We find the least-cost partial
|
||||
// path, across all paths, backtrace from it, and
|
||||
// then prune all others. Note that this, in
|
||||
// general, can produce the wrong result, if the
|
||||
// total best path did not in fact back trace
|
||||
// through this partial best path. But it's the
|
||||
// best we can do... (short of not having a
|
||||
// safety!).
|
||||
|
||||
// First pass: find least cost partial path so far,
|
||||
// including ending at future positions:
|
||||
int leastIDX = -1;
|
||||
int leastCost = Integer.MAX_VALUE;
|
||||
Position leastPosData = null;
|
||||
for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
|
||||
final Position posData2 = positions.get(pos2);
|
||||
for (int idx = 0; idx < posData2.count; idx++) {
|
||||
// System.out.println(" idx=" + idx + " cost=" + cost);
|
||||
final int cost = posData2.costs[idx];
|
||||
if (cost < leastCost) {
|
||||
leastCost = cost;
|
||||
leastIDX = idx;
|
||||
leastPosData = posData2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We will always have at least one live path:
|
||||
assert leastIDX != -1;
|
||||
|
||||
if (outputNBest) {
|
||||
backtraceNBest(leastPosData, false);
|
||||
}
|
||||
|
||||
// Second pass: prune all but the best path:
|
||||
for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
|
||||
final Position posData2 = positions.get(pos2);
|
||||
if (posData2 != leastPosData) {
|
||||
posData2.reset();
|
||||
} else {
|
||||
if (leastIDX != 0) {
|
||||
posData2.costs[0] = posData2.costs[leastIDX];
|
||||
posData2.lastRightID[0] = posData2.lastRightID[leastIDX];
|
||||
posData2.backPos[0] = posData2.backPos[leastIDX];
|
||||
posData2.backWordPos[0] = posData2.backWordPos[leastIDX];
|
||||
posData2.backIndex[0] = posData2.backIndex[leastIDX];
|
||||
posData2.backID[0] = posData2.backID[leastIDX];
|
||||
posData2.backType[0] = posData2.backType[leastIDX];
|
||||
}
|
||||
posData2.count = 1;
|
||||
}
|
||||
}
|
||||
|
||||
backtrace(leastPosData, 0);
|
||||
if (outputNBest) {
|
||||
fixupPendingList();
|
||||
}
|
||||
|
||||
// Re-base cost so we don't risk int overflow:
|
||||
Arrays.fill(leastPosData.costs, 0, leastPosData.count, 0);
|
||||
|
||||
if (pos != leastPosData.pos) {
|
||||
// We jumped into a future position:
|
||||
assert pos < leastPosData.pos;
|
||||
pos = leastPosData.pos;
|
||||
}
|
||||
if (pending.size() > 0) {
|
||||
return;
|
||||
} else {
|
||||
// This means the backtrace only produced
|
||||
// punctuation tokens, so we must keep parsing.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(
|
||||
"\n extend @ pos="
|
||||
+ pos
|
||||
+ " char="
|
||||
+ (char) buffer.get(pos)
|
||||
+ " hex="
|
||||
+ Integer.toHexString(buffer.get(pos)));
|
||||
}
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(" " + posData.count + " arcs in");
|
||||
}
|
||||
|
||||
if (enableSpacePenaltyFactor
|
||||
&& Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) {
|
||||
// We add single space separator as prefixes of the terms that we extract.
|
||||
// This information is needed to compute the space penalty factor of each term.
|
||||
// These whitespace prefixes are removed when the final tokens are generated, or
|
||||
// added as separated tokens when discardPunctuation is unset.
|
||||
if (buffer.get(++pos) == -1) {
|
||||
pos = posData.pos;
|
||||
}
|
||||
}
|
||||
|
||||
boolean anyMatches = false;
|
||||
|
||||
// First try user dict:
|
||||
if (userFST != null) {
|
||||
userFST.getFirstArc(arc);
|
||||
int output = 0;
|
||||
int maxPosAhead = 0;
|
||||
int outputMaxPosAhead = 0;
|
||||
int arcFinalOutMaxPosAhead = 0;
|
||||
|
||||
for (int posAhead = pos; ; posAhead++) {
|
||||
final int ch = buffer.get(posAhead);
|
||||
if (ch == -1) {
|
||||
break;
|
||||
}
|
||||
if (userFST.findTargetArc(ch, arc, arc, posAhead == pos, userFSTReader) == null) {
|
||||
break;
|
||||
}
|
||||
output += arc.output().intValue();
|
||||
if (arc.isFinal()) {
|
||||
maxPosAhead = posAhead;
|
||||
outputMaxPosAhead = output;
|
||||
arcFinalOutMaxPosAhead = arc.nextFinalOutput().intValue();
|
||||
anyMatches = true;
|
||||
if (!outputLongestUserEntryOnly) {
|
||||
// add all matched user entries.
|
||||
add(
|
||||
userDictionary.getMorphAttributes(),
|
||||
posData,
|
||||
pos,
|
||||
posAhead + 1,
|
||||
output + arc.nextFinalOutput().intValue(),
|
||||
TokenType.USER,
|
||||
false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Longest matching for user word
|
||||
if (anyMatches && maxPosAhead > userWordMaxPosAhead) {
|
||||
if (outputLongestUserEntryOnly) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(
|
||||
" USER word "
|
||||
+ new String(buffer.get(pos, maxPosAhead + 1))
|
||||
+ " toPos="
|
||||
+ (maxPosAhead + 1));
|
||||
}
|
||||
add(
|
||||
userDictionary.getMorphAttributes(),
|
||||
posData,
|
||||
pos,
|
||||
maxPosAhead + 1,
|
||||
outputMaxPosAhead + arcFinalOutMaxPosAhead,
|
||||
TokenType.USER,
|
||||
false);
|
||||
}
|
||||
userWordMaxPosAhead = Math.max(userWordMaxPosAhead, maxPosAhead);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: we can be more aggressive about user
|
||||
// matches? if we are "under" a user match then don't
|
||||
// extend KNOWN/UNKNOWN paths?
|
||||
|
||||
if (!anyMatches) {
|
||||
// Next, try known dictionary matches
|
||||
fst.getFirstArc(arc);
|
||||
int output = 0;
|
||||
|
||||
for (int posAhead = pos; ; posAhead++) {
|
||||
final int ch = buffer.get(posAhead);
|
||||
if (ch == -1) {
|
||||
break;
|
||||
}
|
||||
// System.out.println(" match " + (char) ch + " posAhead=" + posAhead);
|
||||
|
||||
if (fst.findTargetArc(ch, arc, arc, posAhead == pos, fstReader) == null) {
|
||||
break;
|
||||
}
|
||||
|
||||
output += arc.output().intValue();
|
||||
|
||||
// Optimization: for known words that are too-long
|
||||
// (compound), we should pre-compute the 2nd
|
||||
// best segmentation and store it in the
|
||||
// dictionary instead of recomputing it each time a
|
||||
// match is found.
|
||||
|
||||
if (arc.isFinal()) {
|
||||
dictionary.lookupWordIds(output + arc.nextFinalOutput().intValue(), wordIdRef);
|
||||
if (VERBOSE) {
|
||||
System.out.println(
|
||||
" KNOWN word "
|
||||
+ new String(buffer.get(pos, posAhead - pos + 1))
|
||||
+ " toPos="
|
||||
+ (posAhead + 1)
|
||||
+ " "
|
||||
+ wordIdRef.length
|
||||
+ " wordIDs");
|
||||
}
|
||||
for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
|
||||
add(
|
||||
dictionary.getMorphAttributes(),
|
||||
posData,
|
||||
pos,
|
||||
posAhead + 1,
|
||||
wordIdRef.ints[wordIdRef.offset + ofs],
|
||||
TokenType.KNOWN,
|
||||
false);
|
||||
anyMatches = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!shouldSkipProcessUnknownWord(unknownWordEndIndex, posData)) {
|
||||
int unknownWordLength = processUnknownWord(anyMatches, posData);
|
||||
unknownWordEndIndex = posData.pos + unknownWordLength;
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
|
||||
end = true;
|
||||
|
||||
if (pos > 0) {
|
||||
|
||||
final Position endPosData = positions.get(pos);
|
||||
int leastCost = Integer.MAX_VALUE;
|
||||
int leastIDX = -1;
|
||||
if (VERBOSE) {
|
||||
System.out.println(" end: " + endPosData.count + " nodes");
|
||||
}
|
||||
for (int idx = 0; idx < endPosData.count; idx++) {
|
||||
// Add EOS cost:
|
||||
final int cost = endPosData.costs[idx] + costs.get(endPosData.lastRightID[idx], 0);
|
||||
// System.out.println(" idx=" + idx + " cost=" + cost + " (pathCost=" +
|
||||
// endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ")
|
||||
// backPos=" + endPosData.backPos[idx]);
|
||||
if (cost < leastCost) {
|
||||
leastCost = cost;
|
||||
leastIDX = idx;
|
||||
}
|
||||
}
|
||||
|
||||
if (outputNBest) {
|
||||
backtraceNBest(endPosData, true);
|
||||
}
|
||||
backtrace(endPosData, leastIDX);
|
||||
if (outputNBest) {
|
||||
fixupPendingList();
|
||||
}
|
||||
} else {
|
||||
// No characters in the input string; return no tokens!
|
||||
}
|
||||
}
|
||||
|
||||
protected boolean shouldSkipProcessUnknownWord(int unknownWordEndIndex, Position posData) {
|
||||
return unknownWordEndIndex > posData.pos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add unknown words to the position graph.
|
||||
*
|
||||
* @return word length
|
||||
*/
|
||||
protected abstract int processUnknownWord(boolean anyMatches, Position posData)
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
* Backtrace from the provided position, back to the last time we back-traced, accumulating the
|
||||
* resulting tokens to the pending list. The pending list is then in-reverse (last token should be
|
||||
* returned first).
|
||||
*/
|
||||
protected abstract void backtrace(final Position endPosData, final int fromIDX)
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
* Backtrace the n-best path. Subclasses that support n-best paths should implement this method.
|
||||
*/
|
||||
protected void backtraceNBest(final Position endPosData, final boolean useEOS)
|
||||
throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove duplicated tokens from the pending list; this is needed because {@link
|
||||
* #backtrace(Position, int)} and {@link #backtraceNBest(Position, boolean)} can add same tokens
|
||||
* to the list. Subclasses that support n-best paths should implement this method.
|
||||
*/
|
||||
protected void fixupPendingList() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
/** Add a token on the minimum cost path to the pending token list. */
|
||||
protected final void add(
|
||||
MorphData morphData,
|
||||
Position fromPosData,
|
||||
int wordPos,
|
||||
int endPos,
|
||||
int wordID,
|
||||
TokenType type,
|
||||
boolean addPenalty)
|
||||
throws IOException {
|
||||
final int wordCost = morphData.getWordCost(wordID);
|
||||
final int leftID = morphData.getLeftId(wordID);
|
||||
int leastCost = Integer.MAX_VALUE;
|
||||
int leastIDX = -1;
|
||||
assert fromPosData.count > 0;
|
||||
for (int idx = 0; idx < fromPosData.count; idx++) {
|
||||
// The number of spaces before the term
|
||||
int numSpaces = wordPos - fromPosData.pos;
|
||||
|
||||
// Cost is path cost so far, plus word cost (added at
|
||||
// end of loop), plus bigram cost and space penalty cost.
|
||||
final int cost =
|
||||
fromPosData.costs[idx]
|
||||
+ costs.get(fromPosData.lastRightID[idx], leftID)
|
||||
+ computeSpacePenalty(morphData, wordID, numSpaces);
|
||||
if (VERBOSE) {
|
||||
System.out.println(
|
||||
" fromIDX="
|
||||
+ idx
|
||||
+ ": cost="
|
||||
+ cost
|
||||
+ " (prevCost="
|
||||
+ fromPosData.costs[idx]
|
||||
+ " wordCost="
|
||||
+ wordCost
|
||||
+ " bgCost="
|
||||
+ costs.get(fromPosData.lastRightID[idx], leftID)
|
||||
+ " spacePenalty="
|
||||
+ computeSpacePenalty(morphData, wordID, numSpaces)
|
||||
+ ") leftID="
|
||||
+ leftID
|
||||
// + " leftPOS="
|
||||
// + leftPOS.name()
|
||||
+ ")");
|
||||
}
|
||||
if (cost < leastCost) {
|
||||
leastCost = cost;
|
||||
leastIDX = idx;
|
||||
if (VERBOSE) {
|
||||
System.out.println(" **");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
leastCost += wordCost;
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(
|
||||
" + cost="
|
||||
+ leastCost
|
||||
+ " wordID="
|
||||
+ wordID
|
||||
+ " leftID="
|
||||
+ leftID
|
||||
+ " leastIDX="
|
||||
+ leastIDX
|
||||
+ " toPos="
|
||||
+ endPos
|
||||
+ " toPos.idx="
|
||||
+ positions.get(endPos).count);
|
||||
}
|
||||
|
||||
if (addPenalty && type != TokenType.USER) {
|
||||
final int penalty = computePenalty(fromPosData.pos, endPos - fromPosData.pos);
|
||||
if (VERBOSE) {
|
||||
if (penalty > 0) {
|
||||
System.out.println(" + penalty=" + penalty + " cost=" + (leastCost + penalty));
|
||||
}
|
||||
}
|
||||
leastCost += penalty;
|
||||
}
|
||||
|
||||
positions
|
||||
.get(endPos)
|
||||
.add(
|
||||
leastCost,
|
||||
morphData.getRightId(wordID),
|
||||
fromPosData.pos,
|
||||
wordPos,
|
||||
leastIDX,
|
||||
wordID,
|
||||
type);
|
||||
}
|
||||
|
||||
/** Returns the space penalty. */
|
||||
protected int computeSpacePenalty(MorphData morphData, int wordID, int numSpaces) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/** Returns the penalty for a specific input region */
|
||||
protected int computePenalty(int pos, int length) throws IOException {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public int getPos() {
|
||||
return pos;
|
||||
}
|
||||
|
||||
public boolean isEnd() {
|
||||
return end;
|
||||
}
|
||||
|
||||
public List<T> getPending() {
|
||||
return pending;
|
||||
}
|
||||
|
||||
public boolean isOutputNBest() {
|
||||
return outputNBest;
|
||||
}
|
||||
|
||||
public void resetBuffer(Reader reader) {
|
||||
buffer.reset(reader);
|
||||
}
|
||||
|
||||
public void resetState() {
|
||||
positions.reset();
|
||||
pos = 0;
|
||||
end = false;
|
||||
lastBackTracePos = 0;
|
||||
pending.clear();
|
||||
|
||||
// Add BOS:
|
||||
positions.get(0).add(0, 0, -1, -1, -1, -1, TokenType.KNOWN);
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds all back pointers arriving to this position.
|
||||
*
|
||||
* <p>NOTE: This and subclasses must have no-arg constructor. See {@link WrappedPositionArray}.
|
||||
*/
|
||||
public static class Position {
|
||||
|
||||
int pos;
|
||||
|
||||
int count;
|
||||
|
||||
// maybe single int array * 5?
|
||||
int[] costs = new int[8];
|
||||
int[] lastRightID = new int[8];
|
||||
int[] backPos = new int[8];
|
||||
int[] backWordPos = new int[8];
|
||||
int[] backIndex = new int[8];
|
||||
int[] backID = new int[8];
|
||||
TokenType[] backType = new TokenType[8];
|
||||
|
||||
private void grow() {
|
||||
costs = ArrayUtil.grow(costs, 1 + count);
|
||||
lastRightID = ArrayUtil.grow(lastRightID, 1 + count);
|
||||
backPos = ArrayUtil.grow(backPos, 1 + count);
|
||||
backWordPos = ArrayUtil.grow(backWordPos, 1 + count);
|
||||
backIndex = ArrayUtil.grow(backIndex, 1 + count);
|
||||
backID = ArrayUtil.grow(backID, 1 + count);
|
||||
|
||||
// NOTE: sneaky: grow separately because
|
||||
// ArrayUtil.grow will otherwise pick a different
|
||||
// length than the int[]s we just grew:
|
||||
final TokenType[] newBackType = new TokenType[backID.length];
|
||||
System.arraycopy(backType, 0, newBackType, 0, backType.length);
|
||||
backType = newBackType;
|
||||
}
|
||||
|
||||
public void add(
|
||||
int cost,
|
||||
int lastRightID,
|
||||
int backPos,
|
||||
int backRPos,
|
||||
int backIndex,
|
||||
int backID,
|
||||
TokenType backType) {
|
||||
// NOTE: this isn't quite a true Viterbi search,
|
||||
// because we should check if lastRightID is
|
||||
// already present here, and only update if the new
|
||||
// cost is less than the current cost, instead of
|
||||
// simply appending. However, that will likely hurt
|
||||
// performance (usually we add a lastRightID only once),
|
||||
// and it means we actually create the full graph
|
||||
// intersection instead of a "normal" Viterbi lattice:
|
||||
if (count == costs.length) {
|
||||
grow();
|
||||
}
|
||||
this.costs[count] = cost;
|
||||
this.lastRightID[count] = lastRightID;
|
||||
this.backPos[count] = backPos;
|
||||
this.backWordPos[count] = backRPos;
|
||||
this.backIndex[count] = backIndex;
|
||||
this.backID[count] = backID;
|
||||
this.backType[count] = backType;
|
||||
count++;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
count = 0;
|
||||
}
|
||||
|
||||
public int getPos() {
|
||||
return pos;
|
||||
}
|
||||
|
||||
public int getCount() {
|
||||
return count;
|
||||
}
|
||||
|
||||
public void setCount(int count) {
|
||||
this.count = count;
|
||||
}
|
||||
|
||||
public int getCost(int index) {
|
||||
return costs[index];
|
||||
}
|
||||
|
||||
public int getBackPos(int index) {
|
||||
return backPos[index];
|
||||
}
|
||||
|
||||
public int getBackWordPos(int index) {
|
||||
return backWordPos[index];
|
||||
}
|
||||
|
||||
public int getBackID(int index) {
|
||||
return backID[index];
|
||||
}
|
||||
|
||||
public int getBackIndex(int index) {
|
||||
return backIndex[index];
|
||||
}
|
||||
|
||||
public TokenType getBackType(int index) {
|
||||
return backType[index];
|
||||
}
|
||||
|
||||
public int getLastRightID(int index) {
|
||||
return lastRightID[index];
|
||||
}
|
||||
}
|
||||
|
||||
/** Holds partial graph (array of positions) for calculating the minimum cost path */
|
||||
public static final class WrappedPositionArray<U extends Position> {
|
||||
private U[] positions;
|
||||
private final Class<U> clazz;
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
WrappedPositionArray(Class<U> clazz) {
|
||||
this.clazz = clazz;
|
||||
positions = (U[]) Array.newInstance(clazz, 8);
|
||||
for (int i = 0; i < positions.length; i++) {
|
||||
try {
|
||||
positions[i] = clazz.getConstructor().newInstance();
|
||||
} catch (ReflectiveOperationException e) {
|
||||
// shouldn't happen; Position class should have no-arg constructor.
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Next array index to write to in positions:
|
||||
private int nextWrite;
|
||||
|
||||
// Next position to write:
|
||||
private int nextPos;
|
||||
|
||||
// How many valid Position instances are held in the
|
||||
// positions array:
|
||||
private int count;
|
||||
|
||||
void reset() {
|
||||
nextWrite--;
|
||||
while (count > 0) {
|
||||
if (nextWrite == -1) {
|
||||
nextWrite = positions.length - 1;
|
||||
}
|
||||
positions[nextWrite--].reset();
|
||||
count--;
|
||||
}
|
||||
nextWrite = 0;
|
||||
nextPos = 0;
|
||||
count = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get Position instance for this absolute position; this is allowed to be arbitrarily far "in
|
||||
* the future" but cannot be before the last freeBefore.
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
public U get(int pos) {
|
||||
while (pos >= nextPos) {
|
||||
// System.out.println("count=" + count + " vs len=" + positions.length);
|
||||
if (count == positions.length) {
|
||||
// Position[] newPositions =
|
||||
// new Position[ArrayUtil.oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
U[] newPositions =
|
||||
(U[])
|
||||
Array.newInstance(
|
||||
clazz, ArrayUtil.oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF));
|
||||
// System.out.println("grow positions " + newPositions.length);
|
||||
System.arraycopy(positions, nextWrite, newPositions, 0, positions.length - nextWrite);
|
||||
System.arraycopy(positions, 0, newPositions, positions.length - nextWrite, nextWrite);
|
||||
for (int i = positions.length; i < newPositions.length; i++) {
|
||||
try {
|
||||
newPositions[i] = clazz.getConstructor().newInstance();
|
||||
} catch (ReflectiveOperationException e) {
|
||||
// shouldn't happen
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
}
|
||||
nextWrite = positions.length;
|
||||
positions = newPositions;
|
||||
}
|
||||
if (nextWrite == positions.length) {
|
||||
nextWrite = 0;
|
||||
}
|
||||
// Should have already been reset:
|
||||
assert positions[nextWrite].count == 0;
|
||||
positions[nextWrite++].pos = nextPos++;
|
||||
count++;
|
||||
}
|
||||
assert inBounds(pos);
|
||||
final int index = getIndex(pos);
|
||||
assert positions[index].pos == pos;
|
||||
return positions[index];
|
||||
}
|
||||
|
||||
int getNextPos() {
|
||||
return nextPos;
|
||||
}
|
||||
|
||||
// For assert:
|
||||
private boolean inBounds(int pos) {
|
||||
return pos < nextPos && pos >= nextPos - count;
|
||||
}
|
||||
|
||||
private int getIndex(int pos) {
|
||||
int index = nextWrite - (nextPos - pos);
|
||||
if (index < 0) {
|
||||
index += positions.length;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
public void freeBefore(int pos) {
|
||||
final int toFree = count - (nextPos - pos);
|
||||
assert toFree >= 0;
|
||||
assert toFree <= count;
|
||||
int index = nextWrite - count;
|
||||
if (index < 0) {
|
||||
index += positions.length;
|
||||
}
|
||||
for (int i = 0; i < toFree; i++) {
|
||||
if (index == positions.length) {
|
||||
index = 0;
|
||||
}
|
||||
// System.out.println(" fb idx=" + index);
|
||||
positions[index].reset();
|
||||
index++;
|
||||
}
|
||||
count -= toFree;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,195 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.ja;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Position;
|
||||
import org.apache.lucene.analysis.ja.JapaneseTokenizer.WrappedPositionArray;
|
||||
import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
|
||||
import org.apache.lucene.analysis.ja.dict.JaMorphData;
|
||||
import org.apache.lucene.analysis.morph.Dictionary;
|
||||
|
||||
// TODO: would be nice to show 2nd best path in a diff't
|
||||
// color...
|
||||
|
||||
/** Outputs the dot (graphviz) string for the viterbi lattice. */
|
||||
public class GraphvizFormatter {
|
||||
|
||||
private static final String BOS_LABEL = "BOS";
|
||||
|
||||
private static final String EOS_LABEL = "EOS";
|
||||
|
||||
private static final String FONT_NAME = "Helvetica";
|
||||
|
||||
private final ConnectionCosts costs;
|
||||
|
||||
private final Map<String, String> bestPathMap;
|
||||
|
||||
private final StringBuilder sb = new StringBuilder();
|
||||
|
||||
public GraphvizFormatter(ConnectionCosts costs) {
|
||||
this.costs = costs;
|
||||
this.bestPathMap = new HashMap<>();
|
||||
sb.append(formatHeader());
|
||||
sb.append(" init [style=invis]\n");
|
||||
sb.append(" init -> 0.0 [label=\"" + BOS_LABEL + "\"]\n");
|
||||
}
|
||||
|
||||
public String finish() {
|
||||
sb.append(formatTrailer());
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
// Backtraces another incremental fragment:
|
||||
void onBacktrace(
|
||||
JapaneseTokenizer tok,
|
||||
WrappedPositionArray positions,
|
||||
int lastBackTracePos,
|
||||
Position endPosData,
|
||||
int fromIDX,
|
||||
char[] fragment,
|
||||
boolean isEnd) {
|
||||
setBestPathMap(positions, lastBackTracePos, endPosData, fromIDX);
|
||||
sb.append(formatNodes(tok, positions, lastBackTracePos, endPosData, fragment));
|
||||
if (isEnd) {
|
||||
sb.append(" fini [style=invis]\n");
|
||||
sb.append(" ");
|
||||
sb.append(getNodeID(endPosData.pos, fromIDX));
|
||||
sb.append(" -> fini [label=\"" + EOS_LABEL + "\"]");
|
||||
}
|
||||
}
|
||||
|
||||
// Records which arcs make up the best bath:
|
||||
private void setBestPathMap(
|
||||
WrappedPositionArray positions, int startPos, Position endPosData, int fromIDX) {
|
||||
bestPathMap.clear();
|
||||
|
||||
int pos = endPosData.pos;
|
||||
int bestIDX = fromIDX;
|
||||
while (pos > startPos) {
|
||||
final Position posData = positions.get(pos);
|
||||
|
||||
final int backPos = posData.backPos[bestIDX];
|
||||
final int backIDX = posData.backIndex[bestIDX];
|
||||
|
||||
final String toNodeID = getNodeID(pos, bestIDX);
|
||||
final String fromNodeID = getNodeID(backPos, backIDX);
|
||||
|
||||
assert !bestPathMap.containsKey(fromNodeID);
|
||||
assert !bestPathMap.containsValue(toNodeID);
|
||||
bestPathMap.put(fromNodeID, toNodeID);
|
||||
pos = backPos;
|
||||
bestIDX = backIDX;
|
||||
}
|
||||
}
|
||||
|
||||
private String formatNodes(
|
||||
JapaneseTokenizer tok,
|
||||
WrappedPositionArray positions,
|
||||
int startPos,
|
||||
Position endPosData,
|
||||
char[] fragment) {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
// Output nodes
|
||||
for (int pos = startPos + 1; pos <= endPosData.pos; pos++) {
|
||||
final Position posData = positions.get(pos);
|
||||
for (int idx = 0; idx < posData.count; idx++) {
|
||||
sb.append(" ");
|
||||
sb.append(getNodeID(pos, idx));
|
||||
sb.append(" [label=\"");
|
||||
sb.append(pos);
|
||||
sb.append(": ");
|
||||
sb.append(posData.lastRightID[idx]);
|
||||
sb.append("\"]\n");
|
||||
}
|
||||
}
|
||||
|
||||
// Output arcs
|
||||
for (int pos = endPosData.pos; pos > startPos; pos--) {
|
||||
final Position posData = positions.get(pos);
|
||||
for (int idx = 0; idx < posData.count; idx++) {
|
||||
final Position backPosData = positions.get(posData.backPos[idx]);
|
||||
final String toNodeID = getNodeID(pos, idx);
|
||||
final String fromNodeID = getNodeID(posData.backPos[idx], posData.backIndex[idx]);
|
||||
|
||||
sb.append(" ");
|
||||
sb.append(fromNodeID);
|
||||
sb.append(" -> ");
|
||||
sb.append(toNodeID);
|
||||
|
||||
final String attrs;
|
||||
if (toNodeID.equals(bestPathMap.get(fromNodeID))) {
|
||||
// This arc is on best path
|
||||
attrs = " color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20";
|
||||
} else {
|
||||
attrs = "";
|
||||
}
|
||||
|
||||
final Dictionary<? extends JaMorphData> dict = tok.getDict(posData.backType[idx]);
|
||||
final int wordCost = dict.getWordCost(posData.backID[idx]);
|
||||
final int bgCost =
|
||||
costs.get(
|
||||
backPosData.lastRightID[posData.backIndex[idx]],
|
||||
dict.getLeftId(posData.backID[idx]));
|
||||
|
||||
final String surfaceForm =
|
||||
new String(fragment, posData.backPos[idx] - startPos, pos - posData.backPos[idx]);
|
||||
|
||||
sb.append(" [label=\"");
|
||||
sb.append(surfaceForm);
|
||||
sb.append(' ');
|
||||
sb.append(wordCost);
|
||||
if (bgCost >= 0) {
|
||||
sb.append('+');
|
||||
}
|
||||
sb.append(bgCost);
|
||||
sb.append("\"");
|
||||
sb.append(attrs);
|
||||
sb.append("]\n");
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private String formatHeader() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("digraph viterbi {\n");
|
||||
sb.append(
|
||||
" graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\"];\n");
|
||||
// sb.append(" // A2 paper size\n");
|
||||
// sb.append(" size = \"34.4,16.5\";\n");
|
||||
// sb.append(" // try to fill paper\n");
|
||||
// sb.append(" ratio = fill;\n");
|
||||
sb.append(" edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
|
||||
sb.append(
|
||||
" node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\""
|
||||
+ FONT_NAME
|
||||
+ "\" ]\n");
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private String formatTrailer() {
|
||||
return "}";
|
||||
}
|
||||
|
||||
private String getNodeID(int pos, int idx) {
|
||||
return pos + "." + idx;
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -31,8 +31,10 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
|
||||
import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
|
||||
import org.apache.lucene.analysis.ja.dict.JaMorphData;
|
||||
import org.apache.lucene.analysis.ja.dict.UserDictionary;
|
||||
import org.apache.lucene.analysis.ja.tokenattributes.*;
|
||||
import org.apache.lucene.analysis.morph.GraphvizFormatter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.tests.analysis.MockGraphTokenFilter;
|
||||
|
@ -518,7 +520,8 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testLatticeToDot() throws Exception {
|
||||
final GraphvizFormatter gv2 = new GraphvizFormatter(ConnectionCosts.getInstance());
|
||||
final GraphvizFormatter<JaMorphData> gv2 =
|
||||
new GraphvizFormatter<>(ConnectionCosts.getInstance());
|
||||
final Analyzer analyzer =
|
||||
new Analyzer() {
|
||||
@Override
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,447 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.ko;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.EnumMap;
|
||||
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
|
||||
import org.apache.lucene.analysis.ko.dict.KoMorphData;
|
||||
import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
|
||||
import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
|
||||
import org.apache.lucene.analysis.ko.dict.UserDictionary;
|
||||
import org.apache.lucene.analysis.morph.ConnectionCosts;
|
||||
import org.apache.lucene.analysis.morph.Dictionary;
|
||||
import org.apache.lucene.analysis.morph.GraphvizFormatter;
|
||||
import org.apache.lucene.analysis.morph.MorphData;
|
||||
import org.apache.lucene.analysis.morph.TokenInfoFST;
|
||||
import org.apache.lucene.analysis.morph.TokenType;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
|
||||
/** {@link org.apache.lucene.analysis.morph.Viterbi} subclass for Korean morphological analysis. */
|
||||
final class Viterbi
|
||||
extends org.apache.lucene.analysis.morph.Viterbi<
|
||||
Token, org.apache.lucene.analysis.morph.Viterbi.Position> {
|
||||
|
||||
private final EnumMap<TokenType, Dictionary<? extends KoMorphData>> dictionaryMap =
|
||||
new EnumMap<>(TokenType.class);
|
||||
|
||||
private final UnknownDictionary unkDictionary;
|
||||
private final CharacterDefinition characterDefinition;
|
||||
|
||||
private final boolean discardPunctuation;
|
||||
private final KoreanTokenizer.DecompoundMode mode;
|
||||
private final boolean outputUnknownUnigrams;
|
||||
|
||||
private GraphvizFormatter<KoMorphData> dotOut;
|
||||
|
||||
Viterbi(
|
||||
TokenInfoFST fst,
|
||||
FST.BytesReader fstReader,
|
||||
TokenInfoDictionary dictionary,
|
||||
TokenInfoFST userFST,
|
||||
FST.BytesReader userFSTReader,
|
||||
UserDictionary userDictionary,
|
||||
ConnectionCosts costs,
|
||||
UnknownDictionary unkDictionary,
|
||||
CharacterDefinition characterDefinition,
|
||||
boolean discardPunctuation,
|
||||
KoreanTokenizer.DecompoundMode mode,
|
||||
boolean outputUnknownUnigrams) {
|
||||
super(
|
||||
fst, fstReader, dictionary, userFST, userFSTReader, userDictionary, costs, Position.class);
|
||||
this.unkDictionary = unkDictionary;
|
||||
this.characterDefinition = characterDefinition;
|
||||
this.discardPunctuation = discardPunctuation;
|
||||
this.mode = mode;
|
||||
this.outputUnknownUnigrams = outputUnknownUnigrams;
|
||||
this.enableSpacePenaltyFactor = true;
|
||||
this.outputLongestUserEntryOnly = true;
|
||||
dictionaryMap.put(TokenType.KNOWN, dictionary);
|
||||
dictionaryMap.put(TokenType.UNKNOWN, unkDictionary);
|
||||
dictionaryMap.put(TokenType.USER, userDictionary);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int processUnknownWord(boolean anyMatches, Position posData) throws IOException {
|
||||
final char firstCharacter = (char) buffer.get(pos);
|
||||
if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) {
|
||||
|
||||
// Find unknown match:
|
||||
int characterId = characterDefinition.getCharacterClass(firstCharacter);
|
||||
// NOTE: copied from UnknownDictionary.lookup:
|
||||
int unknownWordLength;
|
||||
if (!characterDefinition.isGroup(firstCharacter)) {
|
||||
unknownWordLength = 1;
|
||||
} else {
|
||||
// Extract unknown word. Characters with the same script are considered to be part of
|
||||
// unknown word
|
||||
unknownWordLength = 1;
|
||||
Character.UnicodeScript scriptCode = Character.UnicodeScript.of(firstCharacter);
|
||||
final boolean isPunct = isPunctuation(firstCharacter);
|
||||
final boolean isDigit = Character.isDigit(firstCharacter);
|
||||
for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
|
||||
int next = buffer.get(posAhead);
|
||||
if (next == -1) {
|
||||
break;
|
||||
}
|
||||
char ch = (char) next;
|
||||
int chType = Character.getType(ch);
|
||||
Character.UnicodeScript sc = Character.UnicodeScript.of(next);
|
||||
boolean sameScript =
|
||||
isSameScript(scriptCode, sc)
|
||||
// Non-spacing marks inherit the script of their base character,
|
||||
// following recommendations from UTR #24.
|
||||
|| chType == Character.NON_SPACING_MARK;
|
||||
|
||||
if (sameScript
|
||||
// split on punctuation
|
||||
&& isPunctuation(ch, chType) == isPunct
|
||||
// split on digit
|
||||
&& Character.isDigit(ch) == isDigit
|
||||
&& characterDefinition.isGroup(ch)) {
|
||||
unknownWordLength++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
// Update the script code and character class if the original script
|
||||
// is Inherited or Common.
|
||||
if (isCommonOrInherited(scriptCode) && isCommonOrInherited(sc) == false) {
|
||||
scriptCode = sc;
|
||||
characterId = characterDefinition.getCharacterClass(ch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unkDictionary.lookupWordIds(
|
||||
characterId, wordIdRef); // characters in input text are supposed to be the same
|
||||
if (VERBOSE) {
|
||||
System.out.println(
|
||||
" UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.length + " wordIDs");
|
||||
}
|
||||
for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
|
||||
add(
|
||||
unkDictionary.getMorphAttributes(),
|
||||
posData,
|
||||
pos,
|
||||
pos + unknownWordLength,
|
||||
wordIdRef.ints[wordIdRef.offset + ofs],
|
||||
TokenType.UNKNOWN,
|
||||
false);
|
||||
}
|
||||
}
|
||||
// TODO: should return meaningful value?
|
||||
return 0;
|
||||
}
|
||||
|
||||
void setGraphvizFormatter(GraphvizFormatter<KoMorphData> dotOut) {
|
||||
this.dotOut = dotOut;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void backtrace(Position endPosData, int fromIDX) {
|
||||
final int endPos = endPosData.getPos();
|
||||
|
||||
if (endPos == lastBackTracePos) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(
|
||||
"\n backtrace: endPos="
|
||||
+ endPos
|
||||
+ " pos="
|
||||
+ pos
|
||||
+ "; "
|
||||
+ (pos - lastBackTracePos)
|
||||
+ " characters; last="
|
||||
+ lastBackTracePos
|
||||
+ " cost="
|
||||
+ endPosData.getCost(fromIDX));
|
||||
}
|
||||
|
||||
final char[] fragment = buffer.get(lastBackTracePos, endPos - lastBackTracePos);
|
||||
|
||||
if (dotOut != null) {
|
||||
dotOut.onBacktrace(
|
||||
this::getDict, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
|
||||
}
|
||||
|
||||
int pos = endPos;
|
||||
int bestIDX = fromIDX;
|
||||
|
||||
// TODO: sort of silly to make Token instances here; the
|
||||
// back trace has all info needed to generate the
|
||||
// token. So, we could just directly set the attrs,
|
||||
// from the backtrace, in incrementToken w/o ever
|
||||
// creating Token; we'd have to defer calling freeBefore
|
||||
// until after the backtrace was fully "consumed" by
|
||||
// incrementToken.
|
||||
|
||||
while (pos > lastBackTracePos) {
|
||||
// System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
|
||||
final Position posData = positions.get(pos);
|
||||
assert bestIDX < posData.getCount();
|
||||
|
||||
int backPos = posData.getBackPos(bestIDX);
|
||||
int backWordPos = posData.getBackWordPos(bestIDX);
|
||||
assert backPos >= lastBackTracePos
|
||||
: "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos;
|
||||
// the length of the word without the whitespaces at the beginning.
|
||||
int length = pos - backWordPos;
|
||||
TokenType backType = posData.getBackType(bestIDX);
|
||||
int backID = posData.getBackID(bestIDX);
|
||||
int nextBestIDX = posData.getBackIndex(bestIDX);
|
||||
// the start of the word after the whitespace at the beginning.
|
||||
final int fragmentOffset = backWordPos - lastBackTracePos;
|
||||
assert fragmentOffset >= 0;
|
||||
|
||||
final Dictionary<? extends KoMorphData> dict = getDict(backType);
|
||||
|
||||
if (outputUnknownUnigrams && backType == TokenType.UNKNOWN) {
|
||||
// outputUnknownUnigrams converts unknown word into unigrams:
|
||||
for (int i = length - 1; i >= 0; i--) {
|
||||
int charLen = 1;
|
||||
if (i > 0 && Character.isLowSurrogate(fragment[fragmentOffset + i])) {
|
||||
i--;
|
||||
charLen = 2;
|
||||
}
|
||||
final DictionaryToken token =
|
||||
new DictionaryToken(
|
||||
TokenType.UNKNOWN,
|
||||
unkDictionary.getMorphAttributes(),
|
||||
CharacterDefinition.NGRAM,
|
||||
fragment,
|
||||
fragmentOffset + i,
|
||||
charLen,
|
||||
backWordPos + i,
|
||||
backWordPos + i + charLen);
|
||||
pending.add(token);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" add token=" + pending.get(pending.size() - 1));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
final DictionaryToken token =
|
||||
new DictionaryToken(
|
||||
backType,
|
||||
dict.getMorphAttributes(),
|
||||
backID,
|
||||
fragment,
|
||||
fragmentOffset,
|
||||
length,
|
||||
backWordPos,
|
||||
backWordPos + length);
|
||||
if (token.getPOSType() == POS.Type.MORPHEME
|
||||
|| mode == KoreanTokenizer.DecompoundMode.NONE) {
|
||||
if (shouldFilterToken(token) == false) {
|
||||
pending.add(token);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" add token=" + pending.get(pending.size() - 1));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
KoMorphData.Morpheme[] morphemes = token.getMorphemes();
|
||||
if (morphemes == null) {
|
||||
pending.add(token);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" add token=" + pending.get(pending.size() - 1));
|
||||
}
|
||||
} else {
|
||||
int endOffset = backWordPos + length;
|
||||
int posLen = 0;
|
||||
// decompose the compound
|
||||
for (int i = morphemes.length - 1; i >= 0; i--) {
|
||||
final KoMorphData.Morpheme morpheme = morphemes[i];
|
||||
final Token compoundToken;
|
||||
if (token.getPOSType() == POS.Type.COMPOUND) {
|
||||
assert endOffset - morpheme.surfaceForm.length() >= 0;
|
||||
compoundToken =
|
||||
new DecompoundToken(
|
||||
morpheme.posTag,
|
||||
morpheme.surfaceForm,
|
||||
endOffset - morpheme.surfaceForm.length(),
|
||||
endOffset,
|
||||
backType);
|
||||
} else {
|
||||
compoundToken =
|
||||
new DecompoundToken(
|
||||
morpheme.posTag,
|
||||
morpheme.surfaceForm,
|
||||
token.getStartOffset(),
|
||||
token.getEndOffset(),
|
||||
backType);
|
||||
}
|
||||
if (i == 0 && mode == KoreanTokenizer.DecompoundMode.MIXED) {
|
||||
compoundToken.setPositionIncrement(0);
|
||||
}
|
||||
++posLen;
|
||||
endOffset -= morpheme.surfaceForm.length();
|
||||
pending.add(compoundToken);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" add token=" + pending.get(pending.size() - 1));
|
||||
}
|
||||
}
|
||||
if (mode == KoreanTokenizer.DecompoundMode.MIXED) {
|
||||
token.setPositionLength(Math.max(1, posLen));
|
||||
pending.add(token);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" add token=" + pending.get(pending.size() - 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (discardPunctuation == false && backWordPos != backPos) {
|
||||
// Add a token for whitespaces between terms
|
||||
int offset = backPos - lastBackTracePos;
|
||||
int len = backWordPos - backPos;
|
||||
// System.out.println(offset + " " + fragmentOffset + " " + len + " " + backWordPos + " " +
|
||||
// backPos);
|
||||
unkDictionary.lookupWordIds(characterDefinition.getCharacterClass(' '), wordIdRef);
|
||||
DictionaryToken spaceToken =
|
||||
new DictionaryToken(
|
||||
TokenType.UNKNOWN,
|
||||
unkDictionary.getMorphAttributes(),
|
||||
wordIdRef.ints[wordIdRef.offset],
|
||||
fragment,
|
||||
offset,
|
||||
len,
|
||||
backPos,
|
||||
backPos + len);
|
||||
pending.add(spaceToken);
|
||||
}
|
||||
|
||||
pos = backPos;
|
||||
bestIDX = nextBestIDX;
|
||||
}
|
||||
|
||||
lastBackTracePos = endPos;
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(" freeBefore pos=" + endPos);
|
||||
}
|
||||
// Notify the circular buffers that we are done with
|
||||
// these positions:
|
||||
buffer.freeBefore(endPos);
|
||||
positions.freeBefore(endPos);
|
||||
}
|
||||
|
||||
/** Returns the space penalty associated with the provided {@link POS.Tag}. */
|
||||
@Override
|
||||
protected int computeSpacePenalty(MorphData morphData, int wordID, int numSpaces) {
|
||||
final POS.Tag leftPOS = ((KoMorphData) morphData).getLeftPOS(wordID);
|
||||
int spacePenalty = 0;
|
||||
if (numSpaces > 0) {
|
||||
// TODO we should extract the penalty (left-space-penalty-factor) from the dicrc file.
|
||||
switch (leftPOS) {
|
||||
case E:
|
||||
case J:
|
||||
case VCP:
|
||||
case XSA:
|
||||
case XSN:
|
||||
case XSV:
|
||||
spacePenalty = 3000;
|
||||
break;
|
||||
case IC:
|
||||
case MAG:
|
||||
case MAJ:
|
||||
case MM:
|
||||
case NA:
|
||||
case NNB:
|
||||
case NNBC:
|
||||
case NNG:
|
||||
case NNP:
|
||||
case NP:
|
||||
case NR:
|
||||
case SC:
|
||||
case SE:
|
||||
case SF:
|
||||
case SH:
|
||||
case SL:
|
||||
case SN:
|
||||
case SP:
|
||||
case SSC:
|
||||
case SSO:
|
||||
case SY:
|
||||
case UNA:
|
||||
case UNKNOWN:
|
||||
case VA:
|
||||
case VCN:
|
||||
case VSV:
|
||||
case VV:
|
||||
case VX:
|
||||
case XPN:
|
||||
case XR:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
return spacePenalty;
|
||||
}
|
||||
|
||||
Dictionary<? extends KoMorphData> getDict(TokenType type) {
|
||||
return dictionaryMap.get(type);
|
||||
}
|
||||
|
||||
private boolean shouldFilterToken(Token token) {
|
||||
return discardPunctuation && isPunctuation(token.getSurfaceForm()[token.getOffset()]);
|
||||
}
|
||||
|
||||
private static boolean isPunctuation(char ch) {
|
||||
return isPunctuation(ch, Character.getType(ch));
|
||||
}
|
||||
|
||||
private static boolean isPunctuation(char ch, int cid) {
|
||||
// special case for Hangul Letter Araea (interpunct)
|
||||
if (ch == 0x318D) {
|
||||
return true;
|
||||
}
|
||||
switch (cid) {
|
||||
case Character.SPACE_SEPARATOR:
|
||||
case Character.LINE_SEPARATOR:
|
||||
case Character.PARAGRAPH_SEPARATOR:
|
||||
case Character.CONTROL:
|
||||
case Character.FORMAT:
|
||||
case Character.DASH_PUNCTUATION:
|
||||
case Character.START_PUNCTUATION:
|
||||
case Character.END_PUNCTUATION:
|
||||
case Character.CONNECTOR_PUNCTUATION:
|
||||
case Character.OTHER_PUNCTUATION:
|
||||
case Character.MATH_SYMBOL:
|
||||
case Character.CURRENCY_SYMBOL:
|
||||
case Character.MODIFIER_SYMBOL:
|
||||
case Character.OTHER_SYMBOL:
|
||||
case Character.INITIAL_QUOTE_PUNCTUATION:
|
||||
case Character.FINAL_QUOTE_PUNCTUATION:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean isCommonOrInherited(Character.UnicodeScript script) {
|
||||
return script == Character.UnicodeScript.INHERITED || script == Character.UnicodeScript.COMMON;
|
||||
}
|
||||
|
||||
/** Determine if two scripts are compatible. */
|
||||
private static boolean isSameScript(
|
||||
Character.UnicodeScript scriptOne, Character.UnicodeScript scriptTwo) {
|
||||
return scriptOne == scriptTwo
|
||||
|| isCommonOrInherited(scriptOne)
|
||||
|| isCommonOrInherited(scriptTwo);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue