mirror of https://github.com/apache/lucene.git
LUCENE-10493: factor out Viterbi algorithm and share it between kuromoji and nori (#805)
This commit is contained in:
parent
2a4c21bb58
commit
c89f8a7ea1
|
@ -60,6 +60,8 @@ Other
|
||||||
All classes in `org.apache.lucene.analysis.[ja|ko].util` was moved to `org.apache.lucene.analysis.[ja|ko].dict`.
|
All classes in `org.apache.lucene.analysis.[ja|ko].util` was moved to `org.apache.lucene.analysis.[ja|ko].dict`.
|
||||||
(Tomoko Uchida)
|
(Tomoko Uchida)
|
||||||
|
|
||||||
|
* LUCENE-10493: Factor out Viterbi algorithm in Kuromoji and Nori to analysis-common. (Tomoko Uchida)
|
||||||
|
|
||||||
======================= Lucene 9.2.0 =======================
|
======================= Lucene 9.2.0 =======================
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
|
@ -14,22 +14,16 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.analysis.ko;
|
package org.apache.lucene.analysis.morph;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import org.apache.lucene.analysis.ko.KoreanTokenizer.Position;
|
|
||||||
import org.apache.lucene.analysis.ko.KoreanTokenizer.WrappedPositionArray;
|
|
||||||
import org.apache.lucene.analysis.ko.dict.ConnectionCosts;
|
|
||||||
import org.apache.lucene.analysis.ko.dict.KoMorphData;
|
|
||||||
import org.apache.lucene.analysis.morph.Dictionary;
|
|
||||||
|
|
||||||
// TODO: would be nice to show 2nd best path in a diff't
|
// TODO: would be nice to show 2nd best path in a diff't
|
||||||
// color...
|
// color...
|
||||||
|
|
||||||
/** Outputs the dot (graphviz) string for the viterbi lattice. */
|
/** Outputs the dot (graphviz) string for the viterbi lattice. */
|
||||||
public class GraphvizFormatter {
|
public class GraphvizFormatter<T extends MorphData> {
|
||||||
|
|
||||||
private static final String BOS_LABEL = "BOS";
|
private static final String BOS_LABEL = "BOS";
|
||||||
|
|
||||||
private static final String EOS_LABEL = "EOS";
|
private static final String EOS_LABEL = "EOS";
|
||||||
|
@ -56,36 +50,39 @@ public class GraphvizFormatter {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Backtraces another incremental fragment:
|
// Backtraces another incremental fragment:
|
||||||
void onBacktrace(
|
public void onBacktrace(
|
||||||
KoreanTokenizer tok,
|
DictionaryProvider<T> dictProvider,
|
||||||
WrappedPositionArray positions,
|
Viterbi.WrappedPositionArray<? extends Viterbi.Position> positions,
|
||||||
int lastBackTracePos,
|
int lastBackTracePos,
|
||||||
Position endPosData,
|
Viterbi.Position endPosData,
|
||||||
int fromIDX,
|
int fromIDX,
|
||||||
char[] fragment,
|
char[] fragment,
|
||||||
boolean isEnd) {
|
boolean isEnd) {
|
||||||
setBestPathMap(positions, lastBackTracePos, endPosData, fromIDX);
|
setBestPathMap(positions, lastBackTracePos, endPosData, fromIDX);
|
||||||
sb.append(formatNodes(tok, positions, lastBackTracePos, endPosData, fragment));
|
sb.append(formatNodes(dictProvider, positions, lastBackTracePos, endPosData, fragment));
|
||||||
if (isEnd) {
|
if (isEnd) {
|
||||||
sb.append(" fini [style=invis]\n");
|
sb.append(" fini [style=invis]\n");
|
||||||
sb.append(" ");
|
sb.append(" ");
|
||||||
sb.append(getNodeID(endPosData.pos, fromIDX));
|
sb.append(getNodeID(endPosData.getPos(), fromIDX));
|
||||||
sb.append(" -> fini [label=\"" + EOS_LABEL + "\"]");
|
sb.append(" -> fini [label=\"" + EOS_LABEL + "\"]");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Records which arcs make up the best bath:
|
// Records which arcs make up the best bath:
|
||||||
private void setBestPathMap(
|
private void setBestPathMap(
|
||||||
WrappedPositionArray positions, int startPos, Position endPosData, int fromIDX) {
|
Viterbi.WrappedPositionArray<? extends Viterbi.Position> positions,
|
||||||
|
int startPos,
|
||||||
|
Viterbi.Position endPosData,
|
||||||
|
int fromIDX) {
|
||||||
bestPathMap.clear();
|
bestPathMap.clear();
|
||||||
|
|
||||||
int pos = endPosData.pos;
|
int pos = endPosData.getPos();
|
||||||
int bestIDX = fromIDX;
|
int bestIDX = fromIDX;
|
||||||
while (pos > startPos) {
|
while (pos > startPos) {
|
||||||
final Position posData = positions.get(pos);
|
final Viterbi.Position posData = positions.get(pos);
|
||||||
|
|
||||||
final int backPos = posData.backPos[bestIDX];
|
final int backPos = posData.getBackPos(bestIDX);
|
||||||
final int backIDX = posData.backIndex[bestIDX];
|
final int backIDX = posData.getBackIndex(bestIDX);
|
||||||
|
|
||||||
final String toNodeID = getNodeID(pos, bestIDX);
|
final String toNodeID = getNodeID(pos, bestIDX);
|
||||||
final String fromNodeID = getNodeID(backPos, backIDX);
|
final String fromNodeID = getNodeID(backPos, backIDX);
|
||||||
|
@ -99,34 +96,34 @@ public class GraphvizFormatter {
|
||||||
}
|
}
|
||||||
|
|
||||||
private String formatNodes(
|
private String formatNodes(
|
||||||
KoreanTokenizer tok,
|
DictionaryProvider<T> dictProvider,
|
||||||
WrappedPositionArray positions,
|
Viterbi.WrappedPositionArray<? extends Viterbi.Position> positions,
|
||||||
int startPos,
|
int startPos,
|
||||||
Position endPosData,
|
Viterbi.Position endPosData,
|
||||||
char[] fragment) {
|
char[] fragment) {
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
// Output nodes
|
// Output nodes
|
||||||
for (int pos = startPos + 1; pos <= endPosData.pos; pos++) {
|
for (int pos = startPos + 1; pos <= endPosData.getPos(); pos++) {
|
||||||
final Position posData = positions.get(pos);
|
final Viterbi.Position posData = positions.get(pos);
|
||||||
for (int idx = 0; idx < posData.count; idx++) {
|
for (int idx = 0; idx < posData.getCount(); idx++) {
|
||||||
sb.append(" ");
|
sb.append(" ");
|
||||||
sb.append(getNodeID(pos, idx));
|
sb.append(getNodeID(pos, idx));
|
||||||
sb.append(" [label=\"");
|
sb.append(" [label=\"");
|
||||||
sb.append(pos);
|
sb.append(pos);
|
||||||
sb.append(": ");
|
sb.append(": ");
|
||||||
sb.append(posData.lastRightID[idx]);
|
sb.append(posData.getLastRightID(idx));
|
||||||
sb.append("\"]\n");
|
sb.append("\"]\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Output arcs
|
// Output arcs
|
||||||
for (int pos = endPosData.pos; pos > startPos; pos--) {
|
for (int pos = endPosData.getPos(); pos > startPos; pos--) {
|
||||||
final Position posData = positions.get(pos);
|
final Viterbi.Position posData = positions.get(pos);
|
||||||
for (int idx = 0; idx < posData.count; idx++) {
|
for (int idx = 0; idx < posData.getCount(); idx++) {
|
||||||
final Position backPosData = positions.get(posData.backPos[idx]);
|
final Viterbi.Position backPosData = positions.get(posData.getBackPos(idx));
|
||||||
final String toNodeID = getNodeID(pos, idx);
|
final String toNodeID = getNodeID(pos, idx);
|
||||||
final String fromNodeID = getNodeID(posData.backPos[idx], posData.backIndex[idx]);
|
final String fromNodeID = getNodeID(posData.getBackPos(idx), posData.getBackIndex(idx));
|
||||||
|
|
||||||
sb.append(" ");
|
sb.append(" ");
|
||||||
sb.append(fromNodeID);
|
sb.append(fromNodeID);
|
||||||
|
@ -141,15 +138,15 @@ public class GraphvizFormatter {
|
||||||
attrs = "";
|
attrs = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
final Dictionary<? extends KoMorphData> dict = tok.getDict(posData.backType[idx]);
|
final Dictionary<? extends T> dict = dictProvider.get(posData.getBackType(idx));
|
||||||
final int wordCost = dict.getWordCost(posData.backID[idx]);
|
final int wordCost = dict.getWordCost(posData.getBackID(idx));
|
||||||
final int bgCost =
|
final int bgCost =
|
||||||
costs.get(
|
costs.get(
|
||||||
backPosData.lastRightID[posData.backIndex[idx]],
|
backPosData.getLastRightID(posData.getBackIndex(idx)),
|
||||||
dict.getLeftId(posData.backID[idx]));
|
dict.getLeftId(posData.getBackID(idx)));
|
||||||
|
|
||||||
final String surfaceForm =
|
final String surfaceForm =
|
||||||
new String(fragment, posData.backPos[idx] - startPos, pos - posData.backPos[idx]);
|
new String(fragment, posData.getBackPos(idx) - startPos, pos - posData.getBackPos(idx));
|
||||||
|
|
||||||
sb.append(" [label=\"");
|
sb.append(" [label=\"");
|
||||||
sb.append(surfaceForm);
|
sb.append(surfaceForm);
|
||||||
|
@ -190,4 +187,10 @@ public class GraphvizFormatter {
|
||||||
private String getNodeID(int pos, int idx) {
|
private String getNodeID(int pos, int idx) {
|
||||||
return pos + "." + idx;
|
return pos + "." + idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** {@link Dictionary} provider */
|
||||||
|
@FunctionalInterface
|
||||||
|
public interface DictionaryProvider<T extends MorphData> {
|
||||||
|
Dictionary<? extends T> get(TokenType type);
|
||||||
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,815 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.morph;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.lang.reflect.Array;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import org.apache.lucene.analysis.util.RollingCharBuffer;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Performs <a href="https://en.wikipedia.org/wiki/Viterbi_algorithm">Viterbi algorithm</a> for
|
||||||
|
* morphological Tokenizers, which split texts by Hidden Markov Model or Conditional Random Fields.
|
||||||
|
*
|
||||||
|
* @param <T> output token class
|
||||||
|
* @param <U> position class
|
||||||
|
*/
|
||||||
|
public abstract class Viterbi<T extends Token, U extends Viterbi.Position> {
|
||||||
|
protected static final boolean VERBOSE = false;
|
||||||
|
|
||||||
|
// For safety:
|
||||||
|
protected static final int MAX_UNKNOWN_WORD_LENGTH = 1024;
|
||||||
|
private static final int MAX_BACKTRACE_GAP = 1024;
|
||||||
|
|
||||||
|
private final TokenInfoFST fst;
|
||||||
|
private final BinaryDictionary<? extends MorphData> dictionary;
|
||||||
|
private final Dictionary<? extends MorphData> userDictionary;
|
||||||
|
protected final ConnectionCosts costs;
|
||||||
|
|
||||||
|
private final FST.Arc<Long> arc = new FST.Arc<>();
|
||||||
|
private final FST.BytesReader fstReader;
|
||||||
|
protected final IntsRef wordIdRef = new IntsRef();
|
||||||
|
|
||||||
|
private final FST.BytesReader userFSTReader;
|
||||||
|
private final TokenInfoFST userFST;
|
||||||
|
|
||||||
|
protected final RollingCharBuffer buffer = new RollingCharBuffer();
|
||||||
|
|
||||||
|
protected final WrappedPositionArray<U> positions;
|
||||||
|
|
||||||
|
// True once we've hit the EOF from the input reader:
|
||||||
|
protected boolean end;
|
||||||
|
|
||||||
|
// Last absolute position we backtraced from:
|
||||||
|
protected int lastBackTracePos;
|
||||||
|
|
||||||
|
// Next absolute position to process:
|
||||||
|
protected int pos;
|
||||||
|
|
||||||
|
// Already parsed, but not yet passed to caller, tokens:
|
||||||
|
protected final List<T> pending = new ArrayList<>();
|
||||||
|
|
||||||
|
protected boolean outputNBest = false;
|
||||||
|
|
||||||
|
protected boolean enableSpacePenaltyFactor = false;
|
||||||
|
|
||||||
|
protected boolean outputLongestUserEntryOnly = false;
|
||||||
|
|
||||||
|
protected Viterbi(
|
||||||
|
TokenInfoFST fst,
|
||||||
|
FST.BytesReader fstReader,
|
||||||
|
BinaryDictionary<? extends MorphData> dictionary,
|
||||||
|
TokenInfoFST userFST,
|
||||||
|
FST.BytesReader userFSTReader,
|
||||||
|
Dictionary<? extends MorphData> userDictionary,
|
||||||
|
ConnectionCosts costs,
|
||||||
|
Class<U> positionImpl) {
|
||||||
|
this.fst = fst;
|
||||||
|
this.fstReader = fstReader;
|
||||||
|
this.dictionary = dictionary;
|
||||||
|
this.userFST = userFST;
|
||||||
|
this.userFSTReader = userFSTReader;
|
||||||
|
this.userDictionary = userDictionary;
|
||||||
|
this.costs = costs;
|
||||||
|
this.positions = new WrappedPositionArray<>(positionImpl);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Incrementally parse some more characters. This runs the viterbi search forwards "enough" so
|
||||||
|
* that we generate some more tokens. How much forward depends on the chars coming in, since some
|
||||||
|
* chars could cause longer-lasting ambiguity in the parsing. Once the ambiguity is resolved, then
|
||||||
|
* we back trace, produce the pending tokens, and return.
|
||||||
|
*/
|
||||||
|
public final void forward() throws IOException {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("\nPARSE");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Index of the last character of unknown word:
|
||||||
|
int unknownWordEndIndex = -1;
|
||||||
|
|
||||||
|
// Maximum posAhead of user word in the entire input
|
||||||
|
int userWordMaxPosAhead = -1;
|
||||||
|
|
||||||
|
// Advances over each position (character):
|
||||||
|
while (buffer.get(pos) != -1) {
|
||||||
|
final Position posData = positions.get(pos);
|
||||||
|
final boolean isFrontier = positions.getNextPos() == pos + 1;
|
||||||
|
|
||||||
|
if (posData.count == 0) {
|
||||||
|
// No arcs arrive here; move to next position:
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" no arcs in; skip pos=" + pos);
|
||||||
|
}
|
||||||
|
pos++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
|
||||||
|
// We are at a "frontier", and only one node is
|
||||||
|
// alive, so whatever the eventual best path is must
|
||||||
|
// come through this node. So we can safely commit
|
||||||
|
// to the prefix of the best path at this point:
|
||||||
|
if (outputNBest) {
|
||||||
|
backtraceNBest(posData, false);
|
||||||
|
}
|
||||||
|
backtrace(posData, 0);
|
||||||
|
if (outputNBest) {
|
||||||
|
fixupPendingList();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Re-base cost so we don't risk int overflow:
|
||||||
|
posData.costs[0] = 0;
|
||||||
|
if (pending.size() > 0) {
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
// This means the backtrace only produced
|
||||||
|
// punctuation tokens, so we must keep parsing.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP) {
|
||||||
|
// Safety: if we've buffered too much, force a
|
||||||
|
// backtrace now. We find the least-cost partial
|
||||||
|
// path, across all paths, backtrace from it, and
|
||||||
|
// then prune all others. Note that this, in
|
||||||
|
// general, can produce the wrong result, if the
|
||||||
|
// total best path did not in fact back trace
|
||||||
|
// through this partial best path. But it's the
|
||||||
|
// best we can do... (short of not having a
|
||||||
|
// safety!).
|
||||||
|
|
||||||
|
// First pass: find least cost partial path so far,
|
||||||
|
// including ending at future positions:
|
||||||
|
int leastIDX = -1;
|
||||||
|
int leastCost = Integer.MAX_VALUE;
|
||||||
|
Position leastPosData = null;
|
||||||
|
for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
|
||||||
|
final Position posData2 = positions.get(pos2);
|
||||||
|
for (int idx = 0; idx < posData2.count; idx++) {
|
||||||
|
// System.out.println(" idx=" + idx + " cost=" + cost);
|
||||||
|
final int cost = posData2.costs[idx];
|
||||||
|
if (cost < leastCost) {
|
||||||
|
leastCost = cost;
|
||||||
|
leastIDX = idx;
|
||||||
|
leastPosData = posData2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We will always have at least one live path:
|
||||||
|
assert leastIDX != -1;
|
||||||
|
|
||||||
|
if (outputNBest) {
|
||||||
|
backtraceNBest(leastPosData, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second pass: prune all but the best path:
|
||||||
|
for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
|
||||||
|
final Position posData2 = positions.get(pos2);
|
||||||
|
if (posData2 != leastPosData) {
|
||||||
|
posData2.reset();
|
||||||
|
} else {
|
||||||
|
if (leastIDX != 0) {
|
||||||
|
posData2.costs[0] = posData2.costs[leastIDX];
|
||||||
|
posData2.lastRightID[0] = posData2.lastRightID[leastIDX];
|
||||||
|
posData2.backPos[0] = posData2.backPos[leastIDX];
|
||||||
|
posData2.backWordPos[0] = posData2.backWordPos[leastIDX];
|
||||||
|
posData2.backIndex[0] = posData2.backIndex[leastIDX];
|
||||||
|
posData2.backID[0] = posData2.backID[leastIDX];
|
||||||
|
posData2.backType[0] = posData2.backType[leastIDX];
|
||||||
|
}
|
||||||
|
posData2.count = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
backtrace(leastPosData, 0);
|
||||||
|
if (outputNBest) {
|
||||||
|
fixupPendingList();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Re-base cost so we don't risk int overflow:
|
||||||
|
Arrays.fill(leastPosData.costs, 0, leastPosData.count, 0);
|
||||||
|
|
||||||
|
if (pos != leastPosData.pos) {
|
||||||
|
// We jumped into a future position:
|
||||||
|
assert pos < leastPosData.pos;
|
||||||
|
pos = leastPosData.pos;
|
||||||
|
}
|
||||||
|
if (pending.size() > 0) {
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
// This means the backtrace only produced
|
||||||
|
// punctuation tokens, so we must keep parsing.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(
|
||||||
|
"\n extend @ pos="
|
||||||
|
+ pos
|
||||||
|
+ " char="
|
||||||
|
+ (char) buffer.get(pos)
|
||||||
|
+ " hex="
|
||||||
|
+ Integer.toHexString(buffer.get(pos)));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" " + posData.count + " arcs in");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (enableSpacePenaltyFactor
|
||||||
|
&& Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) {
|
||||||
|
// We add single space separator as prefixes of the terms that we extract.
|
||||||
|
// This information is needed to compute the space penalty factor of each term.
|
||||||
|
// These whitespace prefixes are removed when the final tokens are generated, or
|
||||||
|
// added as separated tokens when discardPunctuation is unset.
|
||||||
|
if (buffer.get(++pos) == -1) {
|
||||||
|
pos = posData.pos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean anyMatches = false;
|
||||||
|
|
||||||
|
// First try user dict:
|
||||||
|
if (userFST != null) {
|
||||||
|
userFST.getFirstArc(arc);
|
||||||
|
int output = 0;
|
||||||
|
int maxPosAhead = 0;
|
||||||
|
int outputMaxPosAhead = 0;
|
||||||
|
int arcFinalOutMaxPosAhead = 0;
|
||||||
|
|
||||||
|
for (int posAhead = pos; ; posAhead++) {
|
||||||
|
final int ch = buffer.get(posAhead);
|
||||||
|
if (ch == -1) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (userFST.findTargetArc(ch, arc, arc, posAhead == pos, userFSTReader) == null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
output += arc.output().intValue();
|
||||||
|
if (arc.isFinal()) {
|
||||||
|
maxPosAhead = posAhead;
|
||||||
|
outputMaxPosAhead = output;
|
||||||
|
arcFinalOutMaxPosAhead = arc.nextFinalOutput().intValue();
|
||||||
|
anyMatches = true;
|
||||||
|
if (!outputLongestUserEntryOnly) {
|
||||||
|
// add all matched user entries.
|
||||||
|
add(
|
||||||
|
userDictionary.getMorphAttributes(),
|
||||||
|
posData,
|
||||||
|
pos,
|
||||||
|
posAhead + 1,
|
||||||
|
output + arc.nextFinalOutput().intValue(),
|
||||||
|
TokenType.USER,
|
||||||
|
false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Longest matching for user word
|
||||||
|
if (anyMatches && maxPosAhead > userWordMaxPosAhead) {
|
||||||
|
if (outputLongestUserEntryOnly) {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(
|
||||||
|
" USER word "
|
||||||
|
+ new String(buffer.get(pos, maxPosAhead + 1))
|
||||||
|
+ " toPos="
|
||||||
|
+ (maxPosAhead + 1));
|
||||||
|
}
|
||||||
|
add(
|
||||||
|
userDictionary.getMorphAttributes(),
|
||||||
|
posData,
|
||||||
|
pos,
|
||||||
|
maxPosAhead + 1,
|
||||||
|
outputMaxPosAhead + arcFinalOutMaxPosAhead,
|
||||||
|
TokenType.USER,
|
||||||
|
false);
|
||||||
|
}
|
||||||
|
userWordMaxPosAhead = Math.max(userWordMaxPosAhead, maxPosAhead);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: we can be more aggressive about user
|
||||||
|
// matches? if we are "under" a user match then don't
|
||||||
|
// extend KNOWN/UNKNOWN paths?
|
||||||
|
|
||||||
|
if (!anyMatches) {
|
||||||
|
// Next, try known dictionary matches
|
||||||
|
fst.getFirstArc(arc);
|
||||||
|
int output = 0;
|
||||||
|
|
||||||
|
for (int posAhead = pos; ; posAhead++) {
|
||||||
|
final int ch = buffer.get(posAhead);
|
||||||
|
if (ch == -1) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// System.out.println(" match " + (char) ch + " posAhead=" + posAhead);
|
||||||
|
|
||||||
|
if (fst.findTargetArc(ch, arc, arc, posAhead == pos, fstReader) == null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
output += arc.output().intValue();
|
||||||
|
|
||||||
|
// Optimization: for known words that are too-long
|
||||||
|
// (compound), we should pre-compute the 2nd
|
||||||
|
// best segmentation and store it in the
|
||||||
|
// dictionary instead of recomputing it each time a
|
||||||
|
// match is found.
|
||||||
|
|
||||||
|
if (arc.isFinal()) {
|
||||||
|
dictionary.lookupWordIds(output + arc.nextFinalOutput().intValue(), wordIdRef);
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(
|
||||||
|
" KNOWN word "
|
||||||
|
+ new String(buffer.get(pos, posAhead - pos + 1))
|
||||||
|
+ " toPos="
|
||||||
|
+ (posAhead + 1)
|
||||||
|
+ " "
|
||||||
|
+ wordIdRef.length
|
||||||
|
+ " wordIDs");
|
||||||
|
}
|
||||||
|
for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
|
||||||
|
add(
|
||||||
|
dictionary.getMorphAttributes(),
|
||||||
|
posData,
|
||||||
|
pos,
|
||||||
|
posAhead + 1,
|
||||||
|
wordIdRef.ints[wordIdRef.offset + ofs],
|
||||||
|
TokenType.KNOWN,
|
||||||
|
false);
|
||||||
|
anyMatches = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!shouldSkipProcessUnknownWord(unknownWordEndIndex, posData)) {
|
||||||
|
int unknownWordLength = processUnknownWord(anyMatches, posData);
|
||||||
|
unknownWordEndIndex = posData.pos + unknownWordLength;
|
||||||
|
}
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
|
||||||
|
end = true;
|
||||||
|
|
||||||
|
if (pos > 0) {
|
||||||
|
|
||||||
|
final Position endPosData = positions.get(pos);
|
||||||
|
int leastCost = Integer.MAX_VALUE;
|
||||||
|
int leastIDX = -1;
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" end: " + endPosData.count + " nodes");
|
||||||
|
}
|
||||||
|
for (int idx = 0; idx < endPosData.count; idx++) {
|
||||||
|
// Add EOS cost:
|
||||||
|
final int cost = endPosData.costs[idx] + costs.get(endPosData.lastRightID[idx], 0);
|
||||||
|
// System.out.println(" idx=" + idx + " cost=" + cost + " (pathCost=" +
|
||||||
|
// endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ")
|
||||||
|
// backPos=" + endPosData.backPos[idx]);
|
||||||
|
if (cost < leastCost) {
|
||||||
|
leastCost = cost;
|
||||||
|
leastIDX = idx;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (outputNBest) {
|
||||||
|
backtraceNBest(endPosData, true);
|
||||||
|
}
|
||||||
|
backtrace(endPosData, leastIDX);
|
||||||
|
if (outputNBest) {
|
||||||
|
fixupPendingList();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// No characters in the input string; return no tokens!
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected boolean shouldSkipProcessUnknownWord(int unknownWordEndIndex, Position posData) {
|
||||||
|
return unknownWordEndIndex > posData.pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add unknown words to the position graph.
|
||||||
|
*
|
||||||
|
* @return word length
|
||||||
|
*/
|
||||||
|
protected abstract int processUnknownWord(boolean anyMatches, Position posData)
|
||||||
|
throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Backtrace from the provided position, back to the last time we back-traced, accumulating the
|
||||||
|
* resulting tokens to the pending list. The pending list is then in-reverse (last token should be
|
||||||
|
* returned first).
|
||||||
|
*/
|
||||||
|
protected abstract void backtrace(final Position endPosData, final int fromIDX)
|
||||||
|
throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Backtrace the n-best path. Subclasses that support n-best paths should implement this method.
|
||||||
|
*/
|
||||||
|
protected void backtraceNBest(final Position endPosData, final boolean useEOS)
|
||||||
|
throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove duplicated tokens from the pending list; this is needed because {@link
|
||||||
|
* #backtrace(Position, int)} and {@link #backtraceNBest(Position, boolean)} can add same tokens
|
||||||
|
* to the list. Subclasses that support n-best paths should implement this method.
|
||||||
|
*/
|
||||||
|
protected void fixupPendingList() {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Add a token on the minimum cost path to the pending token list. */
|
||||||
|
protected final void add(
|
||||||
|
MorphData morphData,
|
||||||
|
Position fromPosData,
|
||||||
|
int wordPos,
|
||||||
|
int endPos,
|
||||||
|
int wordID,
|
||||||
|
TokenType type,
|
||||||
|
boolean addPenalty)
|
||||||
|
throws IOException {
|
||||||
|
final int wordCost = morphData.getWordCost(wordID);
|
||||||
|
final int leftID = morphData.getLeftId(wordID);
|
||||||
|
int leastCost = Integer.MAX_VALUE;
|
||||||
|
int leastIDX = -1;
|
||||||
|
assert fromPosData.count > 0;
|
||||||
|
for (int idx = 0; idx < fromPosData.count; idx++) {
|
||||||
|
// The number of spaces before the term
|
||||||
|
int numSpaces = wordPos - fromPosData.pos;
|
||||||
|
|
||||||
|
// Cost is path cost so far, plus word cost (added at
|
||||||
|
// end of loop), plus bigram cost and space penalty cost.
|
||||||
|
final int cost =
|
||||||
|
fromPosData.costs[idx]
|
||||||
|
+ costs.get(fromPosData.lastRightID[idx], leftID)
|
||||||
|
+ computeSpacePenalty(morphData, wordID, numSpaces);
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(
|
||||||
|
" fromIDX="
|
||||||
|
+ idx
|
||||||
|
+ ": cost="
|
||||||
|
+ cost
|
||||||
|
+ " (prevCost="
|
||||||
|
+ fromPosData.costs[idx]
|
||||||
|
+ " wordCost="
|
||||||
|
+ wordCost
|
||||||
|
+ " bgCost="
|
||||||
|
+ costs.get(fromPosData.lastRightID[idx], leftID)
|
||||||
|
+ " spacePenalty="
|
||||||
|
+ computeSpacePenalty(morphData, wordID, numSpaces)
|
||||||
|
+ ") leftID="
|
||||||
|
+ leftID
|
||||||
|
// + " leftPOS="
|
||||||
|
// + leftPOS.name()
|
||||||
|
+ ")");
|
||||||
|
}
|
||||||
|
if (cost < leastCost) {
|
||||||
|
leastCost = cost;
|
||||||
|
leastIDX = idx;
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" **");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
leastCost += wordCost;
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(
|
||||||
|
" + cost="
|
||||||
|
+ leastCost
|
||||||
|
+ " wordID="
|
||||||
|
+ wordID
|
||||||
|
+ " leftID="
|
||||||
|
+ leftID
|
||||||
|
+ " leastIDX="
|
||||||
|
+ leastIDX
|
||||||
|
+ " toPos="
|
||||||
|
+ endPos
|
||||||
|
+ " toPos.idx="
|
||||||
|
+ positions.get(endPos).count);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (addPenalty && type != TokenType.USER) {
|
||||||
|
final int penalty = computePenalty(fromPosData.pos, endPos - fromPosData.pos);
|
||||||
|
if (VERBOSE) {
|
||||||
|
if (penalty > 0) {
|
||||||
|
System.out.println(" + penalty=" + penalty + " cost=" + (leastCost + penalty));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
leastCost += penalty;
|
||||||
|
}
|
||||||
|
|
||||||
|
positions
|
||||||
|
.get(endPos)
|
||||||
|
.add(
|
||||||
|
leastCost,
|
||||||
|
morphData.getRightId(wordID),
|
||||||
|
fromPosData.pos,
|
||||||
|
wordPos,
|
||||||
|
leastIDX,
|
||||||
|
wordID,
|
||||||
|
type);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the space penalty. */
|
||||||
|
protected int computeSpacePenalty(MorphData morphData, int wordID, int numSpaces) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the penalty for a specific input region */
|
||||||
|
protected int computePenalty(int pos, int length) throws IOException {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getPos() {
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isEnd() {
|
||||||
|
return end;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<T> getPending() {
|
||||||
|
return pending;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isOutputNBest() {
|
||||||
|
return outputNBest;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void resetBuffer(Reader reader) {
|
||||||
|
buffer.reset(reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void resetState() {
|
||||||
|
positions.reset();
|
||||||
|
pos = 0;
|
||||||
|
end = false;
|
||||||
|
lastBackTracePos = 0;
|
||||||
|
pending.clear();
|
||||||
|
|
||||||
|
// Add BOS:
|
||||||
|
positions.get(0).add(0, 0, -1, -1, -1, -1, TokenType.KNOWN);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Holds all back pointers arriving to this position.
|
||||||
|
*
|
||||||
|
* <p>NOTE: This and subclasses must have no-arg constructor. See {@link WrappedPositionArray}.
|
||||||
|
*/
|
||||||
|
public static class Position {
|
||||||
|
|
||||||
|
int pos;
|
||||||
|
|
||||||
|
int count;
|
||||||
|
|
||||||
|
// maybe single int array * 5?
|
||||||
|
int[] costs = new int[8];
|
||||||
|
int[] lastRightID = new int[8];
|
||||||
|
int[] backPos = new int[8];
|
||||||
|
int[] backWordPos = new int[8];
|
||||||
|
int[] backIndex = new int[8];
|
||||||
|
int[] backID = new int[8];
|
||||||
|
TokenType[] backType = new TokenType[8];
|
||||||
|
|
||||||
|
private void grow() {
|
||||||
|
costs = ArrayUtil.grow(costs, 1 + count);
|
||||||
|
lastRightID = ArrayUtil.grow(lastRightID, 1 + count);
|
||||||
|
backPos = ArrayUtil.grow(backPos, 1 + count);
|
||||||
|
backWordPos = ArrayUtil.grow(backWordPos, 1 + count);
|
||||||
|
backIndex = ArrayUtil.grow(backIndex, 1 + count);
|
||||||
|
backID = ArrayUtil.grow(backID, 1 + count);
|
||||||
|
|
||||||
|
// NOTE: sneaky: grow separately because
|
||||||
|
// ArrayUtil.grow will otherwise pick a different
|
||||||
|
// length than the int[]s we just grew:
|
||||||
|
final TokenType[] newBackType = new TokenType[backID.length];
|
||||||
|
System.arraycopy(backType, 0, newBackType, 0, backType.length);
|
||||||
|
backType = newBackType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(
|
||||||
|
int cost,
|
||||||
|
int lastRightID,
|
||||||
|
int backPos,
|
||||||
|
int backRPos,
|
||||||
|
int backIndex,
|
||||||
|
int backID,
|
||||||
|
TokenType backType) {
|
||||||
|
// NOTE: this isn't quite a true Viterbi search,
|
||||||
|
// because we should check if lastRightID is
|
||||||
|
// already present here, and only update if the new
|
||||||
|
// cost is less than the current cost, instead of
|
||||||
|
// simply appending. However, that will likely hurt
|
||||||
|
// performance (usually we add a lastRightID only once),
|
||||||
|
// and it means we actually create the full graph
|
||||||
|
// intersection instead of a "normal" Viterbi lattice:
|
||||||
|
if (count == costs.length) {
|
||||||
|
grow();
|
||||||
|
}
|
||||||
|
this.costs[count] = cost;
|
||||||
|
this.lastRightID[count] = lastRightID;
|
||||||
|
this.backPos[count] = backPos;
|
||||||
|
this.backWordPos[count] = backRPos;
|
||||||
|
this.backIndex[count] = backIndex;
|
||||||
|
this.backID[count] = backID;
|
||||||
|
this.backType[count] = backType;
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset() {
|
||||||
|
count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getPos() {
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getCount() {
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCount(int count) {
|
||||||
|
this.count = count;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getCost(int index) {
|
||||||
|
return costs[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getBackPos(int index) {
|
||||||
|
return backPos[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getBackWordPos(int index) {
|
||||||
|
return backWordPos[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getBackID(int index) {
|
||||||
|
return backID[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getBackIndex(int index) {
|
||||||
|
return backIndex[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
public TokenType getBackType(int index) {
|
||||||
|
return backType[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getLastRightID(int index) {
|
||||||
|
return lastRightID[index];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Holds partial graph (array of positions) for calculating the minimum cost path */
|
||||||
|
public static final class WrappedPositionArray<U extends Position> {
|
||||||
|
private U[] positions;
|
||||||
|
private final Class<U> clazz;
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
WrappedPositionArray(Class<U> clazz) {
|
||||||
|
this.clazz = clazz;
|
||||||
|
positions = (U[]) Array.newInstance(clazz, 8);
|
||||||
|
for (int i = 0; i < positions.length; i++) {
|
||||||
|
try {
|
||||||
|
positions[i] = clazz.getConstructor().newInstance();
|
||||||
|
} catch (ReflectiveOperationException e) {
|
||||||
|
// shouldn't happen; Position class should have no-arg constructor.
|
||||||
|
throw new IllegalStateException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Next array index to write to in positions:
|
||||||
|
private int nextWrite;
|
||||||
|
|
||||||
|
// Next position to write:
|
||||||
|
private int nextPos;
|
||||||
|
|
||||||
|
// How many valid Position instances are held in the
|
||||||
|
// positions array:
|
||||||
|
private int count;
|
||||||
|
|
||||||
|
void reset() {
|
||||||
|
nextWrite--;
|
||||||
|
while (count > 0) {
|
||||||
|
if (nextWrite == -1) {
|
||||||
|
nextWrite = positions.length - 1;
|
||||||
|
}
|
||||||
|
positions[nextWrite--].reset();
|
||||||
|
count--;
|
||||||
|
}
|
||||||
|
nextWrite = 0;
|
||||||
|
nextPos = 0;
|
||||||
|
count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get Position instance for this absolute position; this is allowed to be arbitrarily far "in
|
||||||
|
* the future" but cannot be before the last freeBefore.
|
||||||
|
*/
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
public U get(int pos) {
|
||||||
|
while (pos >= nextPos) {
|
||||||
|
// System.out.println("count=" + count + " vs len=" + positions.length);
|
||||||
|
if (count == positions.length) {
|
||||||
|
// Position[] newPositions =
|
||||||
|
// new Position[ArrayUtil.oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||||
|
U[] newPositions =
|
||||||
|
(U[])
|
||||||
|
Array.newInstance(
|
||||||
|
clazz, ArrayUtil.oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF));
|
||||||
|
// System.out.println("grow positions " + newPositions.length);
|
||||||
|
System.arraycopy(positions, nextWrite, newPositions, 0, positions.length - nextWrite);
|
||||||
|
System.arraycopy(positions, 0, newPositions, positions.length - nextWrite, nextWrite);
|
||||||
|
for (int i = positions.length; i < newPositions.length; i++) {
|
||||||
|
try {
|
||||||
|
newPositions[i] = clazz.getConstructor().newInstance();
|
||||||
|
} catch (ReflectiveOperationException e) {
|
||||||
|
// shouldn't happen
|
||||||
|
throw new IllegalStateException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nextWrite = positions.length;
|
||||||
|
positions = newPositions;
|
||||||
|
}
|
||||||
|
if (nextWrite == positions.length) {
|
||||||
|
nextWrite = 0;
|
||||||
|
}
|
||||||
|
// Should have already been reset:
|
||||||
|
assert positions[nextWrite].count == 0;
|
||||||
|
positions[nextWrite++].pos = nextPos++;
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assert inBounds(pos);
|
||||||
|
final int index = getIndex(pos);
|
||||||
|
assert positions[index].pos == pos;
|
||||||
|
return positions[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
int getNextPos() {
|
||||||
|
return nextPos;
|
||||||
|
}
|
||||||
|
|
||||||
|
// For assert:
|
||||||
|
private boolean inBounds(int pos) {
|
||||||
|
return pos < nextPos && pos >= nextPos - count;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int getIndex(int pos) {
|
||||||
|
int index = nextWrite - (nextPos - pos);
|
||||||
|
if (index < 0) {
|
||||||
|
index += positions.length;
|
||||||
|
}
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void freeBefore(int pos) {
|
||||||
|
final int toFree = count - (nextPos - pos);
|
||||||
|
assert toFree >= 0;
|
||||||
|
assert toFree <= count;
|
||||||
|
int index = nextWrite - count;
|
||||||
|
if (index < 0) {
|
||||||
|
index += positions.length;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < toFree; i++) {
|
||||||
|
if (index == positions.length) {
|
||||||
|
index = 0;
|
||||||
|
}
|
||||||
|
// System.out.println(" fb idx=" + index);
|
||||||
|
positions[index].reset();
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
count -= toFree;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,195 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.analysis.ja;
|
|
||||||
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Position;
|
|
||||||
import org.apache.lucene.analysis.ja.JapaneseTokenizer.WrappedPositionArray;
|
|
||||||
import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
|
|
||||||
import org.apache.lucene.analysis.ja.dict.JaMorphData;
|
|
||||||
import org.apache.lucene.analysis.morph.Dictionary;
|
|
||||||
|
|
||||||
// TODO: would be nice to show 2nd best path in a diff't
|
|
||||||
// color...
|
|
||||||
|
|
||||||
/** Outputs the dot (graphviz) string for the viterbi lattice. */
|
|
||||||
public class GraphvizFormatter {
|
|
||||||
|
|
||||||
private static final String BOS_LABEL = "BOS";
|
|
||||||
|
|
||||||
private static final String EOS_LABEL = "EOS";
|
|
||||||
|
|
||||||
private static final String FONT_NAME = "Helvetica";
|
|
||||||
|
|
||||||
private final ConnectionCosts costs;
|
|
||||||
|
|
||||||
private final Map<String, String> bestPathMap;
|
|
||||||
|
|
||||||
private final StringBuilder sb = new StringBuilder();
|
|
||||||
|
|
||||||
public GraphvizFormatter(ConnectionCosts costs) {
|
|
||||||
this.costs = costs;
|
|
||||||
this.bestPathMap = new HashMap<>();
|
|
||||||
sb.append(formatHeader());
|
|
||||||
sb.append(" init [style=invis]\n");
|
|
||||||
sb.append(" init -> 0.0 [label=\"" + BOS_LABEL + "\"]\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
public String finish() {
|
|
||||||
sb.append(formatTrailer());
|
|
||||||
return sb.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Backtraces another incremental fragment:
|
|
||||||
void onBacktrace(
|
|
||||||
JapaneseTokenizer tok,
|
|
||||||
WrappedPositionArray positions,
|
|
||||||
int lastBackTracePos,
|
|
||||||
Position endPosData,
|
|
||||||
int fromIDX,
|
|
||||||
char[] fragment,
|
|
||||||
boolean isEnd) {
|
|
||||||
setBestPathMap(positions, lastBackTracePos, endPosData, fromIDX);
|
|
||||||
sb.append(formatNodes(tok, positions, lastBackTracePos, endPosData, fragment));
|
|
||||||
if (isEnd) {
|
|
||||||
sb.append(" fini [style=invis]\n");
|
|
||||||
sb.append(" ");
|
|
||||||
sb.append(getNodeID(endPosData.pos, fromIDX));
|
|
||||||
sb.append(" -> fini [label=\"" + EOS_LABEL + "\"]");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Records which arcs make up the best bath:
|
|
||||||
private void setBestPathMap(
|
|
||||||
WrappedPositionArray positions, int startPos, Position endPosData, int fromIDX) {
|
|
||||||
bestPathMap.clear();
|
|
||||||
|
|
||||||
int pos = endPosData.pos;
|
|
||||||
int bestIDX = fromIDX;
|
|
||||||
while (pos > startPos) {
|
|
||||||
final Position posData = positions.get(pos);
|
|
||||||
|
|
||||||
final int backPos = posData.backPos[bestIDX];
|
|
||||||
final int backIDX = posData.backIndex[bestIDX];
|
|
||||||
|
|
||||||
final String toNodeID = getNodeID(pos, bestIDX);
|
|
||||||
final String fromNodeID = getNodeID(backPos, backIDX);
|
|
||||||
|
|
||||||
assert !bestPathMap.containsKey(fromNodeID);
|
|
||||||
assert !bestPathMap.containsValue(toNodeID);
|
|
||||||
bestPathMap.put(fromNodeID, toNodeID);
|
|
||||||
pos = backPos;
|
|
||||||
bestIDX = backIDX;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private String formatNodes(
|
|
||||||
JapaneseTokenizer tok,
|
|
||||||
WrappedPositionArray positions,
|
|
||||||
int startPos,
|
|
||||||
Position endPosData,
|
|
||||||
char[] fragment) {
|
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
// Output nodes
|
|
||||||
for (int pos = startPos + 1; pos <= endPosData.pos; pos++) {
|
|
||||||
final Position posData = positions.get(pos);
|
|
||||||
for (int idx = 0; idx < posData.count; idx++) {
|
|
||||||
sb.append(" ");
|
|
||||||
sb.append(getNodeID(pos, idx));
|
|
||||||
sb.append(" [label=\"");
|
|
||||||
sb.append(pos);
|
|
||||||
sb.append(": ");
|
|
||||||
sb.append(posData.lastRightID[idx]);
|
|
||||||
sb.append("\"]\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Output arcs
|
|
||||||
for (int pos = endPosData.pos; pos > startPos; pos--) {
|
|
||||||
final Position posData = positions.get(pos);
|
|
||||||
for (int idx = 0; idx < posData.count; idx++) {
|
|
||||||
final Position backPosData = positions.get(posData.backPos[idx]);
|
|
||||||
final String toNodeID = getNodeID(pos, idx);
|
|
||||||
final String fromNodeID = getNodeID(posData.backPos[idx], posData.backIndex[idx]);
|
|
||||||
|
|
||||||
sb.append(" ");
|
|
||||||
sb.append(fromNodeID);
|
|
||||||
sb.append(" -> ");
|
|
||||||
sb.append(toNodeID);
|
|
||||||
|
|
||||||
final String attrs;
|
|
||||||
if (toNodeID.equals(bestPathMap.get(fromNodeID))) {
|
|
||||||
// This arc is on best path
|
|
||||||
attrs = " color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20";
|
|
||||||
} else {
|
|
||||||
attrs = "";
|
|
||||||
}
|
|
||||||
|
|
||||||
final Dictionary<? extends JaMorphData> dict = tok.getDict(posData.backType[idx]);
|
|
||||||
final int wordCost = dict.getWordCost(posData.backID[idx]);
|
|
||||||
final int bgCost =
|
|
||||||
costs.get(
|
|
||||||
backPosData.lastRightID[posData.backIndex[idx]],
|
|
||||||
dict.getLeftId(posData.backID[idx]));
|
|
||||||
|
|
||||||
final String surfaceForm =
|
|
||||||
new String(fragment, posData.backPos[idx] - startPos, pos - posData.backPos[idx]);
|
|
||||||
|
|
||||||
sb.append(" [label=\"");
|
|
||||||
sb.append(surfaceForm);
|
|
||||||
sb.append(' ');
|
|
||||||
sb.append(wordCost);
|
|
||||||
if (bgCost >= 0) {
|
|
||||||
sb.append('+');
|
|
||||||
}
|
|
||||||
sb.append(bgCost);
|
|
||||||
sb.append("\"");
|
|
||||||
sb.append(attrs);
|
|
||||||
sb.append("]\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return sb.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private String formatHeader() {
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
sb.append("digraph viterbi {\n");
|
|
||||||
sb.append(
|
|
||||||
" graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\"];\n");
|
|
||||||
// sb.append(" // A2 paper size\n");
|
|
||||||
// sb.append(" size = \"34.4,16.5\";\n");
|
|
||||||
// sb.append(" // try to fill paper\n");
|
|
||||||
// sb.append(" ratio = fill;\n");
|
|
||||||
sb.append(" edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
|
|
||||||
sb.append(
|
|
||||||
" node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\""
|
|
||||||
+ FONT_NAME
|
|
||||||
+ "\" ]\n");
|
|
||||||
|
|
||||||
return sb.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private String formatTrailer() {
|
|
||||||
return "}";
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getNodeID(int pos, int idx) {
|
|
||||||
return pos + "." + idx;
|
|
||||||
}
|
|
||||||
}
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -31,8 +31,10 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
|
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
|
||||||
import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
|
import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
|
||||||
|
import org.apache.lucene.analysis.ja.dict.JaMorphData;
|
||||||
import org.apache.lucene.analysis.ja.dict.UserDictionary;
|
import org.apache.lucene.analysis.ja.dict.UserDictionary;
|
||||||
import org.apache.lucene.analysis.ja.tokenattributes.*;
|
import org.apache.lucene.analysis.ja.tokenattributes.*;
|
||||||
|
import org.apache.lucene.analysis.morph.GraphvizFormatter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.tests.analysis.MockGraphTokenFilter;
|
import org.apache.lucene.tests.analysis.MockGraphTokenFilter;
|
||||||
|
@ -518,7 +520,8 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testLatticeToDot() throws Exception {
|
public void testLatticeToDot() throws Exception {
|
||||||
final GraphvizFormatter gv2 = new GraphvizFormatter(ConnectionCosts.getInstance());
|
final GraphvizFormatter<JaMorphData> gv2 =
|
||||||
|
new GraphvizFormatter<>(ConnectionCosts.getInstance());
|
||||||
final Analyzer analyzer =
|
final Analyzer analyzer =
|
||||||
new Analyzer() {
|
new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,447 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.ko;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.EnumMap;
|
||||||
|
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
|
||||||
|
import org.apache.lucene.analysis.ko.dict.KoMorphData;
|
||||||
|
import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
|
||||||
|
import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
|
||||||
|
import org.apache.lucene.analysis.ko.dict.UserDictionary;
|
||||||
|
import org.apache.lucene.analysis.morph.ConnectionCosts;
|
||||||
|
import org.apache.lucene.analysis.morph.Dictionary;
|
||||||
|
import org.apache.lucene.analysis.morph.GraphvizFormatter;
|
||||||
|
import org.apache.lucene.analysis.morph.MorphData;
|
||||||
|
import org.apache.lucene.analysis.morph.TokenInfoFST;
|
||||||
|
import org.apache.lucene.analysis.morph.TokenType;
|
||||||
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
|
||||||
|
/** {@link org.apache.lucene.analysis.morph.Viterbi} subclass for Korean morphological analysis. */
|
||||||
|
final class Viterbi
|
||||||
|
extends org.apache.lucene.analysis.morph.Viterbi<
|
||||||
|
Token, org.apache.lucene.analysis.morph.Viterbi.Position> {
|
||||||
|
|
||||||
|
private final EnumMap<TokenType, Dictionary<? extends KoMorphData>> dictionaryMap =
|
||||||
|
new EnumMap<>(TokenType.class);
|
||||||
|
|
||||||
|
private final UnknownDictionary unkDictionary;
|
||||||
|
private final CharacterDefinition characterDefinition;
|
||||||
|
|
||||||
|
private final boolean discardPunctuation;
|
||||||
|
private final KoreanTokenizer.DecompoundMode mode;
|
||||||
|
private final boolean outputUnknownUnigrams;
|
||||||
|
|
||||||
|
private GraphvizFormatter<KoMorphData> dotOut;
|
||||||
|
|
||||||
|
Viterbi(
|
||||||
|
TokenInfoFST fst,
|
||||||
|
FST.BytesReader fstReader,
|
||||||
|
TokenInfoDictionary dictionary,
|
||||||
|
TokenInfoFST userFST,
|
||||||
|
FST.BytesReader userFSTReader,
|
||||||
|
UserDictionary userDictionary,
|
||||||
|
ConnectionCosts costs,
|
||||||
|
UnknownDictionary unkDictionary,
|
||||||
|
CharacterDefinition characterDefinition,
|
||||||
|
boolean discardPunctuation,
|
||||||
|
KoreanTokenizer.DecompoundMode mode,
|
||||||
|
boolean outputUnknownUnigrams) {
|
||||||
|
super(
|
||||||
|
fst, fstReader, dictionary, userFST, userFSTReader, userDictionary, costs, Position.class);
|
||||||
|
this.unkDictionary = unkDictionary;
|
||||||
|
this.characterDefinition = characterDefinition;
|
||||||
|
this.discardPunctuation = discardPunctuation;
|
||||||
|
this.mode = mode;
|
||||||
|
this.outputUnknownUnigrams = outputUnknownUnigrams;
|
||||||
|
this.enableSpacePenaltyFactor = true;
|
||||||
|
this.outputLongestUserEntryOnly = true;
|
||||||
|
dictionaryMap.put(TokenType.KNOWN, dictionary);
|
||||||
|
dictionaryMap.put(TokenType.UNKNOWN, unkDictionary);
|
||||||
|
dictionaryMap.put(TokenType.USER, userDictionary);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected int processUnknownWord(boolean anyMatches, Position posData) throws IOException {
|
||||||
|
final char firstCharacter = (char) buffer.get(pos);
|
||||||
|
if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) {
|
||||||
|
|
||||||
|
// Find unknown match:
|
||||||
|
int characterId = characterDefinition.getCharacterClass(firstCharacter);
|
||||||
|
// NOTE: copied from UnknownDictionary.lookup:
|
||||||
|
int unknownWordLength;
|
||||||
|
if (!characterDefinition.isGroup(firstCharacter)) {
|
||||||
|
unknownWordLength = 1;
|
||||||
|
} else {
|
||||||
|
// Extract unknown word. Characters with the same script are considered to be part of
|
||||||
|
// unknown word
|
||||||
|
unknownWordLength = 1;
|
||||||
|
Character.UnicodeScript scriptCode = Character.UnicodeScript.of(firstCharacter);
|
||||||
|
final boolean isPunct = isPunctuation(firstCharacter);
|
||||||
|
final boolean isDigit = Character.isDigit(firstCharacter);
|
||||||
|
for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
|
||||||
|
int next = buffer.get(posAhead);
|
||||||
|
if (next == -1) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
char ch = (char) next;
|
||||||
|
int chType = Character.getType(ch);
|
||||||
|
Character.UnicodeScript sc = Character.UnicodeScript.of(next);
|
||||||
|
boolean sameScript =
|
||||||
|
isSameScript(scriptCode, sc)
|
||||||
|
// Non-spacing marks inherit the script of their base character,
|
||||||
|
// following recommendations from UTR #24.
|
||||||
|
|| chType == Character.NON_SPACING_MARK;
|
||||||
|
|
||||||
|
if (sameScript
|
||||||
|
// split on punctuation
|
||||||
|
&& isPunctuation(ch, chType) == isPunct
|
||||||
|
// split on digit
|
||||||
|
&& Character.isDigit(ch) == isDigit
|
||||||
|
&& characterDefinition.isGroup(ch)) {
|
||||||
|
unknownWordLength++;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// Update the script code and character class if the original script
|
||||||
|
// is Inherited or Common.
|
||||||
|
if (isCommonOrInherited(scriptCode) && isCommonOrInherited(sc) == false) {
|
||||||
|
scriptCode = sc;
|
||||||
|
characterId = characterDefinition.getCharacterClass(ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unkDictionary.lookupWordIds(
|
||||||
|
characterId, wordIdRef); // characters in input text are supposed to be the same
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(
|
||||||
|
" UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.length + " wordIDs");
|
||||||
|
}
|
||||||
|
for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
|
||||||
|
add(
|
||||||
|
unkDictionary.getMorphAttributes(),
|
||||||
|
posData,
|
||||||
|
pos,
|
||||||
|
pos + unknownWordLength,
|
||||||
|
wordIdRef.ints[wordIdRef.offset + ofs],
|
||||||
|
TokenType.UNKNOWN,
|
||||||
|
false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// TODO: should return meaningful value?
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void setGraphvizFormatter(GraphvizFormatter<KoMorphData> dotOut) {
|
||||||
|
this.dotOut = dotOut;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void backtrace(Position endPosData, int fromIDX) {
|
||||||
|
final int endPos = endPosData.getPos();
|
||||||
|
|
||||||
|
if (endPos == lastBackTracePos) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(
|
||||||
|
"\n backtrace: endPos="
|
||||||
|
+ endPos
|
||||||
|
+ " pos="
|
||||||
|
+ pos
|
||||||
|
+ "; "
|
||||||
|
+ (pos - lastBackTracePos)
|
||||||
|
+ " characters; last="
|
||||||
|
+ lastBackTracePos
|
||||||
|
+ " cost="
|
||||||
|
+ endPosData.getCost(fromIDX));
|
||||||
|
}
|
||||||
|
|
||||||
|
final char[] fragment = buffer.get(lastBackTracePos, endPos - lastBackTracePos);
|
||||||
|
|
||||||
|
if (dotOut != null) {
|
||||||
|
dotOut.onBacktrace(
|
||||||
|
this::getDict, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
int pos = endPos;
|
||||||
|
int bestIDX = fromIDX;
|
||||||
|
|
||||||
|
// TODO: sort of silly to make Token instances here; the
|
||||||
|
// back trace has all info needed to generate the
|
||||||
|
// token. So, we could just directly set the attrs,
|
||||||
|
// from the backtrace, in incrementToken w/o ever
|
||||||
|
// creating Token; we'd have to defer calling freeBefore
|
||||||
|
// until after the backtrace was fully "consumed" by
|
||||||
|
// incrementToken.
|
||||||
|
|
||||||
|
while (pos > lastBackTracePos) {
|
||||||
|
// System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
|
||||||
|
final Position posData = positions.get(pos);
|
||||||
|
assert bestIDX < posData.getCount();
|
||||||
|
|
||||||
|
int backPos = posData.getBackPos(bestIDX);
|
||||||
|
int backWordPos = posData.getBackWordPos(bestIDX);
|
||||||
|
assert backPos >= lastBackTracePos
|
||||||
|
: "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos;
|
||||||
|
// the length of the word without the whitespaces at the beginning.
|
||||||
|
int length = pos - backWordPos;
|
||||||
|
TokenType backType = posData.getBackType(bestIDX);
|
||||||
|
int backID = posData.getBackID(bestIDX);
|
||||||
|
int nextBestIDX = posData.getBackIndex(bestIDX);
|
||||||
|
// the start of the word after the whitespace at the beginning.
|
||||||
|
final int fragmentOffset = backWordPos - lastBackTracePos;
|
||||||
|
assert fragmentOffset >= 0;
|
||||||
|
|
||||||
|
final Dictionary<? extends KoMorphData> dict = getDict(backType);
|
||||||
|
|
||||||
|
if (outputUnknownUnigrams && backType == TokenType.UNKNOWN) {
|
||||||
|
// outputUnknownUnigrams converts unknown word into unigrams:
|
||||||
|
for (int i = length - 1; i >= 0; i--) {
|
||||||
|
int charLen = 1;
|
||||||
|
if (i > 0 && Character.isLowSurrogate(fragment[fragmentOffset + i])) {
|
||||||
|
i--;
|
||||||
|
charLen = 2;
|
||||||
|
}
|
||||||
|
final DictionaryToken token =
|
||||||
|
new DictionaryToken(
|
||||||
|
TokenType.UNKNOWN,
|
||||||
|
unkDictionary.getMorphAttributes(),
|
||||||
|
CharacterDefinition.NGRAM,
|
||||||
|
fragment,
|
||||||
|
fragmentOffset + i,
|
||||||
|
charLen,
|
||||||
|
backWordPos + i,
|
||||||
|
backWordPos + i + charLen);
|
||||||
|
pending.add(token);
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" add token=" + pending.get(pending.size() - 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
final DictionaryToken token =
|
||||||
|
new DictionaryToken(
|
||||||
|
backType,
|
||||||
|
dict.getMorphAttributes(),
|
||||||
|
backID,
|
||||||
|
fragment,
|
||||||
|
fragmentOffset,
|
||||||
|
length,
|
||||||
|
backWordPos,
|
||||||
|
backWordPos + length);
|
||||||
|
if (token.getPOSType() == POS.Type.MORPHEME
|
||||||
|
|| mode == KoreanTokenizer.DecompoundMode.NONE) {
|
||||||
|
if (shouldFilterToken(token) == false) {
|
||||||
|
pending.add(token);
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" add token=" + pending.get(pending.size() - 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
KoMorphData.Morpheme[] morphemes = token.getMorphemes();
|
||||||
|
if (morphemes == null) {
|
||||||
|
pending.add(token);
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" add token=" + pending.get(pending.size() - 1));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int endOffset = backWordPos + length;
|
||||||
|
int posLen = 0;
|
||||||
|
// decompose the compound
|
||||||
|
for (int i = morphemes.length - 1; i >= 0; i--) {
|
||||||
|
final KoMorphData.Morpheme morpheme = morphemes[i];
|
||||||
|
final Token compoundToken;
|
||||||
|
if (token.getPOSType() == POS.Type.COMPOUND) {
|
||||||
|
assert endOffset - morpheme.surfaceForm.length() >= 0;
|
||||||
|
compoundToken =
|
||||||
|
new DecompoundToken(
|
||||||
|
morpheme.posTag,
|
||||||
|
morpheme.surfaceForm,
|
||||||
|
endOffset - morpheme.surfaceForm.length(),
|
||||||
|
endOffset,
|
||||||
|
backType);
|
||||||
|
} else {
|
||||||
|
compoundToken =
|
||||||
|
new DecompoundToken(
|
||||||
|
morpheme.posTag,
|
||||||
|
morpheme.surfaceForm,
|
||||||
|
token.getStartOffset(),
|
||||||
|
token.getEndOffset(),
|
||||||
|
backType);
|
||||||
|
}
|
||||||
|
if (i == 0 && mode == KoreanTokenizer.DecompoundMode.MIXED) {
|
||||||
|
compoundToken.setPositionIncrement(0);
|
||||||
|
}
|
||||||
|
++posLen;
|
||||||
|
endOffset -= morpheme.surfaceForm.length();
|
||||||
|
pending.add(compoundToken);
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" add token=" + pending.get(pending.size() - 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (mode == KoreanTokenizer.DecompoundMode.MIXED) {
|
||||||
|
token.setPositionLength(Math.max(1, posLen));
|
||||||
|
pending.add(token);
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" add token=" + pending.get(pending.size() - 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (discardPunctuation == false && backWordPos != backPos) {
|
||||||
|
// Add a token for whitespaces between terms
|
||||||
|
int offset = backPos - lastBackTracePos;
|
||||||
|
int len = backWordPos - backPos;
|
||||||
|
// System.out.println(offset + " " + fragmentOffset + " " + len + " " + backWordPos + " " +
|
||||||
|
// backPos);
|
||||||
|
unkDictionary.lookupWordIds(characterDefinition.getCharacterClass(' '), wordIdRef);
|
||||||
|
DictionaryToken spaceToken =
|
||||||
|
new DictionaryToken(
|
||||||
|
TokenType.UNKNOWN,
|
||||||
|
unkDictionary.getMorphAttributes(),
|
||||||
|
wordIdRef.ints[wordIdRef.offset],
|
||||||
|
fragment,
|
||||||
|
offset,
|
||||||
|
len,
|
||||||
|
backPos,
|
||||||
|
backPos + len);
|
||||||
|
pending.add(spaceToken);
|
||||||
|
}
|
||||||
|
|
||||||
|
pos = backPos;
|
||||||
|
bestIDX = nextBestIDX;
|
||||||
|
}
|
||||||
|
|
||||||
|
lastBackTracePos = endPos;
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" freeBefore pos=" + endPos);
|
||||||
|
}
|
||||||
|
// Notify the circular buffers that we are done with
|
||||||
|
// these positions:
|
||||||
|
buffer.freeBefore(endPos);
|
||||||
|
positions.freeBefore(endPos);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the space penalty associated with the provided {@link POS.Tag}. */
|
||||||
|
@Override
|
||||||
|
protected int computeSpacePenalty(MorphData morphData, int wordID, int numSpaces) {
|
||||||
|
final POS.Tag leftPOS = ((KoMorphData) morphData).getLeftPOS(wordID);
|
||||||
|
int spacePenalty = 0;
|
||||||
|
if (numSpaces > 0) {
|
||||||
|
// TODO we should extract the penalty (left-space-penalty-factor) from the dicrc file.
|
||||||
|
switch (leftPOS) {
|
||||||
|
case E:
|
||||||
|
case J:
|
||||||
|
case VCP:
|
||||||
|
case XSA:
|
||||||
|
case XSN:
|
||||||
|
case XSV:
|
||||||
|
spacePenalty = 3000;
|
||||||
|
break;
|
||||||
|
case IC:
|
||||||
|
case MAG:
|
||||||
|
case MAJ:
|
||||||
|
case MM:
|
||||||
|
case NA:
|
||||||
|
case NNB:
|
||||||
|
case NNBC:
|
||||||
|
case NNG:
|
||||||
|
case NNP:
|
||||||
|
case NP:
|
||||||
|
case NR:
|
||||||
|
case SC:
|
||||||
|
case SE:
|
||||||
|
case SF:
|
||||||
|
case SH:
|
||||||
|
case SL:
|
||||||
|
case SN:
|
||||||
|
case SP:
|
||||||
|
case SSC:
|
||||||
|
case SSO:
|
||||||
|
case SY:
|
||||||
|
case UNA:
|
||||||
|
case UNKNOWN:
|
||||||
|
case VA:
|
||||||
|
case VCN:
|
||||||
|
case VSV:
|
||||||
|
case VV:
|
||||||
|
case VX:
|
||||||
|
case XPN:
|
||||||
|
case XR:
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return spacePenalty;
|
||||||
|
}
|
||||||
|
|
||||||
|
Dictionary<? extends KoMorphData> getDict(TokenType type) {
|
||||||
|
return dictionaryMap.get(type);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean shouldFilterToken(Token token) {
|
||||||
|
return discardPunctuation && isPunctuation(token.getSurfaceForm()[token.getOffset()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isPunctuation(char ch) {
|
||||||
|
return isPunctuation(ch, Character.getType(ch));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isPunctuation(char ch, int cid) {
|
||||||
|
// special case for Hangul Letter Araea (interpunct)
|
||||||
|
if (ch == 0x318D) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
switch (cid) {
|
||||||
|
case Character.SPACE_SEPARATOR:
|
||||||
|
case Character.LINE_SEPARATOR:
|
||||||
|
case Character.PARAGRAPH_SEPARATOR:
|
||||||
|
case Character.CONTROL:
|
||||||
|
case Character.FORMAT:
|
||||||
|
case Character.DASH_PUNCTUATION:
|
||||||
|
case Character.START_PUNCTUATION:
|
||||||
|
case Character.END_PUNCTUATION:
|
||||||
|
case Character.CONNECTOR_PUNCTUATION:
|
||||||
|
case Character.OTHER_PUNCTUATION:
|
||||||
|
case Character.MATH_SYMBOL:
|
||||||
|
case Character.CURRENCY_SYMBOL:
|
||||||
|
case Character.MODIFIER_SYMBOL:
|
||||||
|
case Character.OTHER_SYMBOL:
|
||||||
|
case Character.INITIAL_QUOTE_PUNCTUATION:
|
||||||
|
case Character.FINAL_QUOTE_PUNCTUATION:
|
||||||
|
return true;
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isCommonOrInherited(Character.UnicodeScript script) {
|
||||||
|
return script == Character.UnicodeScript.INHERITED || script == Character.UnicodeScript.COMMON;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Determine if two scripts are compatible. */
|
||||||
|
private static boolean isSameScript(
|
||||||
|
Character.UnicodeScript scriptOne, Character.UnicodeScript scriptTwo) {
|
||||||
|
return scriptOne == scriptTwo
|
||||||
|
|| isCommonOrInherited(scriptOne)
|
||||||
|
|| isCommonOrInherited(scriptTwo);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue