LUCENE-10493: factor out Viterbi algorithm and share it between kuromoji and nori (#805)

This commit is contained in:
Tomoko Uchida 2022-04-25 20:09:46 +09:00 committed by GitHub
parent 2a4c21bb58
commit c89f8a7ea1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 2640 additions and 2999 deletions

View File

@ -60,6 +60,8 @@ Other
All classes in `org.apache.lucene.analysis.[ja|ko].util` was moved to `org.apache.lucene.analysis.[ja|ko].dict`. All classes in `org.apache.lucene.analysis.[ja|ko].util` was moved to `org.apache.lucene.analysis.[ja|ko].dict`.
(Tomoko Uchida) (Tomoko Uchida)
* LUCENE-10493: Factor out Viterbi algorithm in Kuromoji and Nori to analysis-common. (Tomoko Uchida)
======================= Lucene 9.2.0 ======================= ======================= Lucene 9.2.0 =======================
API Changes API Changes

View File

@ -14,22 +14,16 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.analysis.ko; package org.apache.lucene.analysis.morph;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import org.apache.lucene.analysis.ko.KoreanTokenizer.Position;
import org.apache.lucene.analysis.ko.KoreanTokenizer.WrappedPositionArray;
import org.apache.lucene.analysis.ko.dict.ConnectionCosts;
import org.apache.lucene.analysis.ko.dict.KoMorphData;
import org.apache.lucene.analysis.morph.Dictionary;
// TODO: would be nice to show 2nd best path in a diff't // TODO: would be nice to show 2nd best path in a diff't
// color... // color...
/** Outputs the dot (graphviz) string for the viterbi lattice. */ /** Outputs the dot (graphviz) string for the viterbi lattice. */
public class GraphvizFormatter { public class GraphvizFormatter<T extends MorphData> {
private static final String BOS_LABEL = "BOS"; private static final String BOS_LABEL = "BOS";
private static final String EOS_LABEL = "EOS"; private static final String EOS_LABEL = "EOS";
@ -56,36 +50,39 @@ public class GraphvizFormatter {
} }
// Backtraces another incremental fragment: // Backtraces another incremental fragment:
void onBacktrace( public void onBacktrace(
KoreanTokenizer tok, DictionaryProvider<T> dictProvider,
WrappedPositionArray positions, Viterbi.WrappedPositionArray<? extends Viterbi.Position> positions,
int lastBackTracePos, int lastBackTracePos,
Position endPosData, Viterbi.Position endPosData,
int fromIDX, int fromIDX,
char[] fragment, char[] fragment,
boolean isEnd) { boolean isEnd) {
setBestPathMap(positions, lastBackTracePos, endPosData, fromIDX); setBestPathMap(positions, lastBackTracePos, endPosData, fromIDX);
sb.append(formatNodes(tok, positions, lastBackTracePos, endPosData, fragment)); sb.append(formatNodes(dictProvider, positions, lastBackTracePos, endPosData, fragment));
if (isEnd) { if (isEnd) {
sb.append(" fini [style=invis]\n"); sb.append(" fini [style=invis]\n");
sb.append(" "); sb.append(" ");
sb.append(getNodeID(endPosData.pos, fromIDX)); sb.append(getNodeID(endPosData.getPos(), fromIDX));
sb.append(" -> fini [label=\"" + EOS_LABEL + "\"]"); sb.append(" -> fini [label=\"" + EOS_LABEL + "\"]");
} }
} }
// Records which arcs make up the best bath: // Records which arcs make up the best bath:
private void setBestPathMap( private void setBestPathMap(
WrappedPositionArray positions, int startPos, Position endPosData, int fromIDX) { Viterbi.WrappedPositionArray<? extends Viterbi.Position> positions,
int startPos,
Viterbi.Position endPosData,
int fromIDX) {
bestPathMap.clear(); bestPathMap.clear();
int pos = endPosData.pos; int pos = endPosData.getPos();
int bestIDX = fromIDX; int bestIDX = fromIDX;
while (pos > startPos) { while (pos > startPos) {
final Position posData = positions.get(pos); final Viterbi.Position posData = positions.get(pos);
final int backPos = posData.backPos[bestIDX]; final int backPos = posData.getBackPos(bestIDX);
final int backIDX = posData.backIndex[bestIDX]; final int backIDX = posData.getBackIndex(bestIDX);
final String toNodeID = getNodeID(pos, bestIDX); final String toNodeID = getNodeID(pos, bestIDX);
final String fromNodeID = getNodeID(backPos, backIDX); final String fromNodeID = getNodeID(backPos, backIDX);
@ -99,34 +96,34 @@ public class GraphvizFormatter {
} }
private String formatNodes( private String formatNodes(
KoreanTokenizer tok, DictionaryProvider<T> dictProvider,
WrappedPositionArray positions, Viterbi.WrappedPositionArray<? extends Viterbi.Position> positions,
int startPos, int startPos,
Position endPosData, Viterbi.Position endPosData,
char[] fragment) { char[] fragment) {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
// Output nodes // Output nodes
for (int pos = startPos + 1; pos <= endPosData.pos; pos++) { for (int pos = startPos + 1; pos <= endPosData.getPos(); pos++) {
final Position posData = positions.get(pos); final Viterbi.Position posData = positions.get(pos);
for (int idx = 0; idx < posData.count; idx++) { for (int idx = 0; idx < posData.getCount(); idx++) {
sb.append(" "); sb.append(" ");
sb.append(getNodeID(pos, idx)); sb.append(getNodeID(pos, idx));
sb.append(" [label=\""); sb.append(" [label=\"");
sb.append(pos); sb.append(pos);
sb.append(": "); sb.append(": ");
sb.append(posData.lastRightID[idx]); sb.append(posData.getLastRightID(idx));
sb.append("\"]\n"); sb.append("\"]\n");
} }
} }
// Output arcs // Output arcs
for (int pos = endPosData.pos; pos > startPos; pos--) { for (int pos = endPosData.getPos(); pos > startPos; pos--) {
final Position posData = positions.get(pos); final Viterbi.Position posData = positions.get(pos);
for (int idx = 0; idx < posData.count; idx++) { for (int idx = 0; idx < posData.getCount(); idx++) {
final Position backPosData = positions.get(posData.backPos[idx]); final Viterbi.Position backPosData = positions.get(posData.getBackPos(idx));
final String toNodeID = getNodeID(pos, idx); final String toNodeID = getNodeID(pos, idx);
final String fromNodeID = getNodeID(posData.backPos[idx], posData.backIndex[idx]); final String fromNodeID = getNodeID(posData.getBackPos(idx), posData.getBackIndex(idx));
sb.append(" "); sb.append(" ");
sb.append(fromNodeID); sb.append(fromNodeID);
@ -141,15 +138,15 @@ public class GraphvizFormatter {
attrs = ""; attrs = "";
} }
final Dictionary<? extends KoMorphData> dict = tok.getDict(posData.backType[idx]); final Dictionary<? extends T> dict = dictProvider.get(posData.getBackType(idx));
final int wordCost = dict.getWordCost(posData.backID[idx]); final int wordCost = dict.getWordCost(posData.getBackID(idx));
final int bgCost = final int bgCost =
costs.get( costs.get(
backPosData.lastRightID[posData.backIndex[idx]], backPosData.getLastRightID(posData.getBackIndex(idx)),
dict.getLeftId(posData.backID[idx])); dict.getLeftId(posData.getBackID(idx)));
final String surfaceForm = final String surfaceForm =
new String(fragment, posData.backPos[idx] - startPos, pos - posData.backPos[idx]); new String(fragment, posData.getBackPos(idx) - startPos, pos - posData.getBackPos(idx));
sb.append(" [label=\""); sb.append(" [label=\"");
sb.append(surfaceForm); sb.append(surfaceForm);
@ -190,4 +187,10 @@ public class GraphvizFormatter {
private String getNodeID(int pos, int idx) { private String getNodeID(int pos, int idx) {
return pos + "." + idx; return pos + "." + idx;
} }
/** {@link Dictionary} provider */
@FunctionalInterface
public interface DictionaryProvider<T extends MorphData> {
Dictionary<? extends T> get(TokenType type);
}
} }

View File

@ -0,0 +1,815 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.morph;
import java.io.IOException;
import java.io.Reader;
import java.lang.reflect.Array;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.analysis.util.RollingCharBuffer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST;
/**
* Performs <a href="https://en.wikipedia.org/wiki/Viterbi_algorithm">Viterbi algorithm</a> for
* morphological Tokenizers, which split texts by Hidden Markov Model or Conditional Random Fields.
*
* @param <T> output token class
* @param <U> position class
*/
public abstract class Viterbi<T extends Token, U extends Viterbi.Position> {
protected static final boolean VERBOSE = false;
// For safety:
protected static final int MAX_UNKNOWN_WORD_LENGTH = 1024;
private static final int MAX_BACKTRACE_GAP = 1024;
private final TokenInfoFST fst;
private final BinaryDictionary<? extends MorphData> dictionary;
private final Dictionary<? extends MorphData> userDictionary;
protected final ConnectionCosts costs;
private final FST.Arc<Long> arc = new FST.Arc<>();
private final FST.BytesReader fstReader;
protected final IntsRef wordIdRef = new IntsRef();
private final FST.BytesReader userFSTReader;
private final TokenInfoFST userFST;
protected final RollingCharBuffer buffer = new RollingCharBuffer();
protected final WrappedPositionArray<U> positions;
// True once we've hit the EOF from the input reader:
protected boolean end;
// Last absolute position we backtraced from:
protected int lastBackTracePos;
// Next absolute position to process:
protected int pos;
// Already parsed, but not yet passed to caller, tokens:
protected final List<T> pending = new ArrayList<>();
protected boolean outputNBest = false;
protected boolean enableSpacePenaltyFactor = false;
protected boolean outputLongestUserEntryOnly = false;
protected Viterbi(
TokenInfoFST fst,
FST.BytesReader fstReader,
BinaryDictionary<? extends MorphData> dictionary,
TokenInfoFST userFST,
FST.BytesReader userFSTReader,
Dictionary<? extends MorphData> userDictionary,
ConnectionCosts costs,
Class<U> positionImpl) {
this.fst = fst;
this.fstReader = fstReader;
this.dictionary = dictionary;
this.userFST = userFST;
this.userFSTReader = userFSTReader;
this.userDictionary = userDictionary;
this.costs = costs;
this.positions = new WrappedPositionArray<>(positionImpl);
}
/**
* Incrementally parse some more characters. This runs the viterbi search forwards "enough" so
* that we generate some more tokens. How much forward depends on the chars coming in, since some
* chars could cause longer-lasting ambiguity in the parsing. Once the ambiguity is resolved, then
* we back trace, produce the pending tokens, and return.
*/
public final void forward() throws IOException {
if (VERBOSE) {
System.out.println("\nPARSE");
}
// Index of the last character of unknown word:
int unknownWordEndIndex = -1;
// Maximum posAhead of user word in the entire input
int userWordMaxPosAhead = -1;
// Advances over each position (character):
while (buffer.get(pos) != -1) {
final Position posData = positions.get(pos);
final boolean isFrontier = positions.getNextPos() == pos + 1;
if (posData.count == 0) {
// No arcs arrive here; move to next position:
if (VERBOSE) {
System.out.println(" no arcs in; skip pos=" + pos);
}
pos++;
continue;
}
if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
// We are at a "frontier", and only one node is
// alive, so whatever the eventual best path is must
// come through this node. So we can safely commit
// to the prefix of the best path at this point:
if (outputNBest) {
backtraceNBest(posData, false);
}
backtrace(posData, 0);
if (outputNBest) {
fixupPendingList();
}
// Re-base cost so we don't risk int overflow:
posData.costs[0] = 0;
if (pending.size() > 0) {
return;
} else {
// This means the backtrace only produced
// punctuation tokens, so we must keep parsing.
}
}
if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP) {
// Safety: if we've buffered too much, force a
// backtrace now. We find the least-cost partial
// path, across all paths, backtrace from it, and
// then prune all others. Note that this, in
// general, can produce the wrong result, if the
// total best path did not in fact back trace
// through this partial best path. But it's the
// best we can do... (short of not having a
// safety!).
// First pass: find least cost partial path so far,
// including ending at future positions:
int leastIDX = -1;
int leastCost = Integer.MAX_VALUE;
Position leastPosData = null;
for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
final Position posData2 = positions.get(pos2);
for (int idx = 0; idx < posData2.count; idx++) {
// System.out.println(" idx=" + idx + " cost=" + cost);
final int cost = posData2.costs[idx];
if (cost < leastCost) {
leastCost = cost;
leastIDX = idx;
leastPosData = posData2;
}
}
}
// We will always have at least one live path:
assert leastIDX != -1;
if (outputNBest) {
backtraceNBest(leastPosData, false);
}
// Second pass: prune all but the best path:
for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
final Position posData2 = positions.get(pos2);
if (posData2 != leastPosData) {
posData2.reset();
} else {
if (leastIDX != 0) {
posData2.costs[0] = posData2.costs[leastIDX];
posData2.lastRightID[0] = posData2.lastRightID[leastIDX];
posData2.backPos[0] = posData2.backPos[leastIDX];
posData2.backWordPos[0] = posData2.backWordPos[leastIDX];
posData2.backIndex[0] = posData2.backIndex[leastIDX];
posData2.backID[0] = posData2.backID[leastIDX];
posData2.backType[0] = posData2.backType[leastIDX];
}
posData2.count = 1;
}
}
backtrace(leastPosData, 0);
if (outputNBest) {
fixupPendingList();
}
// Re-base cost so we don't risk int overflow:
Arrays.fill(leastPosData.costs, 0, leastPosData.count, 0);
if (pos != leastPosData.pos) {
// We jumped into a future position:
assert pos < leastPosData.pos;
pos = leastPosData.pos;
}
if (pending.size() > 0) {
return;
} else {
// This means the backtrace only produced
// punctuation tokens, so we must keep parsing.
continue;
}
}
if (VERBOSE) {
System.out.println(
"\n extend @ pos="
+ pos
+ " char="
+ (char) buffer.get(pos)
+ " hex="
+ Integer.toHexString(buffer.get(pos)));
}
if (VERBOSE) {
System.out.println(" " + posData.count + " arcs in");
}
if (enableSpacePenaltyFactor
&& Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) {
// We add single space separator as prefixes of the terms that we extract.
// This information is needed to compute the space penalty factor of each term.
// These whitespace prefixes are removed when the final tokens are generated, or
// added as separated tokens when discardPunctuation is unset.
if (buffer.get(++pos) == -1) {
pos = posData.pos;
}
}
boolean anyMatches = false;
// First try user dict:
if (userFST != null) {
userFST.getFirstArc(arc);
int output = 0;
int maxPosAhead = 0;
int outputMaxPosAhead = 0;
int arcFinalOutMaxPosAhead = 0;
for (int posAhead = pos; ; posAhead++) {
final int ch = buffer.get(posAhead);
if (ch == -1) {
break;
}
if (userFST.findTargetArc(ch, arc, arc, posAhead == pos, userFSTReader) == null) {
break;
}
output += arc.output().intValue();
if (arc.isFinal()) {
maxPosAhead = posAhead;
outputMaxPosAhead = output;
arcFinalOutMaxPosAhead = arc.nextFinalOutput().intValue();
anyMatches = true;
if (!outputLongestUserEntryOnly) {
// add all matched user entries.
add(
userDictionary.getMorphAttributes(),
posData,
pos,
posAhead + 1,
output + arc.nextFinalOutput().intValue(),
TokenType.USER,
false);
}
}
}
// Longest matching for user word
if (anyMatches && maxPosAhead > userWordMaxPosAhead) {
if (outputLongestUserEntryOnly) {
if (VERBOSE) {
System.out.println(
" USER word "
+ new String(buffer.get(pos, maxPosAhead + 1))
+ " toPos="
+ (maxPosAhead + 1));
}
add(
userDictionary.getMorphAttributes(),
posData,
pos,
maxPosAhead + 1,
outputMaxPosAhead + arcFinalOutMaxPosAhead,
TokenType.USER,
false);
}
userWordMaxPosAhead = Math.max(userWordMaxPosAhead, maxPosAhead);
}
}
// TODO: we can be more aggressive about user
// matches? if we are "under" a user match then don't
// extend KNOWN/UNKNOWN paths?
if (!anyMatches) {
// Next, try known dictionary matches
fst.getFirstArc(arc);
int output = 0;
for (int posAhead = pos; ; posAhead++) {
final int ch = buffer.get(posAhead);
if (ch == -1) {
break;
}
// System.out.println(" match " + (char) ch + " posAhead=" + posAhead);
if (fst.findTargetArc(ch, arc, arc, posAhead == pos, fstReader) == null) {
break;
}
output += arc.output().intValue();
// Optimization: for known words that are too-long
// (compound), we should pre-compute the 2nd
// best segmentation and store it in the
// dictionary instead of recomputing it each time a
// match is found.
if (arc.isFinal()) {
dictionary.lookupWordIds(output + arc.nextFinalOutput().intValue(), wordIdRef);
if (VERBOSE) {
System.out.println(
" KNOWN word "
+ new String(buffer.get(pos, posAhead - pos + 1))
+ " toPos="
+ (posAhead + 1)
+ " "
+ wordIdRef.length
+ " wordIDs");
}
for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
add(
dictionary.getMorphAttributes(),
posData,
pos,
posAhead + 1,
wordIdRef.ints[wordIdRef.offset + ofs],
TokenType.KNOWN,
false);
anyMatches = true;
}
}
}
}
if (!shouldSkipProcessUnknownWord(unknownWordEndIndex, posData)) {
int unknownWordLength = processUnknownWord(anyMatches, posData);
unknownWordEndIndex = posData.pos + unknownWordLength;
}
pos++;
}
end = true;
if (pos > 0) {
final Position endPosData = positions.get(pos);
int leastCost = Integer.MAX_VALUE;
int leastIDX = -1;
if (VERBOSE) {
System.out.println(" end: " + endPosData.count + " nodes");
}
for (int idx = 0; idx < endPosData.count; idx++) {
// Add EOS cost:
final int cost = endPosData.costs[idx] + costs.get(endPosData.lastRightID[idx], 0);
// System.out.println(" idx=" + idx + " cost=" + cost + " (pathCost=" +
// endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ")
// backPos=" + endPosData.backPos[idx]);
if (cost < leastCost) {
leastCost = cost;
leastIDX = idx;
}
}
if (outputNBest) {
backtraceNBest(endPosData, true);
}
backtrace(endPosData, leastIDX);
if (outputNBest) {
fixupPendingList();
}
} else {
// No characters in the input string; return no tokens!
}
}
protected boolean shouldSkipProcessUnknownWord(int unknownWordEndIndex, Position posData) {
return unknownWordEndIndex > posData.pos;
}
/**
* Add unknown words to the position graph.
*
* @return word length
*/
protected abstract int processUnknownWord(boolean anyMatches, Position posData)
throws IOException;
/**
* Backtrace from the provided position, back to the last time we back-traced, accumulating the
* resulting tokens to the pending list. The pending list is then in-reverse (last token should be
* returned first).
*/
protected abstract void backtrace(final Position endPosData, final int fromIDX)
throws IOException;
/**
* Backtrace the n-best path. Subclasses that support n-best paths should implement this method.
*/
protected void backtraceNBest(final Position endPosData, final boolean useEOS)
throws IOException {
throw new UnsupportedOperationException();
}
/**
* Remove duplicated tokens from the pending list; this is needed because {@link
* #backtrace(Position, int)} and {@link #backtraceNBest(Position, boolean)} can add same tokens
* to the list. Subclasses that support n-best paths should implement this method.
*/
protected void fixupPendingList() {
throw new UnsupportedOperationException();
}
/** Add a token on the minimum cost path to the pending token list. */
protected final void add(
MorphData morphData,
Position fromPosData,
int wordPos,
int endPos,
int wordID,
TokenType type,
boolean addPenalty)
throws IOException {
final int wordCost = morphData.getWordCost(wordID);
final int leftID = morphData.getLeftId(wordID);
int leastCost = Integer.MAX_VALUE;
int leastIDX = -1;
assert fromPosData.count > 0;
for (int idx = 0; idx < fromPosData.count; idx++) {
// The number of spaces before the term
int numSpaces = wordPos - fromPosData.pos;
// Cost is path cost so far, plus word cost (added at
// end of loop), plus bigram cost and space penalty cost.
final int cost =
fromPosData.costs[idx]
+ costs.get(fromPosData.lastRightID[idx], leftID)
+ computeSpacePenalty(morphData, wordID, numSpaces);
if (VERBOSE) {
System.out.println(
" fromIDX="
+ idx
+ ": cost="
+ cost
+ " (prevCost="
+ fromPosData.costs[idx]
+ " wordCost="
+ wordCost
+ " bgCost="
+ costs.get(fromPosData.lastRightID[idx], leftID)
+ " spacePenalty="
+ computeSpacePenalty(morphData, wordID, numSpaces)
+ ") leftID="
+ leftID
// + " leftPOS="
// + leftPOS.name()
+ ")");
}
if (cost < leastCost) {
leastCost = cost;
leastIDX = idx;
if (VERBOSE) {
System.out.println(" **");
}
}
}
leastCost += wordCost;
if (VERBOSE) {
System.out.println(
" + cost="
+ leastCost
+ " wordID="
+ wordID
+ " leftID="
+ leftID
+ " leastIDX="
+ leastIDX
+ " toPos="
+ endPos
+ " toPos.idx="
+ positions.get(endPos).count);
}
if (addPenalty && type != TokenType.USER) {
final int penalty = computePenalty(fromPosData.pos, endPos - fromPosData.pos);
if (VERBOSE) {
if (penalty > 0) {
System.out.println(" + penalty=" + penalty + " cost=" + (leastCost + penalty));
}
}
leastCost += penalty;
}
positions
.get(endPos)
.add(
leastCost,
morphData.getRightId(wordID),
fromPosData.pos,
wordPos,
leastIDX,
wordID,
type);
}
/** Returns the space penalty. */
protected int computeSpacePenalty(MorphData morphData, int wordID, int numSpaces) {
return 0;
}
/** Returns the penalty for a specific input region */
protected int computePenalty(int pos, int length) throws IOException {
return 0;
}
public int getPos() {
return pos;
}
public boolean isEnd() {
return end;
}
public List<T> getPending() {
return pending;
}
public boolean isOutputNBest() {
return outputNBest;
}
public void resetBuffer(Reader reader) {
buffer.reset(reader);
}
public void resetState() {
positions.reset();
pos = 0;
end = false;
lastBackTracePos = 0;
pending.clear();
// Add BOS:
positions.get(0).add(0, 0, -1, -1, -1, -1, TokenType.KNOWN);
}
/**
* Holds all back pointers arriving to this position.
*
* <p>NOTE: This and subclasses must have no-arg constructor. See {@link WrappedPositionArray}.
*/
public static class Position {
int pos;
int count;
// maybe single int array * 5?
int[] costs = new int[8];
int[] lastRightID = new int[8];
int[] backPos = new int[8];
int[] backWordPos = new int[8];
int[] backIndex = new int[8];
int[] backID = new int[8];
TokenType[] backType = new TokenType[8];
private void grow() {
costs = ArrayUtil.grow(costs, 1 + count);
lastRightID = ArrayUtil.grow(lastRightID, 1 + count);
backPos = ArrayUtil.grow(backPos, 1 + count);
backWordPos = ArrayUtil.grow(backWordPos, 1 + count);
backIndex = ArrayUtil.grow(backIndex, 1 + count);
backID = ArrayUtil.grow(backID, 1 + count);
// NOTE: sneaky: grow separately because
// ArrayUtil.grow will otherwise pick a different
// length than the int[]s we just grew:
final TokenType[] newBackType = new TokenType[backID.length];
System.arraycopy(backType, 0, newBackType, 0, backType.length);
backType = newBackType;
}
public void add(
int cost,
int lastRightID,
int backPos,
int backRPos,
int backIndex,
int backID,
TokenType backType) {
// NOTE: this isn't quite a true Viterbi search,
// because we should check if lastRightID is
// already present here, and only update if the new
// cost is less than the current cost, instead of
// simply appending. However, that will likely hurt
// performance (usually we add a lastRightID only once),
// and it means we actually create the full graph
// intersection instead of a "normal" Viterbi lattice:
if (count == costs.length) {
grow();
}
this.costs[count] = cost;
this.lastRightID[count] = lastRightID;
this.backPos[count] = backPos;
this.backWordPos[count] = backRPos;
this.backIndex[count] = backIndex;
this.backID[count] = backID;
this.backType[count] = backType;
count++;
}
public void reset() {
count = 0;
}
public int getPos() {
return pos;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
public int getCost(int index) {
return costs[index];
}
public int getBackPos(int index) {
return backPos[index];
}
public int getBackWordPos(int index) {
return backWordPos[index];
}
public int getBackID(int index) {
return backID[index];
}
public int getBackIndex(int index) {
return backIndex[index];
}
public TokenType getBackType(int index) {
return backType[index];
}
public int getLastRightID(int index) {
return lastRightID[index];
}
}
/** Holds partial graph (array of positions) for calculating the minimum cost path */
public static final class WrappedPositionArray<U extends Position> {
private U[] positions;
private final Class<U> clazz;
@SuppressWarnings("unchecked")
WrappedPositionArray(Class<U> clazz) {
this.clazz = clazz;
positions = (U[]) Array.newInstance(clazz, 8);
for (int i = 0; i < positions.length; i++) {
try {
positions[i] = clazz.getConstructor().newInstance();
} catch (ReflectiveOperationException e) {
// shouldn't happen; Position class should have no-arg constructor.
throw new IllegalStateException(e);
}
}
}
// Next array index to write to in positions:
private int nextWrite;
// Next position to write:
private int nextPos;
// How many valid Position instances are held in the
// positions array:
private int count;
void reset() {
nextWrite--;
while (count > 0) {
if (nextWrite == -1) {
nextWrite = positions.length - 1;
}
positions[nextWrite--].reset();
count--;
}
nextWrite = 0;
nextPos = 0;
count = 0;
}
/**
* Get Position instance for this absolute position; this is allowed to be arbitrarily far "in
* the future" but cannot be before the last freeBefore.
*/
@SuppressWarnings("unchecked")
public U get(int pos) {
while (pos >= nextPos) {
// System.out.println("count=" + count + " vs len=" + positions.length);
if (count == positions.length) {
// Position[] newPositions =
// new Position[ArrayUtil.oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
U[] newPositions =
(U[])
Array.newInstance(
clazz, ArrayUtil.oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF));
// System.out.println("grow positions " + newPositions.length);
System.arraycopy(positions, nextWrite, newPositions, 0, positions.length - nextWrite);
System.arraycopy(positions, 0, newPositions, positions.length - nextWrite, nextWrite);
for (int i = positions.length; i < newPositions.length; i++) {
try {
newPositions[i] = clazz.getConstructor().newInstance();
} catch (ReflectiveOperationException e) {
// shouldn't happen
throw new IllegalStateException(e);
}
}
nextWrite = positions.length;
positions = newPositions;
}
if (nextWrite == positions.length) {
nextWrite = 0;
}
// Should have already been reset:
assert positions[nextWrite].count == 0;
positions[nextWrite++].pos = nextPos++;
count++;
}
assert inBounds(pos);
final int index = getIndex(pos);
assert positions[index].pos == pos;
return positions[index];
}
int getNextPos() {
return nextPos;
}
// For assert:
private boolean inBounds(int pos) {
return pos < nextPos && pos >= nextPos - count;
}
private int getIndex(int pos) {
int index = nextWrite - (nextPos - pos);
if (index < 0) {
index += positions.length;
}
return index;
}
public void freeBefore(int pos) {
final int toFree = count - (nextPos - pos);
assert toFree >= 0;
assert toFree <= count;
int index = nextWrite - count;
if (index < 0) {
index += positions.length;
}
for (int i = 0; i < toFree; i++) {
if (index == positions.length) {
index = 0;
}
// System.out.println(" fb idx=" + index);
positions[index].reset();
index++;
}
count -= toFree;
}
}
}

View File

@ -1,195 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Position;
import org.apache.lucene.analysis.ja.JapaneseTokenizer.WrappedPositionArray;
import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
import org.apache.lucene.analysis.ja.dict.JaMorphData;
import org.apache.lucene.analysis.morph.Dictionary;
// TODO: would be nice to show 2nd best path in a diff't
// color...
/** Outputs the dot (graphviz) string for the viterbi lattice. */
public class GraphvizFormatter {
private static final String BOS_LABEL = "BOS";
private static final String EOS_LABEL = "EOS";
private static final String FONT_NAME = "Helvetica";
private final ConnectionCosts costs;
private final Map<String, String> bestPathMap;
private final StringBuilder sb = new StringBuilder();
public GraphvizFormatter(ConnectionCosts costs) {
this.costs = costs;
this.bestPathMap = new HashMap<>();
sb.append(formatHeader());
sb.append(" init [style=invis]\n");
sb.append(" init -> 0.0 [label=\"" + BOS_LABEL + "\"]\n");
}
public String finish() {
sb.append(formatTrailer());
return sb.toString();
}
// Backtraces another incremental fragment:
void onBacktrace(
JapaneseTokenizer tok,
WrappedPositionArray positions,
int lastBackTracePos,
Position endPosData,
int fromIDX,
char[] fragment,
boolean isEnd) {
setBestPathMap(positions, lastBackTracePos, endPosData, fromIDX);
sb.append(formatNodes(tok, positions, lastBackTracePos, endPosData, fragment));
if (isEnd) {
sb.append(" fini [style=invis]\n");
sb.append(" ");
sb.append(getNodeID(endPosData.pos, fromIDX));
sb.append(" -> fini [label=\"" + EOS_LABEL + "\"]");
}
}
// Records which arcs make up the best bath:
private void setBestPathMap(
WrappedPositionArray positions, int startPos, Position endPosData, int fromIDX) {
bestPathMap.clear();
int pos = endPosData.pos;
int bestIDX = fromIDX;
while (pos > startPos) {
final Position posData = positions.get(pos);
final int backPos = posData.backPos[bestIDX];
final int backIDX = posData.backIndex[bestIDX];
final String toNodeID = getNodeID(pos, bestIDX);
final String fromNodeID = getNodeID(backPos, backIDX);
assert !bestPathMap.containsKey(fromNodeID);
assert !bestPathMap.containsValue(toNodeID);
bestPathMap.put(fromNodeID, toNodeID);
pos = backPos;
bestIDX = backIDX;
}
}
private String formatNodes(
JapaneseTokenizer tok,
WrappedPositionArray positions,
int startPos,
Position endPosData,
char[] fragment) {
StringBuilder sb = new StringBuilder();
// Output nodes
for (int pos = startPos + 1; pos <= endPosData.pos; pos++) {
final Position posData = positions.get(pos);
for (int idx = 0; idx < posData.count; idx++) {
sb.append(" ");
sb.append(getNodeID(pos, idx));
sb.append(" [label=\"");
sb.append(pos);
sb.append(": ");
sb.append(posData.lastRightID[idx]);
sb.append("\"]\n");
}
}
// Output arcs
for (int pos = endPosData.pos; pos > startPos; pos--) {
final Position posData = positions.get(pos);
for (int idx = 0; idx < posData.count; idx++) {
final Position backPosData = positions.get(posData.backPos[idx]);
final String toNodeID = getNodeID(pos, idx);
final String fromNodeID = getNodeID(posData.backPos[idx], posData.backIndex[idx]);
sb.append(" ");
sb.append(fromNodeID);
sb.append(" -> ");
sb.append(toNodeID);
final String attrs;
if (toNodeID.equals(bestPathMap.get(fromNodeID))) {
// This arc is on best path
attrs = " color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20";
} else {
attrs = "";
}
final Dictionary<? extends JaMorphData> dict = tok.getDict(posData.backType[idx]);
final int wordCost = dict.getWordCost(posData.backID[idx]);
final int bgCost =
costs.get(
backPosData.lastRightID[posData.backIndex[idx]],
dict.getLeftId(posData.backID[idx]));
final String surfaceForm =
new String(fragment, posData.backPos[idx] - startPos, pos - posData.backPos[idx]);
sb.append(" [label=\"");
sb.append(surfaceForm);
sb.append(' ');
sb.append(wordCost);
if (bgCost >= 0) {
sb.append('+');
}
sb.append(bgCost);
sb.append("\"");
sb.append(attrs);
sb.append("]\n");
}
}
return sb.toString();
}
private String formatHeader() {
StringBuilder sb = new StringBuilder();
sb.append("digraph viterbi {\n");
sb.append(
" graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\"];\n");
// sb.append(" // A2 paper size\n");
// sb.append(" size = \"34.4,16.5\";\n");
// sb.append(" // try to fill paper\n");
// sb.append(" ratio = fill;\n");
sb.append(" edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
sb.append(
" node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\""
+ FONT_NAME
+ "\" ]\n");
return sb.toString();
}
private String formatTrailer() {
return "}";
}
private String getNodeID(int pos, int idx) {
return pos + "." + idx;
}
}

File diff suppressed because it is too large Load Diff

View File

@ -31,8 +31,10 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode; import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
import org.apache.lucene.analysis.ja.dict.ConnectionCosts; import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
import org.apache.lucene.analysis.ja.dict.JaMorphData;
import org.apache.lucene.analysis.ja.dict.UserDictionary; import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.apache.lucene.analysis.ja.tokenattributes.*; import org.apache.lucene.analysis.ja.tokenattributes.*;
import org.apache.lucene.analysis.morph.GraphvizFormatter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.tests.analysis.MockGraphTokenFilter; import org.apache.lucene.tests.analysis.MockGraphTokenFilter;
@ -518,7 +520,8 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
} }
public void testLatticeToDot() throws Exception { public void testLatticeToDot() throws Exception {
final GraphvizFormatter gv2 = new GraphvizFormatter(ConnectionCosts.getInstance()); final GraphvizFormatter<JaMorphData> gv2 =
new GraphvizFormatter<>(ConnectionCosts.getInstance());
final Analyzer analyzer = final Analyzer analyzer =
new Analyzer() { new Analyzer() {
@Override @Override

View File

@ -0,0 +1,447 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ko;
import java.io.IOException;
import java.util.EnumMap;
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
import org.apache.lucene.analysis.ko.dict.KoMorphData;
import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
import org.apache.lucene.analysis.ko.dict.UserDictionary;
import org.apache.lucene.analysis.morph.ConnectionCosts;
import org.apache.lucene.analysis.morph.Dictionary;
import org.apache.lucene.analysis.morph.GraphvizFormatter;
import org.apache.lucene.analysis.morph.MorphData;
import org.apache.lucene.analysis.morph.TokenInfoFST;
import org.apache.lucene.analysis.morph.TokenType;
import org.apache.lucene.util.fst.FST;
/** {@link org.apache.lucene.analysis.morph.Viterbi} subclass for Korean morphological analysis. */
final class Viterbi
extends org.apache.lucene.analysis.morph.Viterbi<
Token, org.apache.lucene.analysis.morph.Viterbi.Position> {
private final EnumMap<TokenType, Dictionary<? extends KoMorphData>> dictionaryMap =
new EnumMap<>(TokenType.class);
private final UnknownDictionary unkDictionary;
private final CharacterDefinition characterDefinition;
private final boolean discardPunctuation;
private final KoreanTokenizer.DecompoundMode mode;
private final boolean outputUnknownUnigrams;
private GraphvizFormatter<KoMorphData> dotOut;
Viterbi(
TokenInfoFST fst,
FST.BytesReader fstReader,
TokenInfoDictionary dictionary,
TokenInfoFST userFST,
FST.BytesReader userFSTReader,
UserDictionary userDictionary,
ConnectionCosts costs,
UnknownDictionary unkDictionary,
CharacterDefinition characterDefinition,
boolean discardPunctuation,
KoreanTokenizer.DecompoundMode mode,
boolean outputUnknownUnigrams) {
super(
fst, fstReader, dictionary, userFST, userFSTReader, userDictionary, costs, Position.class);
this.unkDictionary = unkDictionary;
this.characterDefinition = characterDefinition;
this.discardPunctuation = discardPunctuation;
this.mode = mode;
this.outputUnknownUnigrams = outputUnknownUnigrams;
this.enableSpacePenaltyFactor = true;
this.outputLongestUserEntryOnly = true;
dictionaryMap.put(TokenType.KNOWN, dictionary);
dictionaryMap.put(TokenType.UNKNOWN, unkDictionary);
dictionaryMap.put(TokenType.USER, userDictionary);
}
@Override
protected int processUnknownWord(boolean anyMatches, Position posData) throws IOException {
final char firstCharacter = (char) buffer.get(pos);
if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) {
// Find unknown match:
int characterId = characterDefinition.getCharacterClass(firstCharacter);
// NOTE: copied from UnknownDictionary.lookup:
int unknownWordLength;
if (!characterDefinition.isGroup(firstCharacter)) {
unknownWordLength = 1;
} else {
// Extract unknown word. Characters with the same script are considered to be part of
// unknown word
unknownWordLength = 1;
Character.UnicodeScript scriptCode = Character.UnicodeScript.of(firstCharacter);
final boolean isPunct = isPunctuation(firstCharacter);
final boolean isDigit = Character.isDigit(firstCharacter);
for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
int next = buffer.get(posAhead);
if (next == -1) {
break;
}
char ch = (char) next;
int chType = Character.getType(ch);
Character.UnicodeScript sc = Character.UnicodeScript.of(next);
boolean sameScript =
isSameScript(scriptCode, sc)
// Non-spacing marks inherit the script of their base character,
// following recommendations from UTR #24.
|| chType == Character.NON_SPACING_MARK;
if (sameScript
// split on punctuation
&& isPunctuation(ch, chType) == isPunct
// split on digit
&& Character.isDigit(ch) == isDigit
&& characterDefinition.isGroup(ch)) {
unknownWordLength++;
} else {
break;
}
// Update the script code and character class if the original script
// is Inherited or Common.
if (isCommonOrInherited(scriptCode) && isCommonOrInherited(sc) == false) {
scriptCode = sc;
characterId = characterDefinition.getCharacterClass(ch);
}
}
}
unkDictionary.lookupWordIds(
characterId, wordIdRef); // characters in input text are supposed to be the same
if (VERBOSE) {
System.out.println(
" UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.length + " wordIDs");
}
for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
add(
unkDictionary.getMorphAttributes(),
posData,
pos,
pos + unknownWordLength,
wordIdRef.ints[wordIdRef.offset + ofs],
TokenType.UNKNOWN,
false);
}
}
// TODO: should return meaningful value?
return 0;
}
void setGraphvizFormatter(GraphvizFormatter<KoMorphData> dotOut) {
this.dotOut = dotOut;
}
@Override
protected void backtrace(Position endPosData, int fromIDX) {
final int endPos = endPosData.getPos();
if (endPos == lastBackTracePos) {
return;
}
if (VERBOSE) {
System.out.println(
"\n backtrace: endPos="
+ endPos
+ " pos="
+ pos
+ "; "
+ (pos - lastBackTracePos)
+ " characters; last="
+ lastBackTracePos
+ " cost="
+ endPosData.getCost(fromIDX));
}
final char[] fragment = buffer.get(lastBackTracePos, endPos - lastBackTracePos);
if (dotOut != null) {
dotOut.onBacktrace(
this::getDict, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
}
int pos = endPos;
int bestIDX = fromIDX;
// TODO: sort of silly to make Token instances here; the
// back trace has all info needed to generate the
// token. So, we could just directly set the attrs,
// from the backtrace, in incrementToken w/o ever
// creating Token; we'd have to defer calling freeBefore
// until after the backtrace was fully "consumed" by
// incrementToken.
while (pos > lastBackTracePos) {
// System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
final Position posData = positions.get(pos);
assert bestIDX < posData.getCount();
int backPos = posData.getBackPos(bestIDX);
int backWordPos = posData.getBackWordPos(bestIDX);
assert backPos >= lastBackTracePos
: "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos;
// the length of the word without the whitespaces at the beginning.
int length = pos - backWordPos;
TokenType backType = posData.getBackType(bestIDX);
int backID = posData.getBackID(bestIDX);
int nextBestIDX = posData.getBackIndex(bestIDX);
// the start of the word after the whitespace at the beginning.
final int fragmentOffset = backWordPos - lastBackTracePos;
assert fragmentOffset >= 0;
final Dictionary<? extends KoMorphData> dict = getDict(backType);
if (outputUnknownUnigrams && backType == TokenType.UNKNOWN) {
// outputUnknownUnigrams converts unknown word into unigrams:
for (int i = length - 1; i >= 0; i--) {
int charLen = 1;
if (i > 0 && Character.isLowSurrogate(fragment[fragmentOffset + i])) {
i--;
charLen = 2;
}
final DictionaryToken token =
new DictionaryToken(
TokenType.UNKNOWN,
unkDictionary.getMorphAttributes(),
CharacterDefinition.NGRAM,
fragment,
fragmentOffset + i,
charLen,
backWordPos + i,
backWordPos + i + charLen);
pending.add(token);
if (VERBOSE) {
System.out.println(" add token=" + pending.get(pending.size() - 1));
}
}
} else {
final DictionaryToken token =
new DictionaryToken(
backType,
dict.getMorphAttributes(),
backID,
fragment,
fragmentOffset,
length,
backWordPos,
backWordPos + length);
if (token.getPOSType() == POS.Type.MORPHEME
|| mode == KoreanTokenizer.DecompoundMode.NONE) {
if (shouldFilterToken(token) == false) {
pending.add(token);
if (VERBOSE) {
System.out.println(" add token=" + pending.get(pending.size() - 1));
}
}
} else {
KoMorphData.Morpheme[] morphemes = token.getMorphemes();
if (morphemes == null) {
pending.add(token);
if (VERBOSE) {
System.out.println(" add token=" + pending.get(pending.size() - 1));
}
} else {
int endOffset = backWordPos + length;
int posLen = 0;
// decompose the compound
for (int i = morphemes.length - 1; i >= 0; i--) {
final KoMorphData.Morpheme morpheme = morphemes[i];
final Token compoundToken;
if (token.getPOSType() == POS.Type.COMPOUND) {
assert endOffset - morpheme.surfaceForm.length() >= 0;
compoundToken =
new DecompoundToken(
morpheme.posTag,
morpheme.surfaceForm,
endOffset - morpheme.surfaceForm.length(),
endOffset,
backType);
} else {
compoundToken =
new DecompoundToken(
morpheme.posTag,
morpheme.surfaceForm,
token.getStartOffset(),
token.getEndOffset(),
backType);
}
if (i == 0 && mode == KoreanTokenizer.DecompoundMode.MIXED) {
compoundToken.setPositionIncrement(0);
}
++posLen;
endOffset -= morpheme.surfaceForm.length();
pending.add(compoundToken);
if (VERBOSE) {
System.out.println(" add token=" + pending.get(pending.size() - 1));
}
}
if (mode == KoreanTokenizer.DecompoundMode.MIXED) {
token.setPositionLength(Math.max(1, posLen));
pending.add(token);
if (VERBOSE) {
System.out.println(" add token=" + pending.get(pending.size() - 1));
}
}
}
}
}
if (discardPunctuation == false && backWordPos != backPos) {
// Add a token for whitespaces between terms
int offset = backPos - lastBackTracePos;
int len = backWordPos - backPos;
// System.out.println(offset + " " + fragmentOffset + " " + len + " " + backWordPos + " " +
// backPos);
unkDictionary.lookupWordIds(characterDefinition.getCharacterClass(' '), wordIdRef);
DictionaryToken spaceToken =
new DictionaryToken(
TokenType.UNKNOWN,
unkDictionary.getMorphAttributes(),
wordIdRef.ints[wordIdRef.offset],
fragment,
offset,
len,
backPos,
backPos + len);
pending.add(spaceToken);
}
pos = backPos;
bestIDX = nextBestIDX;
}
lastBackTracePos = endPos;
if (VERBOSE) {
System.out.println(" freeBefore pos=" + endPos);
}
// Notify the circular buffers that we are done with
// these positions:
buffer.freeBefore(endPos);
positions.freeBefore(endPos);
}
/** Returns the space penalty associated with the provided {@link POS.Tag}. */
@Override
protected int computeSpacePenalty(MorphData morphData, int wordID, int numSpaces) {
final POS.Tag leftPOS = ((KoMorphData) morphData).getLeftPOS(wordID);
int spacePenalty = 0;
if (numSpaces > 0) {
// TODO we should extract the penalty (left-space-penalty-factor) from the dicrc file.
switch (leftPOS) {
case E:
case J:
case VCP:
case XSA:
case XSN:
case XSV:
spacePenalty = 3000;
break;
case IC:
case MAG:
case MAJ:
case MM:
case NA:
case NNB:
case NNBC:
case NNG:
case NNP:
case NP:
case NR:
case SC:
case SE:
case SF:
case SH:
case SL:
case SN:
case SP:
case SSC:
case SSO:
case SY:
case UNA:
case UNKNOWN:
case VA:
case VCN:
case VSV:
case VV:
case VX:
case XPN:
case XR:
default:
break;
}
}
return spacePenalty;
}
Dictionary<? extends KoMorphData> getDict(TokenType type) {
return dictionaryMap.get(type);
}
private boolean shouldFilterToken(Token token) {
return discardPunctuation && isPunctuation(token.getSurfaceForm()[token.getOffset()]);
}
private static boolean isPunctuation(char ch) {
return isPunctuation(ch, Character.getType(ch));
}
private static boolean isPunctuation(char ch, int cid) {
// special case for Hangul Letter Araea (interpunct)
if (ch == 0x318D) {
return true;
}
switch (cid) {
case Character.SPACE_SEPARATOR:
case Character.LINE_SEPARATOR:
case Character.PARAGRAPH_SEPARATOR:
case Character.CONTROL:
case Character.FORMAT:
case Character.DASH_PUNCTUATION:
case Character.START_PUNCTUATION:
case Character.END_PUNCTUATION:
case Character.CONNECTOR_PUNCTUATION:
case Character.OTHER_PUNCTUATION:
case Character.MATH_SYMBOL:
case Character.CURRENCY_SYMBOL:
case Character.MODIFIER_SYMBOL:
case Character.OTHER_SYMBOL:
case Character.INITIAL_QUOTE_PUNCTUATION:
case Character.FINAL_QUOTE_PUNCTUATION:
return true;
default:
return false;
}
}
private static boolean isCommonOrInherited(Character.UnicodeScript script) {
return script == Character.UnicodeScript.INHERITED || script == Character.UnicodeScript.COMMON;
}
/** Determine if two scripts are compatible. */
private static boolean isSameScript(
Character.UnicodeScript scriptOne, Character.UnicodeScript scriptTwo) {
return scriptOne == scriptTwo
|| isCommonOrInherited(scriptOne)
|| isCommonOrInherited(scriptTwo);
}
}