LUCENE-10493: factor out Viterbi algorithm and share it between kuromoji and nori (#805)

2022-04-25 20:09:46 +09:00 · 2022-04-25 20:09:46 +09:00 · c89f8a7ea1
parent 2a4c21bb58
commit c89f8a7ea1
9 changed files with 2640 additions and 2999 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -60,6 +60,8 @@ Other
  All classes in `org.apache.lucene.analysis.[ja|ko].util` was moved to `org.apache.lucene.analysis.[ja|ko].dict`.
  (Tomoko Uchida)

+* LUCENE-10493: Factor out Viterbi algorithm in Kuromoji and Nori to analysis-common. (Tomoko Uchida)
+
 ======================= Lucene 9.2.0 =======================

 API Changes
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/GraphvizFormatter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/GraphvizFormatter.java
@ -14,22 +14,16 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-package org.apache.lucene.analysis.ko;
+package org.apache.lucene.analysis.morph;

 import java.util.HashMap;
 import java.util.Map;
-import org.apache.lucene.analysis.ko.KoreanTokenizer.Position;
-import org.apache.lucene.analysis.ko.KoreanTokenizer.WrappedPositionArray;
-import org.apache.lucene.analysis.ko.dict.ConnectionCosts;
-import org.apache.lucene.analysis.ko.dict.KoMorphData;
-import org.apache.lucene.analysis.morph.Dictionary;

 // TODO: would be nice to show 2nd best path in a diff't
 // color...

 /** Outputs the dot (graphviz) string for the viterbi lattice. */
-public class GraphvizFormatter {
-
+public class GraphvizFormatter<T extends MorphData> {
  private static final String BOS_LABEL = "BOS";

  private static final String EOS_LABEL = "EOS";
@ -56,36 +50,39 @@ public class GraphvizFormatter {
  }

  // Backtraces another incremental fragment:
-  void onBacktrace(
-      KoreanTokenizer tok,
-      WrappedPositionArray positions,
+  public void onBacktrace(
+      DictionaryProvider<T> dictProvider,
+      Viterbi.WrappedPositionArray<? extends Viterbi.Position> positions,
      int lastBackTracePos,
-      Position endPosData,
+      Viterbi.Position endPosData,
      int fromIDX,
      char[] fragment,
      boolean isEnd) {
    setBestPathMap(positions, lastBackTracePos, endPosData, fromIDX);
-    sb.append(formatNodes(tok, positions, lastBackTracePos, endPosData, fragment));
+    sb.append(formatNodes(dictProvider, positions, lastBackTracePos, endPosData, fragment));
    if (isEnd) {
      sb.append("  fini [style=invis]\n");
      sb.append("  ");
-      sb.append(getNodeID(endPosData.pos, fromIDX));
+      sb.append(getNodeID(endPosData.getPos(), fromIDX));
      sb.append(" -> fini [label=\"" + EOS_LABEL + "\"]");
    }
  }

  // Records which arcs make up the best bath:
  private void setBestPathMap(
-      WrappedPositionArray positions, int startPos, Position endPosData, int fromIDX) {
+      Viterbi.WrappedPositionArray<? extends Viterbi.Position> positions,
+      int startPos,
+      Viterbi.Position endPosData,
+      int fromIDX) {
    bestPathMap.clear();

-    int pos = endPosData.pos;
+    int pos = endPosData.getPos();
    int bestIDX = fromIDX;
    while (pos > startPos) {
-      final Position posData = positions.get(pos);
+      final Viterbi.Position posData = positions.get(pos);

-      final int backPos = posData.backPos[bestIDX];
-      final int backIDX = posData.backIndex[bestIDX];
+      final int backPos = posData.getBackPos(bestIDX);
+      final int backIDX = posData.getBackIndex(bestIDX);

      final String toNodeID = getNodeID(pos, bestIDX);
      final String fromNodeID = getNodeID(backPos, backIDX);
@ -99,34 +96,34 @@ public class GraphvizFormatter {
  }

  private String formatNodes(
-      KoreanTokenizer tok,
-      WrappedPositionArray positions,
+      DictionaryProvider<T> dictProvider,
+      Viterbi.WrappedPositionArray<? extends Viterbi.Position> positions,
      int startPos,
-      Position endPosData,
+      Viterbi.Position endPosData,
      char[] fragment) {

    StringBuilder sb = new StringBuilder();
    // Output nodes
-    for (int pos = startPos + 1; pos <= endPosData.pos; pos++) {
-      final Position posData = positions.get(pos);
-      for (int idx = 0; idx < posData.count; idx++) {
+    for (int pos = startPos + 1; pos <= endPosData.getPos(); pos++) {
+      final Viterbi.Position posData = positions.get(pos);
+      for (int idx = 0; idx < posData.getCount(); idx++) {
        sb.append("  ");
        sb.append(getNodeID(pos, idx));
        sb.append(" [label=\"");
        sb.append(pos);
        sb.append(": ");
-        sb.append(posData.lastRightID[idx]);
+        sb.append(posData.getLastRightID(idx));
        sb.append("\"]\n");
      }
    }

    // Output arcs
-    for (int pos = endPosData.pos; pos > startPos; pos--) {
-      final Position posData = positions.get(pos);
-      for (int idx = 0; idx < posData.count; idx++) {
-        final Position backPosData = positions.get(posData.backPos[idx]);
+    for (int pos = endPosData.getPos(); pos > startPos; pos--) {
+      final Viterbi.Position posData = positions.get(pos);
+      for (int idx = 0; idx < posData.getCount(); idx++) {
+        final Viterbi.Position backPosData = positions.get(posData.getBackPos(idx));
        final String toNodeID = getNodeID(pos, idx);
-        final String fromNodeID = getNodeID(posData.backPos[idx], posData.backIndex[idx]);
+        final String fromNodeID = getNodeID(posData.getBackPos(idx), posData.getBackIndex(idx));

        sb.append("  ");
        sb.append(fromNodeID);
@ -141,15 +138,15 @@ public class GraphvizFormatter {
          attrs = "";
        }

-        final Dictionary<? extends KoMorphData> dict = tok.getDict(posData.backType[idx]);
-        final int wordCost = dict.getWordCost(posData.backID[idx]);
+        final Dictionary<? extends T> dict = dictProvider.get(posData.getBackType(idx));
+        final int wordCost = dict.getWordCost(posData.getBackID(idx));
        final int bgCost =
            costs.get(
-                backPosData.lastRightID[posData.backIndex[idx]],
-                dict.getLeftId(posData.backID[idx]));
+                backPosData.getLastRightID(posData.getBackIndex(idx)),
+                dict.getLeftId(posData.getBackID(idx)));

        final String surfaceForm =
-            new String(fragment, posData.backPos[idx] - startPos, pos - posData.backPos[idx]);
+            new String(fragment, posData.getBackPos(idx) - startPos, pos - posData.getBackPos(idx));

        sb.append(" [label=\"");
        sb.append(surfaceForm);
@ -190,4 +187,10 @@ public class GraphvizFormatter {
  private String getNodeID(int pos, int idx) {
    return pos + "." + idx;
  }
+
+  /** {@link Dictionary} provider */
+  @FunctionalInterface
+  public interface DictionaryProvider<T extends MorphData> {
+    Dictionary<? extends T> get(TokenType type);
+  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/Viterbi.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/Viterbi.java
@ -0,0 +1,815 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.morph;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.lang.reflect.Array;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.apache.lucene.analysis.util.RollingCharBuffer;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.fst.FST;
+
+/**
+ * Performs <a href="https://en.wikipedia.org/wiki/Viterbi_algorithm">Viterbi algorithm</a> for
+ * morphological Tokenizers, which split texts by Hidden Markov Model or Conditional Random Fields.
+ *
+ * @param <T> output token class
+ * @param <U> position class
+ */
+public abstract class Viterbi<T extends Token, U extends Viterbi.Position> {
+  protected static final boolean VERBOSE = false;
+
+  // For safety:
+  protected static final int MAX_UNKNOWN_WORD_LENGTH = 1024;
+  private static final int MAX_BACKTRACE_GAP = 1024;
+
+  private final TokenInfoFST fst;
+  private final BinaryDictionary<? extends MorphData> dictionary;
+  private final Dictionary<? extends MorphData> userDictionary;
+  protected final ConnectionCosts costs;
+
+  private final FST.Arc<Long> arc = new FST.Arc<>();
+  private final FST.BytesReader fstReader;
+  protected final IntsRef wordIdRef = new IntsRef();
+
+  private final FST.BytesReader userFSTReader;
+  private final TokenInfoFST userFST;
+
+  protected final RollingCharBuffer buffer = new RollingCharBuffer();
+
+  protected final WrappedPositionArray<U> positions;
+
+  // True once we've hit the EOF from the input reader:
+  protected boolean end;
+
+  // Last absolute position we backtraced from:
+  protected int lastBackTracePos;
+
+  // Next absolute position to process:
+  protected int pos;
+
+  // Already parsed, but not yet passed to caller, tokens:
+  protected final List<T> pending = new ArrayList<>();
+
+  protected boolean outputNBest = false;
+
+  protected boolean enableSpacePenaltyFactor = false;
+
+  protected boolean outputLongestUserEntryOnly = false;
+
+  protected Viterbi(
+      TokenInfoFST fst,
+      FST.BytesReader fstReader,
+      BinaryDictionary<? extends MorphData> dictionary,
+      TokenInfoFST userFST,
+      FST.BytesReader userFSTReader,
+      Dictionary<? extends MorphData> userDictionary,
+      ConnectionCosts costs,
+      Class<U> positionImpl) {
+    this.fst = fst;
+    this.fstReader = fstReader;
+    this.dictionary = dictionary;
+    this.userFST = userFST;
+    this.userFSTReader = userFSTReader;
+    this.userDictionary = userDictionary;
+    this.costs = costs;
+    this.positions = new WrappedPositionArray<>(positionImpl);
+  }
+
+  /**
+   * Incrementally parse some more characters. This runs the viterbi search forwards "enough" so
+   * that we generate some more tokens. How much forward depends on the chars coming in, since some
+   * chars could cause longer-lasting ambiguity in the parsing. Once the ambiguity is resolved, then
+   * we back trace, produce the pending tokens, and return.
+   */
+  public final void forward() throws IOException {
+    if (VERBOSE) {
+      System.out.println("\nPARSE");
+    }
+
+    // Index of the last character of unknown word:
+    int unknownWordEndIndex = -1;
+
+    // Maximum posAhead of user word in the entire input
+    int userWordMaxPosAhead = -1;
+
+    // Advances over each position (character):
+    while (buffer.get(pos) != -1) {
+      final Position posData = positions.get(pos);
+      final boolean isFrontier = positions.getNextPos() == pos + 1;
+
+      if (posData.count == 0) {
+        // No arcs arrive here; move to next position:
+        if (VERBOSE) {
+          System.out.println("    no arcs in; skip pos=" + pos);
+        }
+        pos++;
+        continue;
+      }
+
+      if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
+        // We are at a "frontier", and only one node is
+        // alive, so whatever the eventual best path is must
+        // come through this node.  So we can safely commit
+        // to the prefix of the best path at this point:
+        if (outputNBest) {
+          backtraceNBest(posData, false);
+        }
+        backtrace(posData, 0);
+        if (outputNBest) {
+          fixupPendingList();
+        }
+
+        // Re-base cost so we don't risk int overflow:
+        posData.costs[0] = 0;
+        if (pending.size() > 0) {
+          return;
+        } else {
+          // This means the backtrace only produced
+          // punctuation tokens, so we must keep parsing.
+        }
+      }
+
+      if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP) {
+        // Safety: if we've buffered too much, force a
+        // backtrace now.  We find the least-cost partial
+        // path, across all paths, backtrace from it, and
+        // then prune all others.  Note that this, in
+        // general, can produce the wrong result, if the
+        // total best path did not in fact back trace
+        // through this partial best path.  But it's the
+        // best we can do... (short of not having a
+        // safety!).
+
+        // First pass: find least cost partial path so far,
+        // including ending at future positions:
+        int leastIDX = -1;
+        int leastCost = Integer.MAX_VALUE;
+        Position leastPosData = null;
+        for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
+          final Position posData2 = positions.get(pos2);
+          for (int idx = 0; idx < posData2.count; idx++) {
+            // System.out.println("    idx=" + idx + " cost=" + cost);
+            final int cost = posData2.costs[idx];
+            if (cost < leastCost) {
+              leastCost = cost;
+              leastIDX = idx;
+              leastPosData = posData2;
+            }
+          }
+        }
+
+        // We will always have at least one live path:
+        assert leastIDX != -1;
+
+        if (outputNBest) {
+          backtraceNBest(leastPosData, false);
+        }
+
+        // Second pass: prune all but the best path:
+        for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
+          final Position posData2 = positions.get(pos2);
+          if (posData2 != leastPosData) {
+            posData2.reset();
+          } else {
+            if (leastIDX != 0) {
+              posData2.costs[0] = posData2.costs[leastIDX];
+              posData2.lastRightID[0] = posData2.lastRightID[leastIDX];
+              posData2.backPos[0] = posData2.backPos[leastIDX];
+              posData2.backWordPos[0] = posData2.backWordPos[leastIDX];
+              posData2.backIndex[0] = posData2.backIndex[leastIDX];
+              posData2.backID[0] = posData2.backID[leastIDX];
+              posData2.backType[0] = posData2.backType[leastIDX];
+            }
+            posData2.count = 1;
+          }
+        }
+
+        backtrace(leastPosData, 0);
+        if (outputNBest) {
+          fixupPendingList();
+        }
+
+        // Re-base cost so we don't risk int overflow:
+        Arrays.fill(leastPosData.costs, 0, leastPosData.count, 0);
+
+        if (pos != leastPosData.pos) {
+          // We jumped into a future position:
+          assert pos < leastPosData.pos;
+          pos = leastPosData.pos;
+        }
+        if (pending.size() > 0) {
+          return;
+        } else {
+          // This means the backtrace only produced
+          // punctuation tokens, so we must keep parsing.
+          continue;
+        }
+      }
+
+      if (VERBOSE) {
+        System.out.println(
+            "\n  extend @ pos="
+                + pos
+                + " char="
+                + (char) buffer.get(pos)
+                + " hex="
+                + Integer.toHexString(buffer.get(pos)));
+      }
+
+      if (VERBOSE) {
+        System.out.println("    " + posData.count + " arcs in");
+      }
+
+      if (enableSpacePenaltyFactor
+          && Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) {
+        // We add single space separator as prefixes of the terms that we extract.
+        // This information is needed to compute the space penalty factor of each term.
+        // These whitespace prefixes are removed when the final tokens are generated, or
+        // added as separated tokens when discardPunctuation is unset.
+        if (buffer.get(++pos) == -1) {
+          pos = posData.pos;
+        }
+      }
+
+      boolean anyMatches = false;
+
+      // First try user dict:
+      if (userFST != null) {
+        userFST.getFirstArc(arc);
+        int output = 0;
+        int maxPosAhead = 0;
+        int outputMaxPosAhead = 0;
+        int arcFinalOutMaxPosAhead = 0;
+
+        for (int posAhead = pos; ; posAhead++) {
+          final int ch = buffer.get(posAhead);
+          if (ch == -1) {
+            break;
+          }
+          if (userFST.findTargetArc(ch, arc, arc, posAhead == pos, userFSTReader) == null) {
+            break;
+          }
+          output += arc.output().intValue();
+          if (arc.isFinal()) {
+            maxPosAhead = posAhead;
+            outputMaxPosAhead = output;
+            arcFinalOutMaxPosAhead = arc.nextFinalOutput().intValue();
+            anyMatches = true;
+            if (!outputLongestUserEntryOnly) {
+              // add all matched user entries.
+              add(
+                  userDictionary.getMorphAttributes(),
+                  posData,
+                  pos,
+                  posAhead + 1,
+                  output + arc.nextFinalOutput().intValue(),
+                  TokenType.USER,
+                  false);
+            }
+          }
+        }
+
+        // Longest matching for user word
+        if (anyMatches && maxPosAhead > userWordMaxPosAhead) {
+          if (outputLongestUserEntryOnly) {
+            if (VERBOSE) {
+              System.out.println(
+                  "    USER word "
+                      + new String(buffer.get(pos, maxPosAhead + 1))
+                      + " toPos="
+                      + (maxPosAhead + 1));
+            }
+            add(
+                userDictionary.getMorphAttributes(),
+                posData,
+                pos,
+                maxPosAhead + 1,
+                outputMaxPosAhead + arcFinalOutMaxPosAhead,
+                TokenType.USER,
+                false);
+          }
+          userWordMaxPosAhead = Math.max(userWordMaxPosAhead, maxPosAhead);
+        }
+      }
+
+      // TODO: we can be more aggressive about user
+      // matches?  if we are "under" a user match then don't
+      // extend KNOWN/UNKNOWN paths?
+
+      if (!anyMatches) {
+        // Next, try known dictionary matches
+        fst.getFirstArc(arc);
+        int output = 0;
+
+        for (int posAhead = pos; ; posAhead++) {
+          final int ch = buffer.get(posAhead);
+          if (ch == -1) {
+            break;
+          }
+          // System.out.println("    match " + (char) ch + " posAhead=" + posAhead);
+
+          if (fst.findTargetArc(ch, arc, arc, posAhead == pos, fstReader) == null) {
+            break;
+          }
+
+          output += arc.output().intValue();
+
+          // Optimization: for known words that are too-long
+          // (compound), we should pre-compute the 2nd
+          // best segmentation and store it in the
+          // dictionary instead of recomputing it each time a
+          // match is found.
+
+          if (arc.isFinal()) {
+            dictionary.lookupWordIds(output + arc.nextFinalOutput().intValue(), wordIdRef);
+            if (VERBOSE) {
+              System.out.println(
+                  "    KNOWN word "
+                      + new String(buffer.get(pos, posAhead - pos + 1))
+                      + " toPos="
+                      + (posAhead + 1)
+                      + " "
+                      + wordIdRef.length
+                      + " wordIDs");
+            }
+            for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
+              add(
+                  dictionary.getMorphAttributes(),
+                  posData,
+                  pos,
+                  posAhead + 1,
+                  wordIdRef.ints[wordIdRef.offset + ofs],
+                  TokenType.KNOWN,
+                  false);
+              anyMatches = true;
+            }
+          }
+        }
+      }
+
+      if (!shouldSkipProcessUnknownWord(unknownWordEndIndex, posData)) {
+        int unknownWordLength = processUnknownWord(anyMatches, posData);
+        unknownWordEndIndex = posData.pos + unknownWordLength;
+      }
+      pos++;
+    }
+
+    end = true;
+
+    if (pos > 0) {
+
+      final Position endPosData = positions.get(pos);
+      int leastCost = Integer.MAX_VALUE;
+      int leastIDX = -1;
+      if (VERBOSE) {
+        System.out.println("  end: " + endPosData.count + " nodes");
+      }
+      for (int idx = 0; idx < endPosData.count; idx++) {
+        // Add EOS cost:
+        final int cost = endPosData.costs[idx] + costs.get(endPosData.lastRightID[idx], 0);
+        // System.out.println("    idx=" + idx + " cost=" + cost + " (pathCost=" +
+        // endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ")
+        // backPos=" + endPosData.backPos[idx]);
+        if (cost < leastCost) {
+          leastCost = cost;
+          leastIDX = idx;
+        }
+      }
+
+      if (outputNBest) {
+        backtraceNBest(endPosData, true);
+      }
+      backtrace(endPosData, leastIDX);
+      if (outputNBest) {
+        fixupPendingList();
+      }
+    } else {
+      // No characters in the input string; return no tokens!
+    }
+  }
+
+  protected boolean shouldSkipProcessUnknownWord(int unknownWordEndIndex, Position posData) {
+    return unknownWordEndIndex > posData.pos;
+  }
+
+  /**
+   * Add unknown words to the position graph.
+   *
+   * @return word length
+   */
+  protected abstract int processUnknownWord(boolean anyMatches, Position posData)
+      throws IOException;
+
+  /**
+   * Backtrace from the provided position, back to the last time we back-traced, accumulating the
+   * resulting tokens to the pending list. The pending list is then in-reverse (last token should be
+   * returned first).
+   */
+  protected abstract void backtrace(final Position endPosData, final int fromIDX)
+      throws IOException;
+
+  /**
+   * Backtrace the n-best path. Subclasses that support n-best paths should implement this method.
+   */
+  protected void backtraceNBest(final Position endPosData, final boolean useEOS)
+      throws IOException {
+    throw new UnsupportedOperationException();
+  }
+
+  /**
+   * Remove duplicated tokens from the pending list; this is needed because {@link
+   * #backtrace(Position, int)} and {@link #backtraceNBest(Position, boolean)} can add same tokens
+   * to the list. Subclasses that support n-best paths should implement this method.
+   */
+  protected void fixupPendingList() {
+    throw new UnsupportedOperationException();
+  }
+
+  /** Add a token on the minimum cost path to the pending token list. */
+  protected final void add(
+      MorphData morphData,
+      Position fromPosData,
+      int wordPos,
+      int endPos,
+      int wordID,
+      TokenType type,
+      boolean addPenalty)
+      throws IOException {
+    final int wordCost = morphData.getWordCost(wordID);
+    final int leftID = morphData.getLeftId(wordID);
+    int leastCost = Integer.MAX_VALUE;
+    int leastIDX = -1;
+    assert fromPosData.count > 0;
+    for (int idx = 0; idx < fromPosData.count; idx++) {
+      // The number of spaces before the term
+      int numSpaces = wordPos - fromPosData.pos;
+
+      // Cost is path cost so far, plus word cost (added at
+      // end of loop), plus bigram cost and space penalty cost.
+      final int cost =
+          fromPosData.costs[idx]
+              + costs.get(fromPosData.lastRightID[idx], leftID)
+              + computeSpacePenalty(morphData, wordID, numSpaces);
+      if (VERBOSE) {
+        System.out.println(
+            "      fromIDX="
+                + idx
+                + ": cost="
+                + cost
+                + " (prevCost="
+                + fromPosData.costs[idx]
+                + " wordCost="
+                + wordCost
+                + " bgCost="
+                + costs.get(fromPosData.lastRightID[idx], leftID)
+                + " spacePenalty="
+                + computeSpacePenalty(morphData, wordID, numSpaces)
+                + ") leftID="
+                + leftID
+                // + " leftPOS="
+                // + leftPOS.name()
+                + ")");
+      }
+      if (cost < leastCost) {
+        leastCost = cost;
+        leastIDX = idx;
+        if (VERBOSE) {
+          System.out.println("        **");
+        }
+      }
+    }
+
+    leastCost += wordCost;
+
+    if (VERBOSE) {
+      System.out.println(
+          "      + cost="
+              + leastCost
+              + " wordID="
+              + wordID
+              + " leftID="
+              + leftID
+              + " leastIDX="
+              + leastIDX
+              + " toPos="
+              + endPos
+              + " toPos.idx="
+              + positions.get(endPos).count);
+    }
+
+    if (addPenalty && type != TokenType.USER) {
+      final int penalty = computePenalty(fromPosData.pos, endPos - fromPosData.pos);
+      if (VERBOSE) {
+        if (penalty > 0) {
+          System.out.println("        + penalty=" + penalty + " cost=" + (leastCost + penalty));
+        }
+      }
+      leastCost += penalty;
+    }
+
+    positions
+        .get(endPos)
+        .add(
+            leastCost,
+            morphData.getRightId(wordID),
+            fromPosData.pos,
+            wordPos,
+            leastIDX,
+            wordID,
+            type);
+  }
+
+  /** Returns the space penalty. */
+  protected int computeSpacePenalty(MorphData morphData, int wordID, int numSpaces) {
+    return 0;
+  }
+
+  /** Returns the penalty for a specific input region */
+  protected int computePenalty(int pos, int length) throws IOException {
+    return 0;
+  }
+
+  public int getPos() {
+    return pos;
+  }
+
+  public boolean isEnd() {
+    return end;
+  }
+
+  public List<T> getPending() {
+    return pending;
+  }
+
+  public boolean isOutputNBest() {
+    return outputNBest;
+  }
+
+  public void resetBuffer(Reader reader) {
+    buffer.reset(reader);
+  }
+
+  public void resetState() {
+    positions.reset();
+    pos = 0;
+    end = false;
+    lastBackTracePos = 0;
+    pending.clear();
+
+    // Add BOS:
+    positions.get(0).add(0, 0, -1, -1, -1, -1, TokenType.KNOWN);
+  }
+
+  /**
+   * Holds all back pointers arriving to this position.
+   *
+   * <p>NOTE: This and subclasses must have no-arg constructor. See {@link WrappedPositionArray}.
+   */
+  public static class Position {
+
+    int pos;
+
+    int count;
+
+    // maybe single int array * 5?
+    int[] costs = new int[8];
+    int[] lastRightID = new int[8];
+    int[] backPos = new int[8];
+    int[] backWordPos = new int[8];
+    int[] backIndex = new int[8];
+    int[] backID = new int[8];
+    TokenType[] backType = new TokenType[8];
+
+    private void grow() {
+      costs = ArrayUtil.grow(costs, 1 + count);
+      lastRightID = ArrayUtil.grow(lastRightID, 1 + count);
+      backPos = ArrayUtil.grow(backPos, 1 + count);
+      backWordPos = ArrayUtil.grow(backWordPos, 1 + count);
+      backIndex = ArrayUtil.grow(backIndex, 1 + count);
+      backID = ArrayUtil.grow(backID, 1 + count);
+
+      // NOTE: sneaky: grow separately because
+      // ArrayUtil.grow will otherwise pick a different
+      // length than the int[]s we just grew:
+      final TokenType[] newBackType = new TokenType[backID.length];
+      System.arraycopy(backType, 0, newBackType, 0, backType.length);
+      backType = newBackType;
+    }
+
+    public void add(
+        int cost,
+        int lastRightID,
+        int backPos,
+        int backRPos,
+        int backIndex,
+        int backID,
+        TokenType backType) {
+      // NOTE: this isn't quite a true Viterbi search,
+      // because we should check if lastRightID is
+      // already present here, and only update if the new
+      // cost is less than the current cost, instead of
+      // simply appending.  However, that will likely hurt
+      // performance (usually we add a lastRightID only once),
+      // and it means we actually create the full graph
+      // intersection instead of a "normal" Viterbi lattice:
+      if (count == costs.length) {
+        grow();
+      }
+      this.costs[count] = cost;
+      this.lastRightID[count] = lastRightID;
+      this.backPos[count] = backPos;
+      this.backWordPos[count] = backRPos;
+      this.backIndex[count] = backIndex;
+      this.backID[count] = backID;
+      this.backType[count] = backType;
+      count++;
+    }
+
+    public void reset() {
+      count = 0;
+    }
+
+    public int getPos() {
+      return pos;
+    }
+
+    public int getCount() {
+      return count;
+    }
+
+    public void setCount(int count) {
+      this.count = count;
+    }
+
+    public int getCost(int index) {
+      return costs[index];
+    }
+
+    public int getBackPos(int index) {
+      return backPos[index];
+    }
+
+    public int getBackWordPos(int index) {
+      return backWordPos[index];
+    }
+
+    public int getBackID(int index) {
+      return backID[index];
+    }
+
+    public int getBackIndex(int index) {
+      return backIndex[index];
+    }
+
+    public TokenType getBackType(int index) {
+      return backType[index];
+    }
+
+    public int getLastRightID(int index) {
+      return lastRightID[index];
+    }
+  }
+
+  /** Holds partial graph (array of positions) for calculating the minimum cost path */
+  public static final class WrappedPositionArray<U extends Position> {
+    private U[] positions;
+    private final Class<U> clazz;
+
+    @SuppressWarnings("unchecked")
+    WrappedPositionArray(Class<U> clazz) {
+      this.clazz = clazz;
+      positions = (U[]) Array.newInstance(clazz, 8);
+      for (int i = 0; i < positions.length; i++) {
+        try {
+          positions[i] = clazz.getConstructor().newInstance();
+        } catch (ReflectiveOperationException e) {
+          // shouldn't happen; Position class should have no-arg constructor.
+          throw new IllegalStateException(e);
+        }
+      }
+    }
+
+    // Next array index to write to in positions:
+    private int nextWrite;
+
+    // Next position to write:
+    private int nextPos;
+
+    // How many valid Position instances are held in the
+    // positions array:
+    private int count;
+
+    void reset() {
+      nextWrite--;
+      while (count > 0) {
+        if (nextWrite == -1) {
+          nextWrite = positions.length - 1;
+        }
+        positions[nextWrite--].reset();
+        count--;
+      }
+      nextWrite = 0;
+      nextPos = 0;
+      count = 0;
+    }
+
+    /**
+     * Get Position instance for this absolute position; this is allowed to be arbitrarily far "in
+     * the future" but cannot be before the last freeBefore.
+     */
+    @SuppressWarnings("unchecked")
+    public U get(int pos) {
+      while (pos >= nextPos) {
+        // System.out.println("count=" + count + " vs len=" + positions.length);
+        if (count == positions.length) {
+          // Position[] newPositions =
+          //    new Position[ArrayUtil.oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+          U[] newPositions =
+              (U[])
+                  Array.newInstance(
+                      clazz, ArrayUtil.oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF));
+          // System.out.println("grow positions " + newPositions.length);
+          System.arraycopy(positions, nextWrite, newPositions, 0, positions.length - nextWrite);
+          System.arraycopy(positions, 0, newPositions, positions.length - nextWrite, nextWrite);
+          for (int i = positions.length; i < newPositions.length; i++) {
+            try {
+              newPositions[i] = clazz.getConstructor().newInstance();
+            } catch (ReflectiveOperationException e) {
+              // shouldn't happen
+              throw new IllegalStateException(e);
+            }
+          }
+          nextWrite = positions.length;
+          positions = newPositions;
+        }
+        if (nextWrite == positions.length) {
+          nextWrite = 0;
+        }
+        // Should have already been reset:
+        assert positions[nextWrite].count == 0;
+        positions[nextWrite++].pos = nextPos++;
+        count++;
+      }
+      assert inBounds(pos);
+      final int index = getIndex(pos);
+      assert positions[index].pos == pos;
+      return positions[index];
+    }
+
+    int getNextPos() {
+      return nextPos;
+    }
+
+    // For assert:
+    private boolean inBounds(int pos) {
+      return pos < nextPos && pos >= nextPos - count;
+    }
+
+    private int getIndex(int pos) {
+      int index = nextWrite - (nextPos - pos);
+      if (index < 0) {
+        index += positions.length;
+      }
+      return index;
+    }
+
+    public void freeBefore(int pos) {
+      final int toFree = count - (nextPos - pos);
+      assert toFree >= 0;
+      assert toFree <= count;
+      int index = nextWrite - count;
+      if (index < 0) {
+        index += positions.length;
+      }
+      for (int i = 0; i < toFree; i++) {
+        if (index == positions.length) {
+          index = 0;
+        }
+        // System.out.println("  fb idx=" + index);
+        positions[index].reset();
+        index++;
+      }
+      count -= toFree;
+    }
+  }
+}
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/GraphvizFormatter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/GraphvizFormatter.java
@ -1,195 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.ja;
-
-import java.util.HashMap;
-import java.util.Map;
-import org.apache.lucene.analysis.ja.JapaneseTokenizer.Position;
-import org.apache.lucene.analysis.ja.JapaneseTokenizer.WrappedPositionArray;
-import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
-import org.apache.lucene.analysis.ja.dict.JaMorphData;
-import org.apache.lucene.analysis.morph.Dictionary;
-
-// TODO: would be nice to show 2nd best path in a diff't
-// color...
-
-/** Outputs the dot (graphviz) string for the viterbi lattice. */
-public class GraphvizFormatter {
-
-  private static final String BOS_LABEL = "BOS";
-
-  private static final String EOS_LABEL = "EOS";
-
-  private static final String FONT_NAME = "Helvetica";
-
-  private final ConnectionCosts costs;
-
-  private final Map<String, String> bestPathMap;
-
-  private final StringBuilder sb = new StringBuilder();
-
-  public GraphvizFormatter(ConnectionCosts costs) {
-    this.costs = costs;
-    this.bestPathMap = new HashMap<>();
-    sb.append(formatHeader());
-    sb.append("  init [style=invis]\n");
-    sb.append("  init -> 0.0 [label=\"" + BOS_LABEL + "\"]\n");
-  }
-
-  public String finish() {
-    sb.append(formatTrailer());
-    return sb.toString();
-  }
-
-  // Backtraces another incremental fragment:
-  void onBacktrace(
-      JapaneseTokenizer tok,
-      WrappedPositionArray positions,
-      int lastBackTracePos,
-      Position endPosData,
-      int fromIDX,
-      char[] fragment,
-      boolean isEnd) {
-    setBestPathMap(positions, lastBackTracePos, endPosData, fromIDX);
-    sb.append(formatNodes(tok, positions, lastBackTracePos, endPosData, fragment));
-    if (isEnd) {
-      sb.append("  fini [style=invis]\n");
-      sb.append("  ");
-      sb.append(getNodeID(endPosData.pos, fromIDX));
-      sb.append(" -> fini [label=\"" + EOS_LABEL + "\"]");
-    }
-  }
-
-  // Records which arcs make up the best bath:
-  private void setBestPathMap(
-      WrappedPositionArray positions, int startPos, Position endPosData, int fromIDX) {
-    bestPathMap.clear();
-
-    int pos = endPosData.pos;
-    int bestIDX = fromIDX;
-    while (pos > startPos) {
-      final Position posData = positions.get(pos);
-
-      final int backPos = posData.backPos[bestIDX];
-      final int backIDX = posData.backIndex[bestIDX];
-
-      final String toNodeID = getNodeID(pos, bestIDX);
-      final String fromNodeID = getNodeID(backPos, backIDX);
-
-      assert !bestPathMap.containsKey(fromNodeID);
-      assert !bestPathMap.containsValue(toNodeID);
-      bestPathMap.put(fromNodeID, toNodeID);
-      pos = backPos;
-      bestIDX = backIDX;
-    }
-  }
-
-  private String formatNodes(
-      JapaneseTokenizer tok,
-      WrappedPositionArray positions,
-      int startPos,
-      Position endPosData,
-      char[] fragment) {
-
-    StringBuilder sb = new StringBuilder();
-    // Output nodes
-    for (int pos = startPos + 1; pos <= endPosData.pos; pos++) {
-      final Position posData = positions.get(pos);
-      for (int idx = 0; idx < posData.count; idx++) {
-        sb.append("  ");
-        sb.append(getNodeID(pos, idx));
-        sb.append(" [label=\"");
-        sb.append(pos);
-        sb.append(": ");
-        sb.append(posData.lastRightID[idx]);
-        sb.append("\"]\n");
-      }
-    }
-
-    // Output arcs
-    for (int pos = endPosData.pos; pos > startPos; pos--) {
-      final Position posData = positions.get(pos);
-      for (int idx = 0; idx < posData.count; idx++) {
-        final Position backPosData = positions.get(posData.backPos[idx]);
-        final String toNodeID = getNodeID(pos, idx);
-        final String fromNodeID = getNodeID(posData.backPos[idx], posData.backIndex[idx]);
-
-        sb.append("  ");
-        sb.append(fromNodeID);
-        sb.append(" -> ");
-        sb.append(toNodeID);
-
-        final String attrs;
-        if (toNodeID.equals(bestPathMap.get(fromNodeID))) {
-          // This arc is on best path
-          attrs = " color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20";
-        } else {
-          attrs = "";
-        }
-
-        final Dictionary<? extends JaMorphData> dict = tok.getDict(posData.backType[idx]);
-        final int wordCost = dict.getWordCost(posData.backID[idx]);
-        final int bgCost =
-            costs.get(
-                backPosData.lastRightID[posData.backIndex[idx]],
-                dict.getLeftId(posData.backID[idx]));
-
-        final String surfaceForm =
-            new String(fragment, posData.backPos[idx] - startPos, pos - posData.backPos[idx]);
-
-        sb.append(" [label=\"");
-        sb.append(surfaceForm);
-        sb.append(' ');
-        sb.append(wordCost);
-        if (bgCost >= 0) {
-          sb.append('+');
-        }
-        sb.append(bgCost);
-        sb.append("\"");
-        sb.append(attrs);
-        sb.append("]\n");
-      }
-    }
-    return sb.toString();
-  }
-
-  private String formatHeader() {
-    StringBuilder sb = new StringBuilder();
-    sb.append("digraph viterbi {\n");
-    sb.append(
-        "  graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\"];\n");
-    // sb.append("  // A2 paper size\n");
-    // sb.append("  size = \"34.4,16.5\";\n");
-    // sb.append("  // try to fill paper\n");
-    // sb.append("  ratio = fill;\n");
-    sb.append("  edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
-    sb.append(
-        "  node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\""
-            + FONT_NAME
-            + "\" ]\n");
-
-    return sb.toString();
-  }
-
-  private String formatTrailer() {
-    return "}";
-  }
-
-  private String getNodeID(int pos, int idx) {
-    return pos + "." + idx;
-  }
-}
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/ViterbiNBest.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/ViterbiNBest.java
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
@ -31,8 +31,10 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
 import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
+import org.apache.lucene.analysis.ja.dict.JaMorphData;
 import org.apache.lucene.analysis.ja.dict.UserDictionary;
 import org.apache.lucene.analysis.ja.tokenattributes.*;
+import org.apache.lucene.analysis.morph.GraphvizFormatter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.tests.analysis.MockGraphTokenFilter;
@ -518,7 +520,8 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
  }

  public void testLatticeToDot() throws Exception {
-    final GraphvizFormatter gv2 = new GraphvizFormatter(ConnectionCosts.getInstance());
+    final GraphvizFormatter<JaMorphData> gv2 =
+        new GraphvizFormatter<>(ConnectionCosts.getInstance());
    final Analyzer analyzer =
        new Analyzer() {
          @Override
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Viterbi.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Viterbi.java
@ -0,0 +1,447 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ko;
+
+import java.io.IOException;
+import java.util.EnumMap;
+import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
+import org.apache.lucene.analysis.ko.dict.KoMorphData;
+import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
+import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
+import org.apache.lucene.analysis.ko.dict.UserDictionary;
+import org.apache.lucene.analysis.morph.ConnectionCosts;
+import org.apache.lucene.analysis.morph.Dictionary;
+import org.apache.lucene.analysis.morph.GraphvizFormatter;
+import org.apache.lucene.analysis.morph.MorphData;
+import org.apache.lucene.analysis.morph.TokenInfoFST;
+import org.apache.lucene.analysis.morph.TokenType;
+import org.apache.lucene.util.fst.FST;
+
+/** {@link org.apache.lucene.analysis.morph.Viterbi} subclass for Korean morphological analysis. */
+final class Viterbi
+    extends org.apache.lucene.analysis.morph.Viterbi<
+        Token, org.apache.lucene.analysis.morph.Viterbi.Position> {
+
+  private final EnumMap<TokenType, Dictionary<? extends KoMorphData>> dictionaryMap =
+      new EnumMap<>(TokenType.class);
+
+  private final UnknownDictionary unkDictionary;
+  private final CharacterDefinition characterDefinition;
+
+  private final boolean discardPunctuation;
+  private final KoreanTokenizer.DecompoundMode mode;
+  private final boolean outputUnknownUnigrams;
+
+  private GraphvizFormatter<KoMorphData> dotOut;
+
+  Viterbi(
+      TokenInfoFST fst,
+      FST.BytesReader fstReader,
+      TokenInfoDictionary dictionary,
+      TokenInfoFST userFST,
+      FST.BytesReader userFSTReader,
+      UserDictionary userDictionary,
+      ConnectionCosts costs,
+      UnknownDictionary unkDictionary,
+      CharacterDefinition characterDefinition,
+      boolean discardPunctuation,
+      KoreanTokenizer.DecompoundMode mode,
+      boolean outputUnknownUnigrams) {
+    super(
+        fst, fstReader, dictionary, userFST, userFSTReader, userDictionary, costs, Position.class);
+    this.unkDictionary = unkDictionary;
+    this.characterDefinition = characterDefinition;
+    this.discardPunctuation = discardPunctuation;
+    this.mode = mode;
+    this.outputUnknownUnigrams = outputUnknownUnigrams;
+    this.enableSpacePenaltyFactor = true;
+    this.outputLongestUserEntryOnly = true;
+    dictionaryMap.put(TokenType.KNOWN, dictionary);
+    dictionaryMap.put(TokenType.UNKNOWN, unkDictionary);
+    dictionaryMap.put(TokenType.USER, userDictionary);
+  }
+
+  @Override
+  protected int processUnknownWord(boolean anyMatches, Position posData) throws IOException {
+    final char firstCharacter = (char) buffer.get(pos);
+    if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) {
+
+      // Find unknown match:
+      int characterId = characterDefinition.getCharacterClass(firstCharacter);
+      // NOTE: copied from UnknownDictionary.lookup:
+      int unknownWordLength;
+      if (!characterDefinition.isGroup(firstCharacter)) {
+        unknownWordLength = 1;
+      } else {
+        // Extract unknown word. Characters with the same script are considered to be part of
+        // unknown word
+        unknownWordLength = 1;
+        Character.UnicodeScript scriptCode = Character.UnicodeScript.of(firstCharacter);
+        final boolean isPunct = isPunctuation(firstCharacter);
+        final boolean isDigit = Character.isDigit(firstCharacter);
+        for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
+          int next = buffer.get(posAhead);
+          if (next == -1) {
+            break;
+          }
+          char ch = (char) next;
+          int chType = Character.getType(ch);
+          Character.UnicodeScript sc = Character.UnicodeScript.of(next);
+          boolean sameScript =
+              isSameScript(scriptCode, sc)
+                  // Non-spacing marks inherit the script of their base character,
+                  // following recommendations from UTR #24.
+                  || chType == Character.NON_SPACING_MARK;
+
+          if (sameScript
+              // split on punctuation
+              && isPunctuation(ch, chType) == isPunct
+              // split on digit
+              && Character.isDigit(ch) == isDigit
+              && characterDefinition.isGroup(ch)) {
+            unknownWordLength++;
+          } else {
+            break;
+          }
+          // Update the script code and character class if the original script
+          // is Inherited or Common.
+          if (isCommonOrInherited(scriptCode) && isCommonOrInherited(sc) == false) {
+            scriptCode = sc;
+            characterId = characterDefinition.getCharacterClass(ch);
+          }
+        }
+      }
+
+      unkDictionary.lookupWordIds(
+          characterId, wordIdRef); // characters in input text are supposed to be the same
+      if (VERBOSE) {
+        System.out.println(
+            "    UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.length + " wordIDs");
+      }
+      for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
+        add(
+            unkDictionary.getMorphAttributes(),
+            posData,
+            pos,
+            pos + unknownWordLength,
+            wordIdRef.ints[wordIdRef.offset + ofs],
+            TokenType.UNKNOWN,
+            false);
+      }
+    }
+    // TODO: should return meaningful value?
+    return 0;
+  }
+
+  void setGraphvizFormatter(GraphvizFormatter<KoMorphData> dotOut) {
+    this.dotOut = dotOut;
+  }
+
+  @Override
+  protected void backtrace(Position endPosData, int fromIDX) {
+    final int endPos = endPosData.getPos();
+
+    if (endPos == lastBackTracePos) {
+      return;
+    }
+
+    if (VERBOSE) {
+      System.out.println(
+          "\n  backtrace: endPos="
+              + endPos
+              + " pos="
+              + pos
+              + "; "
+              + (pos - lastBackTracePos)
+              + " characters; last="
+              + lastBackTracePos
+              + " cost="
+              + endPosData.getCost(fromIDX));
+    }
+
+    final char[] fragment = buffer.get(lastBackTracePos, endPos - lastBackTracePos);
+
+    if (dotOut != null) {
+      dotOut.onBacktrace(
+          this::getDict, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
+    }
+
+    int pos = endPos;
+    int bestIDX = fromIDX;
+
+    // TODO: sort of silly to make Token instances here; the
+    // back trace has all info needed to generate the
+    // token.  So, we could just directly set the attrs,
+    // from the backtrace, in incrementToken w/o ever
+    // creating Token; we'd have to defer calling freeBefore
+    // until after the backtrace was fully "consumed" by
+    // incrementToken.
+
+    while (pos > lastBackTracePos) {
+      // System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
+      final Position posData = positions.get(pos);
+      assert bestIDX < posData.getCount();
+
+      int backPos = posData.getBackPos(bestIDX);
+      int backWordPos = posData.getBackWordPos(bestIDX);
+      assert backPos >= lastBackTracePos
+          : "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos;
+      // the length of the word without the whitespaces at the beginning.
+      int length = pos - backWordPos;
+      TokenType backType = posData.getBackType(bestIDX);
+      int backID = posData.getBackID(bestIDX);
+      int nextBestIDX = posData.getBackIndex(bestIDX);
+      // the start of the word after the whitespace at the beginning.
+      final int fragmentOffset = backWordPos - lastBackTracePos;
+      assert fragmentOffset >= 0;
+
+      final Dictionary<? extends KoMorphData> dict = getDict(backType);
+
+      if (outputUnknownUnigrams && backType == TokenType.UNKNOWN) {
+        // outputUnknownUnigrams converts unknown word into unigrams:
+        for (int i = length - 1; i >= 0; i--) {
+          int charLen = 1;
+          if (i > 0 && Character.isLowSurrogate(fragment[fragmentOffset + i])) {
+            i--;
+            charLen = 2;
+          }
+          final DictionaryToken token =
+              new DictionaryToken(
+                  TokenType.UNKNOWN,
+                  unkDictionary.getMorphAttributes(),
+                  CharacterDefinition.NGRAM,
+                  fragment,
+                  fragmentOffset + i,
+                  charLen,
+                  backWordPos + i,
+                  backWordPos + i + charLen);
+          pending.add(token);
+          if (VERBOSE) {
+            System.out.println("    add token=" + pending.get(pending.size() - 1));
+          }
+        }
+      } else {
+        final DictionaryToken token =
+            new DictionaryToken(
+                backType,
+                dict.getMorphAttributes(),
+                backID,
+                fragment,
+                fragmentOffset,
+                length,
+                backWordPos,
+                backWordPos + length);
+        if (token.getPOSType() == POS.Type.MORPHEME
+            || mode == KoreanTokenizer.DecompoundMode.NONE) {
+          if (shouldFilterToken(token) == false) {
+            pending.add(token);
+            if (VERBOSE) {
+              System.out.println("    add token=" + pending.get(pending.size() - 1));
+            }
+          }
+        } else {
+          KoMorphData.Morpheme[] morphemes = token.getMorphemes();
+          if (morphemes == null) {
+            pending.add(token);
+            if (VERBOSE) {
+              System.out.println("    add token=" + pending.get(pending.size() - 1));
+            }
+          } else {
+            int endOffset = backWordPos + length;
+            int posLen = 0;
+            // decompose the compound
+            for (int i = morphemes.length - 1; i >= 0; i--) {
+              final KoMorphData.Morpheme morpheme = morphemes[i];
+              final Token compoundToken;
+              if (token.getPOSType() == POS.Type.COMPOUND) {
+                assert endOffset - morpheme.surfaceForm.length() >= 0;
+                compoundToken =
+                    new DecompoundToken(
+                        morpheme.posTag,
+                        morpheme.surfaceForm,
+                        endOffset - morpheme.surfaceForm.length(),
+                        endOffset,
+                        backType);
+              } else {
+                compoundToken =
+                    new DecompoundToken(
+                        morpheme.posTag,
+                        morpheme.surfaceForm,
+                        token.getStartOffset(),
+                        token.getEndOffset(),
+                        backType);
+              }
+              if (i == 0 && mode == KoreanTokenizer.DecompoundMode.MIXED) {
+                compoundToken.setPositionIncrement(0);
+              }
+              ++posLen;
+              endOffset -= morpheme.surfaceForm.length();
+              pending.add(compoundToken);
+              if (VERBOSE) {
+                System.out.println("    add token=" + pending.get(pending.size() - 1));
+              }
+            }
+            if (mode == KoreanTokenizer.DecompoundMode.MIXED) {
+              token.setPositionLength(Math.max(1, posLen));
+              pending.add(token);
+              if (VERBOSE) {
+                System.out.println("    add token=" + pending.get(pending.size() - 1));
+              }
+            }
+          }
+        }
+      }
+      if (discardPunctuation == false && backWordPos != backPos) {
+        // Add a token for whitespaces between terms
+        int offset = backPos - lastBackTracePos;
+        int len = backWordPos - backPos;
+        // System.out.println(offset + " " + fragmentOffset + " " + len + " " + backWordPos + " " +
+        // backPos);
+        unkDictionary.lookupWordIds(characterDefinition.getCharacterClass(' '), wordIdRef);
+        DictionaryToken spaceToken =
+            new DictionaryToken(
+                TokenType.UNKNOWN,
+                unkDictionary.getMorphAttributes(),
+                wordIdRef.ints[wordIdRef.offset],
+                fragment,
+                offset,
+                len,
+                backPos,
+                backPos + len);
+        pending.add(spaceToken);
+      }
+
+      pos = backPos;
+      bestIDX = nextBestIDX;
+    }
+
+    lastBackTracePos = endPos;
+
+    if (VERBOSE) {
+      System.out.println("  freeBefore pos=" + endPos);
+    }
+    // Notify the circular buffers that we are done with
+    // these positions:
+    buffer.freeBefore(endPos);
+    positions.freeBefore(endPos);
+  }
+
+  /** Returns the space penalty associated with the provided {@link POS.Tag}. */
+  @Override
+  protected int computeSpacePenalty(MorphData morphData, int wordID, int numSpaces) {
+    final POS.Tag leftPOS = ((KoMorphData) morphData).getLeftPOS(wordID);
+    int spacePenalty = 0;
+    if (numSpaces > 0) {
+      // TODO we should extract the penalty (left-space-penalty-factor) from the dicrc file.
+      switch (leftPOS) {
+        case E:
+        case J:
+        case VCP:
+        case XSA:
+        case XSN:
+        case XSV:
+          spacePenalty = 3000;
+          break;
+        case IC:
+        case MAG:
+        case MAJ:
+        case MM:
+        case NA:
+        case NNB:
+        case NNBC:
+        case NNG:
+        case NNP:
+        case NP:
+        case NR:
+        case SC:
+        case SE:
+        case SF:
+        case SH:
+        case SL:
+        case SN:
+        case SP:
+        case SSC:
+        case SSO:
+        case SY:
+        case UNA:
+        case UNKNOWN:
+        case VA:
+        case VCN:
+        case VSV:
+        case VV:
+        case VX:
+        case XPN:
+        case XR:
+        default:
+          break;
+      }
+    }
+    return spacePenalty;
+  }
+
+  Dictionary<? extends KoMorphData> getDict(TokenType type) {
+    return dictionaryMap.get(type);
+  }
+
+  private boolean shouldFilterToken(Token token) {
+    return discardPunctuation && isPunctuation(token.getSurfaceForm()[token.getOffset()]);
+  }
+
+  private static boolean isPunctuation(char ch) {
+    return isPunctuation(ch, Character.getType(ch));
+  }
+
+  private static boolean isPunctuation(char ch, int cid) {
+    // special case for Hangul Letter Araea (interpunct)
+    if (ch == 0x318D) {
+      return true;
+    }
+    switch (cid) {
+      case Character.SPACE_SEPARATOR:
+      case Character.LINE_SEPARATOR:
+      case Character.PARAGRAPH_SEPARATOR:
+      case Character.CONTROL:
+      case Character.FORMAT:
+      case Character.DASH_PUNCTUATION:
+      case Character.START_PUNCTUATION:
+      case Character.END_PUNCTUATION:
+      case Character.CONNECTOR_PUNCTUATION:
+      case Character.OTHER_PUNCTUATION:
+      case Character.MATH_SYMBOL:
+      case Character.CURRENCY_SYMBOL:
+      case Character.MODIFIER_SYMBOL:
+      case Character.OTHER_SYMBOL:
+      case Character.INITIAL_QUOTE_PUNCTUATION:
+      case Character.FINAL_QUOTE_PUNCTUATION:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+  private static boolean isCommonOrInherited(Character.UnicodeScript script) {
+    return script == Character.UnicodeScript.INHERITED || script == Character.UnicodeScript.COMMON;
+  }
+
+  /** Determine if two scripts are compatible. */
+  private static boolean isSameScript(
+      Character.UnicodeScript scriptOne, Character.UnicodeScript scriptTwo) {
+    return scriptOne == scriptTwo
+        || isCommonOrInherited(scriptOne)
+        || isCommonOrInherited(scriptTwo);
+  }
+}