LUCENE-10493: factor out Viterbi algorithm and share it between kuromoji and nori (#805)

2022-04-25 20:09:46 +09:00 · 2022-04-25 20:09:46 +09:00 · c89f8a7ea1
parent 2a4c21bb58
commit c89f8a7ea1
9 changed files with 2640 additions and 2999 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -60,6 +60,8 @@ Other
  All classes in `org.apache.lucene.analysis.[ja|ko].util` was moved to `org.apache.lucene.analysis.[ja|ko].dict`.
  (Tomoko Uchida)
 * LUCENE-10493: Factor out Viterbi algorithm in Kuromoji and Nori to analysis-common. (Tomoko Uchida)
 ======================= Lucene 9.2.0 =======================
 API Changes
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/GraphvizFormatter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/GraphvizFormatter.java
@ -14,22 +14,16 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-package org.apache.lucene.analysis.ko;
+package org.apache.lucene.analysis.morph;
 import java.util.HashMap;
 import java.util.Map;
 import org.apache.lucene.analysis.ko.KoreanTokenizer.Position;
 import org.apache.lucene.analysis.ko.KoreanTokenizer.WrappedPositionArray;
 import org.apache.lucene.analysis.ko.dict.ConnectionCosts;
 import org.apache.lucene.analysis.ko.dict.KoMorphData;
 import org.apache.lucene.analysis.morph.Dictionary;
 // TODO: would be nice to show 2nd best path in a diff't
 // color...
 /** Outputs the dot (graphviz) string for the viterbi lattice. */
-public class GraphvizFormatter {
+public class GraphvizFormatter<T extends MorphData> {
  private static final String BOS_LABEL = "BOS";
  private static final String EOS_LABEL = "EOS";
@ -56,36 +50,39 @@ public class GraphvizFormatter {
  }
  // Backtraces another incremental fragment:
-  void onBacktrace(
+  public void onBacktrace(
-      KoreanTokenizer tok,
+      DictionaryProvider<T> dictProvider,
-      WrappedPositionArray positions,
+      Viterbi.WrappedPositionArray<? extends Viterbi.Position> positions,
      int lastBackTracePos,
-      Position endPosData,
+      Viterbi.Position endPosData,
      int fromIDX,
      char[] fragment,
      boolean isEnd) {
    setBestPathMap(positions, lastBackTracePos, endPosData, fromIDX);
-    sb.append(formatNodes(tok, positions, lastBackTracePos, endPosData, fragment));
+    sb.append(formatNodes(dictProvider, positions, lastBackTracePos, endPosData, fragment));
    if (isEnd) {
      sb.append("  fini [style=invis]\n");
      sb.append("  ");
-      sb.append(getNodeID(endPosData.pos, fromIDX));
+      sb.append(getNodeID(endPosData.getPos(), fromIDX));
      sb.append(" -> fini [label=\"" + EOS_LABEL + "\"]");
    }
  }
  // Records which arcs make up the best bath:
  private void setBestPathMap(
-      WrappedPositionArray positions, int startPos, Position endPosData, int fromIDX) {
+      Viterbi.WrappedPositionArray<? extends Viterbi.Position> positions,
      int startPos,
      Viterbi.Position endPosData,
      int fromIDX) {
    bestPathMap.clear();
-    int pos = endPosData.pos;
+    int pos = endPosData.getPos();
    int bestIDX = fromIDX;
    while (pos > startPos) {
-      final Position posData = positions.get(pos);
+      final Viterbi.Position posData = positions.get(pos);
-      final int backPos = posData.backPos[bestIDX];
+      final int backPos = posData.getBackPos(bestIDX);
-      final int backIDX = posData.backIndex[bestIDX];
+      final int backIDX = posData.getBackIndex(bestIDX);
      final String toNodeID = getNodeID(pos, bestIDX);
      final String fromNodeID = getNodeID(backPos, backIDX);
@ -99,34 +96,34 @@ public class GraphvizFormatter {
  }
  private String formatNodes(
-      KoreanTokenizer tok,
+      DictionaryProvider<T> dictProvider,
-      WrappedPositionArray positions,
+      Viterbi.WrappedPositionArray<? extends Viterbi.Position> positions,
      int startPos,
-      Position endPosData,
+      Viterbi.Position endPosData,
      char[] fragment) {
    StringBuilder sb = new StringBuilder();
    // Output nodes
-    for (int pos = startPos + 1; pos <= endPosData.pos; pos++) {
+    for (int pos = startPos + 1; pos <= endPosData.getPos(); pos++) {
-      final Position posData = positions.get(pos);
+      final Viterbi.Position posData = positions.get(pos);
-      for (int idx = 0; idx < posData.count; idx++) {
+      for (int idx = 0; idx < posData.getCount(); idx++) {
        sb.append("  ");
        sb.append(getNodeID(pos, idx));
        sb.append(" [label=\"");
        sb.append(pos);
        sb.append(": ");
-        sb.append(posData.lastRightID[idx]);
+        sb.append(posData.getLastRightID(idx));
        sb.append("\"]\n");
      }
    }
    // Output arcs
-    for (int pos = endPosData.pos; pos > startPos; pos--) {
+    for (int pos = endPosData.getPos(); pos > startPos; pos--) {
-      final Position posData = positions.get(pos);
+      final Viterbi.Position posData = positions.get(pos);
-      for (int idx = 0; idx < posData.count; idx++) {
+      for (int idx = 0; idx < posData.getCount(); idx++) {
-        final Position backPosData = positions.get(posData.backPos[idx]);
+        final Viterbi.Position backPosData = positions.get(posData.getBackPos(idx));
        final String toNodeID = getNodeID(pos, idx);
-        final String fromNodeID = getNodeID(posData.backPos[idx], posData.backIndex[idx]);
+        final String fromNodeID = getNodeID(posData.getBackPos(idx), posData.getBackIndex(idx));
        sb.append("  ");
        sb.append(fromNodeID);
@ -141,15 +138,15 @@ public class GraphvizFormatter {
          attrs = "";
        }
-        final Dictionary<? extends KoMorphData> dict = tok.getDict(posData.backType[idx]);
+        final Dictionary<? extends T> dict = dictProvider.get(posData.getBackType(idx));
-        final int wordCost = dict.getWordCost(posData.backID[idx]);
+        final int wordCost = dict.getWordCost(posData.getBackID(idx));
        final int bgCost =
            costs.get(
-                backPosData.lastRightID[posData.backIndex[idx]],
+                backPosData.getLastRightID(posData.getBackIndex(idx)),
-                dict.getLeftId(posData.backID[idx]));
+                dict.getLeftId(posData.getBackID(idx)));
        final String surfaceForm =
-            new String(fragment, posData.backPos[idx] - startPos, pos - posData.backPos[idx]);
+            new String(fragment, posData.getBackPos(idx) - startPos, pos - posData.getBackPos(idx));
        sb.append(" [label=\"");
        sb.append(surfaceForm);
@ -190,4 +187,10 @@ public class GraphvizFormatter {
  private String getNodeID(int pos, int idx) {
    return pos + "." + idx;
  }
  /** {@link Dictionary} provider */
  @FunctionalInterface
  public interface DictionaryProvider<T extends MorphData> {
    Dictionary<? extends T> get(TokenType type);
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/Viterbi.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/Viterbi.java
@ -0,0 +1,815 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.morph;
 import java.io.IOException;
 import java.io.Reader;
 import java.lang.reflect.Array;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import org.apache.lucene.analysis.util.RollingCharBuffer;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.fst.FST;
 /**
 * Performs <a href="https://en.wikipedia.org/wiki/Viterbi_algorithm">Viterbi algorithm</a> for
 * morphological Tokenizers, which split texts by Hidden Markov Model or Conditional Random Fields.
 *
 * @param <T> output token class
 * @param <U> position class
 */
 public abstract class Viterbi<T extends Token, U extends Viterbi.Position> {
  protected static final boolean VERBOSE = false;
  // For safety:
  protected static final int MAX_UNKNOWN_WORD_LENGTH = 1024;
  private static final int MAX_BACKTRACE_GAP = 1024;
  private final TokenInfoFST fst;
  private final BinaryDictionary<? extends MorphData> dictionary;
  private final Dictionary<? extends MorphData> userDictionary;
  protected final ConnectionCosts costs;
  private final FST.Arc<Long> arc = new FST.Arc<>();
  private final FST.BytesReader fstReader;
  protected final IntsRef wordIdRef = new IntsRef();
  private final FST.BytesReader userFSTReader;
  private final TokenInfoFST userFST;
  protected final RollingCharBuffer buffer = new RollingCharBuffer();
  protected final WrappedPositionArray<U> positions;
  // True once we've hit the EOF from the input reader:
  protected boolean end;
  // Last absolute position we backtraced from:
  protected int lastBackTracePos;
  // Next absolute position to process:
  protected int pos;
  // Already parsed, but not yet passed to caller, tokens:
  protected final List<T> pending = new ArrayList<>();
  protected boolean outputNBest = false;
  protected boolean enableSpacePenaltyFactor = false;
  protected boolean outputLongestUserEntryOnly = false;
  protected Viterbi(
      TokenInfoFST fst,
      FST.BytesReader fstReader,
      BinaryDictionary<? extends MorphData> dictionary,
      TokenInfoFST userFST,
      FST.BytesReader userFSTReader,
      Dictionary<? extends MorphData> userDictionary,
      ConnectionCosts costs,
      Class<U> positionImpl) {
    this.fst = fst;
    this.fstReader = fstReader;
    this.dictionary = dictionary;
    this.userFST = userFST;
    this.userFSTReader = userFSTReader;
    this.userDictionary = userDictionary;
    this.costs = costs;
    this.positions = new WrappedPositionArray<>(positionImpl);
  }
  /**
   * Incrementally parse some more characters. This runs the viterbi search forwards "enough" so
   * that we generate some more tokens. How much forward depends on the chars coming in, since some
   * chars could cause longer-lasting ambiguity in the parsing. Once the ambiguity is resolved, then
   * we back trace, produce the pending tokens, and return.
   */
  public final void forward() throws IOException {
    if (VERBOSE) {
      System.out.println("\nPARSE");
    }
    // Index of the last character of unknown word:
    int unknownWordEndIndex = -1;
    // Maximum posAhead of user word in the entire input
    int userWordMaxPosAhead = -1;
    // Advances over each position (character):
    while (buffer.get(pos) != -1) {
      final Position posData = positions.get(pos);
      final boolean isFrontier = positions.getNextPos() == pos + 1;
      if (posData.count == 0) {
        // No arcs arrive here; move to next position:
        if (VERBOSE) {
          System.out.println("    no arcs in; skip pos=" + pos);
        }
        pos++;
        continue;
      }
      if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
        // We are at a "frontier", and only one node is
        // alive, so whatever the eventual best path is must
        // come through this node.  So we can safely commit
        // to the prefix of the best path at this point:
        if (outputNBest) {
          backtraceNBest(posData, false);
        }
        backtrace(posData, 0);
        if (outputNBest) {
          fixupPendingList();
        }
        // Re-base cost so we don't risk int overflow:
        posData.costs[0] = 0;
        if (pending.size() > 0) {
          return;
        } else {
          // This means the backtrace only produced
          // punctuation tokens, so we must keep parsing.
        }
      }
      if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP) {
        // Safety: if we've buffered too much, force a
        // backtrace now.  We find the least-cost partial
        // path, across all paths, backtrace from it, and
        // then prune all others.  Note that this, in
        // general, can produce the wrong result, if the
        // total best path did not in fact back trace
        // through this partial best path.  But it's the
        // best we can do... (short of not having a
        // safety!).
        // First pass: find least cost partial path so far,
        // including ending at future positions:
        int leastIDX = -1;
        int leastCost = Integer.MAX_VALUE;
        Position leastPosData = null;
        for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
          final Position posData2 = positions.get(pos2);
          for (int idx = 0; idx < posData2.count; idx++) {
            // System.out.println("    idx=" + idx + " cost=" + cost);
            final int cost = posData2.costs[idx];
            if (cost < leastCost) {
              leastCost = cost;
              leastIDX = idx;
              leastPosData = posData2;
            }
          }
        }
        // We will always have at least one live path:
        assert leastIDX != -1;
        if (outputNBest) {
          backtraceNBest(leastPosData, false);
        }
        // Second pass: prune all but the best path:
        for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
          final Position posData2 = positions.get(pos2);
          if (posData2 != leastPosData) {
            posData2.reset();
          } else {
            if (leastIDX != 0) {
              posData2.costs[0] = posData2.costs[leastIDX];
              posData2.lastRightID[0] = posData2.lastRightID[leastIDX];
              posData2.backPos[0] = posData2.backPos[leastIDX];
              posData2.backWordPos[0] = posData2.backWordPos[leastIDX];
              posData2.backIndex[0] = posData2.backIndex[leastIDX];
              posData2.backID[0] = posData2.backID[leastIDX];
              posData2.backType[0] = posData2.backType[leastIDX];
            }
            posData2.count = 1;
          }
        }
        backtrace(leastPosData, 0);
        if (outputNBest) {
          fixupPendingList();
        }
        // Re-base cost so we don't risk int overflow:
        Arrays.fill(leastPosData.costs, 0, leastPosData.count, 0);
        if (pos != leastPosData.pos) {
          // We jumped into a future position:
          assert pos < leastPosData.pos;
          pos = leastPosData.pos;
        }
        if (pending.size() > 0) {
          return;
        } else {
          // This means the backtrace only produced
          // punctuation tokens, so we must keep parsing.
          continue;
        }
      }
      if (VERBOSE) {
        System.out.println(
            "\n  extend @ pos="
                + pos
                + " char="
                + (char) buffer.get(pos)
                + " hex="
                + Integer.toHexString(buffer.get(pos)));
      }
      if (VERBOSE) {
        System.out.println("    " + posData.count + " arcs in");
      }
      if (enableSpacePenaltyFactor
          && Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) {
        // We add single space separator as prefixes of the terms that we extract.
        // This information is needed to compute the space penalty factor of each term.
        // These whitespace prefixes are removed when the final tokens are generated, or
        // added as separated tokens when discardPunctuation is unset.
        if (buffer.get(++pos) == -1) {
          pos = posData.pos;
        }
      }
      boolean anyMatches = false;
      // First try user dict:
      if (userFST != null) {
        userFST.getFirstArc(arc);
        int output = 0;
        int maxPosAhead = 0;
        int outputMaxPosAhead = 0;
        int arcFinalOutMaxPosAhead = 0;
        for (int posAhead = pos; ; posAhead++) {
          final int ch = buffer.get(posAhead);
          if (ch == -1) {
            break;
          }
          if (userFST.findTargetArc(ch, arc, arc, posAhead == pos, userFSTReader) == null) {
            break;
          }
          output += arc.output().intValue();
          if (arc.isFinal()) {
            maxPosAhead = posAhead;
            outputMaxPosAhead = output;
            arcFinalOutMaxPosAhead = arc.nextFinalOutput().intValue();
            anyMatches = true;
            if (!outputLongestUserEntryOnly) {
              // add all matched user entries.
              add(
                  userDictionary.getMorphAttributes(),
                  posData,
                  pos,
                  posAhead + 1,
                  output + arc.nextFinalOutput().intValue(),
                  TokenType.USER,
                  false);
            }
          }
        }
        // Longest matching for user word
        if (anyMatches && maxPosAhead > userWordMaxPosAhead) {
          if (outputLongestUserEntryOnly) {
            if (VERBOSE) {
              System.out.println(
                  "    USER word "
                      + new String(buffer.get(pos, maxPosAhead + 1))
                      + " toPos="
                      + (maxPosAhead + 1));
            }
            add(
                userDictionary.getMorphAttributes(),
                posData,
                pos,
                maxPosAhead + 1,
                outputMaxPosAhead + arcFinalOutMaxPosAhead,
                TokenType.USER,
                false);
          }
          userWordMaxPosAhead = Math.max(userWordMaxPosAhead, maxPosAhead);
        }
      }
      // TODO: we can be more aggressive about user
      // matches?  if we are "under" a user match then don't
      // extend KNOWN/UNKNOWN paths?
      if (!anyMatches) {
        // Next, try known dictionary matches
        fst.getFirstArc(arc);
        int output = 0;
        for (int posAhead = pos; ; posAhead++) {
          final int ch = buffer.get(posAhead);
          if (ch == -1) {
            break;
          }
          // System.out.println("    match " + (char) ch + " posAhead=" + posAhead);
          if (fst.findTargetArc(ch, arc, arc, posAhead == pos, fstReader) == null) {
            break;
          }
          output += arc.output().intValue();
          // Optimization: for known words that are too-long
          // (compound), we should pre-compute the 2nd
          // best segmentation and store it in the
          // dictionary instead of recomputing it each time a
          // match is found.
          if (arc.isFinal()) {
            dictionary.lookupWordIds(output + arc.nextFinalOutput().intValue(), wordIdRef);
            if (VERBOSE) {
              System.out.println(
                  "    KNOWN word "
                      + new String(buffer.get(pos, posAhead - pos + 1))
                      + " toPos="
                      + (posAhead + 1)
                      + " "
                      + wordIdRef.length
                      + " wordIDs");
            }
            for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
              add(
                  dictionary.getMorphAttributes(),
                  posData,
                  pos,
                  posAhead + 1,
                  wordIdRef.ints[wordIdRef.offset + ofs],
                  TokenType.KNOWN,
                  false);
              anyMatches = true;
            }
          }
        }
      }
      if (!shouldSkipProcessUnknownWord(unknownWordEndIndex, posData)) {
        int unknownWordLength = processUnknownWord(anyMatches, posData);
        unknownWordEndIndex = posData.pos + unknownWordLength;
      }
      pos++;
    }
    end = true;
    if (pos > 0) {
      final Position endPosData = positions.get(pos);
      int leastCost = Integer.MAX_VALUE;
      int leastIDX = -1;
      if (VERBOSE) {
        System.out.println("  end: " + endPosData.count + " nodes");
      }
      for (int idx = 0; idx < endPosData.count; idx++) {
        // Add EOS cost:
        final int cost = endPosData.costs[idx] + costs.get(endPosData.lastRightID[idx], 0);
        // System.out.println("    idx=" + idx + " cost=" + cost + " (pathCost=" +
        // endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ")
        // backPos=" + endPosData.backPos[idx]);
        if (cost < leastCost) {
          leastCost = cost;
          leastIDX = idx;
        }
      }
      if (outputNBest) {
        backtraceNBest(endPosData, true);
      }
      backtrace(endPosData, leastIDX);
      if (outputNBest) {
        fixupPendingList();
      }
    } else {
      // No characters in the input string; return no tokens!
    }
  }
  protected boolean shouldSkipProcessUnknownWord(int unknownWordEndIndex, Position posData) {
    return unknownWordEndIndex > posData.pos;
  }
  /**
   * Add unknown words to the position graph.
   *
   * @return word length
   */
  protected abstract int processUnknownWord(boolean anyMatches, Position posData)
      throws IOException;
  /**
   * Backtrace from the provided position, back to the last time we back-traced, accumulating the
   * resulting tokens to the pending list. The pending list is then in-reverse (last token should be
   * returned first).
   */
  protected abstract void backtrace(final Position endPosData, final int fromIDX)
      throws IOException;
  /**
   * Backtrace the n-best path. Subclasses that support n-best paths should implement this method.
   */
  protected void backtraceNBest(final Position endPosData, final boolean useEOS)
      throws IOException {
    throw new UnsupportedOperationException();
  }
  /**
   * Remove duplicated tokens from the pending list; this is needed because {@link
   * #backtrace(Position, int)} and {@link #backtraceNBest(Position, boolean)} can add same tokens
   * to the list. Subclasses that support n-best paths should implement this method.
   */
  protected void fixupPendingList() {
    throw new UnsupportedOperationException();
  }
  /** Add a token on the minimum cost path to the pending token list. */
  protected final void add(
      MorphData morphData,
      Position fromPosData,
      int wordPos,
      int endPos,
      int wordID,
      TokenType type,
      boolean addPenalty)
      throws IOException {
    final int wordCost = morphData.getWordCost(wordID);
    final int leftID = morphData.getLeftId(wordID);
    int leastCost = Integer.MAX_VALUE;
    int leastIDX = -1;
    assert fromPosData.count > 0;
    for (int idx = 0; idx < fromPosData.count; idx++) {
      // The number of spaces before the term
      int numSpaces = wordPos - fromPosData.pos;
      // Cost is path cost so far, plus word cost (added at
      // end of loop), plus bigram cost and space penalty cost.
      final int cost =
          fromPosData.costs[idx]
              + costs.get(fromPosData.lastRightID[idx], leftID)
              + computeSpacePenalty(morphData, wordID, numSpaces);
      if (VERBOSE) {
        System.out.println(
            "      fromIDX="
                + idx
                + ": cost="
                + cost
                + " (prevCost="
                + fromPosData.costs[idx]
                + " wordCost="
                + wordCost
                + " bgCost="
                + costs.get(fromPosData.lastRightID[idx], leftID)
                + " spacePenalty="
                + computeSpacePenalty(morphData, wordID, numSpaces)
                + ") leftID="
                + leftID
                // + " leftPOS="
                // + leftPOS.name()
                + ")");
      }
      if (cost < leastCost) {
        leastCost = cost;
        leastIDX = idx;
        if (VERBOSE) {
          System.out.println("        **");
        }
      }
    }
    leastCost += wordCost;
    if (VERBOSE) {
      System.out.println(
          "      + cost="
              + leastCost
              + " wordID="
              + wordID
              + " leftID="
              + leftID
              + " leastIDX="
              + leastIDX
              + " toPos="
              + endPos
              + " toPos.idx="
              + positions.get(endPos).count);
    }
    if (addPenalty && type != TokenType.USER) {
      final int penalty = computePenalty(fromPosData.pos, endPos - fromPosData.pos);
      if (VERBOSE) {
        if (penalty > 0) {
          System.out.println("        + penalty=" + penalty + " cost=" + (leastCost + penalty));
        }
      }
      leastCost += penalty;
    }
    positions
        .get(endPos)
        .add(
            leastCost,
            morphData.getRightId(wordID),
            fromPosData.pos,
            wordPos,
            leastIDX,
            wordID,
            type);
  }
  /** Returns the space penalty. */
  protected int computeSpacePenalty(MorphData morphData, int wordID, int numSpaces) {
    return 0;
  }
  /** Returns the penalty for a specific input region */
  protected int computePenalty(int pos, int length) throws IOException {
    return 0;
  }
  public int getPos() {
    return pos;
  }
  public boolean isEnd() {
    return end;
  }
  public List<T> getPending() {
    return pending;
  }
  public boolean isOutputNBest() {
    return outputNBest;
  }
  public void resetBuffer(Reader reader) {
    buffer.reset(reader);
  }
  public void resetState() {
    positions.reset();
    pos = 0;
    end = false;
    lastBackTracePos = 0;
    pending.clear();
    // Add BOS:
    positions.get(0).add(0, 0, -1, -1, -1, -1, TokenType.KNOWN);
  }
  /**
   * Holds all back pointers arriving to this position.
   *
   * <p>NOTE: This and subclasses must have no-arg constructor. See {@link WrappedPositionArray}.
   */
  public static class Position {
    int pos;
    int count;
    // maybe single int array * 5?
    int[] costs = new int[8];
    int[] lastRightID = new int[8];
    int[] backPos = new int[8];
    int[] backWordPos = new int[8];
    int[] backIndex = new int[8];
    int[] backID = new int[8];
    TokenType[] backType = new TokenType[8];
    private void grow() {
      costs = ArrayUtil.grow(costs, 1 + count);
      lastRightID = ArrayUtil.grow(lastRightID, 1 + count);
      backPos = ArrayUtil.grow(backPos, 1 + count);
      backWordPos = ArrayUtil.grow(backWordPos, 1 + count);
      backIndex = ArrayUtil.grow(backIndex, 1 + count);
      backID = ArrayUtil.grow(backID, 1 + count);
      // NOTE: sneaky: grow separately because
      // ArrayUtil.grow will otherwise pick a different
      // length than the int[]s we just grew:
      final TokenType[] newBackType = new TokenType[backID.length];
      System.arraycopy(backType, 0, newBackType, 0, backType.length);
      backType = newBackType;
    }
    public void add(
        int cost,
        int lastRightID,
        int backPos,
        int backRPos,
        int backIndex,
        int backID,
        TokenType backType) {
      // NOTE: this isn't quite a true Viterbi search,
      // because we should check if lastRightID is
      // already present here, and only update if the new
      // cost is less than the current cost, instead of
      // simply appending.  However, that will likely hurt
      // performance (usually we add a lastRightID only once),
      // and it means we actually create the full graph
      // intersection instead of a "normal" Viterbi lattice:
      if (count == costs.length) {
        grow();
      }
      this.costs[count] = cost;
      this.lastRightID[count] = lastRightID;
      this.backPos[count] = backPos;
      this.backWordPos[count] = backRPos;
      this.backIndex[count] = backIndex;
      this.backID[count] = backID;
      this.backType[count] = backType;
      count++;
    }
    public void reset() {
      count = 0;
    }
    public int getPos() {
      return pos;
    }
    public int getCount() {
      return count;
    }
    public void setCount(int count) {
      this.count = count;
    }
    public int getCost(int index) {
      return costs[index];
    }
    public int getBackPos(int index) {
      return backPos[index];
    }
    public int getBackWordPos(int index) {
      return backWordPos[index];
    }
    public int getBackID(int index) {
      return backID[index];
    }
    public int getBackIndex(int index) {
      return backIndex[index];
    }
    public TokenType getBackType(int index) {
      return backType[index];
    }
    public int getLastRightID(int index) {
      return lastRightID[index];
    }
  }
  /** Holds partial graph (array of positions) for calculating the minimum cost path */
  public static final class WrappedPositionArray<U extends Position> {
    private U[] positions;
    private final Class<U> clazz;
    @SuppressWarnings("unchecked")
    WrappedPositionArray(Class<U> clazz) {
      this.clazz = clazz;
      positions = (U[]) Array.newInstance(clazz, 8);
      for (int i = 0; i < positions.length; i++) {
        try {
          positions[i] = clazz.getConstructor().newInstance();
        } catch (ReflectiveOperationException e) {
          // shouldn't happen; Position class should have no-arg constructor.
          throw new IllegalStateException(e);
        }
      }
    }
    // Next array index to write to in positions:
    private int nextWrite;
    // Next position to write:
    private int nextPos;
    // How many valid Position instances are held in the
    // positions array:
    private int count;
    void reset() {
      nextWrite--;
      while (count > 0) {
        if (nextWrite == -1) {
          nextWrite = positions.length - 1;
        }
        positions[nextWrite--].reset();
        count--;
      }
      nextWrite = 0;
      nextPos = 0;
      count = 0;
    }
    /**
     * Get Position instance for this absolute position; this is allowed to be arbitrarily far "in
     * the future" but cannot be before the last freeBefore.
     */
    @SuppressWarnings("unchecked")
    public U get(int pos) {
      while (pos >= nextPos) {
        // System.out.println("count=" + count + " vs len=" + positions.length);
        if (count == positions.length) {
          // Position[] newPositions =
          //    new Position[ArrayUtil.oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
          U[] newPositions =
              (U[])
                  Array.newInstance(
                      clazz, ArrayUtil.oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF));
          // System.out.println("grow positions " + newPositions.length);
          System.arraycopy(positions, nextWrite, newPositions, 0, positions.length - nextWrite);
          System.arraycopy(positions, 0, newPositions, positions.length - nextWrite, nextWrite);
          for (int i = positions.length; i < newPositions.length; i++) {
            try {
              newPositions[i] = clazz.getConstructor().newInstance();
            } catch (ReflectiveOperationException e) {
              // shouldn't happen
              throw new IllegalStateException(e);
            }
          }
          nextWrite = positions.length;
          positions = newPositions;
        }
        if (nextWrite == positions.length) {
          nextWrite = 0;
        }
        // Should have already been reset:
        assert positions[nextWrite].count == 0;
        positions[nextWrite++].pos = nextPos++;
        count++;
      }
      assert inBounds(pos);
      final int index = getIndex(pos);
      assert positions[index].pos == pos;
      return positions[index];
    }
    int getNextPos() {
      return nextPos;
    }
    // For assert:
    private boolean inBounds(int pos) {
      return pos < nextPos && pos >= nextPos - count;
    }
    private int getIndex(int pos) {
      int index = nextWrite - (nextPos - pos);
      if (index < 0) {
        index += positions.length;
      }
      return index;
    }
    public void freeBefore(int pos) {
      final int toFree = count - (nextPos - pos);
      assert toFree >= 0;
      assert toFree <= count;
      int index = nextWrite - count;
      if (index < 0) {
        index += positions.length;
      }
      for (int i = 0; i < toFree; i++) {
        if (index == positions.length) {
          index = 0;
        }
        // System.out.println("  fb idx=" + index);
        positions[index].reset();
        index++;
      }
      count -= toFree;
    }
  }
 }
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/GraphvizFormatter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/GraphvizFormatter.java
@ -1,195 +0,0 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.ja;
 import java.util.HashMap;
 import java.util.Map;
 import org.apache.lucene.analysis.ja.JapaneseTokenizer.Position;
 import org.apache.lucene.analysis.ja.JapaneseTokenizer.WrappedPositionArray;
 import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
 import org.apache.lucene.analysis.ja.dict.JaMorphData;
 import org.apache.lucene.analysis.morph.Dictionary;
 // TODO: would be nice to show 2nd best path in a diff't
 // color...
 /** Outputs the dot (graphviz) string for the viterbi lattice. */
 public class GraphvizFormatter {
  private static final String BOS_LABEL = "BOS";
  private static final String EOS_LABEL = "EOS";
  private static final String FONT_NAME = "Helvetica";
  private final ConnectionCosts costs;
  private final Map<String, String> bestPathMap;
  private final StringBuilder sb = new StringBuilder();
  public GraphvizFormatter(ConnectionCosts costs) {
    this.costs = costs;
    this.bestPathMap = new HashMap<>();
    sb.append(formatHeader());
    sb.append("  init [style=invis]\n");
    sb.append("  init -> 0.0 [label=\"" + BOS_LABEL + "\"]\n");
  }
  public String finish() {
    sb.append(formatTrailer());
    return sb.toString();
  }
  // Backtraces another incremental fragment:
  void onBacktrace(
      JapaneseTokenizer tok,
      WrappedPositionArray positions,
      int lastBackTracePos,
      Position endPosData,
      int fromIDX,
      char[] fragment,
      boolean isEnd) {
    setBestPathMap(positions, lastBackTracePos, endPosData, fromIDX);
    sb.append(formatNodes(tok, positions, lastBackTracePos, endPosData, fragment));
    if (isEnd) {
      sb.append("  fini [style=invis]\n");
      sb.append("  ");
      sb.append(getNodeID(endPosData.pos, fromIDX));
      sb.append(" -> fini [label=\"" + EOS_LABEL + "\"]");
    }
  }
  // Records which arcs make up the best bath:
  private void setBestPathMap(
      WrappedPositionArray positions, int startPos, Position endPosData, int fromIDX) {
    bestPathMap.clear();
    int pos = endPosData.pos;
    int bestIDX = fromIDX;
    while (pos > startPos) {
      final Position posData = positions.get(pos);
      final int backPos = posData.backPos[bestIDX];
      final int backIDX = posData.backIndex[bestIDX];
      final String toNodeID = getNodeID(pos, bestIDX);
      final String fromNodeID = getNodeID(backPos, backIDX);
      assert !bestPathMap.containsKey(fromNodeID);
      assert !bestPathMap.containsValue(toNodeID);
      bestPathMap.put(fromNodeID, toNodeID);
      pos = backPos;
      bestIDX = backIDX;
    }
  }
  private String formatNodes(
      JapaneseTokenizer tok,
      WrappedPositionArray positions,
      int startPos,
      Position endPosData,
      char[] fragment) {
    StringBuilder sb = new StringBuilder();
    // Output nodes
    for (int pos = startPos + 1; pos <= endPosData.pos; pos++) {
      final Position posData = positions.get(pos);
      for (int idx = 0; idx < posData.count; idx++) {
        sb.append("  ");
        sb.append(getNodeID(pos, idx));
        sb.append(" [label=\"");
        sb.append(pos);
        sb.append(": ");
        sb.append(posData.lastRightID[idx]);
        sb.append("\"]\n");
      }
    }
    // Output arcs
    for (int pos = endPosData.pos; pos > startPos; pos--) {
      final Position posData = positions.get(pos);
      for (int idx = 0; idx < posData.count; idx++) {
        final Position backPosData = positions.get(posData.backPos[idx]);
        final String toNodeID = getNodeID(pos, idx);
        final String fromNodeID = getNodeID(posData.backPos[idx], posData.backIndex[idx]);
        sb.append("  ");
        sb.append(fromNodeID);
        sb.append(" -> ");
        sb.append(toNodeID);
        final String attrs;
        if (toNodeID.equals(bestPathMap.get(fromNodeID))) {
          // This arc is on best path
          attrs = " color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20";
        } else {
          attrs = "";
        }
        final Dictionary<? extends JaMorphData> dict = tok.getDict(posData.backType[idx]);
        final int wordCost = dict.getWordCost(posData.backID[idx]);
        final int bgCost =
            costs.get(
                backPosData.lastRightID[posData.backIndex[idx]],
                dict.getLeftId(posData.backID[idx]));
        final String surfaceForm =
            new String(fragment, posData.backPos[idx] - startPos, pos - posData.backPos[idx]);
        sb.append(" [label=\"");
        sb.append(surfaceForm);
        sb.append(' ');
        sb.append(wordCost);
        if (bgCost >= 0) {
          sb.append('+');
        }
        sb.append(bgCost);
        sb.append("\"");
        sb.append(attrs);
        sb.append("]\n");
      }
    }
    return sb.toString();
  }
  private String formatHeader() {
    StringBuilder sb = new StringBuilder();
    sb.append("digraph viterbi {\n");
    sb.append(
        "  graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\"];\n");
    // sb.append("  // A2 paper size\n");
    // sb.append("  size = \"34.4,16.5\";\n");
    // sb.append("  // try to fill paper\n");
    // sb.append("  ratio = fill;\n");
    sb.append("  edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
    sb.append(
        "  node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\""
            + FONT_NAME
            + "\" ]\n");
    return sb.toString();
  }
  private String formatTrailer() {
    return "}";
  }
  private String getNodeID(int pos, int idx) {
    return pos + "." + idx;
  }
 }
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/ViterbiNBest.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/ViterbiNBest.java
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
@ -31,8 +31,10 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
 import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
 import org.apache.lucene.analysis.ja.dict.JaMorphData;
 import org.apache.lucene.analysis.ja.dict.UserDictionary;
 import org.apache.lucene.analysis.ja.tokenattributes.*;
 import org.apache.lucene.analysis.morph.GraphvizFormatter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.tests.analysis.MockGraphTokenFilter;
@ -518,7 +520,8 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
  }
  public void testLatticeToDot() throws Exception {
-    final GraphvizFormatter gv2 = new GraphvizFormatter(ConnectionCosts.getInstance());
+    final GraphvizFormatter<JaMorphData> gv2 =
        new GraphvizFormatter<>(ConnectionCosts.getInstance());
    final Analyzer analyzer =
        new Analyzer() {
          @Override
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Viterbi.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Viterbi.java
@ -0,0 +1,447 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.ko;
 import java.io.IOException;
 import java.util.EnumMap;
 import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
 import org.apache.lucene.analysis.ko.dict.KoMorphData;
 import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
 import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
 import org.apache.lucene.analysis.ko.dict.UserDictionary;
 import org.apache.lucene.analysis.morph.ConnectionCosts;
 import org.apache.lucene.analysis.morph.Dictionary;
 import org.apache.lucene.analysis.morph.GraphvizFormatter;
 import org.apache.lucene.analysis.morph.MorphData;
 import org.apache.lucene.analysis.morph.TokenInfoFST;
 import org.apache.lucene.analysis.morph.TokenType;
 import org.apache.lucene.util.fst.FST;
 /** {@link org.apache.lucene.analysis.morph.Viterbi} subclass for Korean morphological analysis. */
 final class Viterbi
    extends org.apache.lucene.analysis.morph.Viterbi<
        Token, org.apache.lucene.analysis.morph.Viterbi.Position> {
  private final EnumMap<TokenType, Dictionary<? extends KoMorphData>> dictionaryMap =
      new EnumMap<>(TokenType.class);
  private final UnknownDictionary unkDictionary;
  private final CharacterDefinition characterDefinition;
  private final boolean discardPunctuation;
  private final KoreanTokenizer.DecompoundMode mode;
  private final boolean outputUnknownUnigrams;
  private GraphvizFormatter<KoMorphData> dotOut;
  Viterbi(
      TokenInfoFST fst,
      FST.BytesReader fstReader,
      TokenInfoDictionary dictionary,
      TokenInfoFST userFST,
      FST.BytesReader userFSTReader,
      UserDictionary userDictionary,
      ConnectionCosts costs,
      UnknownDictionary unkDictionary,
      CharacterDefinition characterDefinition,
      boolean discardPunctuation,
      KoreanTokenizer.DecompoundMode mode,
      boolean outputUnknownUnigrams) {
    super(
        fst, fstReader, dictionary, userFST, userFSTReader, userDictionary, costs, Position.class);
    this.unkDictionary = unkDictionary;
    this.characterDefinition = characterDefinition;
    this.discardPunctuation = discardPunctuation;
    this.mode = mode;
    this.outputUnknownUnigrams = outputUnknownUnigrams;
    this.enableSpacePenaltyFactor = true;
    this.outputLongestUserEntryOnly = true;
    dictionaryMap.put(TokenType.KNOWN, dictionary);
    dictionaryMap.put(TokenType.UNKNOWN, unkDictionary);
    dictionaryMap.put(TokenType.USER, userDictionary);
  }
  @Override
  protected int processUnknownWord(boolean anyMatches, Position posData) throws IOException {
    final char firstCharacter = (char) buffer.get(pos);
    if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) {
      // Find unknown match:
      int characterId = characterDefinition.getCharacterClass(firstCharacter);
      // NOTE: copied from UnknownDictionary.lookup:
      int unknownWordLength;
      if (!characterDefinition.isGroup(firstCharacter)) {
        unknownWordLength = 1;
      } else {
        // Extract unknown word. Characters with the same script are considered to be part of
        // unknown word
        unknownWordLength = 1;
        Character.UnicodeScript scriptCode = Character.UnicodeScript.of(firstCharacter);
        final boolean isPunct = isPunctuation(firstCharacter);
        final boolean isDigit = Character.isDigit(firstCharacter);
        for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
          int next = buffer.get(posAhead);
          if (next == -1) {
            break;
          }
          char ch = (char) next;
          int chType = Character.getType(ch);
          Character.UnicodeScript sc = Character.UnicodeScript.of(next);
          boolean sameScript =
              isSameScript(scriptCode, sc)
                  // Non-spacing marks inherit the script of their base character,
                  // following recommendations from UTR #24.
                  || chType == Character.NON_SPACING_MARK;
          if (sameScript
              // split on punctuation
              && isPunctuation(ch, chType) == isPunct
              // split on digit
              && Character.isDigit(ch) == isDigit
              && characterDefinition.isGroup(ch)) {
            unknownWordLength++;
          } else {
            break;
          }
          // Update the script code and character class if the original script
          // is Inherited or Common.
          if (isCommonOrInherited(scriptCode) && isCommonOrInherited(sc) == false) {
            scriptCode = sc;
            characterId = characterDefinition.getCharacterClass(ch);
          }
        }
      }
      unkDictionary.lookupWordIds(
          characterId, wordIdRef); // characters in input text are supposed to be the same
      if (VERBOSE) {
        System.out.println(
            "    UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.length + " wordIDs");
      }
      for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
        add(
            unkDictionary.getMorphAttributes(),
            posData,
            pos,
            pos + unknownWordLength,
            wordIdRef.ints[wordIdRef.offset + ofs],
            TokenType.UNKNOWN,
            false);
      }
    }
    // TODO: should return meaningful value?
    return 0;
  }
  void setGraphvizFormatter(GraphvizFormatter<KoMorphData> dotOut) {
    this.dotOut = dotOut;
  }
  @Override
  protected void backtrace(Position endPosData, int fromIDX) {
    final int endPos = endPosData.getPos();
    if (endPos == lastBackTracePos) {
      return;
    }
    if (VERBOSE) {
      System.out.println(
          "\n  backtrace: endPos="
              + endPos
              + " pos="
              + pos
              + "; "
              + (pos - lastBackTracePos)
              + " characters; last="
              + lastBackTracePos
              + " cost="
              + endPosData.getCost(fromIDX));
    }
    final char[] fragment = buffer.get(lastBackTracePos, endPos - lastBackTracePos);
    if (dotOut != null) {
      dotOut.onBacktrace(
          this::getDict, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
    }
    int pos = endPos;
    int bestIDX = fromIDX;
    // TODO: sort of silly to make Token instances here; the
    // back trace has all info needed to generate the
    // token.  So, we could just directly set the attrs,
    // from the backtrace, in incrementToken w/o ever
    // creating Token; we'd have to defer calling freeBefore
    // until after the backtrace was fully "consumed" by
    // incrementToken.
    while (pos > lastBackTracePos) {
      // System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
      final Position posData = positions.get(pos);
      assert bestIDX < posData.getCount();
      int backPos = posData.getBackPos(bestIDX);
      int backWordPos = posData.getBackWordPos(bestIDX);
      assert backPos >= lastBackTracePos
          : "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos;
      // the length of the word without the whitespaces at the beginning.
      int length = pos - backWordPos;
      TokenType backType = posData.getBackType(bestIDX);
      int backID = posData.getBackID(bestIDX);
      int nextBestIDX = posData.getBackIndex(bestIDX);
      // the start of the word after the whitespace at the beginning.
      final int fragmentOffset = backWordPos - lastBackTracePos;
      assert fragmentOffset >= 0;
      final Dictionary<? extends KoMorphData> dict = getDict(backType);
      if (outputUnknownUnigrams && backType == TokenType.UNKNOWN) {
        // outputUnknownUnigrams converts unknown word into unigrams:
        for (int i = length - 1; i >= 0; i--) {
          int charLen = 1;
          if (i > 0 && Character.isLowSurrogate(fragment[fragmentOffset + i])) {
            i--;
            charLen = 2;
          }
          final DictionaryToken token =
              new DictionaryToken(
                  TokenType.UNKNOWN,
                  unkDictionary.getMorphAttributes(),
                  CharacterDefinition.NGRAM,
                  fragment,
                  fragmentOffset + i,
                  charLen,
                  backWordPos + i,
                  backWordPos + i + charLen);
          pending.add(token);
          if (VERBOSE) {
            System.out.println("    add token=" + pending.get(pending.size() - 1));
          }
        }
      } else {
        final DictionaryToken token =
            new DictionaryToken(
                backType,
                dict.getMorphAttributes(),
                backID,
                fragment,
                fragmentOffset,
                length,
                backWordPos,
                backWordPos + length);
        if (token.getPOSType() == POS.Type.MORPHEME
            || mode == KoreanTokenizer.DecompoundMode.NONE) {
          if (shouldFilterToken(token) == false) {
            pending.add(token);
            if (VERBOSE) {
              System.out.println("    add token=" + pending.get(pending.size() - 1));
            }
          }
        } else {
          KoMorphData.Morpheme[] morphemes = token.getMorphemes();
          if (morphemes == null) {
            pending.add(token);
            if (VERBOSE) {
              System.out.println("    add token=" + pending.get(pending.size() - 1));
            }
          } else {
            int endOffset = backWordPos + length;
            int posLen = 0;
            // decompose the compound
            for (int i = morphemes.length - 1; i >= 0; i--) {
              final KoMorphData.Morpheme morpheme = morphemes[i];
              final Token compoundToken;
              if (token.getPOSType() == POS.Type.COMPOUND) {
                assert endOffset - morpheme.surfaceForm.length() >= 0;
                compoundToken =
                    new DecompoundToken(
                        morpheme.posTag,
                        morpheme.surfaceForm,
                        endOffset - morpheme.surfaceForm.length(),
                        endOffset,
                        backType);
              } else {
                compoundToken =
                    new DecompoundToken(
                        morpheme.posTag,
                        morpheme.surfaceForm,
                        token.getStartOffset(),
                        token.getEndOffset(),
                        backType);
              }
              if (i == 0 && mode == KoreanTokenizer.DecompoundMode.MIXED) {
                compoundToken.setPositionIncrement(0);
              }
              ++posLen;
              endOffset -= morpheme.surfaceForm.length();
              pending.add(compoundToken);
              if (VERBOSE) {
                System.out.println("    add token=" + pending.get(pending.size() - 1));
              }
            }
            if (mode == KoreanTokenizer.DecompoundMode.MIXED) {
              token.setPositionLength(Math.max(1, posLen));
              pending.add(token);
              if (VERBOSE) {
                System.out.println("    add token=" + pending.get(pending.size() - 1));
              }
            }
          }
        }
      }
      if (discardPunctuation == false && backWordPos != backPos) {
        // Add a token for whitespaces between terms
        int offset = backPos - lastBackTracePos;
        int len = backWordPos - backPos;
        // System.out.println(offset + " " + fragmentOffset + " " + len + " " + backWordPos + " " +
        // backPos);
        unkDictionary.lookupWordIds(characterDefinition.getCharacterClass(' '), wordIdRef);
        DictionaryToken spaceToken =
            new DictionaryToken(
                TokenType.UNKNOWN,
                unkDictionary.getMorphAttributes(),
                wordIdRef.ints[wordIdRef.offset],
                fragment,
                offset,
                len,
                backPos,
                backPos + len);
        pending.add(spaceToken);
      }
      pos = backPos;
      bestIDX = nextBestIDX;
    }
    lastBackTracePos = endPos;
    if (VERBOSE) {
      System.out.println("  freeBefore pos=" + endPos);
    }
    // Notify the circular buffers that we are done with
    // these positions:
    buffer.freeBefore(endPos);
    positions.freeBefore(endPos);
  }
  /** Returns the space penalty associated with the provided {@link POS.Tag}. */
  @Override
  protected int computeSpacePenalty(MorphData morphData, int wordID, int numSpaces) {
    final POS.Tag leftPOS = ((KoMorphData) morphData).getLeftPOS(wordID);
    int spacePenalty = 0;
    if (numSpaces > 0) {
      // TODO we should extract the penalty (left-space-penalty-factor) from the dicrc file.
      switch (leftPOS) {
        case E:
        case J:
        case VCP:
        case XSA:
        case XSN:
        case XSV:
          spacePenalty = 3000;
          break;
        case IC:
        case MAG:
        case MAJ:
        case MM:
        case NA:
        case NNB:
        case NNBC:
        case NNG:
        case NNP:
        case NP:
        case NR:
        case SC:
        case SE:
        case SF:
        case SH:
        case SL:
        case SN:
        case SP:
        case SSC:
        case SSO:
        case SY:
        case UNA:
        case UNKNOWN:
        case VA:
        case VCN:
        case VSV:
        case VV:
        case VX:
        case XPN:
        case XR:
        default:
          break;
      }
    }
    return spacePenalty;
  }
  Dictionary<? extends KoMorphData> getDict(TokenType type) {
    return dictionaryMap.get(type);
  }
  private boolean shouldFilterToken(Token token) {
    return discardPunctuation && isPunctuation(token.getSurfaceForm()[token.getOffset()]);
  }
  private static boolean isPunctuation(char ch) {
    return isPunctuation(ch, Character.getType(ch));
  }
  private static boolean isPunctuation(char ch, int cid) {
    // special case for Hangul Letter Araea (interpunct)
    if (ch == 0x318D) {
      return true;
    }
    switch (cid) {
      case Character.SPACE_SEPARATOR:
      case Character.LINE_SEPARATOR:
      case Character.PARAGRAPH_SEPARATOR:
      case Character.CONTROL:
      case Character.FORMAT:
      case Character.DASH_PUNCTUATION:
      case Character.START_PUNCTUATION:
      case Character.END_PUNCTUATION:
      case Character.CONNECTOR_PUNCTUATION:
      case Character.OTHER_PUNCTUATION:
      case Character.MATH_SYMBOL:
      case Character.CURRENCY_SYMBOL:
      case Character.MODIFIER_SYMBOL:
      case Character.OTHER_SYMBOL:
      case Character.INITIAL_QUOTE_PUNCTUATION:
      case Character.FINAL_QUOTE_PUNCTUATION:
        return true;
      default:
        return false;
    }
  }
  private static boolean isCommonOrInherited(Character.UnicodeScript script) {
    return script == Character.UnicodeScript.INHERITED || script == Character.UnicodeScript.COMMON;
  }
  /** Determine if two scripts are compatible. */
  private static boolean isSameScript(
      Character.UnicodeScript scriptOne, Character.UnicodeScript scriptTwo) {
    return scriptOne == scriptTwo
        || isCommonOrInherited(scriptOne)
        || isCommonOrInherited(scriptTwo);
  }
 }