LUCENE-10102: Add JapaneseCompletionFilter for Input Method-aware auto-completion (#297)

Co-authored-by: Robert Muir <rmuir@apache.org>
2021-09-17 22:37:12 +09:00 · 2021-09-17 22:37:12 +09:00 · 4e86df96c0
parent de45b68c90
commit 4e86df96c0
13 changed files with 1518 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -21,6 +21,8 @@ New Features

 * LUCENE-10096: Add TamilAnalyzer based on the snowball stemmer. (Robert Muir)

+* LUCENE-10102: Add JapaneseCompletionFilter for Input Method-aware auto-completion (Tomoko Uchida, Robert Muir, Jun Ohtani)
+
 System Requirements

 * LUCENE-8738: Move to Java 11 as minimum Java version.
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseCompletionAnalyzer.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseCompletionAnalyzer.java
@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+import java.io.Reader;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.cjk.CJKWidthCharFilter;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.ja.dict.UserDictionary;
+
+/**
+ * Analyzer for Japanese completion suggester.
+ *
+ * @see JapaneseCompletionFilter
+ */
+public class JapaneseCompletionAnalyzer extends Analyzer {
+  private final JapaneseCompletionFilter.Mode mode;
+  private final UserDictionary userDict;
+
+  /** Creates a new {@code JapaneseCompletionAnalyzer} with default configurations */
+  public JapaneseCompletionAnalyzer() {
+    this(null, JapaneseCompletionFilter.Mode.INDEX);
+  }
+
+  /** Creates a new {@code JapaneseCompletionAnalyzer} */
+  public JapaneseCompletionAnalyzer(UserDictionary userDict, JapaneseCompletionFilter.Mode mode) {
+    this.userDict = userDict;
+    this.mode = mode;
+  }
+
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName) {
+    Tokenizer tokenizer =
+        new JapaneseTokenizer(userDict, true, true, JapaneseTokenizer.Mode.NORMAL);
+    TokenStream stream = new JapaneseCompletionFilter(tokenizer, mode);
+    stream = new LowerCaseFilter(stream);
+    return new TokenStreamComponents(tokenizer, stream);
+  }
+
+  @Override
+  protected Reader initReader(String fieldName, Reader reader) {
+    return new CJKWidthCharFilter(reader);
+  }
+
+  @Override
+  protected Reader initReaderForNormalization(String fieldName, Reader reader) {
+    return new CJKWidthCharFilter(reader);
+  }
+}
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.java
@ -0,0 +1,267 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ja.completion.CharSequenceUtils;
+import org.apache.lucene.analysis.ja.completion.KatakanaRomanizer;
+import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.CharsRefBuilder;
+
+/**
+ * A {@link org.apache.lucene.analysis.TokenFilter} that adds Japanese romanized tokens to the term
+ * attribute. Also, this keeps original tokens (surface forms). Main usage of this filter is Query
+ * Auto-Completion.
+ *
+ * <p>Supported romanization form: (modified) Hepburn-shiki, Kunrei-shiki (Nihon-shiki) and Wāpuro
+ * shiki.
+ *
+ * <p>This does NOT support some romanji forms which are official but not commonly used with
+ * Japanese <a href="https://en.wikipedia.org/wiki/Input_method">Input Methods</a>. e.g.: circumflex
+ * or macron representing <a href="https://en.wikipedia.org/wiki/Ch%C5%8Donpu">Chōonpu (長音符)</a> are
+ * not supported.
+ *
+ * <p>The romanization behaviour changes according to its {@link Mode}. The default mode is {@link
+ * Mode#INDEX}.
+ *
+ * <p>Note: This filter must be applied AFTER half-width and full-width normalization. Please ensure
+ * that a width normalizer such as {@link org.apache.lucene.analysis.cjk.CJKWidthCharFilter} or
+ * {@link org.apache.lucene.analysis.cjk.CJKWidthFilter} is included in your analysis chain. IF THE
+ * WIDTH NORMALIZATION IS NOT PERFORMED, THIS DOES NOT WORK AS EXPECTED. See also: {@link
+ * JapaneseCompletionAnalyzer}.
+ */
+public final class JapaneseCompletionFilter extends TokenFilter {
+  public static final Mode DEFAULT_MODE = Mode.INDEX;
+
+  private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
+  private final ReadingAttribute readingAttr = addAttribute(ReadingAttribute.class);
+  private final PositionIncrementAttribute posIncAtt =
+      addAttribute(PositionIncrementAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+  private final CompletionTokenGenerator tokenGenerator;
+
+  /** Completion mode */
+  public enum Mode {
+    /** Simple romanization. Expected to be used when indexing. */
+    INDEX,
+    /** Input Method aware romanization. Expected to be used when querying. */
+    QUERY
+  }
+
+  /** Creates a new {@code JapaneseCompletionFilter} with default configurations */
+  public JapaneseCompletionFilter(TokenStream input) {
+    this(input, DEFAULT_MODE);
+  }
+
+  /** Creates a new {@code JapaneseCompletionFilter} */
+  public JapaneseCompletionFilter(TokenStream input, Mode mode) {
+    super(input);
+    this.tokenGenerator = new CompletionTokenGenerator(mode);
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    tokenGenerator.reset();
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    mayIncrementToken();
+    if (tokenGenerator.hasNext()) {
+      clearAttributes();
+      CompletionToken token = tokenGenerator.next();
+      termAttr.setEmpty().append(token.term);
+      if (token.isFirst) {
+        posIncAtt.setPositionIncrement(1);
+      } else {
+        posIncAtt.setPositionIncrement(0);
+      }
+      offsetAtt.setOffset(token.startOffset, token.endOffset);
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  private void mayIncrementToken() throws IOException {
+    while (!tokenGenerator.hasNext()) {
+      if (input.incrementToken()) {
+        String surface = termAttr.toString();
+        String reading = readingAttr.getReading();
+        int startOffset = offsetAtt.startOffset();
+        int endOffset = offsetAtt.endOffset();
+        if (reading == null && CharSequenceUtils.isKana(surface)) {
+          // use the surface form as reading when possible.
+          reading = CharSequenceUtils.toKatakana(surface);
+        }
+        tokenGenerator.addToken(surface, reading, startOffset, endOffset);
+      } else {
+        if (tokenGenerator.hasPendingToken()) {
+          // a pending token remains.
+          tokenGenerator.finish();
+        } else {
+          // already consumed all tokens. there's no next token to output.
+          break;
+        }
+      }
+    }
+  }
+
+  private static class CompletionToken {
+    final String term;
+    final boolean isFirst;
+    final int startOffset;
+    final int endOffset;
+
+    CompletionToken(String term, boolean isFirst, int startOffset, int endOffset) {
+      this.term = term;
+      this.isFirst = isFirst;
+      this.startOffset = startOffset;
+      this.endOffset = endOffset;
+    }
+  }
+
+  private static class CompletionTokenGenerator implements Iterator<CompletionToken> {
+
+    private final Mode mode;
+
+    private List<CompletionToken> outputs;
+
+    private CharsRefBuilder pdgSurface;
+    private CharsRefBuilder pdgReading;
+    private int pdgStartOffset;
+    private int pdgEndOffset;
+
+    CompletionTokenGenerator(Mode mode) {
+      this.mode = mode;
+      outputs = new ArrayList<>();
+    }
+
+    public void reset() {
+      clearPendingToken();
+      outputs.clear();
+    }
+
+    @Override
+    public boolean hasNext() {
+      return outputs.size() > 0;
+    }
+
+    @Override
+    public CompletionToken next() {
+      return outputs.remove(0);
+    }
+
+    void addToken(String surface, String reading, int startOffset, int endOffset) {
+      assert surface != null : "surface must not be null.";
+
+      if (hasPendingToken()) {
+        if (mode == Mode.QUERY
+            && pdgReading != null
+            && !CharSequenceUtils.isLowercaseAlphabets(pdgSurface.get())
+            && CharSequenceUtils.isLowercaseAlphabets(surface)) {
+          // words that are in mid-IME composition are split into two tokens by JapaneseTokenizer;
+          // should be recovered when querying.
+          // Note: in this case, the reading attribute is null; use the surface form in place of the
+          // reading.
+          // e.g.: "サッ" + "k" => "サッk", "反" + "sy" => "反sy"
+          pdgSurface.append(surface);
+          pdgReading.append(surface);
+          pdgEndOffset = endOffset;
+          generateOutputs();
+          clearPendingToken();
+        } else if (mode == Mode.QUERY
+            && CharSequenceUtils.isKana(pdgSurface.get())
+            && CharSequenceUtils.isKana(surface)) {
+          // words that are all composed only of Katakana or Hiragana should be concatenated when
+          // querying.
+          // e.g.: "こい" + "ぬ" => "こいぬ"
+          pdgSurface.append(surface);
+          pdgReading.append(reading);
+          pdgEndOffset = endOffset;
+        } else {
+          generateOutputs();
+          resetPendingToken(surface, reading, startOffset, endOffset);
+        }
+      } else {
+        resetPendingToken(surface, reading, startOffset, endOffset);
+      }
+    }
+
+    void finish() {
+      generateOutputs();
+      clearPendingToken();
+    }
+
+    private void generateOutputs() {
+      // preserve original surface form as an output.
+      outputs.add(new CompletionToken(pdgSurface.toString(), true, pdgStartOffset, pdgEndOffset));
+      // skip readings that cannot be translated to romaji.
+      if (pdgReading == null
+          || pdgReading.length() == 0
+          || !CharSequenceUtils.isKatakanaOrHWAlphabets(pdgReading.get())) {
+        return;
+      }
+      // translate the reading to romaji.
+      List<CharsRef> romaji = KatakanaRomanizer.getInstance().romanize(pdgReading.get());
+      for (CharsRef ref : romaji) {
+        // set the same start/end offset as the original surface form for romanized tokens.
+        outputs.add(new CompletionToken(ref.toString(), false, pdgStartOffset, pdgEndOffset));
+      }
+    }
+
+    boolean hasPendingToken() {
+      return pdgSurface != null;
+    }
+
+    void resetPendingToken(
+        CharSequence surface, CharSequence reading, int startOffset, int endOffset) {
+      if (this.pdgSurface == null) {
+        this.pdgSurface = new CharsRefBuilder();
+      } else {
+        this.pdgSurface.clear();
+      }
+      this.pdgSurface.append(surface);
+      if (this.pdgReading == null) {
+        this.pdgReading = new CharsRefBuilder();
+      } else {
+        this.pdgReading.clear();
+      }
+      this.pdgReading.append(reading);
+      this.pdgStartOffset = startOffset;
+      this.pdgEndOffset = endOffset;
+    }
+
+    void clearPendingToken() {
+      this.pdgSurface = null;
+      this.pdgReading = null;
+      this.pdgStartOffset = 0;
+      this.pdgEndOffset = 0;
+    }
+  }
+}
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseCompletionFilterFactory.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseCompletionFilterFactory.java
@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+import java.util.Locale;
+import java.util.Map;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Factory for {@link JapaneseCompletionFilter}.
+ *
+ * <p>Supported attributes:
+ *
+ * <ul>
+ *   <li>mode: Completion mode. see {@link JapaneseCompletionFilter.Mode}
+ * </ul>
+ *
+ * @lucene.spi {@value #NAME}
+ */
+public class JapaneseCompletionFilterFactory extends TokenFilterFactory {
+
+  /** SPI name */
+  public static final String NAME = "japaneseCompletion";
+
+  private static final String MODE_PARAM = "mode";
+  private final JapaneseCompletionFilter.Mode mode;
+
+  /** Creates a new {@code JapaneseCompletionFilterFactory} */
+  public JapaneseCompletionFilterFactory(Map<String, String> args) {
+    super(args);
+    mode =
+        JapaneseCompletionFilter.Mode.valueOf(
+            get(
+                args,
+                MODE_PARAM,
+                JapaneseCompletionFilter.DEFAULT_MODE.name().toUpperCase(Locale.ROOT)));
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  /** Default ctor for compatibility with SPI */
+  public JapaneseCompletionFilterFactory() {
+    throw defaultCtorException();
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new JapaneseCompletionFilter(input, mode);
+  }
+}
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/completion/CharSequenceUtils.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/completion/CharSequenceUtils.java
@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja.completion;
+
+/** Utility functions for {@link org.apache.lucene.analysis.ja.JapaneseCompletionFilter} */
+public class CharSequenceUtils {
+
+  /** Checks if a char sequence is composed only of lowercase alphabets */
+  public static boolean isLowercaseAlphabets(CharSequence s) {
+    for (int i = 0; i < s.length(); i++) {
+      char ch = s.charAt(i);
+      if (!(isHalfWidthLowercaseAlphabet(ch) || isFullWidthLowercaseAlphabet(ch))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /** Checks if a char sequence is composed only of Katakana or hiragana */
+  public static boolean isKana(CharSequence s) {
+    for (int i = 0; i < s.length(); i++) {
+      char ch = s.charAt(i);
+      if (!(isHiragana(ch) || isKatakana(ch))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /** Checks if a char sequence is composed only of Katakana or lowercase alphabets */
+  public static boolean isKatakanaOrHWAlphabets(CharSequence ref) {
+    for (int i = 0; i < ref.length(); i++) {
+      char ch = ref.charAt(i);
+      if (!isKatakana(ch) && !isHalfWidthLowercaseAlphabet(ch)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /** Checks if a char is a Hiragana */
+  private static boolean isHiragana(char ch) {
+    return ch >= 0x3040 && ch <= 0x309f;
+  }
+
+  /** Checks if a char is a Katakana */
+  private static boolean isKatakana(char ch) {
+    return ch >= 0x30a0 && ch <= 0x30ff;
+  }
+
+  /** Checks if a char is a half-width lowercase alphabet */
+  private static boolean isHalfWidthLowercaseAlphabet(char ch) {
+    return ch >= 0x61 && ch <= 0x7a;
+  }
+
+  /** Checks if a char is a full-width lowercase alphabet */
+  public static boolean isFullWidthLowercaseAlphabet(char ch) {
+    return ch >= 0xff41 && ch <= 0xff5a;
+  }
+
+  /** Convert all hiragana in a string into kanataka */
+  public static String toKatakana(CharSequence s) {
+    char[] chars = new char[s.length()];
+    for (int i = 0; i < s.length(); i++) {
+      char ch = s.charAt(i);
+      // if the character is from 'ぁ' to 'ゖ' or 'ゝ' or 'ゞ', can be converted to katakana.
+      if (ch >= 0x3041 && ch <= 0x3096 || ch == 0x309d || ch == 0x309e) {
+        chars[i] = (char) (ch + 0x60);
+      } else {
+        chars[i] = ch;
+      }
+    }
+    return new String(chars);
+  }
+
+  private CharSequenceUtils() {}
+}
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/completion/KatakanaRomanizer.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/completion/KatakanaRomanizer.java
@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja.completion;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.CharsRefBuilder;
+
+/**
+ * Converts a Katakana string to <a
+ * href="https://en.wikipedia.org/wiki/Romanization_of_Japanese">Romaji</a> using the pre-defined
+ * Katakana-Romaji mapping rules. Internally, this repeatedly performs prefix match on the given
+ * char sequence to the pre-built keystroke array until it reaches the end of the sequence, or there
+ * are no matched keystrokes.
+ */
+public class KatakanaRomanizer {
+  private static final String ROMAJI_MAP_FILE = "romaji_map.txt";
+
+  private static KatakanaRomanizer INSTANCE;
+
+  static {
+    // Build romaji-map and keystroke arrays from the pre-defined Katakana-Romaji mapping file.
+    try (InputStreamReader is =
+            new InputStreamReader(
+                KatakanaRomanizer.class.getResourceAsStream(ROMAJI_MAP_FILE),
+                Charset.forName("UTF-8"));
+        BufferedReader ir = new BufferedReader(is)) {
+      Map<CharsRef, List<CharsRef>> romajiMap = new HashMap<>();
+      String line;
+      while ((line = ir.readLine()) != null) {
+        if (line.startsWith("#")) {
+          continue;
+        }
+        String[] cols = line.trim().split(",");
+        if (cols.length < 2) {
+          continue;
+        }
+        CharsRef prefix = new CharsRef(cols[0]);
+        romajiMap.put(prefix, new ArrayList<>());
+        for (int i = 1; i < cols.length; i++) {
+          romajiMap.get(prefix).add(new CharsRef(cols[i]));
+        }
+      }
+
+      Set<CharsRef> keystrokeSet = romajiMap.keySet();
+      int maxKeystrokeLength = keystrokeSet.stream().mapToInt(CharsRef::length).max().getAsInt();
+      CharsRef[][] keystrokes = new CharsRef[maxKeystrokeLength][];
+      for (int len = 0; len < maxKeystrokeLength; len++) {
+        final int l = len;
+        keystrokes[l] =
+            keystrokeSet.stream().filter(k -> k.length - 1 == l).toArray(CharsRef[]::new);
+      }
+      for (CharsRef[] ks : keystrokes) {
+        // keystroke array must be sorted in ascending order for binary search.
+        Arrays.sort(ks);
+      }
+
+      INSTANCE = new KatakanaRomanizer(keystrokes, romajiMap);
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  private final CharsRef[][] keystrokes;
+  private final Map<CharsRef, List<CharsRef>> romajiMap;
+
+  /** Returns the singleton instance of {@code KatakanaRomenizer} */
+  public static KatakanaRomanizer getInstance() {
+    return INSTANCE;
+  }
+
+  private KatakanaRomanizer(CharsRef[][] keystrokes, Map<CharsRef, List<CharsRef>> romajiMap) {
+    this.keystrokes = keystrokes;
+    this.romajiMap = romajiMap;
+  }
+
+  /**
+   * Translates a sequence of katakana to romaji. An input can produce multiple outputs because a
+   * keystroke can be mapped to multiple romajis.
+   */
+  public List<CharsRef> romanize(CharsRef input) {
+    assert CharSequenceUtils.isKatakanaOrHWAlphabets(input);
+
+    List<CharsRefBuilder> pendingOutputs = new ArrayList<>();
+    int pos = 0;
+    while (pos < input.length) {
+      // Greedily looks up the longest matched keystroke.
+      // e.g.: Consider input="キョウ", then there are two matched keystrokes (romaji mapping rules)
+      // "キ" -> "ki" and "キョ" -> "kyo". Only the longest one "キョ" will be selected.
+      MatchedKeystroke matched = longestKeystrokeMatch(input, pos);
+      if (matched == null) {
+        break;
+      }
+
+      List<CharsRef> candidates =
+          romajiMap.get(keystrokes[matched.keystrokeLen - 1][matched.keystrokeIndex]);
+
+      if (pendingOutputs.size() == 0) {
+        // There is no pending output.
+        // Add the matched keystrokes to pending outputs list.
+        for (CharsRef cref : candidates) {
+          CharsRefBuilder output = new CharsRefBuilder();
+          output.copyChars(cref);
+          pendingOutputs.add(output);
+        }
+      } else if (candidates.size() == 1) {
+        // There are one or more pending output(s) and one matched keystroke.
+        // Append the matched keystroke to all pending outputs.
+        // e.g.: Consider we already have two pending outputs "shi" and "si" and the matched
+        // keystroke "ka";
+        // then results are "shika" and "sika".
+        CharsRef cref = candidates.get(0);
+        for (CharsRefBuilder pdgOutput : pendingOutputs) {
+          pdgOutput.append(cref.chars, 0, cref.length);
+        }
+      } else {
+        // There are one or more pending output(s) and multiple matched keystrokes.
+        // Combine the matched keystrokes to all pending outputs.
+        // e.g.: Consider we already have two pending outputs "shi" and "si" and the matched
+        // keystroke "n" and "nn".
+        // To produce all possible keystroke patterns, result outputs should be "shin", "shinn",
+        // "sin" and "sinn".
+        List<CharsRefBuilder> outputs = new ArrayList<>();
+        for (CharsRef cref : candidates) {
+          for (CharsRefBuilder pdgOutput : pendingOutputs) {
+            CharsRefBuilder buffer = new CharsRefBuilder();
+            buffer.copyChars(pdgOutput.chars(), 0, pdgOutput.length());
+            buffer.append(cref.chars, cref.offset, cref.length);
+            outputs.add(buffer);
+          }
+        }
+        // update the pending outputs
+        pendingOutputs = outputs;
+      }
+
+      // proceed to the next input position
+      pos += matched.keystrokeLen;
+    }
+
+    if (pos < input.length) {
+      // add the remnants (that cannot be mapped to any romaji) as suffix
+      for (CharsRefBuilder output : pendingOutputs) {
+        output.append(input.chars, pos, input.length - pos);
+      }
+    }
+    return pendingOutputs.stream().map(CharsRefBuilder::get).collect(Collectors.toList());
+  }
+
+  private MatchedKeystroke longestKeystrokeMatch(CharsRef input, int inputOffset) {
+    for (int len = Math.min(input.length - inputOffset, keystrokes.length); len > 0; len--) {
+      CharsRef ref = new CharsRef(input.chars, inputOffset, len);
+      int index = Arrays.binarySearch(keystrokes[len - 1], ref);
+      if (index >= 0) {
+        return new MatchedKeystroke(len, index);
+      }
+    }
+    // there's no matched keystroke
+    return null;
+  }
+
+  private static class MatchedKeystroke {
+    final int keystrokeLen;
+    final int keystrokeIndex;
+
+    MatchedKeystroke(int keystrokeLen, int keystrokeIndex) {
+      this.keystrokeLen = keystrokeLen;
+      this.keystrokeIndex = keystrokeIndex;
+    }
+  }
+}
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/completion/package-info.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/completion/package-info.java
@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Utilities for {@link org.apache.lucene.analysis.ja.JapaneseCompletionFilter} */
+package org.apache.lucene.analysis.ja.completion;
--- a/lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
+++ b/lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
@ -14,6 +14,7 @@
 #  limitations under the License.

 org.apache.lucene.analysis.ja.JapaneseBaseFormFilterFactory
+org.apache.lucene.analysis.ja.JapaneseCompletionFilterFactory
 org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilterFactory
 org.apache.lucene.analysis.ja.JapaneseNumberFilterFactory
 org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilterFactory
--- a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/completion/romaji_map.txt
+++ b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/completion/romaji_map.txt
@ -0,0 +1,344 @@
+# mapping rules of katakana (an unit of keystroke) to list of acceptable romanizations.
+# longest-match is used to find entries in this list.
+# covers romanization systems: modified Hepburn-shiki, Kunrei-shiki (Nihon-shiki), and Wāpuro shiki.
+# note: this does not strictly comply with the romanization systems listed above,
+# but tries to cover possible keystoroke supported by various Input Methods.
+
+ア,a
+イ,i
+ウ,u
+エ,e
+オ,o
+カ,ka
+キ,ki
+ク,ku
+ケ,ke
+コ,ko
+キャ,kya
+キュ,kyu
+キョ,kyo
+ガ,ga
+ギ,gi
+グ,gu
+ゲ,ge
+ゴ,go
+ギャ,gya
+ギュ,gyu
+ギョ,gyo
+サ,sa
+シ,si,shi
+ス,su
+セ,se
+ソ,so
+シャ,sya,sha
+シュ,syu,shu
+シェ,sye,she
+ショ,syo,sho
+ザ,za
+ジ,zi,ji
+ズ,zu
+ゼ,ze
+ゾ,zo
+ジャ,zya,ja
+ジュ,zyu,ju
+ジェ,zye,je
+ジョ,zyo,jo
+タ,ta
+チ,ti,chi
+ツ,tu,tsu
+テ,te
+ト,to
+チャ,tya,cha,cya
+チュ,tyu,chu,cyu
+チョ,tyo,cho,cyo
+ダ,da
+ヂ,di,zi,ji
+ヅ,du,zu
+デ,de
+ド,do
+ヂャ,dya,zya,ja
+ヂュ,dyu,zyu,ju
+ヂョ,dyo,zyo,jo
+ナ,na
+ニ,ni
+ヌ,nu
+ネ,ne
+ノ,no
+ニャ,nya
+ニュ,nyu
+ニョ,nyo
+ハ,ha
+ヒ,hi
+フ,hu,fu
+ヘ,he
+ホ,ho
+ヒャ,hya
+ヒュ,hyu
+ヒョ,hyo
+バ,ba
+ビ,bi
+ブ,bu
+ベ,be
+ボ,bo
+ビャ,bya
+ビュ,byu
+ビョ,byo
+パ,pa
+ピ,pi
+プ,pu
+ペ,pe
+ポ,po
+ピャ,pya
+ピュ,pyu
+ピョ,pyo
+マ,ma
+ミ,mi
+ム,mu
+メ,me
+モ,mo
+ミャ,mya
+ミュ,myu
+ミョ,myo
+ヤ,ya
+ユ,yu
+ヨ,yo
+ラ,ra
+リ,ri
+ル,ru
+レ,re
+ロ,ro
+リャ,rya
+リュ,ryu
+リョ,ryo
+ワ,wa
+ウィ,wi
+ヰ,wi
+ウェ,we
+ヱ,we
+ヲ,wo,o
+ン,n,nn
+
+クァ,kwa,kuxa
+クィ,kwi,kuxi
+クゥ,kwu,kuxu
+クェ,kwe,kuxe
+クォ,kwo,kuxo
+グァ,gwa,guxa
+グィ,gwi,guxi
+グゥ,gwu,guxu
+グェ,gwe,guxe
+グォ,gwo,guxo
+スァ,swa,suxa
+スィ,swi,suxi
+スゥ,swu,suxu
+スェ,swe,suxe
+スォ,swo,suxo
+トァ,twa,toxa
+トィ,twi,toxi
+トゥ,twu,toxu
+トェ,twe,toxe
+トォ,two,toxo
+ドァ,dwa,doxa
+ドィ,dwi,
+ドゥ,dwu,doxu
+ドェ,dwe,doxe
+ドォ,dwo,doxo
+ファ,hwa,fa,huxa
+フィ,hwi,fi,huxi
+フェ,hwe,fe,huxe
+フォ,hwo,fo,huxo
+ヴァ,va,vuxa
+ヴィ,vi,vuxi
+ヴ,vu
+ヴェ,ve,vuxe
+ヴォ,vo,vuxo
+テァ,tha,texa
+ティ,thi,texi
+テェ,the,texe
+テャ,tha,texya
+テュ,thu,texyu
+テョ,tho,texyo
+フャ,fya,huxya,fuxya
+フュ,fyu,huxyu,fuxyu
+フョ,fyo,huxyo,fuxyo
+ヴャ,vya,vuxya
+ヴュ,vyu,vuxyu
+ヴョ,vyo,vuxyo
+
+ッカ,kka
+ッキ,kki
+ック,kku
+ッケ,kke
+ッコ,kko
+ッキャ,kkya
+ッキュ,kkyu
+ッキョ,kkyo
+ッガ,gga
+ッギ,ggi
+ッグ,ggu
+ッゲ,gge
+ッゴ,ggo
+ッギャ,ggya
+ッギュ,ggyu
+ッギョ,ggyo
+ッサ,ssa
+ッシ,ssi
+ッス,ssu
+ッセ,sse
+ッソ,sso
+ッシャ,ssya,ssha
+ッシュ,ssyu,sshu
+ッショ,ssyo,ssho
+ッザ,zza
+ッジ,zzi,jji
+ッズ,zzu
+ッゼ,zze
+ッゾ,zzo
+ッジャ,zzya,jja
+ッジュ,zzyu,jju
+ッジョ,zzyo,jjo
+ッタ,tta
+ッチ,tti
+ッツ,ttu
+ッテ,tte
+ット,tto
+ッチャ,ttya,ccha,ccya
+ッチュ,ttyu,cchu,ccyu
+ッチョ,ttyo,ccho,ccyo
+ッダ,dda
+ッヂ,ddi,
+ッヅ,ddu
+ッデ,dde
+ッド,ddo
+ッヂャ,ddya
+ッヂュ,ddyu
+ッヂョ,ddyo
+ッハ,hha
+ッヒ,hhi
+ッフ,hhu,ffu
+ッへ,hhe
+ッホ,hho
+ッヒャ,hhya
+ッヒュ,hhyu
+ッヒョ,hhyo
+ッバ,bba
+ッビ,bbi
+ッブ,bbu
+ッベ,bbe
+ッボ,bbo
+ッビャ,bbya
+ッビュ,bbyu
+ッビョ,bbyo
+ッパ,ppa
+ッピ,ppi
+ップ,ppu
+ッペ,ppe
+ッポ,ppo
+ッピャ,ppya
+ッピュ,ppyu
+ッピョ,ppyo
+ッマ,mma
+ッミ,mmi
+ッム,mmu
+ッメ,mme
+ッモ,mmo
+ッミャ,mmya
+ッミュ,mmyu
+ッミョ,mmyo
+ッヤ,yya
+ッイ,yyi
+ッユ,yyu
+ッイェ,yye
+ッヨ,yyo
+ッラ,rra
+ッリ,rri
+ッル,rru
+ッレ,rre
+ッロ,rro
+ッリャ,rrya
+ッリュ,rryu
+ッリョ,rryo
+ッワ,wwa
+ッウィ,wwi
+ッウ,wwu
+ッウェ,wwe
+ッヲ,wwo
+
+ックァ,kkwa,kkuxa
+ックィ,kkwi,kkuxi
+ックゥ,kkwu,kkuxu
+ックェ,kkwe,kkuxe
+ックォ,kkwo,kkuxo
+ッグァ,ggwa,gguxa
+ッグィ,ggwi,gguxi
+ッグゥ,ggwu,gguxu
+ッグェ,ggwe,gguxe
+ッグォ,ggwo,gguxo
+ッスァ,sswa,ssuxa
+ッスィ,sswi,ssuxi
+ッスゥ,sswu,ssuxu
+ッスェ,sswe,ssuxe
+ッスォ,sswo,suxo
+ットァ,ttwa,ttoxa
+ットィ,ttwi,ttoxi
+ットゥ,ttwu,ttoxu
+ットェ,ttwe,ttoxe
+ットォ,ttwo,ttoxo
+ッドァ,ddwa,ddoxa
+ッドィ,ddwi,ddoxi
+ッドゥ,ddwu,ddoxi
+ッドェ,ddwe,ddoxe
+ッドォ,ddwo,ddoxo
+ッファ,hhwa,ffa,hhuxa,ffuxa
+ッフィ,hhwi,ffi,hhuxi,ffuxi
+ッフェ,hhwe,ffe,hhuxe,ffuxe
+ッフォ,hhwo,ffo,hhuxo,ffuxo
+ッヴァ,vva,vvuxa
+ッヴィ,vvi,vvuxi
+ッヴ,vvu
+ッヴェ,vve,vvuxe
+ッヴォ,vvo,vvuxo
+ッテァ,ttha,ttexa
+ッティ,tthi,ttexi
+ッテェ,tthe,ttexe
+ッテャ,ttha,ttexya
+ッテュ,tthu,ttexyu
+ッテョ,ttho,ttexyo
+ッフャ,ffya,hhuxya,ffuxya
+ッフュ,ffyu,hhuxyu,ffuxyu
+ッフョ,ffyo,hhuxyo,ffuxyo
+ッヴャ,vvya,vvuxya
+ッヴュ,vvyu,vvuxyu
+ッヴョ,vvyo,vvuxyo
+
+ァ,xa
+ィ,xi
+ゥ,xu
+ェ,xe
+ォ,xo
+ヵ,xka
+ヶ,xke
+ッ,xtu
+ャ,xya
+ュ,xyu
+ョ,xyo
+
+ッk,kk
+ッg,gg
+ッs,ss
+ッz,zz
+ッt,tt
+ッd,dd
+ッh,hh
+ッb,bb
+ッp,pp
+ッm,mm
+ッy,yy
+ッr,rr
+ッw,ww
+
+# below are characters that should be kept but have no explicit romanization rules.
+# Chōonpu (Katakana-Hiragana Prolonged Sound Mark)
+ー,ー
+# Interpunct (Middle Dot)
+・,・
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseCompletionAnalyzer.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseCompletionAnalyzer.java
@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+import java.io.IOException;
+import java.util.Random;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.junit.Test;
+
+public class TestJapaneseCompletionAnalyzer extends BaseTokenStreamTestCase {
+
+  @Test
+  public void testCompletionDefault() throws IOException {
+    // mode=INDEX (default)
+    Analyzer analyzer = new JapaneseCompletionAnalyzer();
+    assertAnalyzesTo(
+        analyzer,
+        "東京",
+        new String[] {"東京", "toukyou"},
+        new int[] {0, 0},
+        new int[] {2, 2},
+        new int[] {1, 0});
+    analyzer.close();
+  }
+
+  @Test
+  public void testCompletionQuery() throws IOException {
+    // mode=QUERY
+    Analyzer analyzer = new JapaneseCompletionAnalyzer(null, JapaneseCompletionFilter.Mode.QUERY);
+    assertAnalyzesTo(
+        analyzer,
+        "東京ｔ",
+        new String[] {"東京t", "toukyout"},
+        new int[] {0, 0},
+        new int[] {3, 3},
+        new int[] {1, 0});
+    analyzer.close();
+  }
+
+  /** blast random strings against the analyzer */
+  @Test
+  public void testRandom() throws IOException {
+    Random random = random();
+    final Analyzer a = new JapaneseCompletionAnalyzer();
+    checkRandomData(random, a, atLeast(100));
+    a.close();
+  }
+
+  /** blast some random large strings through the analyzer */
+  @Test
+  public void testRandomHugeStrings() throws Exception {
+    Random random = random();
+    final Analyzer a = new JapaneseCompletionAnalyzer();
+    checkRandomData(random, a, 2 * RANDOM_MULTIPLIER, 8192);
+    a.close();
+  }
+}
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseCompletionFilter.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseCompletionFilter.java
@ -0,0 +1,271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+import java.io.IOException;
+import java.io.Reader;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.cjk.CJKWidthCharFilter;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.util.IOUtils;
+import org.junit.Test;
+
+public class TestJapaneseCompletionFilter extends BaseTokenStreamTestCase {
+  private Analyzer indexAnalyzer;
+  private Analyzer queryAnalyzer;
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    indexAnalyzer =
+        new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.NORMAL);
+            return new TokenStreamComponents(
+                tokenizer,
+                new JapaneseCompletionFilter(tokenizer, JapaneseCompletionFilter.Mode.INDEX));
+          }
+
+          @Override
+          protected Reader initReader(String fieldName, Reader reader) {
+            return new CJKWidthCharFilter(reader);
+          }
+
+          @Override
+          protected Reader initReaderForNormalization(String fieldName, Reader reader) {
+            return new CJKWidthCharFilter(reader);
+          }
+        };
+    queryAnalyzer =
+        new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.NORMAL);
+            return new TokenStreamComponents(
+                tokenizer,
+                new JapaneseCompletionFilter(tokenizer, JapaneseCompletionFilter.Mode.QUERY));
+          }
+
+          @Override
+          protected Reader initReader(String fieldName, Reader reader) {
+            return new CJKWidthCharFilter(reader);
+          }
+
+          @Override
+          protected Reader initReaderForNormalization(String fieldName, Reader reader) {
+            return new CJKWidthCharFilter(reader);
+          }
+        };
+  }
+
+  @Override
+  public void tearDown() throws Exception {
+    IOUtils.close(indexAnalyzer);
+    IOUtils.close(queryAnalyzer);
+    super.tearDown();
+  }
+
+  @Test
+  public void testCompletionIndex() throws IOException {
+    assertAnalyzesTo(
+        indexAnalyzer,
+        "東京",
+        new String[] {"東京", "toukyou"},
+        new int[] {0, 0},
+        new int[] {2, 2},
+        new int[] {1, 0});
+
+    assertAnalyzesTo(
+        indexAnalyzer,
+        "東京都",
+        new String[] {"東京", "toukyou", "都", "to"},
+        new int[] {0, 0, 2, 2},
+        new int[] {2, 2, 3, 3},
+        new int[] {1, 0, 1, 0});
+
+    assertAnalyzesTo(
+        indexAnalyzer,
+        "ドラえもん",
+        new String[] {"ドラえもん", "doraemon", "doraemonn"},
+        new int[] {0, 0, 0},
+        new int[] {5, 5, 5},
+        new int[] {1, 0, 0});
+
+    assertAnalyzesTo(
+        indexAnalyzer,
+        "ソースコード",
+        new String[] {"ソース", "soーsu", "コード", "koーdo"},
+        new int[] {0, 0, 3, 3},
+        new int[] {3, 3, 6, 6},
+        new int[] {1, 0, 1, 0});
+
+    assertAnalyzesTo(
+        indexAnalyzer,
+        "反社会的勢力",
+        new String[] {"反", "han", "hann", "社会", "syakai", "shakai", "的", "teki", "勢力", "seiryoku"},
+        new int[] {0, 0, 0, 1, 1, 1, 3, 3, 4, 4},
+        new int[] {1, 1, 1, 3, 3, 3, 4, 4, 6, 6},
+        new int[] {1, 0, 0, 1, 0, 0, 1, 0, 1, 0});
+
+    assertAnalyzesTo(
+        indexAnalyzer, "々", new String[] {"々"}, new int[] {0}, new int[] {1}, new int[] {1});
+
+    assertAnalyzesTo(
+        indexAnalyzer,
+        "是々",
+        new String[] {"是", "ze", "々"},
+        new int[] {0, 0, 1},
+        new int[] {1, 1, 2},
+        new int[] {1, 0, 1});
+
+    assertAnalyzesTo(
+        indexAnalyzer,
+        "是々の",
+        new String[] {"是", "ze", "々", "の", "no"},
+        new int[] {0, 0, 1, 2, 2},
+        new int[] {1, 1, 2, 3, 3},
+        new int[] {1, 0, 1, 1, 0});
+  }
+
+  @Test
+  public void testCompletionQuery() throws IOException {
+    assertAnalyzesTo(
+        queryAnalyzer,
+        "東京",
+        new String[] {"東京", "toukyou"},
+        new int[] {0, 0},
+        new int[] {2, 2},
+        new int[] {1, 0});
+
+    assertAnalyzesTo(
+        queryAnalyzer,
+        "東京都",
+        new String[] {"東京", "toukyou", "都", "to"},
+        new int[] {0, 0, 2, 2},
+        new int[] {2, 2, 3, 3},
+        new int[] {1, 0, 1, 0});
+
+    assertAnalyzesTo(
+        queryAnalyzer,
+        "ドラえもん",
+        new String[] {"ドラえもん", "doraemon", "doraemonn"},
+        new int[] {0, 0, 0},
+        new int[] {5, 5, 5},
+        new int[] {1, 0, 0});
+
+    assertAnalyzesTo(
+        queryAnalyzer,
+        "ソースコード",
+        new String[] {"ソースコード", "soーsukoーdo"},
+        new int[] {0, 0},
+        new int[] {6, 6},
+        new int[] {1, 0});
+
+    assertAnalyzesTo(
+        queryAnalyzer,
+        "反社会的勢力",
+        new String[] {"反", "han", "hann", "社会", "syakai", "shakai", "的", "teki", "勢力", "seiryoku"},
+        new int[] {0, 0, 0, 1, 1, 1, 3, 3, 4, 4},
+        new int[] {1, 1, 1, 3, 3, 3, 4, 4, 6, 6},
+        new int[] {1, 0, 0, 1, 0, 0, 1, 0, 1, 0});
+
+    assertAnalyzesTo(
+        queryAnalyzer, "々", new String[] {"々"}, new int[] {0}, new int[] {1}, new int[] {1});
+
+    assertAnalyzesTo(
+        queryAnalyzer,
+        "是々",
+        new String[] {"是", "ze", "々"},
+        new int[] {0, 0, 1},
+        new int[] {1, 1, 2},
+        new int[] {1, 0, 1});
+
+    assertAnalyzesTo(
+        indexAnalyzer,
+        "是々の",
+        new String[] {"是", "ze", "々", "の", "no"},
+        new int[] {0, 0, 1, 2, 2},
+        new int[] {1, 1, 2, 3, 3},
+        new int[] {1, 0, 1, 1, 0});
+
+    assertAnalyzesTo(
+        queryAnalyzer,
+        "東京ｔ",
+        new String[] {"東京t", "toukyout"},
+        new int[] {0, 0},
+        new int[] {3, 3},
+        new int[] {1, 0});
+
+    assertAnalyzesTo(
+        queryAnalyzer,
+        "サッｋ",
+        new String[] {"サッk", "sakk"},
+        new int[] {0, 0},
+        new int[] {3, 3},
+        new int[] {1, 0});
+
+    assertAnalyzesTo(
+        queryAnalyzer,
+        "反ｓｙ",
+        new String[] {"反sy", "hansy", "hannsy"},
+        new int[] {0, 0, 0},
+        new int[] {3, 3, 3},
+        new int[] {1, 0, 0});
+
+    assertAnalyzesTo(
+        queryAnalyzer,
+        "さーきゅｒ",
+        new String[] {"さーきゅr", "saーkyur"},
+        new int[] {0, 0},
+        new int[] {5, 5},
+        new int[] {1, 0});
+
+    assertAnalyzesTo(
+        queryAnalyzer,
+        "是々ｈ",
+        new String[] {"是", "ze", "々h"},
+        new int[] {0, 0, 1},
+        new int[] {1, 1, 3},
+        new int[] {1, 0, 1});
+  }
+
+  public void testEnglish() throws IOException {
+    assertAnalyzesTo(indexAnalyzer, "this atest", new String[] {"this", "atest"});
+    assertAnalyzesTo(queryAnalyzer, "this atest", new String[] {"this", "atest"});
+  }
+
+  public void testRandomStrings() throws IOException {
+    checkRandomData(random(), indexAnalyzer, atLeast(200));
+    checkRandomData(random(), queryAnalyzer, atLeast(200));
+  }
+
+  public void testEmptyTerm() throws IOException {
+    Analyzer a =
+        new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer tokenizer = new KeywordTokenizer();
+            return new TokenStreamComponents(tokenizer, new JapaneseCompletionFilter(tokenizer));
+          }
+        };
+    checkOneTerm(a, "", "");
+    a.close();
+  }
+}
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseCompletionFilterFactory.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseCompletionFilterFactory.java
@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.cjk.CJKWidthFilterFactory;
+import org.junit.Test;
+
+public class TestJapaneseCompletionFilterFactory extends BaseTokenStreamFactoryTestCase {
+  @Test
+  public void testCompletion() throws IOException {
+    JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<>());
+    TokenStream tokenStream = tokenizerFactory.create();
+    ((Tokenizer) tokenStream).setReader(new StringReader("東京ｔ"));
+    CJKWidthFilterFactory cjkWidthFactory = new CJKWidthFilterFactory(new HashMap<>());
+    tokenStream = cjkWidthFactory.create(tokenStream);
+    Map<String, String> map = new HashMap<>();
+    map.put("mode", "QUERY");
+    JapaneseCompletionFilterFactory filterFactory = new JapaneseCompletionFilterFactory(map);
+    assertTokenStreamContents(filterFactory.create(tokenStream), new String[] {"東京t", "toukyout"});
+  }
+
+  /** Test that bogus arguments result in exception */
+  @Test
+  public void testBogusArguments() throws Exception {
+    IllegalArgumentException expected =
+        expectThrows(
+            IllegalArgumentException.class,
+            () -> {
+              new JapaneseCompletionFilterFactory(
+                  new HashMap<String, String>() {
+                    {
+                      put("bogusArg", "bogusValue");
+                    }
+                  });
+            });
+    assertTrue(expected.getMessage().contains("Unknown parameters"));
+  }
+}
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/completion/TestKatakanaRomanizer.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/completion/TestKatakanaRomanizer.java
@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja.completion;
+
+import java.util.List;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
+
+public class TestKatakanaRomanizer extends LuceneTestCase {
+  private final KatakanaRomanizer romanizer = KatakanaRomanizer.getInstance();
+
+  @Test
+  public void testRomanize() {
+    assertCharsRefListEqualsUnordered(
+        List.of(new CharsRef("hasi"), new CharsRef("hashi")),
+        romanizer.romanize(new CharsRef("ハシ")));
+    assertCharsRefListEqualsUnordered(
+        List.of(new CharsRef("yuukyuu")), romanizer.romanize(new CharsRef("ユウキュウ")));
+    assertCharsRefListEqualsUnordered(
+        List.of(new CharsRef("yakyuu")), romanizer.romanize(new CharsRef("ヤキュウ")));
+    assertCharsRefListEqualsUnordered(
+        List.of(new CharsRef("toukyou")), romanizer.romanize(new CharsRef("トウキョウ")));
+    assertCharsRefListEqualsUnordered(
+        List.of(new CharsRef("toーkyoー")), romanizer.romanize(new CharsRef("トーキョー")));
+    assertCharsRefListEqualsUnordered(
+        List.of(new CharsRef("sakka")), romanizer.romanize(new CharsRef("サッカ")));
+    assertCharsRefListEqualsUnordered(
+        List.of(new CharsRef("hyakkaten"), new CharsRef("hyakkatenn")),
+        romanizer.romanize(new CharsRef("ヒャッカテン")));
+    assertCharsRefListEqualsUnordered(
+        List.of(new CharsRef("voruteーru"), new CharsRef("vuxoruteーru")),
+        romanizer.romanize(new CharsRef("ヴォルテール")));
+  }
+
+  @Test
+  public void testRomanizeWithAlphabets() {
+    assertCharsRefListEqualsUnordered(
+        List.of(new CharsRef("toukyout")), romanizer.romanize(new CharsRef("トウキョウt")));
+    assertCharsRefListEqualsUnordered(
+        List.of(new CharsRef("kodakk")), romanizer.romanize(new CharsRef("コダッk")));
+    assertCharsRefListEqualsUnordered(
+        List.of(new CharsRef("syousy"), new CharsRef("shousy")),
+        romanizer.romanize(new CharsRef("ショウsy")));
+  }
+
+  private static void assertCharsRefListEqualsUnordered(
+      List<CharsRef> expected, List<CharsRef> actual) {
+    assertEquals(expected.size(), actual.size());
+    for (CharsRef ref : expected) {
+      assertTrue(ref.toString() + " is not contained in " + actual, actual.contains(ref));
+    }
+  }
+}