LUCENE-10102: Add JapaneseCompletionFilter for Input Method-aware auto-completion (#297)

Co-authored-by: Robert Muir <rmuir@apache.org>
This commit is contained in:
Tomoko Uchida 2021-09-17 22:37:12 +09:00 committed by GitHub
parent de45b68c90
commit 4e86df96c0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 1518 additions and 0 deletions

View File

@ -21,6 +21,8 @@ New Features
* LUCENE-10096: Add TamilAnalyzer based on the snowball stemmer. (Robert Muir)
* LUCENE-10102: Add JapaneseCompletionFilter for Input Method-aware auto-completion (Tomoko Uchida, Robert Muir, Jun Ohtani)
System Requirements
* LUCENE-8738: Move to Java 11 as minimum Java version.

View File

@ -0,0 +1,65 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cjk.CJKWidthCharFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
/**
* Analyzer for Japanese completion suggester.
*
* @see JapaneseCompletionFilter
*/
public class JapaneseCompletionAnalyzer extends Analyzer {
private final JapaneseCompletionFilter.Mode mode;
private final UserDictionary userDict;
/** Creates a new {@code JapaneseCompletionAnalyzer} with default configurations */
public JapaneseCompletionAnalyzer() {
this(null, JapaneseCompletionFilter.Mode.INDEX);
}
/** Creates a new {@code JapaneseCompletionAnalyzer} */
public JapaneseCompletionAnalyzer(UserDictionary userDict, JapaneseCompletionFilter.Mode mode) {
this.userDict = userDict;
this.mode = mode;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer =
new JapaneseTokenizer(userDict, true, true, JapaneseTokenizer.Mode.NORMAL);
TokenStream stream = new JapaneseCompletionFilter(tokenizer, mode);
stream = new LowerCaseFilter(stream);
return new TokenStreamComponents(tokenizer, stream);
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new CJKWidthCharFilter(reader);
}
@Override
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
return new CJKWidthCharFilter(reader);
}
}

View File

@ -0,0 +1,267 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ja.completion.CharSequenceUtils;
import org.apache.lucene.analysis.ja.completion.KatakanaRomanizer;
import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
/**
* A {@link org.apache.lucene.analysis.TokenFilter} that adds Japanese romanized tokens to the term
* attribute. Also, this keeps original tokens (surface forms). Main usage of this filter is Query
* Auto-Completion.
*
* <p>Supported romanization form: (modified) Hepburn-shiki, Kunrei-shiki (Nihon-shiki) and Wāpuro
* shiki.
*
* <p>This does NOT support some romanji forms which are official but not commonly used with
* Japanese <a href="https://en.wikipedia.org/wiki/Input_method">Input Methods</a>. e.g.: circumflex
* or macron representing <a href="https://en.wikipedia.org/wiki/Ch%C5%8Donpu">Chōonpu (長音符)</a> are
* not supported.
*
* <p>The romanization behaviour changes according to its {@link Mode}. The default mode is {@link
* Mode#INDEX}.
*
* <p>Note: This filter must be applied AFTER half-width and full-width normalization. Please ensure
* that a width normalizer such as {@link org.apache.lucene.analysis.cjk.CJKWidthCharFilter} or
* {@link org.apache.lucene.analysis.cjk.CJKWidthFilter} is included in your analysis chain. IF THE
* WIDTH NORMALIZATION IS NOT PERFORMED, THIS DOES NOT WORK AS EXPECTED. See also: {@link
* JapaneseCompletionAnalyzer}.
*/
public final class JapaneseCompletionFilter extends TokenFilter {
public static final Mode DEFAULT_MODE = Mode.INDEX;
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
private final ReadingAttribute readingAttr = addAttribute(ReadingAttribute.class);
private final PositionIncrementAttribute posIncAtt =
addAttribute(PositionIncrementAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final CompletionTokenGenerator tokenGenerator;
/** Completion mode */
public enum Mode {
/** Simple romanization. Expected to be used when indexing. */
INDEX,
/** Input Method aware romanization. Expected to be used when querying. */
QUERY
}
/** Creates a new {@code JapaneseCompletionFilter} with default configurations */
public JapaneseCompletionFilter(TokenStream input) {
this(input, DEFAULT_MODE);
}
/** Creates a new {@code JapaneseCompletionFilter} */
public JapaneseCompletionFilter(TokenStream input, Mode mode) {
super(input);
this.tokenGenerator = new CompletionTokenGenerator(mode);
}
@Override
public void reset() throws IOException {
super.reset();
tokenGenerator.reset();
}
@Override
public boolean incrementToken() throws IOException {
mayIncrementToken();
if (tokenGenerator.hasNext()) {
clearAttributes();
CompletionToken token = tokenGenerator.next();
termAttr.setEmpty().append(token.term);
if (token.isFirst) {
posIncAtt.setPositionIncrement(1);
} else {
posIncAtt.setPositionIncrement(0);
}
offsetAtt.setOffset(token.startOffset, token.endOffset);
return true;
} else {
return false;
}
}
private void mayIncrementToken() throws IOException {
while (!tokenGenerator.hasNext()) {
if (input.incrementToken()) {
String surface = termAttr.toString();
String reading = readingAttr.getReading();
int startOffset = offsetAtt.startOffset();
int endOffset = offsetAtt.endOffset();
if (reading == null && CharSequenceUtils.isKana(surface)) {
// use the surface form as reading when possible.
reading = CharSequenceUtils.toKatakana(surface);
}
tokenGenerator.addToken(surface, reading, startOffset, endOffset);
} else {
if (tokenGenerator.hasPendingToken()) {
// a pending token remains.
tokenGenerator.finish();
} else {
// already consumed all tokens. there's no next token to output.
break;
}
}
}
}
private static class CompletionToken {
final String term;
final boolean isFirst;
final int startOffset;
final int endOffset;
CompletionToken(String term, boolean isFirst, int startOffset, int endOffset) {
this.term = term;
this.isFirst = isFirst;
this.startOffset = startOffset;
this.endOffset = endOffset;
}
}
private static class CompletionTokenGenerator implements Iterator<CompletionToken> {
private final Mode mode;
private List<CompletionToken> outputs;
private CharsRefBuilder pdgSurface;
private CharsRefBuilder pdgReading;
private int pdgStartOffset;
private int pdgEndOffset;
CompletionTokenGenerator(Mode mode) {
this.mode = mode;
outputs = new ArrayList<>();
}
public void reset() {
clearPendingToken();
outputs.clear();
}
@Override
public boolean hasNext() {
return outputs.size() > 0;
}
@Override
public CompletionToken next() {
return outputs.remove(0);
}
void addToken(String surface, String reading, int startOffset, int endOffset) {
assert surface != null : "surface must not be null.";
if (hasPendingToken()) {
if (mode == Mode.QUERY
&& pdgReading != null
&& !CharSequenceUtils.isLowercaseAlphabets(pdgSurface.get())
&& CharSequenceUtils.isLowercaseAlphabets(surface)) {
// words that are in mid-IME composition are split into two tokens by JapaneseTokenizer;
// should be recovered when querying.
// Note: in this case, the reading attribute is null; use the surface form in place of the
// reading.
// e.g.: "サッ" + "k" => "サッk", "" + "sy" => "反sy"
pdgSurface.append(surface);
pdgReading.append(surface);
pdgEndOffset = endOffset;
generateOutputs();
clearPendingToken();
} else if (mode == Mode.QUERY
&& CharSequenceUtils.isKana(pdgSurface.get())
&& CharSequenceUtils.isKana(surface)) {
// words that are all composed only of Katakana or Hiragana should be concatenated when
// querying.
// e.g.: "こい" + "" => "こいぬ"
pdgSurface.append(surface);
pdgReading.append(reading);
pdgEndOffset = endOffset;
} else {
generateOutputs();
resetPendingToken(surface, reading, startOffset, endOffset);
}
} else {
resetPendingToken(surface, reading, startOffset, endOffset);
}
}
void finish() {
generateOutputs();
clearPendingToken();
}
private void generateOutputs() {
// preserve original surface form as an output.
outputs.add(new CompletionToken(pdgSurface.toString(), true, pdgStartOffset, pdgEndOffset));
// skip readings that cannot be translated to romaji.
if (pdgReading == null
|| pdgReading.length() == 0
|| !CharSequenceUtils.isKatakanaOrHWAlphabets(pdgReading.get())) {
return;
}
// translate the reading to romaji.
List<CharsRef> romaji = KatakanaRomanizer.getInstance().romanize(pdgReading.get());
for (CharsRef ref : romaji) {
// set the same start/end offset as the original surface form for romanized tokens.
outputs.add(new CompletionToken(ref.toString(), false, pdgStartOffset, pdgEndOffset));
}
}
boolean hasPendingToken() {
return pdgSurface != null;
}
void resetPendingToken(
CharSequence surface, CharSequence reading, int startOffset, int endOffset) {
if (this.pdgSurface == null) {
this.pdgSurface = new CharsRefBuilder();
} else {
this.pdgSurface.clear();
}
this.pdgSurface.append(surface);
if (this.pdgReading == null) {
this.pdgReading = new CharsRefBuilder();
} else {
this.pdgReading.clear();
}
this.pdgReading.append(reading);
this.pdgStartOffset = startOffset;
this.pdgEndOffset = endOffset;
}
void clearPendingToken() {
this.pdgSurface = null;
this.pdgReading = null;
this.pdgStartOffset = 0;
this.pdgEndOffset = 0;
}
}
}

View File

@ -0,0 +1,66 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja;
import java.util.Locale;
import java.util.Map;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
/**
* Factory for {@link JapaneseCompletionFilter}.
*
* <p>Supported attributes:
*
* <ul>
* <li>mode: Completion mode. see {@link JapaneseCompletionFilter.Mode}
* </ul>
*
* @lucene.spi {@value #NAME}
*/
public class JapaneseCompletionFilterFactory extends TokenFilterFactory {
/** SPI name */
public static final String NAME = "japaneseCompletion";
private static final String MODE_PARAM = "mode";
private final JapaneseCompletionFilter.Mode mode;
/** Creates a new {@code JapaneseCompletionFilterFactory} */
public JapaneseCompletionFilterFactory(Map<String, String> args) {
super(args);
mode =
JapaneseCompletionFilter.Mode.valueOf(
get(
args,
MODE_PARAM,
JapaneseCompletionFilter.DEFAULT_MODE.name().toUpperCase(Locale.ROOT)));
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
/** Default ctor for compatibility with SPI */
public JapaneseCompletionFilterFactory() {
throw defaultCtorException();
}
@Override
public TokenStream create(TokenStream input) {
return new JapaneseCompletionFilter(input, mode);
}
}

View File

@ -0,0 +1,91 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.completion;
/** Utility functions for {@link org.apache.lucene.analysis.ja.JapaneseCompletionFilter} */
public class CharSequenceUtils {
/** Checks if a char sequence is composed only of lowercase alphabets */
public static boolean isLowercaseAlphabets(CharSequence s) {
for (int i = 0; i < s.length(); i++) {
char ch = s.charAt(i);
if (!(isHalfWidthLowercaseAlphabet(ch) || isFullWidthLowercaseAlphabet(ch))) {
return false;
}
}
return true;
}
/** Checks if a char sequence is composed only of Katakana or hiragana */
public static boolean isKana(CharSequence s) {
for (int i = 0; i < s.length(); i++) {
char ch = s.charAt(i);
if (!(isHiragana(ch) || isKatakana(ch))) {
return false;
}
}
return true;
}
/** Checks if a char sequence is composed only of Katakana or lowercase alphabets */
public static boolean isKatakanaOrHWAlphabets(CharSequence ref) {
for (int i = 0; i < ref.length(); i++) {
char ch = ref.charAt(i);
if (!isKatakana(ch) && !isHalfWidthLowercaseAlphabet(ch)) {
return false;
}
}
return true;
}
/** Checks if a char is a Hiragana */
private static boolean isHiragana(char ch) {
return ch >= 0x3040 && ch <= 0x309f;
}
/** Checks if a char is a Katakana */
private static boolean isKatakana(char ch) {
return ch >= 0x30a0 && ch <= 0x30ff;
}
/** Checks if a char is a half-width lowercase alphabet */
private static boolean isHalfWidthLowercaseAlphabet(char ch) {
return ch >= 0x61 && ch <= 0x7a;
}
/** Checks if a char is a full-width lowercase alphabet */
public static boolean isFullWidthLowercaseAlphabet(char ch) {
return ch >= 0xff41 && ch <= 0xff5a;
}
/** Convert all hiragana in a string into kanataka */
public static String toKatakana(CharSequence s) {
char[] chars = new char[s.length()];
for (int i = 0; i < s.length(); i++) {
char ch = s.charAt(i);
// if the character is from 'ぁ' to 'ゖ' or 'ゝ' or 'ゞ', can be converted to katakana.
if (ch >= 0x3041 && ch <= 0x3096 || ch == 0x309d || ch == 0x309e) {
chars[i] = (char) (ch + 0x60);
} else {
chars[i] = ch;
}
}
return new String(chars);
}
private CharSequenceUtils() {}
}

View File

@ -0,0 +1,193 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.completion;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
/**
* Converts a Katakana string to <a
* href="https://en.wikipedia.org/wiki/Romanization_of_Japanese">Romaji</a> using the pre-defined
* Katakana-Romaji mapping rules. Internally, this repeatedly performs prefix match on the given
* char sequence to the pre-built keystroke array until it reaches the end of the sequence, or there
* are no matched keystrokes.
*/
public class KatakanaRomanizer {
private static final String ROMAJI_MAP_FILE = "romaji_map.txt";
private static KatakanaRomanizer INSTANCE;
static {
// Build romaji-map and keystroke arrays from the pre-defined Katakana-Romaji mapping file.
try (InputStreamReader is =
new InputStreamReader(
KatakanaRomanizer.class.getResourceAsStream(ROMAJI_MAP_FILE),
Charset.forName("UTF-8"));
BufferedReader ir = new BufferedReader(is)) {
Map<CharsRef, List<CharsRef>> romajiMap = new HashMap<>();
String line;
while ((line = ir.readLine()) != null) {
if (line.startsWith("#")) {
continue;
}
String[] cols = line.trim().split(",");
if (cols.length < 2) {
continue;
}
CharsRef prefix = new CharsRef(cols[0]);
romajiMap.put(prefix, new ArrayList<>());
for (int i = 1; i < cols.length; i++) {
romajiMap.get(prefix).add(new CharsRef(cols[i]));
}
}
Set<CharsRef> keystrokeSet = romajiMap.keySet();
int maxKeystrokeLength = keystrokeSet.stream().mapToInt(CharsRef::length).max().getAsInt();
CharsRef[][] keystrokes = new CharsRef[maxKeystrokeLength][];
for (int len = 0; len < maxKeystrokeLength; len++) {
final int l = len;
keystrokes[l] =
keystrokeSet.stream().filter(k -> k.length - 1 == l).toArray(CharsRef[]::new);
}
for (CharsRef[] ks : keystrokes) {
// keystroke array must be sorted in ascending order for binary search.
Arrays.sort(ks);
}
INSTANCE = new KatakanaRomanizer(keystrokes, romajiMap);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
private final CharsRef[][] keystrokes;
private final Map<CharsRef, List<CharsRef>> romajiMap;
/** Returns the singleton instance of {@code KatakanaRomenizer} */
public static KatakanaRomanizer getInstance() {
return INSTANCE;
}
private KatakanaRomanizer(CharsRef[][] keystrokes, Map<CharsRef, List<CharsRef>> romajiMap) {
this.keystrokes = keystrokes;
this.romajiMap = romajiMap;
}
/**
* Translates a sequence of katakana to romaji. An input can produce multiple outputs because a
* keystroke can be mapped to multiple romajis.
*/
public List<CharsRef> romanize(CharsRef input) {
assert CharSequenceUtils.isKatakanaOrHWAlphabets(input);
List<CharsRefBuilder> pendingOutputs = new ArrayList<>();
int pos = 0;
while (pos < input.length) {
// Greedily looks up the longest matched keystroke.
// e.g.: Consider input="キョウ", then there are two matched keystrokes (romaji mapping rules)
// "" -> "ki" and "キョ" -> "kyo". Only the longest one "キョ" will be selected.
MatchedKeystroke matched = longestKeystrokeMatch(input, pos);
if (matched == null) {
break;
}
List<CharsRef> candidates =
romajiMap.get(keystrokes[matched.keystrokeLen - 1][matched.keystrokeIndex]);
if (pendingOutputs.size() == 0) {
// There is no pending output.
// Add the matched keystrokes to pending outputs list.
for (CharsRef cref : candidates) {
CharsRefBuilder output = new CharsRefBuilder();
output.copyChars(cref);
pendingOutputs.add(output);
}
} else if (candidates.size() == 1) {
// There are one or more pending output(s) and one matched keystroke.
// Append the matched keystroke to all pending outputs.
// e.g.: Consider we already have two pending outputs "shi" and "si" and the matched
// keystroke "ka";
// then results are "shika" and "sika".
CharsRef cref = candidates.get(0);
for (CharsRefBuilder pdgOutput : pendingOutputs) {
pdgOutput.append(cref.chars, 0, cref.length);
}
} else {
// There are one or more pending output(s) and multiple matched keystrokes.
// Combine the matched keystrokes to all pending outputs.
// e.g.: Consider we already have two pending outputs "shi" and "si" and the matched
// keystroke "n" and "nn".
// To produce all possible keystroke patterns, result outputs should be "shin", "shinn",
// "sin" and "sinn".
List<CharsRefBuilder> outputs = new ArrayList<>();
for (CharsRef cref : candidates) {
for (CharsRefBuilder pdgOutput : pendingOutputs) {
CharsRefBuilder buffer = new CharsRefBuilder();
buffer.copyChars(pdgOutput.chars(), 0, pdgOutput.length());
buffer.append(cref.chars, cref.offset, cref.length);
outputs.add(buffer);
}
}
// update the pending outputs
pendingOutputs = outputs;
}
// proceed to the next input position
pos += matched.keystrokeLen;
}
if (pos < input.length) {
// add the remnants (that cannot be mapped to any romaji) as suffix
for (CharsRefBuilder output : pendingOutputs) {
output.append(input.chars, pos, input.length - pos);
}
}
return pendingOutputs.stream().map(CharsRefBuilder::get).collect(Collectors.toList());
}
private MatchedKeystroke longestKeystrokeMatch(CharsRef input, int inputOffset) {
for (int len = Math.min(input.length - inputOffset, keystrokes.length); len > 0; len--) {
CharsRef ref = new CharsRef(input.chars, inputOffset, len);
int index = Arrays.binarySearch(keystrokes[len - 1], ref);
if (index >= 0) {
return new MatchedKeystroke(len, index);
}
}
// there's no matched keystroke
return null;
}
private static class MatchedKeystroke {
final int keystrokeLen;
final int keystrokeIndex;
MatchedKeystroke(int keystrokeLen, int keystrokeIndex) {
this.keystrokeLen = keystrokeLen;
this.keystrokeIndex = keystrokeIndex;
}
}
}

View File

@ -0,0 +1,19 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** Utilities for {@link org.apache.lucene.analysis.ja.JapaneseCompletionFilter} */
package org.apache.lucene.analysis.ja.completion;

View File

@ -14,6 +14,7 @@
# limitations under the License.
org.apache.lucene.analysis.ja.JapaneseBaseFormFilterFactory
org.apache.lucene.analysis.ja.JapaneseCompletionFilterFactory
org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilterFactory
org.apache.lucene.analysis.ja.JapaneseNumberFilterFactory
org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilterFactory

View File

@ -0,0 +1,344 @@
# mapping rules of katakana (an unit of keystroke) to list of acceptable romanizations.
# longest-match is used to find entries in this list.
# covers romanization systems: modified Hepburn-shiki, Kunrei-shiki (Nihon-shiki), and Wāpuro shiki.
# note: this does not strictly comply with the romanization systems listed above,
# but tries to cover possible keystoroke supported by various Input Methods.
ア,a
イ,i
ウ,u
エ,e
オ,o
カ,ka
キ,ki
ク,ku
ケ,ke
コ,ko
キャ,kya
キュ,kyu
キョ,kyo
ガ,ga
ギ,gi
グ,gu
ゲ,ge
ゴ,go
ギャ,gya
ギュ,gyu
ギョ,gyo
サ,sa
シ,si,shi
ス,su
セ,se
ソ,so
シャ,sya,sha
シュ,syu,shu
シェ,sye,she
ショ,syo,sho
ザ,za
ジ,zi,ji
ズ,zu
ゼ,ze
ゾ,zo
ジャ,zya,ja
ジュ,zyu,ju
ジェ,zye,je
ジョ,zyo,jo
タ,ta
チ,ti,chi
ツ,tu,tsu
テ,te
ト,to
チャ,tya,cha,cya
チュ,tyu,chu,cyu
チョ,tyo,cho,cyo
ダ,da
ヂ,di,zi,ji
ヅ,du,zu
デ,de
ド,do
ヂャ,dya,zya,ja
ヂュ,dyu,zyu,ju
ヂョ,dyo,zyo,jo
ナ,na
ニ,ni
ヌ,nu
ネ,ne
,no
ニャ,nya
ニュ,nyu
ニョ,nyo
ハ,ha
ヒ,hi
フ,hu,fu
ヘ,he
ホ,ho
ヒャ,hya
ヒュ,hyu
ヒョ,hyo
バ,ba
ビ,bi
ブ,bu
ベ,be
ボ,bo
ビャ,bya
ビュ,byu
ビョ,byo
パ,pa
ピ,pi
プ,pu
ペ,pe
ポ,po
ピャ,pya
ピュ,pyu
ピョ,pyo
マ,ma
ミ,mi
ム,mu
メ,me
モ,mo
ミャ,mya
ミュ,myu
ミョ,myo
ヤ,ya
ユ,yu
ヨ,yo
ラ,ra
リ,ri
ル,ru
レ,re
ロ,ro
リャ,rya
リュ,ryu
リョ,ryo
ワ,wa
ウィ,wi
ヰ,wi
ウェ,we
ヱ,we
ヲ,wo,o
ン,n,nn
クァ,kwa,kuxa
クィ,kwi,kuxi
クゥ,kwu,kuxu
クェ,kwe,kuxe
クォ,kwo,kuxo
グァ,gwa,guxa
グィ,gwi,guxi
グゥ,gwu,guxu
グェ,gwe,guxe
グォ,gwo,guxo
スァ,swa,suxa
スィ,swi,suxi
スゥ,swu,suxu
スェ,swe,suxe
スォ,swo,suxo
トァ,twa,toxa
トィ,twi,toxi
トゥ,twu,toxu
トェ,twe,toxe
トォ,two,toxo
ドァ,dwa,doxa
ドィ,dwi,
ドゥ,dwu,doxu
ドェ,dwe,doxe
ドォ,dwo,doxo
ファ,hwa,fa,huxa
フィ,hwi,fi,huxi
フェ,hwe,fe,huxe
フォ,hwo,fo,huxo
ヴァ,va,vuxa
ヴィ,vi,vuxi
ヴ,vu
ヴェ,ve,vuxe
ヴォ,vo,vuxo
テァ,tha,texa
ティ,thi,texi
テェ,the,texe
テャ,tha,texya
テュ,thu,texyu
テョ,tho,texyo
フャ,fya,huxya,fuxya
フュ,fyu,huxyu,fuxyu
フョ,fyo,huxyo,fuxyo
ヴャ,vya,vuxya
ヴュ,vyu,vuxyu
ヴョ,vyo,vuxyo
ッカ,kka
ッキ,kki
ック,kku
ッケ,kke
ッコ,kko
ッキャ,kkya
ッキュ,kkyu
ッキョ,kkyo
ッガ,gga
ッギ,ggi
ッグ,ggu
ッゲ,gge
ッゴ,ggo
ッギャ,ggya
ッギュ,ggyu
ッギョ,ggyo
ッサ,ssa
ッシ,ssi
ッス,ssu
ッセ,sse
ッソ,sso
ッシャ,ssya,ssha
ッシュ,ssyu,sshu
ッショ,ssyo,ssho
ッザ,zza
ッジ,zzi,jji
ッズ,zzu
ッゼ,zze
ッゾ,zzo
ッジャ,zzya,jja
ッジュ,zzyu,jju
ッジョ,zzyo,jjo
ッタ,tta
ッチ,tti
ッツ,ttu
ッテ,tte
ット,tto
ッチャ,ttya,ccha,ccya
ッチュ,ttyu,cchu,ccyu
ッチョ,ttyo,ccho,ccyo
ッダ,dda
ッヂ,ddi,
ッヅ,ddu
ッデ,dde
ッド,ddo
ッヂャ,ddya
ッヂュ,ddyu
ッヂョ,ddyo
ッハ,hha
ッヒ,hhi
ッフ,hhu,ffu
ッへ,hhe
ッホ,hho
ッヒャ,hhya
ッヒュ,hhyu
ッヒョ,hhyo
ッバ,bba
ッビ,bbi
ッブ,bbu
ッベ,bbe
ッボ,bbo
ッビャ,bbya
ッビュ,bbyu
ッビョ,bbyo
ッパ,ppa
ッピ,ppi
ップ,ppu
ッペ,ppe
ッポ,ppo
ッピャ,ppya
ッピュ,ppyu
ッピョ,ppyo
ッマ,mma
ッミ,mmi
ッム,mmu
ッメ,mme
ッモ,mmo
ッミャ,mmya
ッミュ,mmyu
ッミョ,mmyo
ッヤ,yya
ッイ,yyi
ッユ,yyu
ッイェ,yye
ッヨ,yyo
ッラ,rra
ッリ,rri
ッル,rru
ッレ,rre
ッロ,rro
ッリャ,rrya
ッリュ,rryu
ッリョ,rryo
ッワ,wwa
ッウィ,wwi
ッウ,wwu
ッウェ,wwe
ッヲ,wwo
ックァ,kkwa,kkuxa
ックィ,kkwi,kkuxi
ックゥ,kkwu,kkuxu
ックェ,kkwe,kkuxe
ックォ,kkwo,kkuxo
ッグァ,ggwa,gguxa
ッグィ,ggwi,gguxi
ッグゥ,ggwu,gguxu
ッグェ,ggwe,gguxe
ッグォ,ggwo,gguxo
ッスァ,sswa,ssuxa
ッスィ,sswi,ssuxi
ッスゥ,sswu,ssuxu
ッスェ,sswe,ssuxe
ッスォ,sswo,suxo
ットァ,ttwa,ttoxa
ットィ,ttwi,ttoxi
ットゥ,ttwu,ttoxu
ットェ,ttwe,ttoxe
ットォ,ttwo,ttoxo
ッドァ,ddwa,ddoxa
ッドィ,ddwi,ddoxi
ッドゥ,ddwu,ddoxi
ッドェ,ddwe,ddoxe
ッドォ,ddwo,ddoxo
ッファ,hhwa,ffa,hhuxa,ffuxa
ッフィ,hhwi,ffi,hhuxi,ffuxi
ッフェ,hhwe,ffe,hhuxe,ffuxe
ッフォ,hhwo,ffo,hhuxo,ffuxo
ッヴァ,vva,vvuxa
ッヴィ,vvi,vvuxi
ッヴ,vvu
ッヴェ,vve,vvuxe
ッヴォ,vvo,vvuxo
ッテァ,ttha,ttexa
ッティ,tthi,ttexi
ッテェ,tthe,ttexe
ッテャ,ttha,ttexya
ッテュ,tthu,ttexyu
ッテョ,ttho,ttexyo
ッフャ,ffya,hhuxya,ffuxya
ッフュ,ffyu,hhuxyu,ffuxyu
ッフョ,ffyo,hhuxyo,ffuxyo
ッヴャ,vvya,vvuxya
ッヴュ,vvyu,vvuxyu
ッヴョ,vvyo,vvuxyo
ァ,xa
ィ,xi
ゥ,xu
ェ,xe
ォ,xo
ヵ,xka
ヶ,xke
ッ,xtu
ャ,xya
ュ,xyu
ョ,xyo
ッk,kk
ッg,gg
ッs,ss
ッz,zz
ッt,tt
ッd,dd
ッh,hh
ッb,bb
ッp,pp
ッm,mm
ッy,yy
ッr,rr
ッw,ww
# below are characters that should be kept but have no explicit romanization rules.
# Chōonpu (Katakana-Hiragana Prolonged Sound Mark)
ー,ー
# Interpunct (Middle Dot)
・,・

View File

@ -0,0 +1,72 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja;
import java.io.IOException;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.junit.Test;
public class TestJapaneseCompletionAnalyzer extends BaseTokenStreamTestCase {
@Test
public void testCompletionDefault() throws IOException {
// mode=INDEX (default)
Analyzer analyzer = new JapaneseCompletionAnalyzer();
assertAnalyzesTo(
analyzer,
"東京",
new String[] {"東京", "toukyou"},
new int[] {0, 0},
new int[] {2, 2},
new int[] {1, 0});
analyzer.close();
}
@Test
public void testCompletionQuery() throws IOException {
// mode=QUERY
Analyzer analyzer = new JapaneseCompletionAnalyzer(null, JapaneseCompletionFilter.Mode.QUERY);
assertAnalyzesTo(
analyzer,
"東京t",
new String[] {"東京t", "toukyout"},
new int[] {0, 0},
new int[] {3, 3},
new int[] {1, 0});
analyzer.close();
}
/** blast random strings against the analyzer */
@Test
public void testRandom() throws IOException {
Random random = random();
final Analyzer a = new JapaneseCompletionAnalyzer();
checkRandomData(random, a, atLeast(100));
a.close();
}
/** blast some random large strings through the analyzer */
@Test
public void testRandomHugeStrings() throws Exception {
Random random = random();
final Analyzer a = new JapaneseCompletionAnalyzer();
checkRandomData(random, a, 2 * RANDOM_MULTIPLIER, 8192);
a.close();
}
}

View File

@ -0,0 +1,271 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cjk.CJKWidthCharFilter;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.util.IOUtils;
import org.junit.Test;
public class TestJapaneseCompletionFilter extends BaseTokenStreamTestCase {
private Analyzer indexAnalyzer;
private Analyzer queryAnalyzer;
@Override
public void setUp() throws Exception {
super.setUp();
indexAnalyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.NORMAL);
return new TokenStreamComponents(
tokenizer,
new JapaneseCompletionFilter(tokenizer, JapaneseCompletionFilter.Mode.INDEX));
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new CJKWidthCharFilter(reader);
}
@Override
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
return new CJKWidthCharFilter(reader);
}
};
queryAnalyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.NORMAL);
return new TokenStreamComponents(
tokenizer,
new JapaneseCompletionFilter(tokenizer, JapaneseCompletionFilter.Mode.QUERY));
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new CJKWidthCharFilter(reader);
}
@Override
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
return new CJKWidthCharFilter(reader);
}
};
}
@Override
public void tearDown() throws Exception {
IOUtils.close(indexAnalyzer);
IOUtils.close(queryAnalyzer);
super.tearDown();
}
@Test
public void testCompletionIndex() throws IOException {
assertAnalyzesTo(
indexAnalyzer,
"東京",
new String[] {"東京", "toukyou"},
new int[] {0, 0},
new int[] {2, 2},
new int[] {1, 0});
assertAnalyzesTo(
indexAnalyzer,
"東京都",
new String[] {"東京", "toukyou", "", "to"},
new int[] {0, 0, 2, 2},
new int[] {2, 2, 3, 3},
new int[] {1, 0, 1, 0});
assertAnalyzesTo(
indexAnalyzer,
"ドラえもん",
new String[] {"ドラえもん", "doraemon", "doraemonn"},
new int[] {0, 0, 0},
new int[] {5, 5, 5},
new int[] {1, 0, 0});
assertAnalyzesTo(
indexAnalyzer,
"ソースコード",
new String[] {"ソース", "soーsu", "コード", "koーdo"},
new int[] {0, 0, 3, 3},
new int[] {3, 3, 6, 6},
new int[] {1, 0, 1, 0});
assertAnalyzesTo(
indexAnalyzer,
"反社会的勢力",
new String[] {"", "han", "hann", "社会", "syakai", "shakai", "", "teki", "勢力", "seiryoku"},
new int[] {0, 0, 0, 1, 1, 1, 3, 3, 4, 4},
new int[] {1, 1, 1, 3, 3, 3, 4, 4, 6, 6},
new int[] {1, 0, 0, 1, 0, 0, 1, 0, 1, 0});
assertAnalyzesTo(
indexAnalyzer, "", new String[] {""}, new int[] {0}, new int[] {1}, new int[] {1});
assertAnalyzesTo(
indexAnalyzer,
"是々",
new String[] {"", "ze", ""},
new int[] {0, 0, 1},
new int[] {1, 1, 2},
new int[] {1, 0, 1});
assertAnalyzesTo(
indexAnalyzer,
"是々の",
new String[] {"", "ze", "", "", "no"},
new int[] {0, 0, 1, 2, 2},
new int[] {1, 1, 2, 3, 3},
new int[] {1, 0, 1, 1, 0});
}
@Test
public void testCompletionQuery() throws IOException {
assertAnalyzesTo(
queryAnalyzer,
"東京",
new String[] {"東京", "toukyou"},
new int[] {0, 0},
new int[] {2, 2},
new int[] {1, 0});
assertAnalyzesTo(
queryAnalyzer,
"東京都",
new String[] {"東京", "toukyou", "", "to"},
new int[] {0, 0, 2, 2},
new int[] {2, 2, 3, 3},
new int[] {1, 0, 1, 0});
assertAnalyzesTo(
queryAnalyzer,
"ドラえもん",
new String[] {"ドラえもん", "doraemon", "doraemonn"},
new int[] {0, 0, 0},
new int[] {5, 5, 5},
new int[] {1, 0, 0});
assertAnalyzesTo(
queryAnalyzer,
"ソースコード",
new String[] {"ソースコード", "soーsukoーdo"},
new int[] {0, 0},
new int[] {6, 6},
new int[] {1, 0});
assertAnalyzesTo(
queryAnalyzer,
"反社会的勢力",
new String[] {"", "han", "hann", "社会", "syakai", "shakai", "", "teki", "勢力", "seiryoku"},
new int[] {0, 0, 0, 1, 1, 1, 3, 3, 4, 4},
new int[] {1, 1, 1, 3, 3, 3, 4, 4, 6, 6},
new int[] {1, 0, 0, 1, 0, 0, 1, 0, 1, 0});
assertAnalyzesTo(
queryAnalyzer, "", new String[] {""}, new int[] {0}, new int[] {1}, new int[] {1});
assertAnalyzesTo(
queryAnalyzer,
"是々",
new String[] {"", "ze", ""},
new int[] {0, 0, 1},
new int[] {1, 1, 2},
new int[] {1, 0, 1});
assertAnalyzesTo(
indexAnalyzer,
"是々の",
new String[] {"", "ze", "", "", "no"},
new int[] {0, 0, 1, 2, 2},
new int[] {1, 1, 2, 3, 3},
new int[] {1, 0, 1, 1, 0});
assertAnalyzesTo(
queryAnalyzer,
"東京t",
new String[] {"東京t", "toukyout"},
new int[] {0, 0},
new int[] {3, 3},
new int[] {1, 0});
assertAnalyzesTo(
queryAnalyzer,
"サッk",
new String[] {"サッk", "sakk"},
new int[] {0, 0},
new int[] {3, 3},
new int[] {1, 0});
assertAnalyzesTo(
queryAnalyzer,
"反sy",
new String[] {"反sy", "hansy", "hannsy"},
new int[] {0, 0, 0},
new int[] {3, 3, 3},
new int[] {1, 0, 0});
assertAnalyzesTo(
queryAnalyzer,
"さーきゅr",
new String[] {"さーきゅr", "saーkyur"},
new int[] {0, 0},
new int[] {5, 5},
new int[] {1, 0});
assertAnalyzesTo(
queryAnalyzer,
"是々h",
new String[] {"", "ze", "々h"},
new int[] {0, 0, 1},
new int[] {1, 1, 3},
new int[] {1, 0, 1});
}
public void testEnglish() throws IOException {
assertAnalyzesTo(indexAnalyzer, "this atest", new String[] {"this", "atest"});
assertAnalyzesTo(queryAnalyzer, "this atest", new String[] {"this", "atest"});
}
public void testRandomStrings() throws IOException {
checkRandomData(random(), indexAnalyzer, atLeast(200));
checkRandomData(random(), queryAnalyzer, atLeast(200));
}
public void testEmptyTerm() throws IOException {
Analyzer a =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new JapaneseCompletionFilter(tokenizer));
}
};
checkOneTerm(a, "", "");
a.close();
}
}

View File

@ -0,0 +1,59 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cjk.CJKWidthFilterFactory;
import org.junit.Test;
public class TestJapaneseCompletionFilterFactory extends BaseTokenStreamFactoryTestCase {
@Test
public void testCompletion() throws IOException {
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<>());
TokenStream tokenStream = tokenizerFactory.create();
((Tokenizer) tokenStream).setReader(new StringReader("東京t"));
CJKWidthFilterFactory cjkWidthFactory = new CJKWidthFilterFactory(new HashMap<>());
tokenStream = cjkWidthFactory.create(tokenStream);
Map<String, String> map = new HashMap<>();
map.put("mode", "QUERY");
JapaneseCompletionFilterFactory filterFactory = new JapaneseCompletionFilterFactory(map);
assertTokenStreamContents(filterFactory.create(tokenStream), new String[] {"東京t", "toukyout"});
}
/** Test that bogus arguments result in exception */
@Test
public void testBogusArguments() throws Exception {
IllegalArgumentException expected =
expectThrows(
IllegalArgumentException.class,
() -> {
new JapaneseCompletionFilterFactory(
new HashMap<String, String>() {
{
put("bogusArg", "bogusValue");
}
});
});
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}

View File

@ -0,0 +1,68 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.completion;
import java.util.List;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
public class TestKatakanaRomanizer extends LuceneTestCase {
private final KatakanaRomanizer romanizer = KatakanaRomanizer.getInstance();
@Test
public void testRomanize() {
assertCharsRefListEqualsUnordered(
List.of(new CharsRef("hasi"), new CharsRef("hashi")),
romanizer.romanize(new CharsRef("ハシ")));
assertCharsRefListEqualsUnordered(
List.of(new CharsRef("yuukyuu")), romanizer.romanize(new CharsRef("ユウキュウ")));
assertCharsRefListEqualsUnordered(
List.of(new CharsRef("yakyuu")), romanizer.romanize(new CharsRef("ヤキュウ")));
assertCharsRefListEqualsUnordered(
List.of(new CharsRef("toukyou")), romanizer.romanize(new CharsRef("トウキョウ")));
assertCharsRefListEqualsUnordered(
List.of(new CharsRef("toーkyoー")), romanizer.romanize(new CharsRef("トーキョー")));
assertCharsRefListEqualsUnordered(
List.of(new CharsRef("sakka")), romanizer.romanize(new CharsRef("サッカ")));
assertCharsRefListEqualsUnordered(
List.of(new CharsRef("hyakkaten"), new CharsRef("hyakkatenn")),
romanizer.romanize(new CharsRef("ヒャッカテン")));
assertCharsRefListEqualsUnordered(
List.of(new CharsRef("voruteーru"), new CharsRef("vuxoruteーru")),
romanizer.romanize(new CharsRef("ヴォルテール")));
}
@Test
public void testRomanizeWithAlphabets() {
assertCharsRefListEqualsUnordered(
List.of(new CharsRef("toukyout")), romanizer.romanize(new CharsRef("トウキョウt")));
assertCharsRefListEqualsUnordered(
List.of(new CharsRef("kodakk")), romanizer.romanize(new CharsRef("コダッk")));
assertCharsRefListEqualsUnordered(
List.of(new CharsRef("syousy"), new CharsRef("shousy")),
romanizer.romanize(new CharsRef("ショウsy")));
}
private static void assertCharsRefListEqualsUnordered(
List<CharsRef> expected, List<CharsRef> actual) {
assertEquals(expected.size(), actual.size());
for (CharsRef ref : expected) {
assertTrue(ref.toString() + " is not contained in " + actual, actual.contains(ref));
}
}
}