mirror of https://github.com/apache/lucene.git
LUCENE-10102: Add JapaneseCompletionFilter for Input Method-aware auto-completion (#297)
Co-authored-by: Robert Muir <rmuir@apache.org>
This commit is contained in:
parent
de45b68c90
commit
4e86df96c0
|
@ -21,6 +21,8 @@ New Features
|
||||||
|
|
||||||
* LUCENE-10096: Add TamilAnalyzer based on the snowball stemmer. (Robert Muir)
|
* LUCENE-10096: Add TamilAnalyzer based on the snowball stemmer. (Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-10102: Add JapaneseCompletionFilter for Input Method-aware auto-completion (Tomoko Uchida, Robert Muir, Jun Ohtani)
|
||||||
|
|
||||||
System Requirements
|
System Requirements
|
||||||
|
|
||||||
* LUCENE-8738: Move to Java 11 as minimum Java version.
|
* LUCENE-8738: Move to Java 11 as minimum Java version.
|
||||||
|
|
|
@ -0,0 +1,65 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.ja;
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.cjk.CJKWidthCharFilter;
|
||||||
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.ja.dict.UserDictionary;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyzer for Japanese completion suggester.
|
||||||
|
*
|
||||||
|
* @see JapaneseCompletionFilter
|
||||||
|
*/
|
||||||
|
public class JapaneseCompletionAnalyzer extends Analyzer {
|
||||||
|
private final JapaneseCompletionFilter.Mode mode;
|
||||||
|
private final UserDictionary userDict;
|
||||||
|
|
||||||
|
/** Creates a new {@code JapaneseCompletionAnalyzer} with default configurations */
|
||||||
|
public JapaneseCompletionAnalyzer() {
|
||||||
|
this(null, JapaneseCompletionFilter.Mode.INDEX);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Creates a new {@code JapaneseCompletionAnalyzer} */
|
||||||
|
public JapaneseCompletionAnalyzer(UserDictionary userDict, JapaneseCompletionFilter.Mode mode) {
|
||||||
|
this.userDict = userDict;
|
||||||
|
this.mode = mode;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tokenizer =
|
||||||
|
new JapaneseTokenizer(userDict, true, true, JapaneseTokenizer.Mode.NORMAL);
|
||||||
|
TokenStream stream = new JapaneseCompletionFilter(tokenizer, mode);
|
||||||
|
stream = new LowerCaseFilter(stream);
|
||||||
|
return new TokenStreamComponents(tokenizer, stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Reader initReader(String fieldName, Reader reader) {
|
||||||
|
return new CJKWidthCharFilter(reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
|
||||||
|
return new CJKWidthCharFilter(reader);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,267 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.ja;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.ja.completion.CharSequenceUtils;
|
||||||
|
import org.apache.lucene.analysis.ja.completion.KatakanaRomanizer;
|
||||||
|
import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
|
import org.apache.lucene.util.CharsRefBuilder;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link org.apache.lucene.analysis.TokenFilter} that adds Japanese romanized tokens to the term
|
||||||
|
* attribute. Also, this keeps original tokens (surface forms). Main usage of this filter is Query
|
||||||
|
* Auto-Completion.
|
||||||
|
*
|
||||||
|
* <p>Supported romanization form: (modified) Hepburn-shiki, Kunrei-shiki (Nihon-shiki) and Wāpuro
|
||||||
|
* shiki.
|
||||||
|
*
|
||||||
|
* <p>This does NOT support some romanji forms which are official but not commonly used with
|
||||||
|
* Japanese <a href="https://en.wikipedia.org/wiki/Input_method">Input Methods</a>. e.g.: circumflex
|
||||||
|
* or macron representing <a href="https://en.wikipedia.org/wiki/Ch%C5%8Donpu">Chōonpu (長音符)</a> are
|
||||||
|
* not supported.
|
||||||
|
*
|
||||||
|
* <p>The romanization behaviour changes according to its {@link Mode}. The default mode is {@link
|
||||||
|
* Mode#INDEX}.
|
||||||
|
*
|
||||||
|
* <p>Note: This filter must be applied AFTER half-width and full-width normalization. Please ensure
|
||||||
|
* that a width normalizer such as {@link org.apache.lucene.analysis.cjk.CJKWidthCharFilter} or
|
||||||
|
* {@link org.apache.lucene.analysis.cjk.CJKWidthFilter} is included in your analysis chain. IF THE
|
||||||
|
* WIDTH NORMALIZATION IS NOT PERFORMED, THIS DOES NOT WORK AS EXPECTED. See also: {@link
|
||||||
|
* JapaneseCompletionAnalyzer}.
|
||||||
|
*/
|
||||||
|
public final class JapaneseCompletionFilter extends TokenFilter {
|
||||||
|
public static final Mode DEFAULT_MODE = Mode.INDEX;
|
||||||
|
|
||||||
|
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
|
||||||
|
private final ReadingAttribute readingAttr = addAttribute(ReadingAttribute.class);
|
||||||
|
private final PositionIncrementAttribute posIncAtt =
|
||||||
|
addAttribute(PositionIncrementAttribute.class);
|
||||||
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
|
private final CompletionTokenGenerator tokenGenerator;
|
||||||
|
|
||||||
|
/** Completion mode */
|
||||||
|
public enum Mode {
|
||||||
|
/** Simple romanization. Expected to be used when indexing. */
|
||||||
|
INDEX,
|
||||||
|
/** Input Method aware romanization. Expected to be used when querying. */
|
||||||
|
QUERY
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Creates a new {@code JapaneseCompletionFilter} with default configurations */
|
||||||
|
public JapaneseCompletionFilter(TokenStream input) {
|
||||||
|
this(input, DEFAULT_MODE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Creates a new {@code JapaneseCompletionFilter} */
|
||||||
|
public JapaneseCompletionFilter(TokenStream input, Mode mode) {
|
||||||
|
super(input);
|
||||||
|
this.tokenGenerator = new CompletionTokenGenerator(mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset() throws IOException {
|
||||||
|
super.reset();
|
||||||
|
tokenGenerator.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
mayIncrementToken();
|
||||||
|
if (tokenGenerator.hasNext()) {
|
||||||
|
clearAttributes();
|
||||||
|
CompletionToken token = tokenGenerator.next();
|
||||||
|
termAttr.setEmpty().append(token.term);
|
||||||
|
if (token.isFirst) {
|
||||||
|
posIncAtt.setPositionIncrement(1);
|
||||||
|
} else {
|
||||||
|
posIncAtt.setPositionIncrement(0);
|
||||||
|
}
|
||||||
|
offsetAtt.setOffset(token.startOffset, token.endOffset);
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void mayIncrementToken() throws IOException {
|
||||||
|
while (!tokenGenerator.hasNext()) {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
String surface = termAttr.toString();
|
||||||
|
String reading = readingAttr.getReading();
|
||||||
|
int startOffset = offsetAtt.startOffset();
|
||||||
|
int endOffset = offsetAtt.endOffset();
|
||||||
|
if (reading == null && CharSequenceUtils.isKana(surface)) {
|
||||||
|
// use the surface form as reading when possible.
|
||||||
|
reading = CharSequenceUtils.toKatakana(surface);
|
||||||
|
}
|
||||||
|
tokenGenerator.addToken(surface, reading, startOffset, endOffset);
|
||||||
|
} else {
|
||||||
|
if (tokenGenerator.hasPendingToken()) {
|
||||||
|
// a pending token remains.
|
||||||
|
tokenGenerator.finish();
|
||||||
|
} else {
|
||||||
|
// already consumed all tokens. there's no next token to output.
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class CompletionToken {
|
||||||
|
final String term;
|
||||||
|
final boolean isFirst;
|
||||||
|
final int startOffset;
|
||||||
|
final int endOffset;
|
||||||
|
|
||||||
|
CompletionToken(String term, boolean isFirst, int startOffset, int endOffset) {
|
||||||
|
this.term = term;
|
||||||
|
this.isFirst = isFirst;
|
||||||
|
this.startOffset = startOffset;
|
||||||
|
this.endOffset = endOffset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class CompletionTokenGenerator implements Iterator<CompletionToken> {
|
||||||
|
|
||||||
|
private final Mode mode;
|
||||||
|
|
||||||
|
private List<CompletionToken> outputs;
|
||||||
|
|
||||||
|
private CharsRefBuilder pdgSurface;
|
||||||
|
private CharsRefBuilder pdgReading;
|
||||||
|
private int pdgStartOffset;
|
||||||
|
private int pdgEndOffset;
|
||||||
|
|
||||||
|
CompletionTokenGenerator(Mode mode) {
|
||||||
|
this.mode = mode;
|
||||||
|
outputs = new ArrayList<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset() {
|
||||||
|
clearPendingToken();
|
||||||
|
outputs.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasNext() {
|
||||||
|
return outputs.size() > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public CompletionToken next() {
|
||||||
|
return outputs.remove(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void addToken(String surface, String reading, int startOffset, int endOffset) {
|
||||||
|
assert surface != null : "surface must not be null.";
|
||||||
|
|
||||||
|
if (hasPendingToken()) {
|
||||||
|
if (mode == Mode.QUERY
|
||||||
|
&& pdgReading != null
|
||||||
|
&& !CharSequenceUtils.isLowercaseAlphabets(pdgSurface.get())
|
||||||
|
&& CharSequenceUtils.isLowercaseAlphabets(surface)) {
|
||||||
|
// words that are in mid-IME composition are split into two tokens by JapaneseTokenizer;
|
||||||
|
// should be recovered when querying.
|
||||||
|
// Note: in this case, the reading attribute is null; use the surface form in place of the
|
||||||
|
// reading.
|
||||||
|
// e.g.: "サッ" + "k" => "サッk", "反" + "sy" => "反sy"
|
||||||
|
pdgSurface.append(surface);
|
||||||
|
pdgReading.append(surface);
|
||||||
|
pdgEndOffset = endOffset;
|
||||||
|
generateOutputs();
|
||||||
|
clearPendingToken();
|
||||||
|
} else if (mode == Mode.QUERY
|
||||||
|
&& CharSequenceUtils.isKana(pdgSurface.get())
|
||||||
|
&& CharSequenceUtils.isKana(surface)) {
|
||||||
|
// words that are all composed only of Katakana or Hiragana should be concatenated when
|
||||||
|
// querying.
|
||||||
|
// e.g.: "こい" + "ぬ" => "こいぬ"
|
||||||
|
pdgSurface.append(surface);
|
||||||
|
pdgReading.append(reading);
|
||||||
|
pdgEndOffset = endOffset;
|
||||||
|
} else {
|
||||||
|
generateOutputs();
|
||||||
|
resetPendingToken(surface, reading, startOffset, endOffset);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
resetPendingToken(surface, reading, startOffset, endOffset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void finish() {
|
||||||
|
generateOutputs();
|
||||||
|
clearPendingToken();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void generateOutputs() {
|
||||||
|
// preserve original surface form as an output.
|
||||||
|
outputs.add(new CompletionToken(pdgSurface.toString(), true, pdgStartOffset, pdgEndOffset));
|
||||||
|
// skip readings that cannot be translated to romaji.
|
||||||
|
if (pdgReading == null
|
||||||
|
|| pdgReading.length() == 0
|
||||||
|
|| !CharSequenceUtils.isKatakanaOrHWAlphabets(pdgReading.get())) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// translate the reading to romaji.
|
||||||
|
List<CharsRef> romaji = KatakanaRomanizer.getInstance().romanize(pdgReading.get());
|
||||||
|
for (CharsRef ref : romaji) {
|
||||||
|
// set the same start/end offset as the original surface form for romanized tokens.
|
||||||
|
outputs.add(new CompletionToken(ref.toString(), false, pdgStartOffset, pdgEndOffset));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean hasPendingToken() {
|
||||||
|
return pdgSurface != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
void resetPendingToken(
|
||||||
|
CharSequence surface, CharSequence reading, int startOffset, int endOffset) {
|
||||||
|
if (this.pdgSurface == null) {
|
||||||
|
this.pdgSurface = new CharsRefBuilder();
|
||||||
|
} else {
|
||||||
|
this.pdgSurface.clear();
|
||||||
|
}
|
||||||
|
this.pdgSurface.append(surface);
|
||||||
|
if (this.pdgReading == null) {
|
||||||
|
this.pdgReading = new CharsRefBuilder();
|
||||||
|
} else {
|
||||||
|
this.pdgReading.clear();
|
||||||
|
}
|
||||||
|
this.pdgReading.append(reading);
|
||||||
|
this.pdgStartOffset = startOffset;
|
||||||
|
this.pdgEndOffset = endOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
void clearPendingToken() {
|
||||||
|
this.pdgSurface = null;
|
||||||
|
this.pdgReading = null;
|
||||||
|
this.pdgStartOffset = 0;
|
||||||
|
this.pdgEndOffset = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,66 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.ja;
|
||||||
|
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Map;
|
||||||
|
import org.apache.lucene.analysis.TokenFilterFactory;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link JapaneseCompletionFilter}.
|
||||||
|
*
|
||||||
|
* <p>Supported attributes:
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li>mode: Completion mode. see {@link JapaneseCompletionFilter.Mode}
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* @lucene.spi {@value #NAME}
|
||||||
|
*/
|
||||||
|
public class JapaneseCompletionFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
|
/** SPI name */
|
||||||
|
public static final String NAME = "japaneseCompletion";
|
||||||
|
|
||||||
|
private static final String MODE_PARAM = "mode";
|
||||||
|
private final JapaneseCompletionFilter.Mode mode;
|
||||||
|
|
||||||
|
/** Creates a new {@code JapaneseCompletionFilterFactory} */
|
||||||
|
public JapaneseCompletionFilterFactory(Map<String, String> args) {
|
||||||
|
super(args);
|
||||||
|
mode =
|
||||||
|
JapaneseCompletionFilter.Mode.valueOf(
|
||||||
|
get(
|
||||||
|
args,
|
||||||
|
MODE_PARAM,
|
||||||
|
JapaneseCompletionFilter.DEFAULT_MODE.name().toUpperCase(Locale.ROOT)));
|
||||||
|
if (!args.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Default ctor for compatibility with SPI */
|
||||||
|
public JapaneseCompletionFilterFactory() {
|
||||||
|
throw defaultCtorException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new JapaneseCompletionFilter(input, mode);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,91 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.ja.completion;
|
||||||
|
|
||||||
|
/** Utility functions for {@link org.apache.lucene.analysis.ja.JapaneseCompletionFilter} */
|
||||||
|
public class CharSequenceUtils {
|
||||||
|
|
||||||
|
/** Checks if a char sequence is composed only of lowercase alphabets */
|
||||||
|
public static boolean isLowercaseAlphabets(CharSequence s) {
|
||||||
|
for (int i = 0; i < s.length(); i++) {
|
||||||
|
char ch = s.charAt(i);
|
||||||
|
if (!(isHalfWidthLowercaseAlphabet(ch) || isFullWidthLowercaseAlphabet(ch))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Checks if a char sequence is composed only of Katakana or hiragana */
|
||||||
|
public static boolean isKana(CharSequence s) {
|
||||||
|
for (int i = 0; i < s.length(); i++) {
|
||||||
|
char ch = s.charAt(i);
|
||||||
|
if (!(isHiragana(ch) || isKatakana(ch))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Checks if a char sequence is composed only of Katakana or lowercase alphabets */
|
||||||
|
public static boolean isKatakanaOrHWAlphabets(CharSequence ref) {
|
||||||
|
for (int i = 0; i < ref.length(); i++) {
|
||||||
|
char ch = ref.charAt(i);
|
||||||
|
if (!isKatakana(ch) && !isHalfWidthLowercaseAlphabet(ch)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Checks if a char is a Hiragana */
|
||||||
|
private static boolean isHiragana(char ch) {
|
||||||
|
return ch >= 0x3040 && ch <= 0x309f;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Checks if a char is a Katakana */
|
||||||
|
private static boolean isKatakana(char ch) {
|
||||||
|
return ch >= 0x30a0 && ch <= 0x30ff;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Checks if a char is a half-width lowercase alphabet */
|
||||||
|
private static boolean isHalfWidthLowercaseAlphabet(char ch) {
|
||||||
|
return ch >= 0x61 && ch <= 0x7a;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Checks if a char is a full-width lowercase alphabet */
|
||||||
|
public static boolean isFullWidthLowercaseAlphabet(char ch) {
|
||||||
|
return ch >= 0xff41 && ch <= 0xff5a;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Convert all hiragana in a string into kanataka */
|
||||||
|
public static String toKatakana(CharSequence s) {
|
||||||
|
char[] chars = new char[s.length()];
|
||||||
|
for (int i = 0; i < s.length(); i++) {
|
||||||
|
char ch = s.charAt(i);
|
||||||
|
// if the character is from 'ぁ' to 'ゖ' or 'ゝ' or 'ゞ', can be converted to katakana.
|
||||||
|
if (ch >= 0x3041 && ch <= 0x3096 || ch == 0x309d || ch == 0x309e) {
|
||||||
|
chars[i] = (char) (ch + 0x60);
|
||||||
|
} else {
|
||||||
|
chars[i] = ch;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new String(chars);
|
||||||
|
}
|
||||||
|
|
||||||
|
private CharSequenceUtils() {}
|
||||||
|
}
|
|
@ -0,0 +1,193 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.ja.completion;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
|
import org.apache.lucene.util.CharsRefBuilder;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts a Katakana string to <a
|
||||||
|
* href="https://en.wikipedia.org/wiki/Romanization_of_Japanese">Romaji</a> using the pre-defined
|
||||||
|
* Katakana-Romaji mapping rules. Internally, this repeatedly performs prefix match on the given
|
||||||
|
* char sequence to the pre-built keystroke array until it reaches the end of the sequence, or there
|
||||||
|
* are no matched keystrokes.
|
||||||
|
*/
|
||||||
|
public class KatakanaRomanizer {
|
||||||
|
private static final String ROMAJI_MAP_FILE = "romaji_map.txt";
|
||||||
|
|
||||||
|
private static KatakanaRomanizer INSTANCE;
|
||||||
|
|
||||||
|
static {
|
||||||
|
// Build romaji-map and keystroke arrays from the pre-defined Katakana-Romaji mapping file.
|
||||||
|
try (InputStreamReader is =
|
||||||
|
new InputStreamReader(
|
||||||
|
KatakanaRomanizer.class.getResourceAsStream(ROMAJI_MAP_FILE),
|
||||||
|
Charset.forName("UTF-8"));
|
||||||
|
BufferedReader ir = new BufferedReader(is)) {
|
||||||
|
Map<CharsRef, List<CharsRef>> romajiMap = new HashMap<>();
|
||||||
|
String line;
|
||||||
|
while ((line = ir.readLine()) != null) {
|
||||||
|
if (line.startsWith("#")) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
String[] cols = line.trim().split(",");
|
||||||
|
if (cols.length < 2) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
CharsRef prefix = new CharsRef(cols[0]);
|
||||||
|
romajiMap.put(prefix, new ArrayList<>());
|
||||||
|
for (int i = 1; i < cols.length; i++) {
|
||||||
|
romajiMap.get(prefix).add(new CharsRef(cols[i]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<CharsRef> keystrokeSet = romajiMap.keySet();
|
||||||
|
int maxKeystrokeLength = keystrokeSet.stream().mapToInt(CharsRef::length).max().getAsInt();
|
||||||
|
CharsRef[][] keystrokes = new CharsRef[maxKeystrokeLength][];
|
||||||
|
for (int len = 0; len < maxKeystrokeLength; len++) {
|
||||||
|
final int l = len;
|
||||||
|
keystrokes[l] =
|
||||||
|
keystrokeSet.stream().filter(k -> k.length - 1 == l).toArray(CharsRef[]::new);
|
||||||
|
}
|
||||||
|
for (CharsRef[] ks : keystrokes) {
|
||||||
|
// keystroke array must be sorted in ascending order for binary search.
|
||||||
|
Arrays.sort(ks);
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTANCE = new KatakanaRomanizer(keystrokes, romajiMap);
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final CharsRef[][] keystrokes;
|
||||||
|
private final Map<CharsRef, List<CharsRef>> romajiMap;
|
||||||
|
|
||||||
|
/** Returns the singleton instance of {@code KatakanaRomenizer} */
|
||||||
|
public static KatakanaRomanizer getInstance() {
|
||||||
|
return INSTANCE;
|
||||||
|
}
|
||||||
|
|
||||||
|
private KatakanaRomanizer(CharsRef[][] keystrokes, Map<CharsRef, List<CharsRef>> romajiMap) {
|
||||||
|
this.keystrokes = keystrokes;
|
||||||
|
this.romajiMap = romajiMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Translates a sequence of katakana to romaji. An input can produce multiple outputs because a
|
||||||
|
* keystroke can be mapped to multiple romajis.
|
||||||
|
*/
|
||||||
|
public List<CharsRef> romanize(CharsRef input) {
|
||||||
|
assert CharSequenceUtils.isKatakanaOrHWAlphabets(input);
|
||||||
|
|
||||||
|
List<CharsRefBuilder> pendingOutputs = new ArrayList<>();
|
||||||
|
int pos = 0;
|
||||||
|
while (pos < input.length) {
|
||||||
|
// Greedily looks up the longest matched keystroke.
|
||||||
|
// e.g.: Consider input="キョウ", then there are two matched keystrokes (romaji mapping rules)
|
||||||
|
// "キ" -> "ki" and "キョ" -> "kyo". Only the longest one "キョ" will be selected.
|
||||||
|
MatchedKeystroke matched = longestKeystrokeMatch(input, pos);
|
||||||
|
if (matched == null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<CharsRef> candidates =
|
||||||
|
romajiMap.get(keystrokes[matched.keystrokeLen - 1][matched.keystrokeIndex]);
|
||||||
|
|
||||||
|
if (pendingOutputs.size() == 0) {
|
||||||
|
// There is no pending output.
|
||||||
|
// Add the matched keystrokes to pending outputs list.
|
||||||
|
for (CharsRef cref : candidates) {
|
||||||
|
CharsRefBuilder output = new CharsRefBuilder();
|
||||||
|
output.copyChars(cref);
|
||||||
|
pendingOutputs.add(output);
|
||||||
|
}
|
||||||
|
} else if (candidates.size() == 1) {
|
||||||
|
// There are one or more pending output(s) and one matched keystroke.
|
||||||
|
// Append the matched keystroke to all pending outputs.
|
||||||
|
// e.g.: Consider we already have two pending outputs "shi" and "si" and the matched
|
||||||
|
// keystroke "ka";
|
||||||
|
// then results are "shika" and "sika".
|
||||||
|
CharsRef cref = candidates.get(0);
|
||||||
|
for (CharsRefBuilder pdgOutput : pendingOutputs) {
|
||||||
|
pdgOutput.append(cref.chars, 0, cref.length);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// There are one or more pending output(s) and multiple matched keystrokes.
|
||||||
|
// Combine the matched keystrokes to all pending outputs.
|
||||||
|
// e.g.: Consider we already have two pending outputs "shi" and "si" and the matched
|
||||||
|
// keystroke "n" and "nn".
|
||||||
|
// To produce all possible keystroke patterns, result outputs should be "shin", "shinn",
|
||||||
|
// "sin" and "sinn".
|
||||||
|
List<CharsRefBuilder> outputs = new ArrayList<>();
|
||||||
|
for (CharsRef cref : candidates) {
|
||||||
|
for (CharsRefBuilder pdgOutput : pendingOutputs) {
|
||||||
|
CharsRefBuilder buffer = new CharsRefBuilder();
|
||||||
|
buffer.copyChars(pdgOutput.chars(), 0, pdgOutput.length());
|
||||||
|
buffer.append(cref.chars, cref.offset, cref.length);
|
||||||
|
outputs.add(buffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// update the pending outputs
|
||||||
|
pendingOutputs = outputs;
|
||||||
|
}
|
||||||
|
|
||||||
|
// proceed to the next input position
|
||||||
|
pos += matched.keystrokeLen;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pos < input.length) {
|
||||||
|
// add the remnants (that cannot be mapped to any romaji) as suffix
|
||||||
|
for (CharsRefBuilder output : pendingOutputs) {
|
||||||
|
output.append(input.chars, pos, input.length - pos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return pendingOutputs.stream().map(CharsRefBuilder::get).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
private MatchedKeystroke longestKeystrokeMatch(CharsRef input, int inputOffset) {
|
||||||
|
for (int len = Math.min(input.length - inputOffset, keystrokes.length); len > 0; len--) {
|
||||||
|
CharsRef ref = new CharsRef(input.chars, inputOffset, len);
|
||||||
|
int index = Arrays.binarySearch(keystrokes[len - 1], ref);
|
||||||
|
if (index >= 0) {
|
||||||
|
return new MatchedKeystroke(len, index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// there's no matched keystroke
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class MatchedKeystroke {
|
||||||
|
final int keystrokeLen;
|
||||||
|
final int keystrokeIndex;
|
||||||
|
|
||||||
|
MatchedKeystroke(int keystrokeLen, int keystrokeIndex) {
|
||||||
|
this.keystrokeLen = keystrokeLen;
|
||||||
|
this.keystrokeIndex = keystrokeIndex;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,19 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** Utilities for {@link org.apache.lucene.analysis.ja.JapaneseCompletionFilter} */
|
||||||
|
package org.apache.lucene.analysis.ja.completion;
|
|
@ -14,6 +14,7 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
org.apache.lucene.analysis.ja.JapaneseBaseFormFilterFactory
|
org.apache.lucene.analysis.ja.JapaneseBaseFormFilterFactory
|
||||||
|
org.apache.lucene.analysis.ja.JapaneseCompletionFilterFactory
|
||||||
org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilterFactory
|
org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilterFactory
|
||||||
org.apache.lucene.analysis.ja.JapaneseNumberFilterFactory
|
org.apache.lucene.analysis.ja.JapaneseNumberFilterFactory
|
||||||
org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilterFactory
|
org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilterFactory
|
||||||
|
|
|
@ -0,0 +1,344 @@
|
||||||
|
# mapping rules of katakana (an unit of keystroke) to list of acceptable romanizations.
|
||||||
|
# longest-match is used to find entries in this list.
|
||||||
|
# covers romanization systems: modified Hepburn-shiki, Kunrei-shiki (Nihon-shiki), and Wāpuro shiki.
|
||||||
|
# note: this does not strictly comply with the romanization systems listed above,
|
||||||
|
# but tries to cover possible keystoroke supported by various Input Methods.
|
||||||
|
|
||||||
|
ア,a
|
||||||
|
イ,i
|
||||||
|
ウ,u
|
||||||
|
エ,e
|
||||||
|
オ,o
|
||||||
|
カ,ka
|
||||||
|
キ,ki
|
||||||
|
ク,ku
|
||||||
|
ケ,ke
|
||||||
|
コ,ko
|
||||||
|
キャ,kya
|
||||||
|
キュ,kyu
|
||||||
|
キョ,kyo
|
||||||
|
ガ,ga
|
||||||
|
ギ,gi
|
||||||
|
グ,gu
|
||||||
|
ゲ,ge
|
||||||
|
ゴ,go
|
||||||
|
ギャ,gya
|
||||||
|
ギュ,gyu
|
||||||
|
ギョ,gyo
|
||||||
|
サ,sa
|
||||||
|
シ,si,shi
|
||||||
|
ス,su
|
||||||
|
セ,se
|
||||||
|
ソ,so
|
||||||
|
シャ,sya,sha
|
||||||
|
シュ,syu,shu
|
||||||
|
シェ,sye,she
|
||||||
|
ショ,syo,sho
|
||||||
|
ザ,za
|
||||||
|
ジ,zi,ji
|
||||||
|
ズ,zu
|
||||||
|
ゼ,ze
|
||||||
|
ゾ,zo
|
||||||
|
ジャ,zya,ja
|
||||||
|
ジュ,zyu,ju
|
||||||
|
ジェ,zye,je
|
||||||
|
ジョ,zyo,jo
|
||||||
|
タ,ta
|
||||||
|
チ,ti,chi
|
||||||
|
ツ,tu,tsu
|
||||||
|
テ,te
|
||||||
|
ト,to
|
||||||
|
チャ,tya,cha,cya
|
||||||
|
チュ,tyu,chu,cyu
|
||||||
|
チョ,tyo,cho,cyo
|
||||||
|
ダ,da
|
||||||
|
ヂ,di,zi,ji
|
||||||
|
ヅ,du,zu
|
||||||
|
デ,de
|
||||||
|
ド,do
|
||||||
|
ヂャ,dya,zya,ja
|
||||||
|
ヂュ,dyu,zyu,ju
|
||||||
|
ヂョ,dyo,zyo,jo
|
||||||
|
ナ,na
|
||||||
|
ニ,ni
|
||||||
|
ヌ,nu
|
||||||
|
ネ,ne
|
||||||
|
ノ,no
|
||||||
|
ニャ,nya
|
||||||
|
ニュ,nyu
|
||||||
|
ニョ,nyo
|
||||||
|
ハ,ha
|
||||||
|
ヒ,hi
|
||||||
|
フ,hu,fu
|
||||||
|
ヘ,he
|
||||||
|
ホ,ho
|
||||||
|
ヒャ,hya
|
||||||
|
ヒュ,hyu
|
||||||
|
ヒョ,hyo
|
||||||
|
バ,ba
|
||||||
|
ビ,bi
|
||||||
|
ブ,bu
|
||||||
|
ベ,be
|
||||||
|
ボ,bo
|
||||||
|
ビャ,bya
|
||||||
|
ビュ,byu
|
||||||
|
ビョ,byo
|
||||||
|
パ,pa
|
||||||
|
ピ,pi
|
||||||
|
プ,pu
|
||||||
|
ペ,pe
|
||||||
|
ポ,po
|
||||||
|
ピャ,pya
|
||||||
|
ピュ,pyu
|
||||||
|
ピョ,pyo
|
||||||
|
マ,ma
|
||||||
|
ミ,mi
|
||||||
|
ム,mu
|
||||||
|
メ,me
|
||||||
|
モ,mo
|
||||||
|
ミャ,mya
|
||||||
|
ミュ,myu
|
||||||
|
ミョ,myo
|
||||||
|
ヤ,ya
|
||||||
|
ユ,yu
|
||||||
|
ヨ,yo
|
||||||
|
ラ,ra
|
||||||
|
リ,ri
|
||||||
|
ル,ru
|
||||||
|
レ,re
|
||||||
|
ロ,ro
|
||||||
|
リャ,rya
|
||||||
|
リュ,ryu
|
||||||
|
リョ,ryo
|
||||||
|
ワ,wa
|
||||||
|
ウィ,wi
|
||||||
|
ヰ,wi
|
||||||
|
ウェ,we
|
||||||
|
ヱ,we
|
||||||
|
ヲ,wo,o
|
||||||
|
ン,n,nn
|
||||||
|
|
||||||
|
クァ,kwa,kuxa
|
||||||
|
クィ,kwi,kuxi
|
||||||
|
クゥ,kwu,kuxu
|
||||||
|
クェ,kwe,kuxe
|
||||||
|
クォ,kwo,kuxo
|
||||||
|
グァ,gwa,guxa
|
||||||
|
グィ,gwi,guxi
|
||||||
|
グゥ,gwu,guxu
|
||||||
|
グェ,gwe,guxe
|
||||||
|
グォ,gwo,guxo
|
||||||
|
スァ,swa,suxa
|
||||||
|
スィ,swi,suxi
|
||||||
|
スゥ,swu,suxu
|
||||||
|
スェ,swe,suxe
|
||||||
|
スォ,swo,suxo
|
||||||
|
トァ,twa,toxa
|
||||||
|
トィ,twi,toxi
|
||||||
|
トゥ,twu,toxu
|
||||||
|
トェ,twe,toxe
|
||||||
|
トォ,two,toxo
|
||||||
|
ドァ,dwa,doxa
|
||||||
|
ドィ,dwi,
|
||||||
|
ドゥ,dwu,doxu
|
||||||
|
ドェ,dwe,doxe
|
||||||
|
ドォ,dwo,doxo
|
||||||
|
ファ,hwa,fa,huxa
|
||||||
|
フィ,hwi,fi,huxi
|
||||||
|
フェ,hwe,fe,huxe
|
||||||
|
フォ,hwo,fo,huxo
|
||||||
|
ヴァ,va,vuxa
|
||||||
|
ヴィ,vi,vuxi
|
||||||
|
ヴ,vu
|
||||||
|
ヴェ,ve,vuxe
|
||||||
|
ヴォ,vo,vuxo
|
||||||
|
テァ,tha,texa
|
||||||
|
ティ,thi,texi
|
||||||
|
テェ,the,texe
|
||||||
|
テャ,tha,texya
|
||||||
|
テュ,thu,texyu
|
||||||
|
テョ,tho,texyo
|
||||||
|
フャ,fya,huxya,fuxya
|
||||||
|
フュ,fyu,huxyu,fuxyu
|
||||||
|
フョ,fyo,huxyo,fuxyo
|
||||||
|
ヴャ,vya,vuxya
|
||||||
|
ヴュ,vyu,vuxyu
|
||||||
|
ヴョ,vyo,vuxyo
|
||||||
|
|
||||||
|
ッカ,kka
|
||||||
|
ッキ,kki
|
||||||
|
ック,kku
|
||||||
|
ッケ,kke
|
||||||
|
ッコ,kko
|
||||||
|
ッキャ,kkya
|
||||||
|
ッキュ,kkyu
|
||||||
|
ッキョ,kkyo
|
||||||
|
ッガ,gga
|
||||||
|
ッギ,ggi
|
||||||
|
ッグ,ggu
|
||||||
|
ッゲ,gge
|
||||||
|
ッゴ,ggo
|
||||||
|
ッギャ,ggya
|
||||||
|
ッギュ,ggyu
|
||||||
|
ッギョ,ggyo
|
||||||
|
ッサ,ssa
|
||||||
|
ッシ,ssi
|
||||||
|
ッス,ssu
|
||||||
|
ッセ,sse
|
||||||
|
ッソ,sso
|
||||||
|
ッシャ,ssya,ssha
|
||||||
|
ッシュ,ssyu,sshu
|
||||||
|
ッショ,ssyo,ssho
|
||||||
|
ッザ,zza
|
||||||
|
ッジ,zzi,jji
|
||||||
|
ッズ,zzu
|
||||||
|
ッゼ,zze
|
||||||
|
ッゾ,zzo
|
||||||
|
ッジャ,zzya,jja
|
||||||
|
ッジュ,zzyu,jju
|
||||||
|
ッジョ,zzyo,jjo
|
||||||
|
ッタ,tta
|
||||||
|
ッチ,tti
|
||||||
|
ッツ,ttu
|
||||||
|
ッテ,tte
|
||||||
|
ット,tto
|
||||||
|
ッチャ,ttya,ccha,ccya
|
||||||
|
ッチュ,ttyu,cchu,ccyu
|
||||||
|
ッチョ,ttyo,ccho,ccyo
|
||||||
|
ッダ,dda
|
||||||
|
ッヂ,ddi,
|
||||||
|
ッヅ,ddu
|
||||||
|
ッデ,dde
|
||||||
|
ッド,ddo
|
||||||
|
ッヂャ,ddya
|
||||||
|
ッヂュ,ddyu
|
||||||
|
ッヂョ,ddyo
|
||||||
|
ッハ,hha
|
||||||
|
ッヒ,hhi
|
||||||
|
ッフ,hhu,ffu
|
||||||
|
ッへ,hhe
|
||||||
|
ッホ,hho
|
||||||
|
ッヒャ,hhya
|
||||||
|
ッヒュ,hhyu
|
||||||
|
ッヒョ,hhyo
|
||||||
|
ッバ,bba
|
||||||
|
ッビ,bbi
|
||||||
|
ッブ,bbu
|
||||||
|
ッベ,bbe
|
||||||
|
ッボ,bbo
|
||||||
|
ッビャ,bbya
|
||||||
|
ッビュ,bbyu
|
||||||
|
ッビョ,bbyo
|
||||||
|
ッパ,ppa
|
||||||
|
ッピ,ppi
|
||||||
|
ップ,ppu
|
||||||
|
ッペ,ppe
|
||||||
|
ッポ,ppo
|
||||||
|
ッピャ,ppya
|
||||||
|
ッピュ,ppyu
|
||||||
|
ッピョ,ppyo
|
||||||
|
ッマ,mma
|
||||||
|
ッミ,mmi
|
||||||
|
ッム,mmu
|
||||||
|
ッメ,mme
|
||||||
|
ッモ,mmo
|
||||||
|
ッミャ,mmya
|
||||||
|
ッミュ,mmyu
|
||||||
|
ッミョ,mmyo
|
||||||
|
ッヤ,yya
|
||||||
|
ッイ,yyi
|
||||||
|
ッユ,yyu
|
||||||
|
ッイェ,yye
|
||||||
|
ッヨ,yyo
|
||||||
|
ッラ,rra
|
||||||
|
ッリ,rri
|
||||||
|
ッル,rru
|
||||||
|
ッレ,rre
|
||||||
|
ッロ,rro
|
||||||
|
ッリャ,rrya
|
||||||
|
ッリュ,rryu
|
||||||
|
ッリョ,rryo
|
||||||
|
ッワ,wwa
|
||||||
|
ッウィ,wwi
|
||||||
|
ッウ,wwu
|
||||||
|
ッウェ,wwe
|
||||||
|
ッヲ,wwo
|
||||||
|
|
||||||
|
ックァ,kkwa,kkuxa
|
||||||
|
ックィ,kkwi,kkuxi
|
||||||
|
ックゥ,kkwu,kkuxu
|
||||||
|
ックェ,kkwe,kkuxe
|
||||||
|
ックォ,kkwo,kkuxo
|
||||||
|
ッグァ,ggwa,gguxa
|
||||||
|
ッグィ,ggwi,gguxi
|
||||||
|
ッグゥ,ggwu,gguxu
|
||||||
|
ッグェ,ggwe,gguxe
|
||||||
|
ッグォ,ggwo,gguxo
|
||||||
|
ッスァ,sswa,ssuxa
|
||||||
|
ッスィ,sswi,ssuxi
|
||||||
|
ッスゥ,sswu,ssuxu
|
||||||
|
ッスェ,sswe,ssuxe
|
||||||
|
ッスォ,sswo,suxo
|
||||||
|
ットァ,ttwa,ttoxa
|
||||||
|
ットィ,ttwi,ttoxi
|
||||||
|
ットゥ,ttwu,ttoxu
|
||||||
|
ットェ,ttwe,ttoxe
|
||||||
|
ットォ,ttwo,ttoxo
|
||||||
|
ッドァ,ddwa,ddoxa
|
||||||
|
ッドィ,ddwi,ddoxi
|
||||||
|
ッドゥ,ddwu,ddoxi
|
||||||
|
ッドェ,ddwe,ddoxe
|
||||||
|
ッドォ,ddwo,ddoxo
|
||||||
|
ッファ,hhwa,ffa,hhuxa,ffuxa
|
||||||
|
ッフィ,hhwi,ffi,hhuxi,ffuxi
|
||||||
|
ッフェ,hhwe,ffe,hhuxe,ffuxe
|
||||||
|
ッフォ,hhwo,ffo,hhuxo,ffuxo
|
||||||
|
ッヴァ,vva,vvuxa
|
||||||
|
ッヴィ,vvi,vvuxi
|
||||||
|
ッヴ,vvu
|
||||||
|
ッヴェ,vve,vvuxe
|
||||||
|
ッヴォ,vvo,vvuxo
|
||||||
|
ッテァ,ttha,ttexa
|
||||||
|
ッティ,tthi,ttexi
|
||||||
|
ッテェ,tthe,ttexe
|
||||||
|
ッテャ,ttha,ttexya
|
||||||
|
ッテュ,tthu,ttexyu
|
||||||
|
ッテョ,ttho,ttexyo
|
||||||
|
ッフャ,ffya,hhuxya,ffuxya
|
||||||
|
ッフュ,ffyu,hhuxyu,ffuxyu
|
||||||
|
ッフョ,ffyo,hhuxyo,ffuxyo
|
||||||
|
ッヴャ,vvya,vvuxya
|
||||||
|
ッヴュ,vvyu,vvuxyu
|
||||||
|
ッヴョ,vvyo,vvuxyo
|
||||||
|
|
||||||
|
ァ,xa
|
||||||
|
ィ,xi
|
||||||
|
ゥ,xu
|
||||||
|
ェ,xe
|
||||||
|
ォ,xo
|
||||||
|
ヵ,xka
|
||||||
|
ヶ,xke
|
||||||
|
ッ,xtu
|
||||||
|
ャ,xya
|
||||||
|
ュ,xyu
|
||||||
|
ョ,xyo
|
||||||
|
|
||||||
|
ッk,kk
|
||||||
|
ッg,gg
|
||||||
|
ッs,ss
|
||||||
|
ッz,zz
|
||||||
|
ッt,tt
|
||||||
|
ッd,dd
|
||||||
|
ッh,hh
|
||||||
|
ッb,bb
|
||||||
|
ッp,pp
|
||||||
|
ッm,mm
|
||||||
|
ッy,yy
|
||||||
|
ッr,rr
|
||||||
|
ッw,ww
|
||||||
|
|
||||||
|
# below are characters that should be kept but have no explicit romanization rules.
|
||||||
|
# Chōonpu (Katakana-Hiragana Prolonged Sound Mark)
|
||||||
|
ー,ー
|
||||||
|
# Interpunct (Middle Dot)
|
||||||
|
・,・
|
|
@ -0,0 +1,72 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.ja;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Random;
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class TestJapaneseCompletionAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCompletionDefault() throws IOException {
|
||||||
|
// mode=INDEX (default)
|
||||||
|
Analyzer analyzer = new JapaneseCompletionAnalyzer();
|
||||||
|
assertAnalyzesTo(
|
||||||
|
analyzer,
|
||||||
|
"東京",
|
||||||
|
new String[] {"東京", "toukyou"},
|
||||||
|
new int[] {0, 0},
|
||||||
|
new int[] {2, 2},
|
||||||
|
new int[] {1, 0});
|
||||||
|
analyzer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCompletionQuery() throws IOException {
|
||||||
|
// mode=QUERY
|
||||||
|
Analyzer analyzer = new JapaneseCompletionAnalyzer(null, JapaneseCompletionFilter.Mode.QUERY);
|
||||||
|
assertAnalyzesTo(
|
||||||
|
analyzer,
|
||||||
|
"東京t",
|
||||||
|
new String[] {"東京t", "toukyout"},
|
||||||
|
new int[] {0, 0},
|
||||||
|
new int[] {3, 3},
|
||||||
|
new int[] {1, 0});
|
||||||
|
analyzer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** blast random strings against the analyzer */
|
||||||
|
@Test
|
||||||
|
public void testRandom() throws IOException {
|
||||||
|
Random random = random();
|
||||||
|
final Analyzer a = new JapaneseCompletionAnalyzer();
|
||||||
|
checkRandomData(random, a, atLeast(100));
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** blast some random large strings through the analyzer */
|
||||||
|
@Test
|
||||||
|
public void testRandomHugeStrings() throws Exception {
|
||||||
|
Random random = random();
|
||||||
|
final Analyzer a = new JapaneseCompletionAnalyzer();
|
||||||
|
checkRandomData(random, a, 2 * RANDOM_MULTIPLIER, 8192);
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,271 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.ja;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.cjk.CJKWidthCharFilter;
|
||||||
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class TestJapaneseCompletionFilter extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer indexAnalyzer;
|
||||||
|
private Analyzer queryAnalyzer;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
indexAnalyzer =
|
||||||
|
new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.NORMAL);
|
||||||
|
return new TokenStreamComponents(
|
||||||
|
tokenizer,
|
||||||
|
new JapaneseCompletionFilter(tokenizer, JapaneseCompletionFilter.Mode.INDEX));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Reader initReader(String fieldName, Reader reader) {
|
||||||
|
return new CJKWidthCharFilter(reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
|
||||||
|
return new CJKWidthCharFilter(reader);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
queryAnalyzer =
|
||||||
|
new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.NORMAL);
|
||||||
|
return new TokenStreamComponents(
|
||||||
|
tokenizer,
|
||||||
|
new JapaneseCompletionFilter(tokenizer, JapaneseCompletionFilter.Mode.QUERY));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Reader initReader(String fieldName, Reader reader) {
|
||||||
|
return new CJKWidthCharFilter(reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
|
||||||
|
return new CJKWidthCharFilter(reader);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
IOUtils.close(indexAnalyzer);
|
||||||
|
IOUtils.close(queryAnalyzer);
|
||||||
|
super.tearDown();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCompletionIndex() throws IOException {
|
||||||
|
assertAnalyzesTo(
|
||||||
|
indexAnalyzer,
|
||||||
|
"東京",
|
||||||
|
new String[] {"東京", "toukyou"},
|
||||||
|
new int[] {0, 0},
|
||||||
|
new int[] {2, 2},
|
||||||
|
new int[] {1, 0});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
indexAnalyzer,
|
||||||
|
"東京都",
|
||||||
|
new String[] {"東京", "toukyou", "都", "to"},
|
||||||
|
new int[] {0, 0, 2, 2},
|
||||||
|
new int[] {2, 2, 3, 3},
|
||||||
|
new int[] {1, 0, 1, 0});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
indexAnalyzer,
|
||||||
|
"ドラえもん",
|
||||||
|
new String[] {"ドラえもん", "doraemon", "doraemonn"},
|
||||||
|
new int[] {0, 0, 0},
|
||||||
|
new int[] {5, 5, 5},
|
||||||
|
new int[] {1, 0, 0});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
indexAnalyzer,
|
||||||
|
"ソースコード",
|
||||||
|
new String[] {"ソース", "soーsu", "コード", "koーdo"},
|
||||||
|
new int[] {0, 0, 3, 3},
|
||||||
|
new int[] {3, 3, 6, 6},
|
||||||
|
new int[] {1, 0, 1, 0});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
indexAnalyzer,
|
||||||
|
"反社会的勢力",
|
||||||
|
new String[] {"反", "han", "hann", "社会", "syakai", "shakai", "的", "teki", "勢力", "seiryoku"},
|
||||||
|
new int[] {0, 0, 0, 1, 1, 1, 3, 3, 4, 4},
|
||||||
|
new int[] {1, 1, 1, 3, 3, 3, 4, 4, 6, 6},
|
||||||
|
new int[] {1, 0, 0, 1, 0, 0, 1, 0, 1, 0});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
indexAnalyzer, "々", new String[] {"々"}, new int[] {0}, new int[] {1}, new int[] {1});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
indexAnalyzer,
|
||||||
|
"是々",
|
||||||
|
new String[] {"是", "ze", "々"},
|
||||||
|
new int[] {0, 0, 1},
|
||||||
|
new int[] {1, 1, 2},
|
||||||
|
new int[] {1, 0, 1});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
indexAnalyzer,
|
||||||
|
"是々の",
|
||||||
|
new String[] {"是", "ze", "々", "の", "no"},
|
||||||
|
new int[] {0, 0, 1, 2, 2},
|
||||||
|
new int[] {1, 1, 2, 3, 3},
|
||||||
|
new int[] {1, 0, 1, 1, 0});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCompletionQuery() throws IOException {
|
||||||
|
assertAnalyzesTo(
|
||||||
|
queryAnalyzer,
|
||||||
|
"東京",
|
||||||
|
new String[] {"東京", "toukyou"},
|
||||||
|
new int[] {0, 0},
|
||||||
|
new int[] {2, 2},
|
||||||
|
new int[] {1, 0});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
queryAnalyzer,
|
||||||
|
"東京都",
|
||||||
|
new String[] {"東京", "toukyou", "都", "to"},
|
||||||
|
new int[] {0, 0, 2, 2},
|
||||||
|
new int[] {2, 2, 3, 3},
|
||||||
|
new int[] {1, 0, 1, 0});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
queryAnalyzer,
|
||||||
|
"ドラえもん",
|
||||||
|
new String[] {"ドラえもん", "doraemon", "doraemonn"},
|
||||||
|
new int[] {0, 0, 0},
|
||||||
|
new int[] {5, 5, 5},
|
||||||
|
new int[] {1, 0, 0});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
queryAnalyzer,
|
||||||
|
"ソースコード",
|
||||||
|
new String[] {"ソースコード", "soーsukoーdo"},
|
||||||
|
new int[] {0, 0},
|
||||||
|
new int[] {6, 6},
|
||||||
|
new int[] {1, 0});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
queryAnalyzer,
|
||||||
|
"反社会的勢力",
|
||||||
|
new String[] {"反", "han", "hann", "社会", "syakai", "shakai", "的", "teki", "勢力", "seiryoku"},
|
||||||
|
new int[] {0, 0, 0, 1, 1, 1, 3, 3, 4, 4},
|
||||||
|
new int[] {1, 1, 1, 3, 3, 3, 4, 4, 6, 6},
|
||||||
|
new int[] {1, 0, 0, 1, 0, 0, 1, 0, 1, 0});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
queryAnalyzer, "々", new String[] {"々"}, new int[] {0}, new int[] {1}, new int[] {1});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
queryAnalyzer,
|
||||||
|
"是々",
|
||||||
|
new String[] {"是", "ze", "々"},
|
||||||
|
new int[] {0, 0, 1},
|
||||||
|
new int[] {1, 1, 2},
|
||||||
|
new int[] {1, 0, 1});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
indexAnalyzer,
|
||||||
|
"是々の",
|
||||||
|
new String[] {"是", "ze", "々", "の", "no"},
|
||||||
|
new int[] {0, 0, 1, 2, 2},
|
||||||
|
new int[] {1, 1, 2, 3, 3},
|
||||||
|
new int[] {1, 0, 1, 1, 0});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
queryAnalyzer,
|
||||||
|
"東京t",
|
||||||
|
new String[] {"東京t", "toukyout"},
|
||||||
|
new int[] {0, 0},
|
||||||
|
new int[] {3, 3},
|
||||||
|
new int[] {1, 0});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
queryAnalyzer,
|
||||||
|
"サッk",
|
||||||
|
new String[] {"サッk", "sakk"},
|
||||||
|
new int[] {0, 0},
|
||||||
|
new int[] {3, 3},
|
||||||
|
new int[] {1, 0});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
queryAnalyzer,
|
||||||
|
"反sy",
|
||||||
|
new String[] {"反sy", "hansy", "hannsy"},
|
||||||
|
new int[] {0, 0, 0},
|
||||||
|
new int[] {3, 3, 3},
|
||||||
|
new int[] {1, 0, 0});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
queryAnalyzer,
|
||||||
|
"さーきゅr",
|
||||||
|
new String[] {"さーきゅr", "saーkyur"},
|
||||||
|
new int[] {0, 0},
|
||||||
|
new int[] {5, 5},
|
||||||
|
new int[] {1, 0});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
queryAnalyzer,
|
||||||
|
"是々h",
|
||||||
|
new String[] {"是", "ze", "々h"},
|
||||||
|
new int[] {0, 0, 1},
|
||||||
|
new int[] {1, 1, 3},
|
||||||
|
new int[] {1, 0, 1});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEnglish() throws IOException {
|
||||||
|
assertAnalyzesTo(indexAnalyzer, "this atest", new String[] {"this", "atest"});
|
||||||
|
assertAnalyzesTo(queryAnalyzer, "this atest", new String[] {"this", "atest"});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRandomStrings() throws IOException {
|
||||||
|
checkRandomData(random(), indexAnalyzer, atLeast(200));
|
||||||
|
checkRandomData(random(), queryAnalyzer, atLeast(200));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEmptyTerm() throws IOException {
|
||||||
|
Analyzer a =
|
||||||
|
new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tokenizer = new KeywordTokenizer();
|
||||||
|
return new TokenStreamComponents(tokenizer, new JapaneseCompletionFilter(tokenizer));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "", "");
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,59 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.ja;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.cjk.CJKWidthFilterFactory;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class TestJapaneseCompletionFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||||
|
@Test
|
||||||
|
public void testCompletion() throws IOException {
|
||||||
|
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<>());
|
||||||
|
TokenStream tokenStream = tokenizerFactory.create();
|
||||||
|
((Tokenizer) tokenStream).setReader(new StringReader("東京t"));
|
||||||
|
CJKWidthFilterFactory cjkWidthFactory = new CJKWidthFilterFactory(new HashMap<>());
|
||||||
|
tokenStream = cjkWidthFactory.create(tokenStream);
|
||||||
|
Map<String, String> map = new HashMap<>();
|
||||||
|
map.put("mode", "QUERY");
|
||||||
|
JapaneseCompletionFilterFactory filterFactory = new JapaneseCompletionFilterFactory(map);
|
||||||
|
assertTokenStreamContents(filterFactory.create(tokenStream), new String[] {"東京t", "toukyout"});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test that bogus arguments result in exception */
|
||||||
|
@Test
|
||||||
|
public void testBogusArguments() throws Exception {
|
||||||
|
IllegalArgumentException expected =
|
||||||
|
expectThrows(
|
||||||
|
IllegalArgumentException.class,
|
||||||
|
() -> {
|
||||||
|
new JapaneseCompletionFilterFactory(
|
||||||
|
new HashMap<String, String>() {
|
||||||
|
{
|
||||||
|
put("bogusArg", "bogusValue");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,68 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.ja.completion;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class TestKatakanaRomanizer extends LuceneTestCase {
|
||||||
|
private final KatakanaRomanizer romanizer = KatakanaRomanizer.getInstance();
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRomanize() {
|
||||||
|
assertCharsRefListEqualsUnordered(
|
||||||
|
List.of(new CharsRef("hasi"), new CharsRef("hashi")),
|
||||||
|
romanizer.romanize(new CharsRef("ハシ")));
|
||||||
|
assertCharsRefListEqualsUnordered(
|
||||||
|
List.of(new CharsRef("yuukyuu")), romanizer.romanize(new CharsRef("ユウキュウ")));
|
||||||
|
assertCharsRefListEqualsUnordered(
|
||||||
|
List.of(new CharsRef("yakyuu")), romanizer.romanize(new CharsRef("ヤキュウ")));
|
||||||
|
assertCharsRefListEqualsUnordered(
|
||||||
|
List.of(new CharsRef("toukyou")), romanizer.romanize(new CharsRef("トウキョウ")));
|
||||||
|
assertCharsRefListEqualsUnordered(
|
||||||
|
List.of(new CharsRef("toーkyoー")), romanizer.romanize(new CharsRef("トーキョー")));
|
||||||
|
assertCharsRefListEqualsUnordered(
|
||||||
|
List.of(new CharsRef("sakka")), romanizer.romanize(new CharsRef("サッカ")));
|
||||||
|
assertCharsRefListEqualsUnordered(
|
||||||
|
List.of(new CharsRef("hyakkaten"), new CharsRef("hyakkatenn")),
|
||||||
|
romanizer.romanize(new CharsRef("ヒャッカテン")));
|
||||||
|
assertCharsRefListEqualsUnordered(
|
||||||
|
List.of(new CharsRef("voruteーru"), new CharsRef("vuxoruteーru")),
|
||||||
|
romanizer.romanize(new CharsRef("ヴォルテール")));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRomanizeWithAlphabets() {
|
||||||
|
assertCharsRefListEqualsUnordered(
|
||||||
|
List.of(new CharsRef("toukyout")), romanizer.romanize(new CharsRef("トウキョウt")));
|
||||||
|
assertCharsRefListEqualsUnordered(
|
||||||
|
List.of(new CharsRef("kodakk")), romanizer.romanize(new CharsRef("コダッk")));
|
||||||
|
assertCharsRefListEqualsUnordered(
|
||||||
|
List.of(new CharsRef("syousy"), new CharsRef("shousy")),
|
||||||
|
romanizer.romanize(new CharsRef("ショウsy")));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void assertCharsRefListEqualsUnordered(
|
||||||
|
List<CharsRef> expected, List<CharsRef> actual) {
|
||||||
|
assertEquals(expected.size(), actual.size());
|
||||||
|
for (CharsRef ref : expected) {
|
||||||
|
assertTrue(ref.toString() + " is not contained in " + actual, actual.contains(ref));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue