mirror of https://github.com/apache/lucene.git
LUCENE-10102: Add JapaneseCompletionFilter for Input Method-aware auto-completion (#297)
Co-authored-by: Robert Muir <rmuir@apache.org>
This commit is contained in:
parent
de45b68c90
commit
4e86df96c0
|
@ -21,6 +21,8 @@ New Features
|
|||
|
||||
* LUCENE-10096: Add TamilAnalyzer based on the snowball stemmer. (Robert Muir)
|
||||
|
||||
* LUCENE-10102: Add JapaneseCompletionFilter for Input Method-aware auto-completion (Tomoko Uchida, Robert Muir, Jun Ohtani)
|
||||
|
||||
System Requirements
|
||||
|
||||
* LUCENE-8738: Move to Java 11 as minimum Java version.
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.ja;
|
||||
|
||||
import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.cjk.CJKWidthCharFilter;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.ja.dict.UserDictionary;
|
||||
|
||||
/**
|
||||
* Analyzer for Japanese completion suggester.
|
||||
*
|
||||
* @see JapaneseCompletionFilter
|
||||
*/
|
||||
public class JapaneseCompletionAnalyzer extends Analyzer {
|
||||
private final JapaneseCompletionFilter.Mode mode;
|
||||
private final UserDictionary userDict;
|
||||
|
||||
/** Creates a new {@code JapaneseCompletionAnalyzer} with default configurations */
|
||||
public JapaneseCompletionAnalyzer() {
|
||||
this(null, JapaneseCompletionFilter.Mode.INDEX);
|
||||
}
|
||||
|
||||
/** Creates a new {@code JapaneseCompletionAnalyzer} */
|
||||
public JapaneseCompletionAnalyzer(UserDictionary userDict, JapaneseCompletionFilter.Mode mode) {
|
||||
this.userDict = userDict;
|
||||
this.mode = mode;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer =
|
||||
new JapaneseTokenizer(userDict, true, true, JapaneseTokenizer.Mode.NORMAL);
|
||||
TokenStream stream = new JapaneseCompletionFilter(tokenizer, mode);
|
||||
stream = new LowerCaseFilter(stream);
|
||||
return new TokenStreamComponents(tokenizer, stream);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
return new CJKWidthCharFilter(reader);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
|
||||
return new CJKWidthCharFilter(reader);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,267 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.ja;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.ja.completion.CharSequenceUtils;
|
||||
import org.apache.lucene.analysis.ja.completion.KatakanaRomanizer;
|
||||
import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
|
||||
/**
|
||||
* A {@link org.apache.lucene.analysis.TokenFilter} that adds Japanese romanized tokens to the term
|
||||
* attribute. Also, this keeps original tokens (surface forms). Main usage of this filter is Query
|
||||
* Auto-Completion.
|
||||
*
|
||||
* <p>Supported romanization form: (modified) Hepburn-shiki, Kunrei-shiki (Nihon-shiki) and Wāpuro
|
||||
* shiki.
|
||||
*
|
||||
* <p>This does NOT support some romanji forms which are official but not commonly used with
|
||||
* Japanese <a href="https://en.wikipedia.org/wiki/Input_method">Input Methods</a>. e.g.: circumflex
|
||||
* or macron representing <a href="https://en.wikipedia.org/wiki/Ch%C5%8Donpu">Chōonpu (長音符)</a> are
|
||||
* not supported.
|
||||
*
|
||||
* <p>The romanization behaviour changes according to its {@link Mode}. The default mode is {@link
|
||||
* Mode#INDEX}.
|
||||
*
|
||||
* <p>Note: This filter must be applied AFTER half-width and full-width normalization. Please ensure
|
||||
* that a width normalizer such as {@link org.apache.lucene.analysis.cjk.CJKWidthCharFilter} or
|
||||
* {@link org.apache.lucene.analysis.cjk.CJKWidthFilter} is included in your analysis chain. IF THE
|
||||
* WIDTH NORMALIZATION IS NOT PERFORMED, THIS DOES NOT WORK AS EXPECTED. See also: {@link
|
||||
* JapaneseCompletionAnalyzer}.
|
||||
*/
|
||||
public final class JapaneseCompletionFilter extends TokenFilter {
|
||||
public static final Mode DEFAULT_MODE = Mode.INDEX;
|
||||
|
||||
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
|
||||
private final ReadingAttribute readingAttr = addAttribute(ReadingAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt =
|
||||
addAttribute(PositionIncrementAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
private final CompletionTokenGenerator tokenGenerator;
|
||||
|
||||
/** Completion mode */
|
||||
public enum Mode {
|
||||
/** Simple romanization. Expected to be used when indexing. */
|
||||
INDEX,
|
||||
/** Input Method aware romanization. Expected to be used when querying. */
|
||||
QUERY
|
||||
}
|
||||
|
||||
/** Creates a new {@code JapaneseCompletionFilter} with default configurations */
|
||||
public JapaneseCompletionFilter(TokenStream input) {
|
||||
this(input, DEFAULT_MODE);
|
||||
}
|
||||
|
||||
/** Creates a new {@code JapaneseCompletionFilter} */
|
||||
public JapaneseCompletionFilter(TokenStream input, Mode mode) {
|
||||
super(input);
|
||||
this.tokenGenerator = new CompletionTokenGenerator(mode);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
tokenGenerator.reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
mayIncrementToken();
|
||||
if (tokenGenerator.hasNext()) {
|
||||
clearAttributes();
|
||||
CompletionToken token = tokenGenerator.next();
|
||||
termAttr.setEmpty().append(token.term);
|
||||
if (token.isFirst) {
|
||||
posIncAtt.setPositionIncrement(1);
|
||||
} else {
|
||||
posIncAtt.setPositionIncrement(0);
|
||||
}
|
||||
offsetAtt.setOffset(token.startOffset, token.endOffset);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private void mayIncrementToken() throws IOException {
|
||||
while (!tokenGenerator.hasNext()) {
|
||||
if (input.incrementToken()) {
|
||||
String surface = termAttr.toString();
|
||||
String reading = readingAttr.getReading();
|
||||
int startOffset = offsetAtt.startOffset();
|
||||
int endOffset = offsetAtt.endOffset();
|
||||
if (reading == null && CharSequenceUtils.isKana(surface)) {
|
||||
// use the surface form as reading when possible.
|
||||
reading = CharSequenceUtils.toKatakana(surface);
|
||||
}
|
||||
tokenGenerator.addToken(surface, reading, startOffset, endOffset);
|
||||
} else {
|
||||
if (tokenGenerator.hasPendingToken()) {
|
||||
// a pending token remains.
|
||||
tokenGenerator.finish();
|
||||
} else {
|
||||
// already consumed all tokens. there's no next token to output.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class CompletionToken {
|
||||
final String term;
|
||||
final boolean isFirst;
|
||||
final int startOffset;
|
||||
final int endOffset;
|
||||
|
||||
CompletionToken(String term, boolean isFirst, int startOffset, int endOffset) {
|
||||
this.term = term;
|
||||
this.isFirst = isFirst;
|
||||
this.startOffset = startOffset;
|
||||
this.endOffset = endOffset;
|
||||
}
|
||||
}
|
||||
|
||||
private static class CompletionTokenGenerator implements Iterator<CompletionToken> {
|
||||
|
||||
private final Mode mode;
|
||||
|
||||
private List<CompletionToken> outputs;
|
||||
|
||||
private CharsRefBuilder pdgSurface;
|
||||
private CharsRefBuilder pdgReading;
|
||||
private int pdgStartOffset;
|
||||
private int pdgEndOffset;
|
||||
|
||||
CompletionTokenGenerator(Mode mode) {
|
||||
this.mode = mode;
|
||||
outputs = new ArrayList<>();
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
clearPendingToken();
|
||||
outputs.clear();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return outputs.size() > 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CompletionToken next() {
|
||||
return outputs.remove(0);
|
||||
}
|
||||
|
||||
void addToken(String surface, String reading, int startOffset, int endOffset) {
|
||||
assert surface != null : "surface must not be null.";
|
||||
|
||||
if (hasPendingToken()) {
|
||||
if (mode == Mode.QUERY
|
||||
&& pdgReading != null
|
||||
&& !CharSequenceUtils.isLowercaseAlphabets(pdgSurface.get())
|
||||
&& CharSequenceUtils.isLowercaseAlphabets(surface)) {
|
||||
// words that are in mid-IME composition are split into two tokens by JapaneseTokenizer;
|
||||
// should be recovered when querying.
|
||||
// Note: in this case, the reading attribute is null; use the surface form in place of the
|
||||
// reading.
|
||||
// e.g.: "サッ" + "k" => "サッk", "反" + "sy" => "反sy"
|
||||
pdgSurface.append(surface);
|
||||
pdgReading.append(surface);
|
||||
pdgEndOffset = endOffset;
|
||||
generateOutputs();
|
||||
clearPendingToken();
|
||||
} else if (mode == Mode.QUERY
|
||||
&& CharSequenceUtils.isKana(pdgSurface.get())
|
||||
&& CharSequenceUtils.isKana(surface)) {
|
||||
// words that are all composed only of Katakana or Hiragana should be concatenated when
|
||||
// querying.
|
||||
// e.g.: "こい" + "ぬ" => "こいぬ"
|
||||
pdgSurface.append(surface);
|
||||
pdgReading.append(reading);
|
||||
pdgEndOffset = endOffset;
|
||||
} else {
|
||||
generateOutputs();
|
||||
resetPendingToken(surface, reading, startOffset, endOffset);
|
||||
}
|
||||
} else {
|
||||
resetPendingToken(surface, reading, startOffset, endOffset);
|
||||
}
|
||||
}
|
||||
|
||||
void finish() {
|
||||
generateOutputs();
|
||||
clearPendingToken();
|
||||
}
|
||||
|
||||
private void generateOutputs() {
|
||||
// preserve original surface form as an output.
|
||||
outputs.add(new CompletionToken(pdgSurface.toString(), true, pdgStartOffset, pdgEndOffset));
|
||||
// skip readings that cannot be translated to romaji.
|
||||
if (pdgReading == null
|
||||
|| pdgReading.length() == 0
|
||||
|| !CharSequenceUtils.isKatakanaOrHWAlphabets(pdgReading.get())) {
|
||||
return;
|
||||
}
|
||||
// translate the reading to romaji.
|
||||
List<CharsRef> romaji = KatakanaRomanizer.getInstance().romanize(pdgReading.get());
|
||||
for (CharsRef ref : romaji) {
|
||||
// set the same start/end offset as the original surface form for romanized tokens.
|
||||
outputs.add(new CompletionToken(ref.toString(), false, pdgStartOffset, pdgEndOffset));
|
||||
}
|
||||
}
|
||||
|
||||
boolean hasPendingToken() {
|
||||
return pdgSurface != null;
|
||||
}
|
||||
|
||||
void resetPendingToken(
|
||||
CharSequence surface, CharSequence reading, int startOffset, int endOffset) {
|
||||
if (this.pdgSurface == null) {
|
||||
this.pdgSurface = new CharsRefBuilder();
|
||||
} else {
|
||||
this.pdgSurface.clear();
|
||||
}
|
||||
this.pdgSurface.append(surface);
|
||||
if (this.pdgReading == null) {
|
||||
this.pdgReading = new CharsRefBuilder();
|
||||
} else {
|
||||
this.pdgReading.clear();
|
||||
}
|
||||
this.pdgReading.append(reading);
|
||||
this.pdgStartOffset = startOffset;
|
||||
this.pdgEndOffset = endOffset;
|
||||
}
|
||||
|
||||
void clearPendingToken() {
|
||||
this.pdgSurface = null;
|
||||
this.pdgReading = null;
|
||||
this.pdgStartOffset = 0;
|
||||
this.pdgEndOffset = 0;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.ja;
|
||||
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.analysis.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Factory for {@link JapaneseCompletionFilter}.
|
||||
*
|
||||
* <p>Supported attributes:
|
||||
*
|
||||
* <ul>
|
||||
* <li>mode: Completion mode. see {@link JapaneseCompletionFilter.Mode}
|
||||
* </ul>
|
||||
*
|
||||
* @lucene.spi {@value #NAME}
|
||||
*/
|
||||
public class JapaneseCompletionFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** SPI name */
|
||||
public static final String NAME = "japaneseCompletion";
|
||||
|
||||
private static final String MODE_PARAM = "mode";
|
||||
private final JapaneseCompletionFilter.Mode mode;
|
||||
|
||||
/** Creates a new {@code JapaneseCompletionFilterFactory} */
|
||||
public JapaneseCompletionFilterFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
mode =
|
||||
JapaneseCompletionFilter.Mode.valueOf(
|
||||
get(
|
||||
args,
|
||||
MODE_PARAM,
|
||||
JapaneseCompletionFilter.DEFAULT_MODE.name().toUpperCase(Locale.ROOT)));
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
/** Default ctor for compatibility with SPI */
|
||||
public JapaneseCompletionFilterFactory() {
|
||||
throw defaultCtorException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new JapaneseCompletionFilter(input, mode);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.ja.completion;
|
||||
|
||||
/** Utility functions for {@link org.apache.lucene.analysis.ja.JapaneseCompletionFilter} */
|
||||
public class CharSequenceUtils {
|
||||
|
||||
/** Checks if a char sequence is composed only of lowercase alphabets */
|
||||
public static boolean isLowercaseAlphabets(CharSequence s) {
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
char ch = s.charAt(i);
|
||||
if (!(isHalfWidthLowercaseAlphabet(ch) || isFullWidthLowercaseAlphabet(ch))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Checks if a char sequence is composed only of Katakana or hiragana */
|
||||
public static boolean isKana(CharSequence s) {
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
char ch = s.charAt(i);
|
||||
if (!(isHiragana(ch) || isKatakana(ch))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Checks if a char sequence is composed only of Katakana or lowercase alphabets */
|
||||
public static boolean isKatakanaOrHWAlphabets(CharSequence ref) {
|
||||
for (int i = 0; i < ref.length(); i++) {
|
||||
char ch = ref.charAt(i);
|
||||
if (!isKatakana(ch) && !isHalfWidthLowercaseAlphabet(ch)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Checks if a char is a Hiragana */
|
||||
private static boolean isHiragana(char ch) {
|
||||
return ch >= 0x3040 && ch <= 0x309f;
|
||||
}
|
||||
|
||||
/** Checks if a char is a Katakana */
|
||||
private static boolean isKatakana(char ch) {
|
||||
return ch >= 0x30a0 && ch <= 0x30ff;
|
||||
}
|
||||
|
||||
/** Checks if a char is a half-width lowercase alphabet */
|
||||
private static boolean isHalfWidthLowercaseAlphabet(char ch) {
|
||||
return ch >= 0x61 && ch <= 0x7a;
|
||||
}
|
||||
|
||||
/** Checks if a char is a full-width lowercase alphabet */
|
||||
public static boolean isFullWidthLowercaseAlphabet(char ch) {
|
||||
return ch >= 0xff41 && ch <= 0xff5a;
|
||||
}
|
||||
|
||||
/** Convert all hiragana in a string into kanataka */
|
||||
public static String toKatakana(CharSequence s) {
|
||||
char[] chars = new char[s.length()];
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
char ch = s.charAt(i);
|
||||
// if the character is from 'ぁ' to 'ゖ' or 'ゝ' or 'ゞ', can be converted to katakana.
|
||||
if (ch >= 0x3041 && ch <= 0x3096 || ch == 0x309d || ch == 0x309e) {
|
||||
chars[i] = (char) (ch + 0x60);
|
||||
} else {
|
||||
chars[i] = ch;
|
||||
}
|
||||
}
|
||||
return new String(chars);
|
||||
}
|
||||
|
||||
private CharSequenceUtils() {}
|
||||
}
|
|
@ -0,0 +1,193 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.ja.completion;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
|
||||
/**
|
||||
* Converts a Katakana string to <a
|
||||
* href="https://en.wikipedia.org/wiki/Romanization_of_Japanese">Romaji</a> using the pre-defined
|
||||
* Katakana-Romaji mapping rules. Internally, this repeatedly performs prefix match on the given
|
||||
* char sequence to the pre-built keystroke array until it reaches the end of the sequence, or there
|
||||
* are no matched keystrokes.
|
||||
*/
|
||||
public class KatakanaRomanizer {
|
||||
private static final String ROMAJI_MAP_FILE = "romaji_map.txt";
|
||||
|
||||
private static KatakanaRomanizer INSTANCE;
|
||||
|
||||
static {
|
||||
// Build romaji-map and keystroke arrays from the pre-defined Katakana-Romaji mapping file.
|
||||
try (InputStreamReader is =
|
||||
new InputStreamReader(
|
||||
KatakanaRomanizer.class.getResourceAsStream(ROMAJI_MAP_FILE),
|
||||
Charset.forName("UTF-8"));
|
||||
BufferedReader ir = new BufferedReader(is)) {
|
||||
Map<CharsRef, List<CharsRef>> romajiMap = new HashMap<>();
|
||||
String line;
|
||||
while ((line = ir.readLine()) != null) {
|
||||
if (line.startsWith("#")) {
|
||||
continue;
|
||||
}
|
||||
String[] cols = line.trim().split(",");
|
||||
if (cols.length < 2) {
|
||||
continue;
|
||||
}
|
||||
CharsRef prefix = new CharsRef(cols[0]);
|
||||
romajiMap.put(prefix, new ArrayList<>());
|
||||
for (int i = 1; i < cols.length; i++) {
|
||||
romajiMap.get(prefix).add(new CharsRef(cols[i]));
|
||||
}
|
||||
}
|
||||
|
||||
Set<CharsRef> keystrokeSet = romajiMap.keySet();
|
||||
int maxKeystrokeLength = keystrokeSet.stream().mapToInt(CharsRef::length).max().getAsInt();
|
||||
CharsRef[][] keystrokes = new CharsRef[maxKeystrokeLength][];
|
||||
for (int len = 0; len < maxKeystrokeLength; len++) {
|
||||
final int l = len;
|
||||
keystrokes[l] =
|
||||
keystrokeSet.stream().filter(k -> k.length - 1 == l).toArray(CharsRef[]::new);
|
||||
}
|
||||
for (CharsRef[] ks : keystrokes) {
|
||||
// keystroke array must be sorted in ascending order for binary search.
|
||||
Arrays.sort(ks);
|
||||
}
|
||||
|
||||
INSTANCE = new KatakanaRomanizer(keystrokes, romajiMap);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private final CharsRef[][] keystrokes;
|
||||
private final Map<CharsRef, List<CharsRef>> romajiMap;
|
||||
|
||||
/** Returns the singleton instance of {@code KatakanaRomenizer} */
|
||||
public static KatakanaRomanizer getInstance() {
|
||||
return INSTANCE;
|
||||
}
|
||||
|
||||
private KatakanaRomanizer(CharsRef[][] keystrokes, Map<CharsRef, List<CharsRef>> romajiMap) {
|
||||
this.keystrokes = keystrokes;
|
||||
this.romajiMap = romajiMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Translates a sequence of katakana to romaji. An input can produce multiple outputs because a
|
||||
* keystroke can be mapped to multiple romajis.
|
||||
*/
|
||||
public List<CharsRef> romanize(CharsRef input) {
|
||||
assert CharSequenceUtils.isKatakanaOrHWAlphabets(input);
|
||||
|
||||
List<CharsRefBuilder> pendingOutputs = new ArrayList<>();
|
||||
int pos = 0;
|
||||
while (pos < input.length) {
|
||||
// Greedily looks up the longest matched keystroke.
|
||||
// e.g.: Consider input="キョウ", then there are two matched keystrokes (romaji mapping rules)
|
||||
// "キ" -> "ki" and "キョ" -> "kyo". Only the longest one "キョ" will be selected.
|
||||
MatchedKeystroke matched = longestKeystrokeMatch(input, pos);
|
||||
if (matched == null) {
|
||||
break;
|
||||
}
|
||||
|
||||
List<CharsRef> candidates =
|
||||
romajiMap.get(keystrokes[matched.keystrokeLen - 1][matched.keystrokeIndex]);
|
||||
|
||||
if (pendingOutputs.size() == 0) {
|
||||
// There is no pending output.
|
||||
// Add the matched keystrokes to pending outputs list.
|
||||
for (CharsRef cref : candidates) {
|
||||
CharsRefBuilder output = new CharsRefBuilder();
|
||||
output.copyChars(cref);
|
||||
pendingOutputs.add(output);
|
||||
}
|
||||
} else if (candidates.size() == 1) {
|
||||
// There are one or more pending output(s) and one matched keystroke.
|
||||
// Append the matched keystroke to all pending outputs.
|
||||
// e.g.: Consider we already have two pending outputs "shi" and "si" and the matched
|
||||
// keystroke "ka";
|
||||
// then results are "shika" and "sika".
|
||||
CharsRef cref = candidates.get(0);
|
||||
for (CharsRefBuilder pdgOutput : pendingOutputs) {
|
||||
pdgOutput.append(cref.chars, 0, cref.length);
|
||||
}
|
||||
} else {
|
||||
// There are one or more pending output(s) and multiple matched keystrokes.
|
||||
// Combine the matched keystrokes to all pending outputs.
|
||||
// e.g.: Consider we already have two pending outputs "shi" and "si" and the matched
|
||||
// keystroke "n" and "nn".
|
||||
// To produce all possible keystroke patterns, result outputs should be "shin", "shinn",
|
||||
// "sin" and "sinn".
|
||||
List<CharsRefBuilder> outputs = new ArrayList<>();
|
||||
for (CharsRef cref : candidates) {
|
||||
for (CharsRefBuilder pdgOutput : pendingOutputs) {
|
||||
CharsRefBuilder buffer = new CharsRefBuilder();
|
||||
buffer.copyChars(pdgOutput.chars(), 0, pdgOutput.length());
|
||||
buffer.append(cref.chars, cref.offset, cref.length);
|
||||
outputs.add(buffer);
|
||||
}
|
||||
}
|
||||
// update the pending outputs
|
||||
pendingOutputs = outputs;
|
||||
}
|
||||
|
||||
// proceed to the next input position
|
||||
pos += matched.keystrokeLen;
|
||||
}
|
||||
|
||||
if (pos < input.length) {
|
||||
// add the remnants (that cannot be mapped to any romaji) as suffix
|
||||
for (CharsRefBuilder output : pendingOutputs) {
|
||||
output.append(input.chars, pos, input.length - pos);
|
||||
}
|
||||
}
|
||||
return pendingOutputs.stream().map(CharsRefBuilder::get).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private MatchedKeystroke longestKeystrokeMatch(CharsRef input, int inputOffset) {
|
||||
for (int len = Math.min(input.length - inputOffset, keystrokes.length); len > 0; len--) {
|
||||
CharsRef ref = new CharsRef(input.chars, inputOffset, len);
|
||||
int index = Arrays.binarySearch(keystrokes[len - 1], ref);
|
||||
if (index >= 0) {
|
||||
return new MatchedKeystroke(len, index);
|
||||
}
|
||||
}
|
||||
// there's no matched keystroke
|
||||
return null;
|
||||
}
|
||||
|
||||
private static class MatchedKeystroke {
|
||||
final int keystrokeLen;
|
||||
final int keystrokeIndex;
|
||||
|
||||
MatchedKeystroke(int keystrokeLen, int keystrokeIndex) {
|
||||
this.keystrokeLen = keystrokeLen;
|
||||
this.keystrokeIndex = keystrokeIndex;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/** Utilities for {@link org.apache.lucene.analysis.ja.JapaneseCompletionFilter} */
|
||||
package org.apache.lucene.analysis.ja.completion;
|
|
@ -14,6 +14,7 @@
|
|||
# limitations under the License.
|
||||
|
||||
org.apache.lucene.analysis.ja.JapaneseBaseFormFilterFactory
|
||||
org.apache.lucene.analysis.ja.JapaneseCompletionFilterFactory
|
||||
org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilterFactory
|
||||
org.apache.lucene.analysis.ja.JapaneseNumberFilterFactory
|
||||
org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilterFactory
|
||||
|
|
|
@ -0,0 +1,344 @@
|
|||
# mapping rules of katakana (an unit of keystroke) to list of acceptable romanizations.
|
||||
# longest-match is used to find entries in this list.
|
||||
# covers romanization systems: modified Hepburn-shiki, Kunrei-shiki (Nihon-shiki), and Wāpuro shiki.
|
||||
# note: this does not strictly comply with the romanization systems listed above,
|
||||
# but tries to cover possible keystoroke supported by various Input Methods.
|
||||
|
||||
ア,a
|
||||
イ,i
|
||||
ウ,u
|
||||
エ,e
|
||||
オ,o
|
||||
カ,ka
|
||||
キ,ki
|
||||
ク,ku
|
||||
ケ,ke
|
||||
コ,ko
|
||||
キャ,kya
|
||||
キュ,kyu
|
||||
キョ,kyo
|
||||
ガ,ga
|
||||
ギ,gi
|
||||
グ,gu
|
||||
ゲ,ge
|
||||
ゴ,go
|
||||
ギャ,gya
|
||||
ギュ,gyu
|
||||
ギョ,gyo
|
||||
サ,sa
|
||||
シ,si,shi
|
||||
ス,su
|
||||
セ,se
|
||||
ソ,so
|
||||
シャ,sya,sha
|
||||
シュ,syu,shu
|
||||
シェ,sye,she
|
||||
ショ,syo,sho
|
||||
ザ,za
|
||||
ジ,zi,ji
|
||||
ズ,zu
|
||||
ゼ,ze
|
||||
ゾ,zo
|
||||
ジャ,zya,ja
|
||||
ジュ,zyu,ju
|
||||
ジェ,zye,je
|
||||
ジョ,zyo,jo
|
||||
タ,ta
|
||||
チ,ti,chi
|
||||
ツ,tu,tsu
|
||||
テ,te
|
||||
ト,to
|
||||
チャ,tya,cha,cya
|
||||
チュ,tyu,chu,cyu
|
||||
チョ,tyo,cho,cyo
|
||||
ダ,da
|
||||
ヂ,di,zi,ji
|
||||
ヅ,du,zu
|
||||
デ,de
|
||||
ド,do
|
||||
ヂャ,dya,zya,ja
|
||||
ヂュ,dyu,zyu,ju
|
||||
ヂョ,dyo,zyo,jo
|
||||
ナ,na
|
||||
ニ,ni
|
||||
ヌ,nu
|
||||
ネ,ne
|
||||
ノ,no
|
||||
ニャ,nya
|
||||
ニュ,nyu
|
||||
ニョ,nyo
|
||||
ハ,ha
|
||||
ヒ,hi
|
||||
フ,hu,fu
|
||||
ヘ,he
|
||||
ホ,ho
|
||||
ヒャ,hya
|
||||
ヒュ,hyu
|
||||
ヒョ,hyo
|
||||
バ,ba
|
||||
ビ,bi
|
||||
ブ,bu
|
||||
ベ,be
|
||||
ボ,bo
|
||||
ビャ,bya
|
||||
ビュ,byu
|
||||
ビョ,byo
|
||||
パ,pa
|
||||
ピ,pi
|
||||
プ,pu
|
||||
ペ,pe
|
||||
ポ,po
|
||||
ピャ,pya
|
||||
ピュ,pyu
|
||||
ピョ,pyo
|
||||
マ,ma
|
||||
ミ,mi
|
||||
ム,mu
|
||||
メ,me
|
||||
モ,mo
|
||||
ミャ,mya
|
||||
ミュ,myu
|
||||
ミョ,myo
|
||||
ヤ,ya
|
||||
ユ,yu
|
||||
ヨ,yo
|
||||
ラ,ra
|
||||
リ,ri
|
||||
ル,ru
|
||||
レ,re
|
||||
ロ,ro
|
||||
リャ,rya
|
||||
リュ,ryu
|
||||
リョ,ryo
|
||||
ワ,wa
|
||||
ウィ,wi
|
||||
ヰ,wi
|
||||
ウェ,we
|
||||
ヱ,we
|
||||
ヲ,wo,o
|
||||
ン,n,nn
|
||||
|
||||
クァ,kwa,kuxa
|
||||
クィ,kwi,kuxi
|
||||
クゥ,kwu,kuxu
|
||||
クェ,kwe,kuxe
|
||||
クォ,kwo,kuxo
|
||||
グァ,gwa,guxa
|
||||
グィ,gwi,guxi
|
||||
グゥ,gwu,guxu
|
||||
グェ,gwe,guxe
|
||||
グォ,gwo,guxo
|
||||
スァ,swa,suxa
|
||||
スィ,swi,suxi
|
||||
スゥ,swu,suxu
|
||||
スェ,swe,suxe
|
||||
スォ,swo,suxo
|
||||
トァ,twa,toxa
|
||||
トィ,twi,toxi
|
||||
トゥ,twu,toxu
|
||||
トェ,twe,toxe
|
||||
トォ,two,toxo
|
||||
ドァ,dwa,doxa
|
||||
ドィ,dwi,
|
||||
ドゥ,dwu,doxu
|
||||
ドェ,dwe,doxe
|
||||
ドォ,dwo,doxo
|
||||
ファ,hwa,fa,huxa
|
||||
フィ,hwi,fi,huxi
|
||||
フェ,hwe,fe,huxe
|
||||
フォ,hwo,fo,huxo
|
||||
ヴァ,va,vuxa
|
||||
ヴィ,vi,vuxi
|
||||
ヴ,vu
|
||||
ヴェ,ve,vuxe
|
||||
ヴォ,vo,vuxo
|
||||
テァ,tha,texa
|
||||
ティ,thi,texi
|
||||
テェ,the,texe
|
||||
テャ,tha,texya
|
||||
テュ,thu,texyu
|
||||
テョ,tho,texyo
|
||||
フャ,fya,huxya,fuxya
|
||||
フュ,fyu,huxyu,fuxyu
|
||||
フョ,fyo,huxyo,fuxyo
|
||||
ヴャ,vya,vuxya
|
||||
ヴュ,vyu,vuxyu
|
||||
ヴョ,vyo,vuxyo
|
||||
|
||||
ッカ,kka
|
||||
ッキ,kki
|
||||
ック,kku
|
||||
ッケ,kke
|
||||
ッコ,kko
|
||||
ッキャ,kkya
|
||||
ッキュ,kkyu
|
||||
ッキョ,kkyo
|
||||
ッガ,gga
|
||||
ッギ,ggi
|
||||
ッグ,ggu
|
||||
ッゲ,gge
|
||||
ッゴ,ggo
|
||||
ッギャ,ggya
|
||||
ッギュ,ggyu
|
||||
ッギョ,ggyo
|
||||
ッサ,ssa
|
||||
ッシ,ssi
|
||||
ッス,ssu
|
||||
ッセ,sse
|
||||
ッソ,sso
|
||||
ッシャ,ssya,ssha
|
||||
ッシュ,ssyu,sshu
|
||||
ッショ,ssyo,ssho
|
||||
ッザ,zza
|
||||
ッジ,zzi,jji
|
||||
ッズ,zzu
|
||||
ッゼ,zze
|
||||
ッゾ,zzo
|
||||
ッジャ,zzya,jja
|
||||
ッジュ,zzyu,jju
|
||||
ッジョ,zzyo,jjo
|
||||
ッタ,tta
|
||||
ッチ,tti
|
||||
ッツ,ttu
|
||||
ッテ,tte
|
||||
ット,tto
|
||||
ッチャ,ttya,ccha,ccya
|
||||
ッチュ,ttyu,cchu,ccyu
|
||||
ッチョ,ttyo,ccho,ccyo
|
||||
ッダ,dda
|
||||
ッヂ,ddi,
|
||||
ッヅ,ddu
|
||||
ッデ,dde
|
||||
ッド,ddo
|
||||
ッヂャ,ddya
|
||||
ッヂュ,ddyu
|
||||
ッヂョ,ddyo
|
||||
ッハ,hha
|
||||
ッヒ,hhi
|
||||
ッフ,hhu,ffu
|
||||
ッへ,hhe
|
||||
ッホ,hho
|
||||
ッヒャ,hhya
|
||||
ッヒュ,hhyu
|
||||
ッヒョ,hhyo
|
||||
ッバ,bba
|
||||
ッビ,bbi
|
||||
ッブ,bbu
|
||||
ッベ,bbe
|
||||
ッボ,bbo
|
||||
ッビャ,bbya
|
||||
ッビュ,bbyu
|
||||
ッビョ,bbyo
|
||||
ッパ,ppa
|
||||
ッピ,ppi
|
||||
ップ,ppu
|
||||
ッペ,ppe
|
||||
ッポ,ppo
|
||||
ッピャ,ppya
|
||||
ッピュ,ppyu
|
||||
ッピョ,ppyo
|
||||
ッマ,mma
|
||||
ッミ,mmi
|
||||
ッム,mmu
|
||||
ッメ,mme
|
||||
ッモ,mmo
|
||||
ッミャ,mmya
|
||||
ッミュ,mmyu
|
||||
ッミョ,mmyo
|
||||
ッヤ,yya
|
||||
ッイ,yyi
|
||||
ッユ,yyu
|
||||
ッイェ,yye
|
||||
ッヨ,yyo
|
||||
ッラ,rra
|
||||
ッリ,rri
|
||||
ッル,rru
|
||||
ッレ,rre
|
||||
ッロ,rro
|
||||
ッリャ,rrya
|
||||
ッリュ,rryu
|
||||
ッリョ,rryo
|
||||
ッワ,wwa
|
||||
ッウィ,wwi
|
||||
ッウ,wwu
|
||||
ッウェ,wwe
|
||||
ッヲ,wwo
|
||||
|
||||
ックァ,kkwa,kkuxa
|
||||
ックィ,kkwi,kkuxi
|
||||
ックゥ,kkwu,kkuxu
|
||||
ックェ,kkwe,kkuxe
|
||||
ックォ,kkwo,kkuxo
|
||||
ッグァ,ggwa,gguxa
|
||||
ッグィ,ggwi,gguxi
|
||||
ッグゥ,ggwu,gguxu
|
||||
ッグェ,ggwe,gguxe
|
||||
ッグォ,ggwo,gguxo
|
||||
ッスァ,sswa,ssuxa
|
||||
ッスィ,sswi,ssuxi
|
||||
ッスゥ,sswu,ssuxu
|
||||
ッスェ,sswe,ssuxe
|
||||
ッスォ,sswo,suxo
|
||||
ットァ,ttwa,ttoxa
|
||||
ットィ,ttwi,ttoxi
|
||||
ットゥ,ttwu,ttoxu
|
||||
ットェ,ttwe,ttoxe
|
||||
ットォ,ttwo,ttoxo
|
||||
ッドァ,ddwa,ddoxa
|
||||
ッドィ,ddwi,ddoxi
|
||||
ッドゥ,ddwu,ddoxi
|
||||
ッドェ,ddwe,ddoxe
|
||||
ッドォ,ddwo,ddoxo
|
||||
ッファ,hhwa,ffa,hhuxa,ffuxa
|
||||
ッフィ,hhwi,ffi,hhuxi,ffuxi
|
||||
ッフェ,hhwe,ffe,hhuxe,ffuxe
|
||||
ッフォ,hhwo,ffo,hhuxo,ffuxo
|
||||
ッヴァ,vva,vvuxa
|
||||
ッヴィ,vvi,vvuxi
|
||||
ッヴ,vvu
|
||||
ッヴェ,vve,vvuxe
|
||||
ッヴォ,vvo,vvuxo
|
||||
ッテァ,ttha,ttexa
|
||||
ッティ,tthi,ttexi
|
||||
ッテェ,tthe,ttexe
|
||||
ッテャ,ttha,ttexya
|
||||
ッテュ,tthu,ttexyu
|
||||
ッテョ,ttho,ttexyo
|
||||
ッフャ,ffya,hhuxya,ffuxya
|
||||
ッフュ,ffyu,hhuxyu,ffuxyu
|
||||
ッフョ,ffyo,hhuxyo,ffuxyo
|
||||
ッヴャ,vvya,vvuxya
|
||||
ッヴュ,vvyu,vvuxyu
|
||||
ッヴョ,vvyo,vvuxyo
|
||||
|
||||
ァ,xa
|
||||
ィ,xi
|
||||
ゥ,xu
|
||||
ェ,xe
|
||||
ォ,xo
|
||||
ヵ,xka
|
||||
ヶ,xke
|
||||
ッ,xtu
|
||||
ャ,xya
|
||||
ュ,xyu
|
||||
ョ,xyo
|
||||
|
||||
ッk,kk
|
||||
ッg,gg
|
||||
ッs,ss
|
||||
ッz,zz
|
||||
ッt,tt
|
||||
ッd,dd
|
||||
ッh,hh
|
||||
ッb,bb
|
||||
ッp,pp
|
||||
ッm,mm
|
||||
ッy,yy
|
||||
ッr,rr
|
||||
ッw,ww
|
||||
|
||||
# below are characters that should be kept but have no explicit romanization rules.
|
||||
# Chōonpu (Katakana-Hiragana Prolonged Sound Mark)
|
||||
ー,ー
|
||||
# Interpunct (Middle Dot)
|
||||
・,・
|
|
@ -0,0 +1,72 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.ja;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Random;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestJapaneseCompletionAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
||||
@Test
|
||||
public void testCompletionDefault() throws IOException {
|
||||
// mode=INDEX (default)
|
||||
Analyzer analyzer = new JapaneseCompletionAnalyzer();
|
||||
assertAnalyzesTo(
|
||||
analyzer,
|
||||
"東京",
|
||||
new String[] {"東京", "toukyou"},
|
||||
new int[] {0, 0},
|
||||
new int[] {2, 2},
|
||||
new int[] {1, 0});
|
||||
analyzer.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCompletionQuery() throws IOException {
|
||||
// mode=QUERY
|
||||
Analyzer analyzer = new JapaneseCompletionAnalyzer(null, JapaneseCompletionFilter.Mode.QUERY);
|
||||
assertAnalyzesTo(
|
||||
analyzer,
|
||||
"東京t",
|
||||
new String[] {"東京t", "toukyout"},
|
||||
new int[] {0, 0},
|
||||
new int[] {3, 3},
|
||||
new int[] {1, 0});
|
||||
analyzer.close();
|
||||
}
|
||||
|
||||
/** blast random strings against the analyzer */
|
||||
@Test
|
||||
public void testRandom() throws IOException {
|
||||
Random random = random();
|
||||
final Analyzer a = new JapaneseCompletionAnalyzer();
|
||||
checkRandomData(random, a, atLeast(100));
|
||||
a.close();
|
||||
}
|
||||
|
||||
/** blast some random large strings through the analyzer */
|
||||
@Test
|
||||
public void testRandomHugeStrings() throws Exception {
|
||||
Random random = random();
|
||||
final Analyzer a = new JapaneseCompletionAnalyzer();
|
||||
checkRandomData(random, a, 2 * RANDOM_MULTIPLIER, 8192);
|
||||
a.close();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,271 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.ja;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.cjk.CJKWidthCharFilter;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestJapaneseCompletionFilter extends BaseTokenStreamTestCase {
|
||||
private Analyzer indexAnalyzer;
|
||||
private Analyzer queryAnalyzer;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
indexAnalyzer =
|
||||
new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.NORMAL);
|
||||
return new TokenStreamComponents(
|
||||
tokenizer,
|
||||
new JapaneseCompletionFilter(tokenizer, JapaneseCompletionFilter.Mode.INDEX));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
return new CJKWidthCharFilter(reader);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
|
||||
return new CJKWidthCharFilter(reader);
|
||||
}
|
||||
};
|
||||
queryAnalyzer =
|
||||
new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.NORMAL);
|
||||
return new TokenStreamComponents(
|
||||
tokenizer,
|
||||
new JapaneseCompletionFilter(tokenizer, JapaneseCompletionFilter.Mode.QUERY));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
return new CJKWidthCharFilter(reader);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
|
||||
return new CJKWidthCharFilter(reader);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
IOUtils.close(indexAnalyzer);
|
||||
IOUtils.close(queryAnalyzer);
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCompletionIndex() throws IOException {
|
||||
assertAnalyzesTo(
|
||||
indexAnalyzer,
|
||||
"東京",
|
||||
new String[] {"東京", "toukyou"},
|
||||
new int[] {0, 0},
|
||||
new int[] {2, 2},
|
||||
new int[] {1, 0});
|
||||
|
||||
assertAnalyzesTo(
|
||||
indexAnalyzer,
|
||||
"東京都",
|
||||
new String[] {"東京", "toukyou", "都", "to"},
|
||||
new int[] {0, 0, 2, 2},
|
||||
new int[] {2, 2, 3, 3},
|
||||
new int[] {1, 0, 1, 0});
|
||||
|
||||
assertAnalyzesTo(
|
||||
indexAnalyzer,
|
||||
"ドラえもん",
|
||||
new String[] {"ドラえもん", "doraemon", "doraemonn"},
|
||||
new int[] {0, 0, 0},
|
||||
new int[] {5, 5, 5},
|
||||
new int[] {1, 0, 0});
|
||||
|
||||
assertAnalyzesTo(
|
||||
indexAnalyzer,
|
||||
"ソースコード",
|
||||
new String[] {"ソース", "soーsu", "コード", "koーdo"},
|
||||
new int[] {0, 0, 3, 3},
|
||||
new int[] {3, 3, 6, 6},
|
||||
new int[] {1, 0, 1, 0});
|
||||
|
||||
assertAnalyzesTo(
|
||||
indexAnalyzer,
|
||||
"反社会的勢力",
|
||||
new String[] {"反", "han", "hann", "社会", "syakai", "shakai", "的", "teki", "勢力", "seiryoku"},
|
||||
new int[] {0, 0, 0, 1, 1, 1, 3, 3, 4, 4},
|
||||
new int[] {1, 1, 1, 3, 3, 3, 4, 4, 6, 6},
|
||||
new int[] {1, 0, 0, 1, 0, 0, 1, 0, 1, 0});
|
||||
|
||||
assertAnalyzesTo(
|
||||
indexAnalyzer, "々", new String[] {"々"}, new int[] {0}, new int[] {1}, new int[] {1});
|
||||
|
||||
assertAnalyzesTo(
|
||||
indexAnalyzer,
|
||||
"是々",
|
||||
new String[] {"是", "ze", "々"},
|
||||
new int[] {0, 0, 1},
|
||||
new int[] {1, 1, 2},
|
||||
new int[] {1, 0, 1});
|
||||
|
||||
assertAnalyzesTo(
|
||||
indexAnalyzer,
|
||||
"是々の",
|
||||
new String[] {"是", "ze", "々", "の", "no"},
|
||||
new int[] {0, 0, 1, 2, 2},
|
||||
new int[] {1, 1, 2, 3, 3},
|
||||
new int[] {1, 0, 1, 1, 0});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCompletionQuery() throws IOException {
|
||||
assertAnalyzesTo(
|
||||
queryAnalyzer,
|
||||
"東京",
|
||||
new String[] {"東京", "toukyou"},
|
||||
new int[] {0, 0},
|
||||
new int[] {2, 2},
|
||||
new int[] {1, 0});
|
||||
|
||||
assertAnalyzesTo(
|
||||
queryAnalyzer,
|
||||
"東京都",
|
||||
new String[] {"東京", "toukyou", "都", "to"},
|
||||
new int[] {0, 0, 2, 2},
|
||||
new int[] {2, 2, 3, 3},
|
||||
new int[] {1, 0, 1, 0});
|
||||
|
||||
assertAnalyzesTo(
|
||||
queryAnalyzer,
|
||||
"ドラえもん",
|
||||
new String[] {"ドラえもん", "doraemon", "doraemonn"},
|
||||
new int[] {0, 0, 0},
|
||||
new int[] {5, 5, 5},
|
||||
new int[] {1, 0, 0});
|
||||
|
||||
assertAnalyzesTo(
|
||||
queryAnalyzer,
|
||||
"ソースコード",
|
||||
new String[] {"ソースコード", "soーsukoーdo"},
|
||||
new int[] {0, 0},
|
||||
new int[] {6, 6},
|
||||
new int[] {1, 0});
|
||||
|
||||
assertAnalyzesTo(
|
||||
queryAnalyzer,
|
||||
"反社会的勢力",
|
||||
new String[] {"反", "han", "hann", "社会", "syakai", "shakai", "的", "teki", "勢力", "seiryoku"},
|
||||
new int[] {0, 0, 0, 1, 1, 1, 3, 3, 4, 4},
|
||||
new int[] {1, 1, 1, 3, 3, 3, 4, 4, 6, 6},
|
||||
new int[] {1, 0, 0, 1, 0, 0, 1, 0, 1, 0});
|
||||
|
||||
assertAnalyzesTo(
|
||||
queryAnalyzer, "々", new String[] {"々"}, new int[] {0}, new int[] {1}, new int[] {1});
|
||||
|
||||
assertAnalyzesTo(
|
||||
queryAnalyzer,
|
||||
"是々",
|
||||
new String[] {"是", "ze", "々"},
|
||||
new int[] {0, 0, 1},
|
||||
new int[] {1, 1, 2},
|
||||
new int[] {1, 0, 1});
|
||||
|
||||
assertAnalyzesTo(
|
||||
indexAnalyzer,
|
||||
"是々の",
|
||||
new String[] {"是", "ze", "々", "の", "no"},
|
||||
new int[] {0, 0, 1, 2, 2},
|
||||
new int[] {1, 1, 2, 3, 3},
|
||||
new int[] {1, 0, 1, 1, 0});
|
||||
|
||||
assertAnalyzesTo(
|
||||
queryAnalyzer,
|
||||
"東京t",
|
||||
new String[] {"東京t", "toukyout"},
|
||||
new int[] {0, 0},
|
||||
new int[] {3, 3},
|
||||
new int[] {1, 0});
|
||||
|
||||
assertAnalyzesTo(
|
||||
queryAnalyzer,
|
||||
"サッk",
|
||||
new String[] {"サッk", "sakk"},
|
||||
new int[] {0, 0},
|
||||
new int[] {3, 3},
|
||||
new int[] {1, 0});
|
||||
|
||||
assertAnalyzesTo(
|
||||
queryAnalyzer,
|
||||
"反sy",
|
||||
new String[] {"反sy", "hansy", "hannsy"},
|
||||
new int[] {0, 0, 0},
|
||||
new int[] {3, 3, 3},
|
||||
new int[] {1, 0, 0});
|
||||
|
||||
assertAnalyzesTo(
|
||||
queryAnalyzer,
|
||||
"さーきゅr",
|
||||
new String[] {"さーきゅr", "saーkyur"},
|
||||
new int[] {0, 0},
|
||||
new int[] {5, 5},
|
||||
new int[] {1, 0});
|
||||
|
||||
assertAnalyzesTo(
|
||||
queryAnalyzer,
|
||||
"是々h",
|
||||
new String[] {"是", "ze", "々h"},
|
||||
new int[] {0, 0, 1},
|
||||
new int[] {1, 1, 3},
|
||||
new int[] {1, 0, 1});
|
||||
}
|
||||
|
||||
public void testEnglish() throws IOException {
|
||||
assertAnalyzesTo(indexAnalyzer, "this atest", new String[] {"this", "atest"});
|
||||
assertAnalyzesTo(queryAnalyzer, "this atest", new String[] {"this", "atest"});
|
||||
}
|
||||
|
||||
public void testRandomStrings() throws IOException {
|
||||
checkRandomData(random(), indexAnalyzer, atLeast(200));
|
||||
checkRandomData(random(), queryAnalyzer, atLeast(200));
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
Analyzer a =
|
||||
new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer();
|
||||
return new TokenStreamComponents(tokenizer, new JapaneseCompletionFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "", "");
|
||||
a.close();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.ja;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.cjk.CJKWidthFilterFactory;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestJapaneseCompletionFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
@Test
|
||||
public void testCompletion() throws IOException {
|
||||
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<>());
|
||||
TokenStream tokenStream = tokenizerFactory.create();
|
||||
((Tokenizer) tokenStream).setReader(new StringReader("東京t"));
|
||||
CJKWidthFilterFactory cjkWidthFactory = new CJKWidthFilterFactory(new HashMap<>());
|
||||
tokenStream = cjkWidthFactory.create(tokenStream);
|
||||
Map<String, String> map = new HashMap<>();
|
||||
map.put("mode", "QUERY");
|
||||
JapaneseCompletionFilterFactory filterFactory = new JapaneseCompletionFilterFactory(map);
|
||||
assertTokenStreamContents(filterFactory.create(tokenStream), new String[] {"東京t", "toukyout"});
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
@Test
|
||||
public void testBogusArguments() throws Exception {
|
||||
IllegalArgumentException expected =
|
||||
expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> {
|
||||
new JapaneseCompletionFilterFactory(
|
||||
new HashMap<String, String>() {
|
||||
{
|
||||
put("bogusArg", "bogusValue");
|
||||
}
|
||||
});
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,68 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.ja.completion;
|
||||
|
||||
import java.util.List;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestKatakanaRomanizer extends LuceneTestCase {
|
||||
private final KatakanaRomanizer romanizer = KatakanaRomanizer.getInstance();
|
||||
|
||||
@Test
|
||||
public void testRomanize() {
|
||||
assertCharsRefListEqualsUnordered(
|
||||
List.of(new CharsRef("hasi"), new CharsRef("hashi")),
|
||||
romanizer.romanize(new CharsRef("ハシ")));
|
||||
assertCharsRefListEqualsUnordered(
|
||||
List.of(new CharsRef("yuukyuu")), romanizer.romanize(new CharsRef("ユウキュウ")));
|
||||
assertCharsRefListEqualsUnordered(
|
||||
List.of(new CharsRef("yakyuu")), romanizer.romanize(new CharsRef("ヤキュウ")));
|
||||
assertCharsRefListEqualsUnordered(
|
||||
List.of(new CharsRef("toukyou")), romanizer.romanize(new CharsRef("トウキョウ")));
|
||||
assertCharsRefListEqualsUnordered(
|
||||
List.of(new CharsRef("toーkyoー")), romanizer.romanize(new CharsRef("トーキョー")));
|
||||
assertCharsRefListEqualsUnordered(
|
||||
List.of(new CharsRef("sakka")), romanizer.romanize(new CharsRef("サッカ")));
|
||||
assertCharsRefListEqualsUnordered(
|
||||
List.of(new CharsRef("hyakkaten"), new CharsRef("hyakkatenn")),
|
||||
romanizer.romanize(new CharsRef("ヒャッカテン")));
|
||||
assertCharsRefListEqualsUnordered(
|
||||
List.of(new CharsRef("voruteーru"), new CharsRef("vuxoruteーru")),
|
||||
romanizer.romanize(new CharsRef("ヴォルテール")));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRomanizeWithAlphabets() {
|
||||
assertCharsRefListEqualsUnordered(
|
||||
List.of(new CharsRef("toukyout")), romanizer.romanize(new CharsRef("トウキョウt")));
|
||||
assertCharsRefListEqualsUnordered(
|
||||
List.of(new CharsRef("kodakk")), romanizer.romanize(new CharsRef("コダッk")));
|
||||
assertCharsRefListEqualsUnordered(
|
||||
List.of(new CharsRef("syousy"), new CharsRef("shousy")),
|
||||
romanizer.romanize(new CharsRef("ショウsy")));
|
||||
}
|
||||
|
||||
private static void assertCharsRefListEqualsUnordered(
|
||||
List<CharsRef> expected, List<CharsRef> actual) {
|
||||
assertEquals(expected.size(), actual.size());
|
||||
for (CharsRef ref : expected) {
|
||||
assertTrue(ref.toString() + " is not contained in " + actual, actual.contains(ref));
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue