fix typo analysis-kuromoji (#12047)

This commit is contained in:
twosom 2023-01-02 00:58:50 +09:00 committed by GitHub
parent 4eab1d74e8
commit 4676a735c1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 9 additions and 9 deletions

View File

@ -39,9 +39,9 @@ import org.apache.lucene.util.IgnoreRandomChains;
* <p>Notice that this analyzer uses a token composition scheme and relies on punctuation tokens
* being found in the token stream. Please make sure your {@link JapaneseTokenizer} has {@code
* discardPunctuation} set to false. In case punctuation characters, such as (U+FF0E FULLWIDTH
* FULL STOP), is removed from the token stream, this filter would find input tokens tokens and 2千
* and give outputs 3 and 2000 instead of 3200, which is likely not the intended result. If you want
* to remove punctuation characters from your index that are not part of normalized numbers, add a
* FULL STOP), is removed from the token stream, this filter would find input tokens and 2千 and
* give outputs 3 and 2000 instead of 3200, which is likely not the intended result. If you want to
* remove punctuation characters from your index that are not part of normalized numbers, add a
* {@link org.apache.lucene.analysis.StopFilter} with the punctuation you wish to remove after
* {@link JapaneseNumberFilter} in your analyzer chain.
*
@ -59,8 +59,8 @@ import org.apache.lucene.util.IgnoreRandomChains;
* <li>15,7 becomes 157 (be aware of this weakness)
* </ul>
*
* <p>Tokens preceded by a token with {@link PositionIncrementAttribute} of zero are left left
* untouched and emitted as-is.
* <p>Tokens preceded by a token with {@link PositionIncrementAttribute} of zero are left untouched
* and emitted as-is.
*
* <p>This filter does not use any part-of-speech information for its normalization and the
* motivation for this is to also support n-grammed token streams in the future.

View File

@ -72,7 +72,7 @@ public class CharSequenceUtils {
return ch >= 0xff41 && ch <= 0xff5a;
}
/** Convert all hiragana in a string into kanataka */
/** Convert all hiragana in a string into Katakana */
public static String toKatakana(CharSequence s) {
char[] chars = new char[s.length()];
for (int i = 0; i < s.length(); i++) {

View File

@ -1,8 +1,8 @@
# mapping rules of katakana (an unit of keystroke) to list of acceptable romanizations.
# mapping rules of katakana (a unit of keystroke) to list of acceptable romanizations.
# longest-match is used to find entries in this list.
# covers romanization systems: modified Hepburn-shiki, Kunrei-shiki (Nihon-shiki), and Wāpuro shiki.
# note: this does not strictly comply with the romanization systems listed above,
# but tries to cover possible keystoroke supported by various Input Methods.
# but tries to cover possible keystroke supported by various Input Methods.
ア,a
イ,i
@ -341,4 +341,4 @@
# Chōonpu (Katakana-Hiragana Prolonged Sound Mark)
ー,ー
# Interpunct (Middle Dot)
・,・
・,・