mirror of https://github.com/apache/lucene.git
fix typo analysis-kuromoji (#12047)
This commit is contained in:
parent
4eab1d74e8
commit
4676a735c1
|
@ -39,9 +39,9 @@ import org.apache.lucene.util.IgnoreRandomChains;
|
|||
* <p>Notice that this analyzer uses a token composition scheme and relies on punctuation tokens
|
||||
* being found in the token stream. Please make sure your {@link JapaneseTokenizer} has {@code
|
||||
* discardPunctuation} set to false. In case punctuation characters, such as . (U+FF0E FULLWIDTH
|
||||
* FULL STOP), is removed from the token stream, this filter would find input tokens tokens 3 and 2千
|
||||
* and give outputs 3 and 2000 instead of 3200, which is likely not the intended result. If you want
|
||||
* to remove punctuation characters from your index that are not part of normalized numbers, add a
|
||||
* FULL STOP), is removed from the token stream, this filter would find input tokens 3 and 2千 and
|
||||
* give outputs 3 and 2000 instead of 3200, which is likely not the intended result. If you want to
|
||||
* remove punctuation characters from your index that are not part of normalized numbers, add a
|
||||
* {@link org.apache.lucene.analysis.StopFilter} with the punctuation you wish to remove after
|
||||
* {@link JapaneseNumberFilter} in your analyzer chain.
|
||||
*
|
||||
|
@ -59,8 +59,8 @@ import org.apache.lucene.util.IgnoreRandomChains;
|
|||
* <li>15,7 becomes 157 (be aware of this weakness)
|
||||
* </ul>
|
||||
*
|
||||
* <p>Tokens preceded by a token with {@link PositionIncrementAttribute} of zero are left left
|
||||
* untouched and emitted as-is.
|
||||
* <p>Tokens preceded by a token with {@link PositionIncrementAttribute} of zero are left untouched
|
||||
* and emitted as-is.
|
||||
*
|
||||
* <p>This filter does not use any part-of-speech information for its normalization and the
|
||||
* motivation for this is to also support n-grammed token streams in the future.
|
||||
|
|
|
@ -72,7 +72,7 @@ public class CharSequenceUtils {
|
|||
return ch >= 0xff41 && ch <= 0xff5a;
|
||||
}
|
||||
|
||||
/** Convert all hiragana in a string into kanataka */
|
||||
/** Convert all hiragana in a string into Katakana */
|
||||
public static String toKatakana(CharSequence s) {
|
||||
char[] chars = new char[s.length()];
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
# mapping rules of katakana (an unit of keystroke) to list of acceptable romanizations.
|
||||
# mapping rules of katakana (a unit of keystroke) to list of acceptable romanizations.
|
||||
# longest-match is used to find entries in this list.
|
||||
# covers romanization systems: modified Hepburn-shiki, Kunrei-shiki (Nihon-shiki), and Wāpuro shiki.
|
||||
# note: this does not strictly comply with the romanization systems listed above,
|
||||
# but tries to cover possible keystoroke supported by various Input Methods.
|
||||
# but tries to cover possible keystroke supported by various Input Methods.
|
||||
|
||||
ア,a
|
||||
イ,i
|
||||
|
@ -341,4 +341,4 @@
|
|||
# Chōonpu (Katakana-Hiragana Prolonged Sound Mark)
|
||||
ー,ー
|
||||
# Interpunct (Middle Dot)
|
||||
・,・
|
||||
・,・
|
||||
|
|
Loading…
Reference in New Issue