mirror of https://github.com/apache/lucene.git
LUCENE-3745: add proper Japanese stopping
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1240714 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
009608d9f2
commit
03497e7595
|
@ -1,6 +1,14 @@
|
||||||
# set of default stop tags:
|
#
|
||||||
# uncomment a part of speech to treat those words as stopwords.
|
# This file defines a Japanese stoptag set for KuromojiPartOfSpeechStopFilter.
|
||||||
# the entire tagset is provided here for convenience.
|
#
|
||||||
|
# Any token with a part-of-speech tag that exactly matches those defined in this
|
||||||
|
# file are removed from the token stream.
|
||||||
|
#
|
||||||
|
# Set your own stoptags by uncommenting the lines below. Note that comments are
|
||||||
|
# not allowed on the same line as a stoptag. See LUCENE-3745 for frequency lists,
|
||||||
|
# etc. that can be useful for building you own stoptag set.
|
||||||
|
#
|
||||||
|
# The entire possible tagset is provided below for convenience.
|
||||||
#
|
#
|
||||||
#####
|
#####
|
||||||
# noun: unclassified nouns
|
# noun: unclassified nouns
|
||||||
|
@ -188,25 +196,25 @@
|
||||||
#
|
#
|
||||||
#####
|
#####
|
||||||
# prefix: unclassified prefixes
|
# prefix: unclassified prefixes
|
||||||
接頭詞
|
#接頭詞
|
||||||
#
|
#
|
||||||
# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms)
|
# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms)
|
||||||
# excluding numerical expressions.
|
# excluding numerical expressions.
|
||||||
# e.g. お (水), 某 (氏), 同 (社), 故 (~氏), 高 (品質), お (見事), ご (立派)
|
# e.g. お (水), 某 (氏), 同 (社), 故 (~氏), 高 (品質), お (見事), ご (立派)
|
||||||
接頭詞-名詞接続
|
#接頭詞-名詞接続
|
||||||
#
|
#
|
||||||
# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb
|
# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb
|
||||||
# in conjunctive form followed by なる/なさる/くださる.
|
# in conjunctive form followed by なる/なさる/くださる.
|
||||||
# e.g. お (読みなさい), お (座り)
|
# e.g. お (読みなさい), お (座り)
|
||||||
接頭詞-動詞接続
|
#接頭詞-動詞接続
|
||||||
#
|
#
|
||||||
# prefix-adjectival: Prefixes that attach to adjectives.
|
# prefix-adjectival: Prefixes that attach to adjectives.
|
||||||
# e.g. お (寒いですねえ), バカ (でかい)
|
# e.g. お (寒いですねえ), バカ (でかい)
|
||||||
接頭詞-形容詞接続
|
#接頭詞-形容詞接続
|
||||||
#
|
#
|
||||||
# prefix-numerical: Prefixes that attach to numerical expressions.
|
# prefix-numerical: Prefixes that attach to numerical expressions.
|
||||||
# e.g. 約, およそ, 毎時
|
# e.g. 約, およそ, 毎時
|
||||||
接頭詞-数接続
|
#接頭詞-数接続
|
||||||
#
|
#
|
||||||
#####
|
#####
|
||||||
# verb: unclassified verbs
|
# verb: unclassified verbs
|
||||||
|
@ -216,7 +224,7 @@
|
||||||
#動詞-自立
|
#動詞-自立
|
||||||
#
|
#
|
||||||
# verb-auxiliary:
|
# verb-auxiliary:
|
||||||
動詞-非自立
|
#動詞-非自立
|
||||||
#
|
#
|
||||||
# verb-suffix:
|
# verb-suffix:
|
||||||
#動詞-接尾
|
#動詞-接尾
|
||||||
|
@ -351,11 +359,11 @@
|
||||||
# interjection: Greetings and other exclamations.
|
# interjection: Greetings and other exclamations.
|
||||||
# e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます,
|
# e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます,
|
||||||
# いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい
|
# いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい
|
||||||
感動詞
|
#感動詞
|
||||||
#
|
#
|
||||||
#####
|
#####
|
||||||
# symbol: unclassified Symbols.
|
# symbol: unclassified Symbols.
|
||||||
#記号
|
記号
|
||||||
#
|
#
|
||||||
# symbol-misc: A general symbol not in one of the categories below.
|
# symbol-misc: A general symbol not in one of the categories below.
|
||||||
# e.g. [○◎@$〒→+]
|
# e.g. [○◎@$〒→+]
|
||||||
|
@ -408,3 +416,5 @@
|
||||||
#####
|
#####
|
||||||
# unknown: unknown part of speech.
|
# unknown: unknown part of speech.
|
||||||
#未知語
|
#未知語
|
||||||
|
#
|
||||||
|
##### End of file
|
|
@ -1,13 +1,122 @@
|
||||||
# short set of japanese stopwords
|
#
|
||||||
いう
|
# This file defines a stopword set for Japanese.
|
||||||
する
|
#
|
||||||
人物
|
# The set is made up hand-picked frequent terms from taken from segmented Japanese
|
||||||
さま
|
# Wikipedia. Punctuation characters and frequent kanji have mostly been left out.
|
||||||
すること
|
#
|
||||||
ため
|
# There is an overlap between these stopwords and the terms removed when used in
|
||||||
もの
|
# combination with the KuromojiPartOfSpeechStopFilter. When editing this file, note
|
||||||
おいて
|
# that comments are not allowed on the same line as stopwords.
|
||||||
なる
|
#
|
||||||
できる
|
# See LUCENE-3745 for frequency lists, etc. that can be useful for making your own set.
|
||||||
おく
|
#
|
||||||
|
の
|
||||||
|
に
|
||||||
|
は
|
||||||
|
を
|
||||||
|
た
|
||||||
|
が
|
||||||
|
で
|
||||||
|
て
|
||||||
|
と
|
||||||
|
し
|
||||||
|
れ
|
||||||
|
さ
|
||||||
ある
|
ある
|
||||||
|
いる
|
||||||
|
も
|
||||||
|
する
|
||||||
|
から
|
||||||
|
な
|
||||||
|
こと
|
||||||
|
として
|
||||||
|
い
|
||||||
|
や
|
||||||
|
れる
|
||||||
|
など
|
||||||
|
なっ
|
||||||
|
ない
|
||||||
|
この
|
||||||
|
ため
|
||||||
|
その
|
||||||
|
あっ
|
||||||
|
よう
|
||||||
|
また
|
||||||
|
もの
|
||||||
|
という
|
||||||
|
あり
|
||||||
|
まで
|
||||||
|
られ
|
||||||
|
なる
|
||||||
|
へ
|
||||||
|
か
|
||||||
|
だ
|
||||||
|
これ
|
||||||
|
によって
|
||||||
|
により
|
||||||
|
おり
|
||||||
|
より
|
||||||
|
による
|
||||||
|
ず
|
||||||
|
なり
|
||||||
|
られる
|
||||||
|
において
|
||||||
|
ば
|
||||||
|
なかっ
|
||||||
|
なく
|
||||||
|
しかし
|
||||||
|
について
|
||||||
|
せ
|
||||||
|
だっ
|
||||||
|
その後
|
||||||
|
できる
|
||||||
|
それ
|
||||||
|
う
|
||||||
|
ので
|
||||||
|
なお
|
||||||
|
のみ
|
||||||
|
でき
|
||||||
|
き
|
||||||
|
つ
|
||||||
|
における
|
||||||
|
および
|
||||||
|
いう
|
||||||
|
さらに
|
||||||
|
でも
|
||||||
|
ら
|
||||||
|
たり
|
||||||
|
その他
|
||||||
|
に関する
|
||||||
|
たち
|
||||||
|
ます
|
||||||
|
ん
|
||||||
|
なら
|
||||||
|
に対して
|
||||||
|
特に
|
||||||
|
せる
|
||||||
|
及び
|
||||||
|
これら
|
||||||
|
とき
|
||||||
|
では
|
||||||
|
にて
|
||||||
|
ほか
|
||||||
|
ながら
|
||||||
|
うち
|
||||||
|
そして
|
||||||
|
とともに
|
||||||
|
ただし
|
||||||
|
かつて
|
||||||
|
それぞれ
|
||||||
|
または
|
||||||
|
お
|
||||||
|
ほど
|
||||||
|
ものの
|
||||||
|
に対する
|
||||||
|
ほとんど
|
||||||
|
と共に
|
||||||
|
といった
|
||||||
|
です
|
||||||
|
とも
|
||||||
|
ところ
|
||||||
|
ここ
|
||||||
|
##### End of file
|
||||||
|
|
Loading…
Reference in New Issue