FIX: Use the same mode for chinese search when indexing and querying. (#14780)
The `白名单` term becomes `名单 白名单` after it is processed by cppjieba in :query mode. However, `白名单` is not tokenized as such by cppjieba when it appears in a string of text. Therefore, this may lead to failed matches as the search data generated while indexing may not contain all of the terms generated by :query mode. We've decided to maintain parity for now such that both indexing and querying uses the same :mix mode. This may lead to less accurate search but our plan is to properly support CJK search in the future.
This commit is contained in:
parent
a059c7251f
commit
a03c48b720
|
@ -69,19 +69,17 @@ class Search
|
||||||
SiteSetting.search_tokenize_chinese_japanese_korean
|
SiteSetting.search_tokenize_chinese_japanese_korean
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.prepare_data(search_data, purpose = :query)
|
def self.prepare_data(search_data, purpose = nil)
|
||||||
purpose ||= :query
|
|
||||||
|
|
||||||
data = search_data.dup
|
data = search_data.dup
|
||||||
data.force_encoding("UTF-8")
|
data.force_encoding("UTF-8")
|
||||||
|
|
||||||
if purpose != :topic
|
if purpose != :topic
|
||||||
# TODO cppjieba_rb is designed for chinese, we need something else for Japanese
|
# TODO cppjieba_rb is designed for chinese, we need something else for Japanese
|
||||||
# Korean appears to be safe cause words are already space separated
|
# Korean appears to be safe cause words are already space separated
|
||||||
# For Japanese we should investigate using kakasi
|
# For Japanese we should investigate using kakasi
|
||||||
if segment_cjk?
|
if segment_cjk?
|
||||||
require 'cppjieba_rb' unless defined? CppjiebaRb
|
require 'cppjieba_rb' unless defined? CppjiebaRb
|
||||||
mode = (purpose == :query ? :query : :mix)
|
data = CppjiebaRb.segment(search_data, mode: :mix)
|
||||||
data = CppjiebaRb.segment(search_data, mode: mode)
|
|
||||||
|
|
||||||
# TODO: we still want to tokenize here but the current stopword list is too wide
|
# TODO: we still want to tokenize here but the current stopword list is too wide
|
||||||
# in cppjieba leading to words such as volume to be skipped. PG already has an English
|
# in cppjieba leading to words such as volume to be skipped. PG already has an English
|
||||||
|
|
|
@ -1107,7 +1107,7 @@ describe Search do
|
||||||
it 'splits English / Chinese and filter out stop words' do
|
it 'splits English / Chinese and filter out stop words' do
|
||||||
SiteSetting.default_locale = 'zh_CN'
|
SiteSetting.default_locale = 'zh_CN'
|
||||||
data = Search.prepare_data(sentence).split(' ')
|
data = Search.prepare_data(sentence).split(' ')
|
||||||
expect(data).to eq(["Discourse", "中国", "基础", "设施", "基础设施", "网络", "正在", "组装"])
|
expect(data).to eq(["Discourse", "中国", "基础设施", "网络", "正在", "组装"])
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'splits for indexing and filter out stop words' do
|
it 'splits for indexing and filter out stop words' do
|
||||||
|
@ -1119,12 +1119,6 @@ describe Search do
|
||||||
it 'splits English / Traditional Chinese and filter out stop words' do
|
it 'splits English / Traditional Chinese and filter out stop words' do
|
||||||
SiteSetting.default_locale = 'zh_TW'
|
SiteSetting.default_locale = 'zh_TW'
|
||||||
data = Search.prepare_data(sentence_t).split(' ')
|
data = Search.prepare_data(sentence_t).split(' ')
|
||||||
expect(data).to eq(["Discourse", "太平", "平山", "太平山", "森林", "遊樂區"])
|
|
||||||
end
|
|
||||||
|
|
||||||
it 'splits for indexing and filter out stop words' do
|
|
||||||
SiteSetting.default_locale = 'zh_TW'
|
|
||||||
data = Search.prepare_data(sentence_t, :index).split(' ')
|
|
||||||
expect(data).to eq(["Discourse", "太平山", "森林", "遊樂區"])
|
expect(data).to eq(["Discourse", "太平山", "森林", "遊樂區"])
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue