FIX: remove superfluous spaces from CJK blurbs (#12629)
Previously we used the raw data indexed to generate blurbs even for cases when Chinese/Korean/Japanese text was used. This caused superfluous spaces to show up in excerpts.
This commit is contained in:
parent
5e93730375
commit
5b342ae505
|
@ -64,6 +64,11 @@ class Search
|
|||
end
|
||||
end
|
||||
|
||||
def self.segment_cjk?
|
||||
['zh_TW', 'zh_CN', 'ja'].include?(SiteSetting.default_locale) ||
|
||||
SiteSetting.search_tokenize_chinese_japanese_korean
|
||||
end
|
||||
|
||||
def self.prepare_data(search_data, purpose = :query)
|
||||
purpose ||= :query
|
||||
|
||||
|
@ -73,7 +78,7 @@ class Search
|
|||
# TODO cppjieba_rb is designed for chinese, we need something else for Japanese
|
||||
# Korean appears to be safe cause words are already space seperated
|
||||
# For Japanese we should investigate using kakasi
|
||||
if ['zh_TW', 'zh_CN', 'ja'].include?(SiteSetting.default_locale) || SiteSetting.search_tokenize_chinese_japanese_korean
|
||||
if segment_cjk?
|
||||
require 'cppjieba_rb' unless defined? CppjiebaRb
|
||||
mode = (purpose == :query ? :query : :mix)
|
||||
data = CppjiebaRb.segment(search_data, mode: mode)
|
||||
|
|
|
@ -87,7 +87,7 @@ class Search
|
|||
blurb_length: @blurb_length
|
||||
}
|
||||
|
||||
if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION
|
||||
if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION && !Search.segment_cjk?
|
||||
if SiteSetting.use_pg_headlines_for_excerpt
|
||||
scrubbed_headline = post.headline.gsub(SCRUB_HEADLINE_REGEXP, '\1')
|
||||
prefix_omission = scrubbed_headline.start_with?(post.leading_raw_data) ? '' : OMISSION
|
||||
|
|
|
@ -1791,6 +1791,27 @@ describe Search do
|
|||
end
|
||||
end
|
||||
|
||||
context 'CJK segmentation' do
|
||||
before do
|
||||
SiteSetting.search_tokenize_chinese_japanese_korean = true
|
||||
SiteSetting.min_search_term_length = 1
|
||||
end
|
||||
|
||||
let!(:post1) do
|
||||
Fabricate(:post, raw: '場サアマネ織企ういかせ竹域ヱイマ穂基ホ神3予読ずねいぱ松査ス禁多サウ提懸イふ引小43改こょドめ。深とつぐ主思料農ぞかル者杯検める活分えほづぼ白犠')
|
||||
end
|
||||
|
||||
it('does not include superflous spaces in blurbs') do
|
||||
|
||||
results = Search.execute('ういかせ竹域', type_filter: 'topic')
|
||||
expect(results.posts.length).to eq(1)
|
||||
|
||||
expect(results.blurb(results.posts.first)).to include('ういかせ竹域')
|
||||
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
context 'include_diacritics' do
|
||||
before { SiteSetting.search_ignore_accents = false }
|
||||
let!(:post1) { Fabricate(:post, raw: 'สวัสดี Régis hello') }
|
||||
|
|
Loading…
Reference in New Issue