FEATURE: allow CJK to be tokenized in non CJK sites.

Meaning a mixed English/Chinese site can still have a functioning search.
This commit is contained in:
Sam 2015-11-27 16:35:27 +11:00
parent b0905bee15
commit f74a6457ee
4 changed files with 16 additions and 1 deletions

View File

@ -792,6 +792,7 @@ en:
max_topic_title_length: "Maximum allowed topic title length in characters" max_topic_title_length: "Maximum allowed topic title length in characters"
min_private_message_title_length: "Minimum allowed title length for a message in characters" min_private_message_title_length: "Minimum allowed title length for a message in characters"
min_search_term_length: "Minimum valid search term length in characters" min_search_term_length: "Minimum valid search term length in characters"
search_tokenize_chinese_japanese_korean: "Force search to tokenize Chinese/Japanese/Korean even on non CJK sites"
allow_uncategorized_topics: "Allow topics to be created without a category. WARNING: If there are any uncategorized topics, you must recategorize them before turning this off." allow_uncategorized_topics: "Allow topics to be created without a category. WARNING: If there are any uncategorized topics, you must recategorize them before turning this off."
uncategorized_description: "The description of the uncategorized category. Leave blank for no description." uncategorized_description: "The description of the uncategorized category. Leave blank for no description."
allow_duplicate_topic_titles: "Allow topics with identical, duplicate titles." allow_duplicate_topic_titles: "Allow topics with identical, duplicate titles."

View File

@ -887,6 +887,8 @@ uncategorized:
min_search_term_length: min_search_term_length:
client: true client: true
default: 3 default: 3
search_tokenize_chinese_japanese_korean: false
max_similar_results: 5 max_similar_results: 5
minimum_topics_similar: 50 minimum_topics_similar: 50

View File

@ -76,7 +76,7 @@ class Search
def self.prepare_data(search_data) def self.prepare_data(search_data)
data = search_data.squish data = search_data.squish
# TODO rmmseg is designed for chinese, we need something else for Korean / Japanese # TODO rmmseg is designed for chinese, we need something else for Korean / Japanese
if ['zh_TW', 'zh_CN', 'ja', 'ko'].include?(SiteSetting.default_locale) if ['zh_TW', 'zh_CN', 'ja', 'ko'].include?(SiteSetting.default_locale) || SiteSetting.search_tokenize_chinese_japanese_korean
unless defined? RMMSeg unless defined? RMMSeg
require 'rmmseg' require 'rmmseg'
RMMSeg::Dictionary.load_dictionaries RMMSeg::Dictionary.load_dictionaries

View File

@ -392,6 +392,18 @@ describe Search do
expect(Search.execute('社區指南').posts.first.id).to eq(post.id) expect(Search.execute('社區指南').posts.first.id).to eq(post.id)
expect(Search.execute('指南').posts.first.id).to eq(post.id) expect(Search.execute('指南').posts.first.id).to eq(post.id)
end end
it 'finds chinese topic based on title if tokenization is forced' do
skip("skipped until pg app installs the db correctly") if RbConfig::CONFIG["arch"] =~ /darwin/
SiteSetting.search_tokenize_chinese_japanese_korean = true
topic = Fabricate(:topic, title: 'My Title Discourse社區指南')
post = Fabricate(:post, topic: topic)
expect(Search.execute('社區指南').posts.first.id).to eq(post.id)
expect(Search.execute('指南').posts.first.id).to eq(post.id)
end
end end
describe 'Advanced search' do describe 'Advanced search' do