FIX: do not remove stop words when using English locale

PG already handles English stop words, the list in cppjieba is
bigger than the list PG uses, which in turn causes confusion cause
words such as "volume" are stripped using cppijieba stop word list

We will follow up with another commit here to apply the Chinese
word stopwords, but for now to eliminate the confusion we are
skipping applying the stopword list when the dictionary in PG is
in English.
This commit is contained in:
Sam Saffron 2020-05-18 10:54:56 +10:00
parent ea63fa7de7
commit 862773ec83
No known key found for this signature in database
GPG Key ID: B9606168D2FFD9F5
2 changed files with 26 additions and 1 deletions

View File

@ -72,7 +72,18 @@ class Search
require 'cppjieba_rb' unless defined? CppjiebaRb
mode = (purpose == :query ? :query : :mix)
data = CppjiebaRb.segment(search_data, mode: mode)
data = CppjiebaRb.filter_stop_word(data).join(' ')
# TODO: we still want to tokenize here but the current stopword list is too wide
# in cppjieba leading to words such as volume to be skipped. PG already has an English
# stopword list so use that vs relying on cppjieba
if ts_config != 'english'
data = CppjiebaRb.filter_stop_word(data)
else
data = data.filter { |s| s.present? }
end
data = data.join(' ')
else
data.squish!
end

View File

@ -4,6 +4,20 @@ require 'rails_helper'
describe Search do
context "#prepare_data" do
it "does not remove English stop words in mixed mode" do
SiteSetting.search_tokenize_chinese_japanese_korean = true
tokenized = Search.prepare_data("monkey 吃香蕉 in a loud volume")
expect(tokenized).to eq("monkey 吃 香蕉 in a loud volume")
SiteSetting.default_locale = 'zh_CN'
tokenized = Search.prepare_data("monkey 吃香蕉 in a loud volume")
expect(tokenized).to eq("monkey 吃 香蕉 loud")
end
end
context "#ts_config" do
it "maps locales to correct Postgres dictionaries" do
expect(Search.ts_config).to eq("english")