FIX: Return proper results when searching for a topic in Japanese
Currently, when the default locale is Japanese, the search for a topic using its URL, path or ID doesn’t work as expected. It will either return wrong results or no result at all. The problem lies with how we process the provided terms in Japanese mode. For example, if `http://localhost/t/-/55` is provided, currently this will result in `http localhost t 5 5` to be searched for. This patch addresses the issue by checking whether the provided term needs segmenting. If the provided term is a number, or a path or a full URL, then it doesn’t need segmenting. When that happens we skip the processing we normally apply for Japanese, making the search return the expected results.
This commit is contained in:
parent
cc873977ec
commit
9b4b5b5028
|
@ -117,7 +117,7 @@ class Search
|
|||
data.force_encoding("UTF-8")
|
||||
data = clean_term(data)
|
||||
|
||||
if purpose != :topic
|
||||
if purpose != :topic && need_segmenting?(data)
|
||||
if segment_chinese?
|
||||
require "cppjieba_rb" unless defined?(CppjiebaRb)
|
||||
|
||||
|
@ -226,6 +226,13 @@ class Search
|
|||
end
|
||||
end
|
||||
|
||||
def self.need_segmenting?(data)
|
||||
return false if data.match?(/\A\d+\z/)
|
||||
!URI.parse(data).path.start_with?("/")
|
||||
rescue URI::InvalidURIError
|
||||
true
|
||||
end
|
||||
|
||||
attr_accessor :term
|
||||
attr_reader :clean_term, :guardian
|
||||
|
||||
|
|
|
@ -9,6 +9,36 @@ RSpec.describe Search do
|
|||
Jobs.run_immediately!
|
||||
end
|
||||
|
||||
describe ".need_segmenting?" do
|
||||
subject(:search) { described_class }
|
||||
|
||||
context "when data only contains digits" do
|
||||
let(:data) { "510" }
|
||||
|
||||
it { is_expected.not_to be_need_segmenting(data) }
|
||||
end
|
||||
|
||||
context "when data does not only contain digits" do
|
||||
context "when data is a full URL" do
|
||||
let(:data) { "http://localhost/t/-/510" }
|
||||
|
||||
it { is_expected.not_to be_need_segmenting(data) }
|
||||
end
|
||||
|
||||
context "when data is a path" do
|
||||
let(:data) { "/t/-/510" }
|
||||
|
||||
it { is_expected.not_to be_need_segmenting(data) }
|
||||
end
|
||||
|
||||
context "when data is something else" do
|
||||
let(:data) { "text" }
|
||||
|
||||
it { is_expected.to be_need_segmenting(data) }
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe "#ts_config" do
|
||||
it "maps locales to correct Postgres dictionaries" do
|
||||
expect(Search.ts_config).to eq("english")
|
||||
|
@ -1597,10 +1627,38 @@ RSpec.describe Search do
|
|||
let!(:post_2) { Fabricate(:post, topic: topic_2) }
|
||||
|
||||
describe ".prepare_data" do
|
||||
it "removes punctuations" do
|
||||
SiteSetting.search_tokenize_japanese = true
|
||||
subject(:prepared_data) { Search.prepare_data(data) }
|
||||
|
||||
expect(Search.prepare_data(post.raw)).to eq("This is some japanese text 日本 が 大好き です")
|
||||
let(:data) { post.raw }
|
||||
|
||||
before { SiteSetting.search_tokenize_japanese = true }
|
||||
|
||||
it "removes punctuations" do
|
||||
expect(prepared_data).to eq("This is some japanese text 日本 が 大好き です")
|
||||
end
|
||||
|
||||
context "when providing only an URL" do
|
||||
let(:data) { "http://localhost/t/-/51" }
|
||||
|
||||
it "does not change it" do
|
||||
expect(prepared_data).to eq(data)
|
||||
end
|
||||
end
|
||||
|
||||
context "when providing only a path" do
|
||||
let(:data) { "/t/-/51" }
|
||||
|
||||
it "does not change it" do
|
||||
expect(prepared_data).to eq(data)
|
||||
end
|
||||
end
|
||||
|
||||
context "when providing only an ID" do
|
||||
let(:data) { "51" }
|
||||
|
||||
it "does not change it" do
|
||||
expect(prepared_data).to eq(data)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -1616,23 +1674,24 @@ RSpec.describe Search do
|
|||
SiteSetting.refresh!
|
||||
end
|
||||
|
||||
it "finds posts containing Japanese text if tokenization is forced" do
|
||||
SiteSetting.search_tokenize_japanese = true
|
||||
context "when tokenization is forced" do
|
||||
before { SiteSetting.search_tokenize_japanese = true }
|
||||
|
||||
it "finds posts containing Japanese text" do
|
||||
expect(Search.execute("日本").posts.map(&:id)).to eq([post_2.id, post.id])
|
||||
expect(Search.execute("日").posts.map(&:id)).to eq([post_2.id, post.id])
|
||||
end
|
||||
end
|
||||
|
||||
it "find posts containing search term when site's locale is set to Japanese" do
|
||||
SiteSetting.default_locale = "ja"
|
||||
context "when default locale is set to Japanese" do
|
||||
before { SiteSetting.default_locale = "ja" }
|
||||
|
||||
it "find posts containing search term" do
|
||||
expect(Search.execute("日本").posts.map(&:id)).to eq([post_2.id, post.id])
|
||||
expect(Search.execute("日").posts.map(&:id)).to eq([post_2.id, post.id])
|
||||
end
|
||||
|
||||
it "does not include superfluous spaces in blurbs" do
|
||||
SiteSetting.default_locale = "ja"
|
||||
|
||||
post.update!(
|
||||
raw: "場サアマネ織企ういかせ竹域ヱイマ穂基ホ神3予読ずねいぱ松査ス禁多サウ提懸イふ引小43改こょドめ。深とつぐ主思料農ぞかル者杯検める活分えほづぼ白犠",
|
||||
)
|
||||
|
@ -1642,6 +1701,42 @@ RSpec.describe Search do
|
|||
expect(results.posts.length).to eq(1)
|
||||
expect(results.blurb(results.posts.first)).to include("ういかせ竹域")
|
||||
end
|
||||
|
||||
context "when searching for a topic in particular" do
|
||||
subject(:results) do
|
||||
described_class.execute(
|
||||
term,
|
||||
guardian: Discourse.system_user.guardian,
|
||||
type_filter: "topic",
|
||||
search_for_id: true,
|
||||
)
|
||||
end
|
||||
|
||||
context "when searching by topic ID" do
|
||||
let(:term) { topic.id }
|
||||
|
||||
it "finds the proper post" do
|
||||
expect(results.posts.first).to have_attributes(topic: topic, post_number: 1)
|
||||
end
|
||||
end
|
||||
|
||||
context "when searching by topic URL" do
|
||||
let(:term) { "http://#{Discourse.current_hostname}/t/-/#{topic.id}" }
|
||||
|
||||
it "finds the proper post" do
|
||||
expect(results.posts.first).to have_attributes(topic: topic, post_number: 1)
|
||||
end
|
||||
end
|
||||
|
||||
context "when searching by topic path" do
|
||||
let(:term) { "/t/-/#{topic.id}" }
|
||||
|
||||
it "finds the proper post" do
|
||||
expect(results.posts.first).to have_attributes(topic: topic, post_number: 1)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
|
Loading…
Reference in New Issue