FIX: Return proper results when searching for a topic in Japanese

Currently, when the default locale is Japanese, the search for a topic
using its URL, path or ID doesn’t work as expected. It will either
return wrong results or no result at all.

The problem lies with how we process the provided terms in Japanese
mode. For example, if `http://localhost/t/-/55` is provided, currently
this will result in `http localhost t 5 5` to be searched for.

This patch addresses the issue by checking whether the provided term
needs segmenting. If the provided term is a number, or a path or a full
URL, then it doesn’t need segmenting. When that happens we skip the
processing we normally apply for Japanese, making the search return the
expected results.
This commit is contained in:
Loïc Guitaut 2024-08-30 15:48:44 +02:00 committed by Loïc Guitaut
parent cc873977ec
commit 9b4b5b5028
2 changed files with 123 additions and 21 deletions

View File

@ -117,7 +117,7 @@ class Search
data.force_encoding("UTF-8")
data = clean_term(data)
if purpose != :topic
if purpose != :topic && need_segmenting?(data)
if segment_chinese?
require "cppjieba_rb" unless defined?(CppjiebaRb)
@ -226,6 +226,13 @@ class Search
end
end
def self.need_segmenting?(data)
return false if data.match?(/\A\d+\z/)
!URI.parse(data).path.start_with?("/")
rescue URI::InvalidURIError
true
end
attr_accessor :term
attr_reader :clean_term, :guardian

View File

@ -9,6 +9,36 @@ RSpec.describe Search do
Jobs.run_immediately!
end
describe ".need_segmenting?" do
subject(:search) { described_class }
context "when data only contains digits" do
let(:data) { "510" }
it { is_expected.not_to be_need_segmenting(data) }
end
context "when data does not only contain digits" do
context "when data is a full URL" do
let(:data) { "http://localhost/t/-/510" }
it { is_expected.not_to be_need_segmenting(data) }
end
context "when data is a path" do
let(:data) { "/t/-/510" }
it { is_expected.not_to be_need_segmenting(data) }
end
context "when data is something else" do
let(:data) { "text" }
it { is_expected.to be_need_segmenting(data) }
end
end
end
describe "#ts_config" do
it "maps locales to correct Postgres dictionaries" do
expect(Search.ts_config).to eq("english")
@ -1597,10 +1627,38 @@ RSpec.describe Search do
let!(:post_2) { Fabricate(:post, topic: topic_2) }
describe ".prepare_data" do
it "removes punctuations" do
SiteSetting.search_tokenize_japanese = true
subject(:prepared_data) { Search.prepare_data(data) }
expect(Search.prepare_data(post.raw)).to eq("This is some japanese text 日本 が 大好き です")
let(:data) { post.raw }
before { SiteSetting.search_tokenize_japanese = true }
it "removes punctuations" do
expect(prepared_data).to eq("This is some japanese text 日本 が 大好き です")
end
context "when providing only an URL" do
let(:data) { "http://localhost/t/-/51" }
it "does not change it" do
expect(prepared_data).to eq(data)
end
end
context "when providing only a path" do
let(:data) { "/t/-/51" }
it "does not change it" do
expect(prepared_data).to eq(data)
end
end
context "when providing only an ID" do
let(:data) { "51" }
it "does not change it" do
expect(prepared_data).to eq(data)
end
end
end
@ -1616,23 +1674,24 @@ RSpec.describe Search do
SiteSetting.refresh!
end
it "finds posts containing Japanese text if tokenization is forced" do
SiteSetting.search_tokenize_japanese = true
context "when tokenization is forced" do
before { SiteSetting.search_tokenize_japanese = true }
it "finds posts containing Japanese text" do
expect(Search.execute("日本").posts.map(&:id)).to eq([post_2.id, post.id])
expect(Search.execute("").posts.map(&:id)).to eq([post_2.id, post.id])
end
end
it "find posts containing search term when site's locale is set to Japanese" do
SiteSetting.default_locale = "ja"
context "when default locale is set to Japanese" do
before { SiteSetting.default_locale = "ja" }
it "find posts containing search term" do
expect(Search.execute("日本").posts.map(&:id)).to eq([post_2.id, post.id])
expect(Search.execute("").posts.map(&:id)).to eq([post_2.id, post.id])
end
it "does not include superfluous spaces in blurbs" do
SiteSetting.default_locale = "ja"
post.update!(
raw: "場サアマネ織企ういかせ竹域ヱイマ穂基ホ神3予読ずねいぱ松査ス禁多サウ提懸イふ引小43改こょドめ。深とつぐ主思料農ぞかル者杯検める活分えほづぼ白犠",
)
@ -1642,6 +1701,42 @@ RSpec.describe Search do
expect(results.posts.length).to eq(1)
expect(results.blurb(results.posts.first)).to include("ういかせ竹域")
end
context "when searching for a topic in particular" do
subject(:results) do
described_class.execute(
term,
guardian: Discourse.system_user.guardian,
type_filter: "topic",
search_for_id: true,
)
end
context "when searching by topic ID" do
let(:term) { topic.id }
it "finds the proper post" do
expect(results.posts.first).to have_attributes(topic: topic, post_number: 1)
end
end
context "when searching by topic URL" do
let(:term) { "http://#{Discourse.current_hostname}/t/-/#{topic.id}" }
it "finds the proper post" do
expect(results.posts.first).to have_attributes(topic: topic, post_number: 1)
end
end
context "when searching by topic path" do
let(:term) { "/t/-/#{topic.id}" }
it "finds the proper post" do
expect(results.posts.first).to have_attributes(topic: topic, post_number: 1)
end
end
end
end
end
end