From 9b4b5b5028b6d205dddca1048aacb001bc999883 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Guitaut?= Date: Fri, 30 Aug 2024 15:48:44 +0200 Subject: [PATCH] FIX: Return proper results when searching for a topic in Japanese MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, when the default locale is Japanese, the search for a topic using its URL, path or ID doesn’t work as expected. It will either return wrong results or no result at all. The problem lies with how we process the provided terms in Japanese mode. For example, if `http://localhost/t/-/55` is provided, currently this will result in `http localhost t 5 5` to be searched for. This patch addresses the issue by checking whether the provided term needs segmenting. If the provided term is a number, or a path or a full URL, then it doesn’t need segmenting. When that happens we skip the processing we normally apply for Japanese, making the search return the expected results. --- lib/search.rb | 9 ++- spec/lib/search_spec.rb | 135 ++++++++++++++++++++++++++++++++++------ 2 files changed, 123 insertions(+), 21 deletions(-) diff --git a/lib/search.rb b/lib/search.rb index 258298c4fe7..1c2574fde08 100644 --- a/lib/search.rb +++ b/lib/search.rb @@ -117,7 +117,7 @@ class Search data.force_encoding("UTF-8") data = clean_term(data) - if purpose != :topic + if purpose != :topic && need_segmenting?(data) if segment_chinese? require "cppjieba_rb" unless defined?(CppjiebaRb) @@ -226,6 +226,13 @@ class Search end end + def self.need_segmenting?(data) + return false if data.match?(/\A\d+\z/) + !URI.parse(data).path.start_with?("/") + rescue URI::InvalidURIError + true + end + attr_accessor :term attr_reader :clean_term, :guardian diff --git a/spec/lib/search_spec.rb b/spec/lib/search_spec.rb index f63e4712a57..c7c104dfb0f 100644 --- a/spec/lib/search_spec.rb +++ b/spec/lib/search_spec.rb @@ -9,6 +9,36 @@ RSpec.describe Search do Jobs.run_immediately! end + describe ".need_segmenting?" do + subject(:search) { described_class } + + context "when data only contains digits" do + let(:data) { "510" } + + it { is_expected.not_to be_need_segmenting(data) } + end + + context "when data does not only contain digits" do + context "when data is a full URL" do + let(:data) { "http://localhost/t/-/510" } + + it { is_expected.not_to be_need_segmenting(data) } + end + + context "when data is a path" do + let(:data) { "/t/-/510" } + + it { is_expected.not_to be_need_segmenting(data) } + end + + context "when data is something else" do + let(:data) { "text" } + + it { is_expected.to be_need_segmenting(data) } + end + end + end + describe "#ts_config" do it "maps locales to correct Postgres dictionaries" do expect(Search.ts_config).to eq("english") @@ -1597,10 +1627,38 @@ RSpec.describe Search do let!(:post_2) { Fabricate(:post, topic: topic_2) } describe ".prepare_data" do - it "removes punctuations" do - SiteSetting.search_tokenize_japanese = true + subject(:prepared_data) { Search.prepare_data(data) } - expect(Search.prepare_data(post.raw)).to eq("This is some japanese text 日本 が 大好き です") + let(:data) { post.raw } + + before { SiteSetting.search_tokenize_japanese = true } + + it "removes punctuations" do + expect(prepared_data).to eq("This is some japanese text 日本 が 大好き です") + end + + context "when providing only an URL" do + let(:data) { "http://localhost/t/-/51" } + + it "does not change it" do + expect(prepared_data).to eq(data) + end + end + + context "when providing only a path" do + let(:data) { "/t/-/51" } + + it "does not change it" do + expect(prepared_data).to eq(data) + end + end + + context "when providing only an ID" do + let(:data) { "51" } + + it "does not change it" do + expect(prepared_data).to eq(data) + end end end @@ -1616,31 +1674,68 @@ RSpec.describe Search do SiteSetting.refresh! end - it "finds posts containing Japanese text if tokenization is forced" do - SiteSetting.search_tokenize_japanese = true + context "when tokenization is forced" do + before { SiteSetting.search_tokenize_japanese = true } - expect(Search.execute("日本").posts.map(&:id)).to eq([post_2.id, post.id]) - expect(Search.execute("日").posts.map(&:id)).to eq([post_2.id, post.id]) + it "finds posts containing Japanese text" do + expect(Search.execute("日本").posts.map(&:id)).to eq([post_2.id, post.id]) + expect(Search.execute("日").posts.map(&:id)).to eq([post_2.id, post.id]) + end end - it "find posts containing search term when site's locale is set to Japanese" do - SiteSetting.default_locale = "ja" + context "when default locale is set to Japanese" do + before { SiteSetting.default_locale = "ja" } - expect(Search.execute("日本").posts.map(&:id)).to eq([post_2.id, post.id]) - expect(Search.execute("日").posts.map(&:id)).to eq([post_2.id, post.id]) - end + it "find posts containing search term" do + expect(Search.execute("日本").posts.map(&:id)).to eq([post_2.id, post.id]) + expect(Search.execute("日").posts.map(&:id)).to eq([post_2.id, post.id]) + end - it "does not include superfluous spaces in blurbs" do - SiteSetting.default_locale = "ja" + it "does not include superfluous spaces in blurbs" do + post.update!( + raw: "場サアマネ織企ういかせ竹域ヱイマ穂基ホ神3予読ずねいぱ松査ス禁多サウ提懸イふ引小43改こょドめ。深とつぐ主思料農ぞかル者杯検める活分えほづぼ白犠", + ) - post.update!( - raw: "場サアマネ織企ういかせ竹域ヱイマ穂基ホ神3予読ずねいぱ松査ス禁多サウ提懸イふ引小43改こょドめ。深とつぐ主思料農ぞかル者杯検める活分えほづぼ白犠", - ) + results = Search.execute("ういかせ竹域", type_filter: "topic") - results = Search.execute("ういかせ竹域", type_filter: "topic") + expect(results.posts.length).to eq(1) + expect(results.blurb(results.posts.first)).to include("ういかせ竹域") + end - expect(results.posts.length).to eq(1) - expect(results.blurb(results.posts.first)).to include("ういかせ竹域") + context "when searching for a topic in particular" do + subject(:results) do + described_class.execute( + term, + guardian: Discourse.system_user.guardian, + type_filter: "topic", + search_for_id: true, + ) + end + + context "when searching by topic ID" do + let(:term) { topic.id } + + it "finds the proper post" do + expect(results.posts.first).to have_attributes(topic: topic, post_number: 1) + end + end + + context "when searching by topic URL" do + let(:term) { "http://#{Discourse.current_hostname}/t/-/#{topic.id}" } + + it "finds the proper post" do + expect(results.posts.first).to have_attributes(topic: topic, post_number: 1) + end + end + + context "when searching by topic path" do + let(:term) { "/t/-/#{topic.id}" } + + it "finds the proper post" do + expect(results.posts.first).to have_attributes(topic: topic, post_number: 1) + end + end + end end end end