From 597d542c3369fc30e1169720d6c24416dab35f8c Mon Sep 17 00:00:00 2001 From: Guo Xiang Tan Date: Tue, 28 Jul 2020 11:53:25 +0800 Subject: [PATCH] FIX: Improve `Topic.similar_to` with better `Topic#title` matches. This changes PG text search to only match the given title against lexemes that are formed from the title. Likewise, the given raw will only be matched against lexemes that are formed from the post's raw. --- app/models/topic.rb | 18 +++++++++++++----- lib/search.rb | 16 ++++++++++++++-- spec/models/topic_spec.rb | 35 ++++++++++++++++++++++------------- 3 files changed, 49 insertions(+), 20 deletions(-) diff --git a/app/models/topic.rb b/app/models/topic.rb index de3034e3249..883a35c4651 100644 --- a/app/models/topic.rb +++ b/app/models/topic.rb @@ -581,9 +581,17 @@ class Topic < ActiveRecord::Base return [] if title.blank? raw = raw.presence || "" - search_data = "#{title} #{raw[0...MAX_SIMILAR_BODY_LENGTH]}".strip - filter_words = Search.prepare_data(search_data) - ts_query = Search.ts_query(term: filter_words, joiner: "|") + title_tsquery = Search.set_tsquery_weight_filter( + Search.prepare_data(title.strip), + 'A' + ) + + raw_tsquery = Search.set_tsquery_weight_filter( + Search.prepare_data(raw[0...MAX_SIMILAR_BODY_LENGTH].strip), + 'B' + ) + + tsquery = Search.to_tsquery(term: "#{title_tsquery} & #{raw_tsquery}", joiner: "|") candidates = Topic .visible @@ -591,9 +599,9 @@ class Topic < ActiveRecord::Base .secured(Guardian.new(user)) .joins("JOIN topic_search_data s ON topics.id = s.topic_id") .joins("LEFT JOIN categories c ON topics.id = c.topic_id") - .where("search_data @@ #{ts_query}") + .where("search_data @@ #{tsquery}") .where("c.topic_id IS NULL") - .order("ts_rank(search_data, #{ts_query}) DESC") + .order("ts_rank(search_data, #{tsquery}) DESC") .limit(SiteSetting.max_similar_results * 3) candidate_ids = candidates.pluck(:id) diff --git a/lib/search.rb b/lib/search.rb index 02462cded77..ade1ee3dae0 100644 --- a/lib/search.rb +++ b/lib/search.rb @@ -1025,13 +1025,25 @@ class Search end def self.ts_query(term: , ts_config: nil, joiner: nil, weight_filter: nil) + to_tsquery( + ts_config: ts_config, + term: set_tsquery_weight_filter(term, weight_filter), + joiner: joiner + ) + end + + def self.to_tsquery(ts_config: nil, term:, joiner: nil) ts_config = ActiveRecord::Base.connection.quote(ts_config) if ts_config - term = term.gsub("'", "''") - tsquery = "TO_TSQUERY(#{ts_config || default_ts_config}, '''#{PG::Connection.escape_string(term)}'':*#{weight_filter}')" + tsquery = "TO_TSQUERY(#{ts_config || default_ts_config}, '#{term}')" tsquery = "REPLACE(#{tsquery}::text, '&', '#{PG::Connection.escape_string(joiner)}')::tsquery" if joiner tsquery end + def self.set_tsquery_weight_filter(term, weight_filter) + term = term.gsub("'", "''") + "''#{PG::Connection.escape_string(term)}'':*#{weight_filter}" + end + def ts_query(ts_config = nil, weight_filter: nil) @ts_query_cache ||= {} @ts_query_cache["#{ts_config || default_ts_config} #{@term} #{weight_filter}"] ||= diff --git a/spec/models/topic_spec.rb b/spec/models/topic_spec.rb index b8be9f34684..29af224906e 100644 --- a/spec/models/topic_spec.rb +++ b/spec/models/topic_spec.rb @@ -502,37 +502,46 @@ describe Topic do end end - context 'similar_to' do + context '.similar_to' do + fab!(:category) { Fabricate(:category_with_definition) } - it 'returns blank with nil params' do - expect(Topic.similar_to(nil, nil)).to be_blank + it 'returns an empty array with nil params' do + expect(Topic.similar_to(nil, nil)).to eq([]) end context "with a category definition" do - let!(:category) { Fabricate(:category_with_definition) } - it "excludes the category definition topic from similar_to" do - expect(Topic.similar_to('category definition for', "no body")).to be_blank + expect(Topic.similar_to('category definition for', "no body")).to eq([]) end end context 'with a similar topic' do - let!(:topic) { + fab!(:post) { SearchIndexer.enable - post = create_post(title: "Evil trout is the dude who posted this topic") - post.topic + create_post(title: "Evil trout is the dude who posted this topic") } + let(:topic) { post.topic } + + before do + SearchIndexer.enable + end + it 'returns the similar topic if the title is similar' do expect(Topic.similar_to("has evil trout made any topics?", "i am wondering has evil trout made any topics?")).to eq([topic]) end - context "secure categories" do - fab!(:category) { Fabricate(:category_with_definition, read_restricted: true) } + it 'matches title against title and raw against raw when searching for topics' do + topic.update!(title: '1 2 3 numbered titles') + post.update!(raw: 'random toy poodle') + expect(Topic.similar_to("unrelated term", "1 2 3 poddle")).to eq([]) + end + + context "secure categories" do before do - topic.category = category - topic.save + category.update!(read_restricted: true) + topic.update!(category: category) end it "doesn't return topics from private categories" do