FIX: Improve `Topic.similar_to` with better `Topic#title` matches.

This changes PG text search to only match the given title against
lexemes that are formed from the title. Likewise, the given raw will
only be matched against lexemes that are formed from the post's raw.
This commit is contained in:
Guo Xiang Tan 2020-07-28 11:53:25 +08:00
parent 14003abc37
commit 597d542c33
No known key found for this signature in database
GPG Key ID: FBD110179AAC1F20
3 changed files with 49 additions and 20 deletions

View File

@ -581,9 +581,17 @@ class Topic < ActiveRecord::Base
return [] if title.blank? return [] if title.blank?
raw = raw.presence || "" raw = raw.presence || ""
search_data = "#{title} #{raw[0...MAX_SIMILAR_BODY_LENGTH]}".strip title_tsquery = Search.set_tsquery_weight_filter(
filter_words = Search.prepare_data(search_data) Search.prepare_data(title.strip),
ts_query = Search.ts_query(term: filter_words, joiner: "|") 'A'
)
raw_tsquery = Search.set_tsquery_weight_filter(
Search.prepare_data(raw[0...MAX_SIMILAR_BODY_LENGTH].strip),
'B'
)
tsquery = Search.to_tsquery(term: "#{title_tsquery} & #{raw_tsquery}", joiner: "|")
candidates = Topic candidates = Topic
.visible .visible
@ -591,9 +599,9 @@ class Topic < ActiveRecord::Base
.secured(Guardian.new(user)) .secured(Guardian.new(user))
.joins("JOIN topic_search_data s ON topics.id = s.topic_id") .joins("JOIN topic_search_data s ON topics.id = s.topic_id")
.joins("LEFT JOIN categories c ON topics.id = c.topic_id") .joins("LEFT JOIN categories c ON topics.id = c.topic_id")
.where("search_data @@ #{ts_query}") .where("search_data @@ #{tsquery}")
.where("c.topic_id IS NULL") .where("c.topic_id IS NULL")
.order("ts_rank(search_data, #{ts_query}) DESC") .order("ts_rank(search_data, #{tsquery}) DESC")
.limit(SiteSetting.max_similar_results * 3) .limit(SiteSetting.max_similar_results * 3)
candidate_ids = candidates.pluck(:id) candidate_ids = candidates.pluck(:id)

View File

@ -1025,13 +1025,25 @@ class Search
end end
def self.ts_query(term: , ts_config: nil, joiner: nil, weight_filter: nil) def self.ts_query(term: , ts_config: nil, joiner: nil, weight_filter: nil)
to_tsquery(
ts_config: ts_config,
term: set_tsquery_weight_filter(term, weight_filter),
joiner: joiner
)
end
def self.to_tsquery(ts_config: nil, term:, joiner: nil)
ts_config = ActiveRecord::Base.connection.quote(ts_config) if ts_config ts_config = ActiveRecord::Base.connection.quote(ts_config) if ts_config
term = term.gsub("'", "''") tsquery = "TO_TSQUERY(#{ts_config || default_ts_config}, '#{term}')"
tsquery = "TO_TSQUERY(#{ts_config || default_ts_config}, '''#{PG::Connection.escape_string(term)}'':*#{weight_filter}')"
tsquery = "REPLACE(#{tsquery}::text, '&', '#{PG::Connection.escape_string(joiner)}')::tsquery" if joiner tsquery = "REPLACE(#{tsquery}::text, '&', '#{PG::Connection.escape_string(joiner)}')::tsquery" if joiner
tsquery tsquery
end end
def self.set_tsquery_weight_filter(term, weight_filter)
term = term.gsub("'", "''")
"''#{PG::Connection.escape_string(term)}'':*#{weight_filter}"
end
def ts_query(ts_config = nil, weight_filter: nil) def ts_query(ts_config = nil, weight_filter: nil)
@ts_query_cache ||= {} @ts_query_cache ||= {}
@ts_query_cache["#{ts_config || default_ts_config} #{@term} #{weight_filter}"] ||= @ts_query_cache["#{ts_config || default_ts_config} #{@term} #{weight_filter}"] ||=

View File

@ -502,37 +502,46 @@ describe Topic do
end end
end end
context 'similar_to' do context '.similar_to' do
fab!(:category) { Fabricate(:category_with_definition) }
it 'returns blank with nil params' do it 'returns an empty array with nil params' do
expect(Topic.similar_to(nil, nil)).to be_blank expect(Topic.similar_to(nil, nil)).to eq([])
end end
context "with a category definition" do context "with a category definition" do
let!(:category) { Fabricate(:category_with_definition) }
it "excludes the category definition topic from similar_to" do it "excludes the category definition topic from similar_to" do
expect(Topic.similar_to('category definition for', "no body")).to be_blank expect(Topic.similar_to('category definition for', "no body")).to eq([])
end end
end end
context 'with a similar topic' do context 'with a similar topic' do
let!(:topic) { fab!(:post) {
SearchIndexer.enable SearchIndexer.enable
post = create_post(title: "Evil trout is the dude who posted this topic") create_post(title: "Evil trout is the dude who posted this topic")
post.topic
} }
let(:topic) { post.topic }
before do
SearchIndexer.enable
end
it 'returns the similar topic if the title is similar' do it 'returns the similar topic if the title is similar' do
expect(Topic.similar_to("has evil trout made any topics?", "i am wondering has evil trout made any topics?")).to eq([topic]) expect(Topic.similar_to("has evil trout made any topics?", "i am wondering has evil trout made any topics?")).to eq([topic])
end end
context "secure categories" do it 'matches title against title and raw against raw when searching for topics' do
fab!(:category) { Fabricate(:category_with_definition, read_restricted: true) } topic.update!(title: '1 2 3 numbered titles')
post.update!(raw: 'random toy poodle')
expect(Topic.similar_to("unrelated term", "1 2 3 poddle")).to eq([])
end
context "secure categories" do
before do before do
topic.category = category category.update!(read_restricted: true)
topic.save topic.update!(category: category)
end end
it "doesn't return topics from private categories" do it "doesn't return topics from private categories" do