FIX: Improve `Topic.similar_to` with better `Topic#title` matches.

This changes PG text search to only match the given title against
lexemes that are formed from the title. Likewise, the given raw will
only be matched against lexemes that are formed from the post's raw.
This commit is contained in:
Guo Xiang Tan 2020-07-28 11:53:25 +08:00
parent 14003abc37
commit 597d542c33
No known key found for this signature in database
GPG Key ID: FBD110179AAC1F20
3 changed files with 49 additions and 20 deletions

View File

@ -581,9 +581,17 @@ class Topic < ActiveRecord::Base
return [] if title.blank?
raw = raw.presence || ""
search_data = "#{title} #{raw[0...MAX_SIMILAR_BODY_LENGTH]}".strip
filter_words = Search.prepare_data(search_data)
ts_query = Search.ts_query(term: filter_words, joiner: "|")
title_tsquery = Search.set_tsquery_weight_filter(
Search.prepare_data(title.strip),
'A'
)
raw_tsquery = Search.set_tsquery_weight_filter(
Search.prepare_data(raw[0...MAX_SIMILAR_BODY_LENGTH].strip),
'B'
)
tsquery = Search.to_tsquery(term: "#{title_tsquery} & #{raw_tsquery}", joiner: "|")
candidates = Topic
.visible
@ -591,9 +599,9 @@ class Topic < ActiveRecord::Base
.secured(Guardian.new(user))
.joins("JOIN topic_search_data s ON topics.id = s.topic_id")
.joins("LEFT JOIN categories c ON topics.id = c.topic_id")
.where("search_data @@ #{ts_query}")
.where("search_data @@ #{tsquery}")
.where("c.topic_id IS NULL")
.order("ts_rank(search_data, #{ts_query}) DESC")
.order("ts_rank(search_data, #{tsquery}) DESC")
.limit(SiteSetting.max_similar_results * 3)
candidate_ids = candidates.pluck(:id)

View File

@ -1025,13 +1025,25 @@ class Search
end
def self.ts_query(term: , ts_config: nil, joiner: nil, weight_filter: nil)
to_tsquery(
ts_config: ts_config,
term: set_tsquery_weight_filter(term, weight_filter),
joiner: joiner
)
end
def self.to_tsquery(ts_config: nil, term:, joiner: nil)
ts_config = ActiveRecord::Base.connection.quote(ts_config) if ts_config
term = term.gsub("'", "''")
tsquery = "TO_TSQUERY(#{ts_config || default_ts_config}, '''#{PG::Connection.escape_string(term)}'':*#{weight_filter}')"
tsquery = "TO_TSQUERY(#{ts_config || default_ts_config}, '#{term}')"
tsquery = "REPLACE(#{tsquery}::text, '&', '#{PG::Connection.escape_string(joiner)}')::tsquery" if joiner
tsquery
end
def self.set_tsquery_weight_filter(term, weight_filter)
term = term.gsub("'", "''")
"''#{PG::Connection.escape_string(term)}'':*#{weight_filter}"
end
def ts_query(ts_config = nil, weight_filter: nil)
@ts_query_cache ||= {}
@ts_query_cache["#{ts_config || default_ts_config} #{@term} #{weight_filter}"] ||=

View File

@ -502,37 +502,46 @@ describe Topic do
end
end
context 'similar_to' do
context '.similar_to' do
fab!(:category) { Fabricate(:category_with_definition) }
it 'returns blank with nil params' do
expect(Topic.similar_to(nil, nil)).to be_blank
it 'returns an empty array with nil params' do
expect(Topic.similar_to(nil, nil)).to eq([])
end
context "with a category definition" do
let!(:category) { Fabricate(:category_with_definition) }
it "excludes the category definition topic from similar_to" do
expect(Topic.similar_to('category definition for', "no body")).to be_blank
expect(Topic.similar_to('category definition for', "no body")).to eq([])
end
end
context 'with a similar topic' do
let!(:topic) {
fab!(:post) {
SearchIndexer.enable
post = create_post(title: "Evil trout is the dude who posted this topic")
post.topic
create_post(title: "Evil trout is the dude who posted this topic")
}
let(:topic) { post.topic }
before do
SearchIndexer.enable
end
it 'returns the similar topic if the title is similar' do
expect(Topic.similar_to("has evil trout made any topics?", "i am wondering has evil trout made any topics?")).to eq([topic])
end
context "secure categories" do
fab!(:category) { Fabricate(:category_with_definition, read_restricted: true) }
it 'matches title against title and raw against raw when searching for topics' do
topic.update!(title: '1 2 3 numbered titles')
post.update!(raw: 'random toy poodle')
expect(Topic.similar_to("unrelated term", "1 2 3 poddle")).to eq([])
end
context "secure categories" do
before do
topic.category = category
topic.save
category.update!(read_restricted: true)
topic.update!(category: category)
end
it "doesn't return topics from private categories" do