From d8c796bc44d84a9e177737153793e5acc276adec Mon Sep 17 00:00:00 2001 From: Guo Xiang Tan Date: Tue, 7 Jul 2020 15:36:57 +0800 Subject: [PATCH] FIX: Ensure that aggregating search shows the post with the higest rank. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, we would only take either the `MIN` or `MAX` for `post_number` during aggregation meaning that the ranking is not considered. ``` require 'benchmark/ips' Benchmark.ips do |x| x.config(time: 10, warmup: 2) x.report("current aggregate search query") do DB.exec <<~SQL SELECT "posts"."id", "posts"."user_id", "posts"."topic_id", "posts"."post_number", "posts"."raw", "posts"."cooked", "posts"."created_at", "posts"."updated_at", "posts"."reply_to_post_number", "posts"."reply_count", "posts"."quote_count", "posts"."deleted_at", "posts"."off_topic_count", "posts"."like_count", "posts"."incoming_link_count", "posts"."bookmark_count", "posts"."score", "posts"."reads", "posts"."post_type", "posts"."sort_order", "posts"."last_editor_id", "posts"."hidden", "posts"."hidden_reason_id", "posts"."notify_moderators_count", "posts"."spam_count", "posts"."illegal_count", "posts"."inappropriate_count", "posts"."last_version_at", "posts"."user_deleted", "posts"."reply_to_user_id", "posts"."percent_rank", "posts"."notify_user_count", "posts"."like_score", "posts"."deleted_by_id", "posts"."edit_reason", "posts"."word_count", "posts"."version", "posts"."cook_method", "posts"."wiki", "posts"."baked_at", "posts"."baked_version", "posts"."hidden_at", "posts"."self_edits", "posts"."reply_quoted", "posts"."via_email", "posts"."raw_email", "posts"."public_version", "posts"."action_code", "posts"."locked_by_id", "posts"."image_upload_id" FROM "posts" JOIN (SELECT *, row_number() over() row_number FROM (SELECT topics.id, min(posts.post_number) post_number FROM "posts" INNER JOIN "post_search_data" ON "post_search_data"."post_id" = "posts"."id" INNER JOIN "topics" ON "topics"."id" = "posts"."topic_id" AND ("topics"."deleted_at" IS NULL) LEFT JOIN categories ON categories.id = topics.category_id WHERE ("posts"."deleted_at" IS NULL) AND "posts"."post_type" IN (1, 2, 3, 4) AND (topics.visible) AND (topics.archetype <> 'private_message') AND (post_search_data.search_data @@ TO_TSQUERY('english', '''postgres'':*ABCD')) AND (categories.id NOT IN ( SELECT categories.id WHERE categories.search_priority = 1 ) ) AND ((categories.id IS NULL) OR (NOT categories.read_restricted)) GROUP BY topics.id ORDER BY MAX(( TS_RANK_CD( post_search_data.search_data, TO_TSQUERY('english', '''postgres'':*ABCD'), 1|32 ) * ( CASE categories.search_priority WHEN 2 THEN 0.6 WHEN 3 THEN 0.8 WHEN 4 THEN 1.2 WHEN 5 THEN 1.4 ELSE CASE WHEN topics.closed THEN 0.9 ELSE 1 END END ) ) ) DESC, topics.bumped_at DESC LIMIT 51 OFFSET 0) xxx) x ON x.id = posts.topic_id AND x.post_number = posts.post_number WHERE ("posts"."deleted_at" IS NULL) ORDER BY row_number; SQL end x.report("current aggregate search query with proper ranking") do DB.exec <<~SQL SELECT "posts"."id", "posts"."user_id", "posts"."topic_id", "posts"."post_number", "posts"."raw", "posts"."cooked", "posts"."created_at", "posts"."updated_at", "posts"."reply_to_post_number", "posts"."reply_count", "posts"."quote_count", "posts"."deleted_at", "posts"."off_topic_count", "posts"."like_count", "posts"."incoming_link_count", "posts"."bookmark_count", "posts"."score", "posts"."reads", "posts"."post_type", "posts"."sort_order", "posts"."last_editor_id", "posts"."hidden", "posts"."hidden_reason_id", "posts"."notify_moderators_count", "posts"."spam_count", "posts"."illegal_count", "posts"."inappropriate_count", "posts"."last_version_at", "posts"."user_deleted", "posts"."reply_to_user_id", "posts"."percent_rank", "posts"."notify_user_count", "posts"."like_score", "posts"."deleted_by_id", "posts"."edit_reason", "posts"."word_count", "posts"."version", "posts"."cook_method", "posts"."wiki", "posts"."baked_at", "posts"."baked_version", "posts"."hidden_at", "posts"."self_edits", "posts"."reply_quoted", "posts"."via_email", "posts"."raw_email", "posts"."public_version", "posts"."action_code", "posts"."locked_by_id", "posts"."image_upload_id" FROM "posts" JOIN (SELECT *, row_number() over() row_number FROM (SELECT subquery.topic_id id, (ARRAY_AGG(subquery.post_number))[1] post_number, MAX(subquery.rank) rank, MAX(subquery.bumped_at) bumped_at FROM (SELECT "posts"."id", "posts"."user_id", "posts"."topic_id", "posts"."post_number", "posts"."raw", "posts"."cooked", "posts"."created_at", "posts"."updated_at", "posts"."reply_to_post_number", "posts"."reply_count", "posts"."quote_count", "posts"."deleted_at", "posts"."off_topic_count", "posts"."like_count", "posts"."incoming_link_count", "posts"."bookmark_count", "posts"."score", "posts"."reads", "posts"."post_type", "posts"."sort_order", "posts"."last_editor_id", "posts"."hidden", "posts"."hidden_reason_id", "posts"."notify_moderators_count", "posts"."spam_count", "posts"."illegal_count", "posts"."inappropriate_count", "posts"."last_version_at", "posts"."user_deleted", "posts"."reply_to_user_id", "posts"."percent_rank", "posts"."notify_user_count", "posts"."like_score", "posts"."deleted_by_id", "posts"."edit_reason", "posts"."word_count", "posts"."version", "posts"."cook_method", "posts"."wiki", "posts"."baked_at", "posts"."baked_version", "posts"."hidden_at", "posts"."self_edits", "posts"."reply_quoted", "posts"."via_email", "posts"."raw_email", "posts"."public_version", "posts"."action_code", "posts"."locked_by_id", "posts"."image_upload_id", ( TS_RANK_CD( post_search_data.search_data, TO_TSQUERY('english', '''postgres'':*ABCD'), 1|32 ) * ( CASE categories.search_priority WHEN 2 THEN 0.6 WHEN 3 THEN 0.8 WHEN 4 THEN 1.2 WHEN 5 THEN 1.4 ELSE CASE WHEN topics.closed THEN 0.9 ELSE 1 END END ) ) rank, topics.bumped_at bumped_at FROM "posts" INNER JOIN "post_search_data" ON "post_search_data"."post_id" = "posts"."id" INNER JOIN "topics" ON "topics"."id" = "posts"."topic_id" AND ("topics"."deleted_at" IS NULL) LEFT JOIN categories ON categories.id = topics.category_id WHERE ("posts"."deleted_at" IS NULL) AND "posts"."post_type" IN (1, 2, 3, 4) AND (topics.visible) AND (topics.archetype <> 'private_message') AND (post_search_data.search_data @@ TO_TSQUERY('english', '''postgres'':*ABCD')) AND (categories.id NOT IN ( SELECT categories.id WHERE categories.search_priority = 1 ) ) AND ((categories.id IS NULL) OR (NOT categories.read_restricted))) subquery GROUP BY subquery.topic_id ORDER BY rank DESC, bumped_at DESC LIMIT 51 OFFSET 0) xxx) x ON x.id = posts.topic_id AND x.post_number = posts.post_number WHERE ("posts"."deleted_at" IS NULL) ORDER BY row_number; SQL end x.compare! end ``` ``` Warming up -------------------------------------- current aggregate search query 1.000 i/100ms current aggregate search query with proper ranking 1.000 i/100ms Calculating ------------------------------------- current aggregate search query 17.726 (± 0.0%) i/s - 178.000 in 10.045107s current aggregate search query with proper ranking 17.802 (± 0.0%) i/s - 178.000 in 10.002230s Comparison: current aggregate search query with proper ranking: 17.8 i/s current aggregate search query: 17.7 i/s - 1.00x (± 0.00) slower ``` --- lib/search.rb | 194 ++++++++++++++++-------------- spec/components/search_spec.rb | 213 +++++++++++++++++++++------------ 2 files changed, 241 insertions(+), 166 deletions(-) diff --git a/lib/search.rb b/lib/search.rb index 564c13466a3..d17b35a008f 100644 --- a/lib/search.rb +++ b/lib/search.rb @@ -794,9 +794,7 @@ class Search PHRASE_MATCH_REGEXP_PATTERN = '"([^"]+)"' - def posts_query(limit, opts = nil) - opts ||= {} - + def posts_query(limit, type_filter: nil, aggregate_search: false) posts = Post.where(post_type: Topic.visible_post_types(@guardian.user)) .joins(:post_search_data, :topic) .joins("LEFT JOIN categories ON categories.id = topics.category_id") @@ -805,13 +803,13 @@ class Search posts = posts.where("topics.visible") unless is_topic_search - if opts[:type_filter] === "private_messages" || (is_topic_search && @search_context.private_message?) + if type_filter === "private_messages" || (is_topic_search && @search_context.private_message?) posts = posts.where("topics.archetype = ?", Archetype.private_message) unless @guardian.is_admin? posts = posts.private_posts_for_user(@guardian.user) end - elsif opts[:type_filter] === "all_topics" + elsif type_filter === "all_topics" private_posts = posts.where("topics.archetype = ?", Archetype.private_message) private_posts = private_posts.private_posts_for_user(@guardian.user) @@ -861,7 +859,7 @@ class Search posts = if @search_context.present? if @search_context.is_a?(User) - if opts[:type_filter] === "private_messages" + if type_filter === "private_messages" @direct_pms_only ? posts : posts.private_posts_for_user(@search_context) else posts.where("posts.user_id = #{@search_context.id}") @@ -886,14 +884,58 @@ class Search posts = categories_ignored(posts) unless @category_filter_matched posts end + + if aggregate_search + aggregate_relation = Post.unscoped + .select( + "subquery.topic_id id", + "(ARRAY_AGG(subquery.post_number))[1] post_number", + ) + .group("subquery.topic_id") + + posts = posts.select(posts.arel.projections) + end + if @order == :latest - if opts[:aggregate_search] - posts = posts.order("MAX(posts.created_at) DESC") - else - posts = posts.reorder("posts.created_at DESC") + posts = posts.reorder("posts.created_at DESC") + + if aggregate_search + aggregate_relation = aggregate_relation + .select("MAX(subquery.created_at) created_at") + .order("created_at DESC") end - elsif @term.blank? && !@order - data_ranking = <<~SQL + elsif @order == :latest_topic + posts = posts.order("topic_created_at DESC") + + if aggregate_search + posts = posts.select("topics.created_at topic_created_at") + + aggregate_relation = aggregate_relation + .select("MAX(subquery.topic_created_at) topic_created_at") + .order("topic_created_at DESC") + end + elsif @order == :views + posts = posts.order("topic_views DESC") + + if aggregate_search + posts = posts.select("topics.views topic_views") + + aggregate_relation = aggregate_relation + .select("MAX(subquery.topic_views) topic_views") + .order("topic_views DESC") + end + elsif @order == :likes + posts = posts.order("posts.like_count DESC") + else + rank = <<~SQL + TS_RANK_CD( + post_search_data.search_data, + #{@term.blank? ? '' : ts_query(weight_filter: weights)}, + #{SiteSetting.search_ranking_normalization}|32 + ) + SQL + + category_priority_weights = <<~SQL ( CASE categories.search_priority WHEN #{Searchable::PRIORITIES[:very_low]} @@ -912,64 +954,24 @@ class Search END ) SQL - if opts[:aggregate_search] - posts = posts.order("MAX(#{data_ranking}) DESC").order("MAX(posts.created_at) DESC") - else - posts = posts.order("#{data_ranking} DESC").order("posts.created_at DESC") - end - elsif @order == :latest_topic - if opts[:aggregate_search] - posts = posts.order("MAX(topics.created_at) DESC") - else - posts = posts.order("topics.created_at DESC") - end - elsif @order == :views - if opts[:aggregate_search] - posts = posts.order("MAX(topics.views) DESC") - else - posts = posts.order("topics.views DESC") - end - elsif @order == :likes - if opts[:aggregate_search] - posts = posts.order("MAX(posts.like_count) DESC") - else - posts = posts.order("posts.like_count DESC") - end - else - data_ranking = <<~SQL - ( - TS_RANK_CD( - post_search_data.search_data, - #{ts_query(weight_filter: weights)}, - #{SiteSetting.search_ranking_normalization}|32 - ) * - ( - CASE categories.search_priority - WHEN #{Searchable::PRIORITIES[:very_low]} - THEN #{SiteSetting.category_search_priority_very_low_weight} - WHEN #{Searchable::PRIORITIES[:low]} - THEN #{SiteSetting.category_search_priority_low_weight} - WHEN #{Searchable::PRIORITIES[:high]} - THEN #{SiteSetting.category_search_priority_high_weight} - WHEN #{Searchable::PRIORITIES[:very_high]} - THEN #{SiteSetting.category_search_priority_very_high_weight} - ELSE - CASE WHEN topics.closed - THEN 0.9 - ELSE 1 - END - END - ) - ) - SQL - if opts[:aggregate_search] - posts = posts.order("MAX(#{data_ranking}) DESC") - else - posts = posts.order("#{data_ranking} DESC") - end + data_ranking = + if @term.blank? + "(#{category_priority_weights})" + else + "(#{rank} * #{category_priority_weights})" + end - posts = posts.order("topics.bumped_at DESC") + if aggregate_search + posts = posts.select("#{data_ranking} rank", "topics.bumped_at topic_bumped_at") + .order("rank DESC", "topic_bumped_at DESC") + + aggregate_relation = aggregate_relation + .select("MAX(subquery.rank) rank", "MAX(subquery.topic_bumped_at) topic_bumped_at") + .order("rank DESC", "topic_bumped_at DESC") + else + posts = posts.order("#{data_ranking} DESC", "topics.bumped_at DESC") + end end if secure_category_ids.present? @@ -978,6 +980,11 @@ class Search posts = posts.where("(categories.id IS NULL) OR (NOT categories.read_restricted)").references(:categories) end + if aggregate_search + posts = yield(posts) if block_given? + posts = aggregate_relation.from(posts) + end + posts = posts.offset(offset) posts.limit(limit) end @@ -1032,29 +1039,42 @@ class Search end def aggregate_post_sql(opts) - min_or_max = @order == :latest ? "max" : "min" - - query = - if @order == :likes - # likes are a pain to aggregate so skip - posts_query(limit, type_filter: opts[:type_filter]) - .select('topics.id', "posts.post_number") - else - posts_query(limit, aggregate_search: true, type_filter: opts[:type_filter]) - .select('topics.id', "#{min_or_max}(posts.post_number) post_number") - .group('topics.id') - end + default_opts = { + type_filter: opts[:type_filter] + } min_id = Search.min_post_id - if min_id > 0 - low_set = query.dup.where("post_search_data.post_id < #{min_id}") - high_set = query.where("post_search_data.post_id >= #{min_id}") - return { default: wrap_rows(high_set), remaining: wrap_rows(low_set) } + if @order == :likes + # likes are a pain to aggregate so skip + query = posts_query(limit, **default_opts).select('topics.id', 'posts.post_number') + + if min_id > 0 + low_set = query.dup.where("post_search_data.post_id < #{min_id}") + high_set = query.where("post_search_data.post_id >= #{min_id}") + + { default: wrap_rows(high_set), remaining: wrap_rows(low_set) } + else + { default: wrap_rows(query) } + end + else + query = posts_query(limit, **default_opts, aggregate_search: true) do |posts| + if min_id > 0 + posts.select("post_search_data.post_id post_search_data_post_id") + else + posts + end + end + + if min_id > 0 + low_set = query.dup.where("subquery.post_search_data_post_id < #{min_id}") + high_set = query.where("subquery.post_search_data_post_id >= #{min_id}") + + { default: wrap_rows(high_set), remaining: wrap_rows(low_set) } + else + { default: wrap_rows(query) } + end end - - # double wrapping so we get correct row numbers - { default: wrap_rows(query) } end def aggregate_posts(post_sql) diff --git a/spec/components/search_spec.rb b/spec/components/search_spec.rb index 936b43340d6..09769c38ba2 100644 --- a/spec/components/search_spec.rb +++ b/spec/components/search_spec.rb @@ -175,7 +175,6 @@ describe Search do raw: 'hello from mars, we just landed') } it 'searches correctly' do - expect do Search.execute('mars', type_filter: 'private_messages') end.to raise_error(Discourse::InvalidAccess) @@ -368,6 +367,121 @@ describe Search do end end + context 'posts' do + let(:post) { Fabricate(:post) } + let(:topic) { post.topic } + + let!(:reply) do + Fabricate(:post_with_long_raw_content, + topic: topic, + user: topic.user, + ).tap { |post| post.update!(raw: "#{post.raw} elephant") } + end + + let(:expected_blurb) do + "...quire content longer than the typical test post raw content. It really is some long content, folks. elephant" + end + + it 'returns the post' do + result = Search.execute('elephant', + type_filter: 'topic', + include_blurbs: true + ) + + expect(result.posts).to contain_exactly(reply) + expect(result.blurb(reply)).to eq(expected_blurb) + end + + it 'returns the right post and blurb for searches with phrase' do + result = Search.execute('"elephant"', + type_filter: 'topic', + include_blurbs: true + ) + + expect(result.posts).to contain_exactly(reply) + expect(result.blurb(reply)).to eq(expected_blurb) + end + + it 'does not allow a post with repeated words to dominate the ranking' do + category = Fabricate(:category_with_definition, name: "winter is coming") + + post = Fabricate(:post, + raw: "I think winter will end soon", + topic: Fabricate(:topic, + title: "dragon john snow winter", + category: category + ) + ) + + post2 = Fabricate(:post, + raw: "I think #{'winter' * 20} will end soon", + topic: Fabricate(:topic, title: "dragon john snow summer", category: category) + ) + + result = Search.execute('winter') + + expect(result.posts.pluck(:id)).to eq([ + post.id, category.topic.first_post.id, post2.id + ]) + end + + it 'applies a small penalty to closed topic when ranking' do + post = Fabricate(:post, + raw: "My weekly update", + topic: Fabricate(:topic, + title: "A topic that will be closed", + closed: true + ) + ) + + post2 = Fabricate(:post, + raw: "My weekly update", + topic: Fabricate(:topic, + title: "A topic that will be open" + ) + ) + + result = Search.execute('weekly update') + expect(result.posts.pluck(:id)).to eq([post2.id, post.id]) + end + + it 'aggregates searches in a topic by returning the post with the highest rank' do + post = Fabricate(:post, topic: topic, raw: "this is a play post") + post2 = Fabricate(:post, topic: topic, raw: "play playing played") + post3 = Fabricate(:post, raw: "this is a play post") + + results = Search.execute('play') + + expect(results.posts.map(&:id)).to eq([ + post2.id, + post3.id + ]) + end + + it "allows the configuration of search to prefer recent posts" do + SiteSetting.search_prefer_recent_posts = true + SiteSetting.search_recent_posts_size = 1 + post = Fabricate(:post, topic: topic, raw: "this is a play post") + + results = Search.execute('play post') + + expect(results.posts.map(&:id)).to eq([ + post.id + ]) + + post2 = Fabricate(:post, raw: "this is a play post") + + results = Search.execute('play post') + + expect(results.posts.map(&:id)).to eq([ + post2.id, + post.id + ]) + ensure + Discourse.cache.clear + end + end + context 'topics' do let(:post) { Fabricate(:post) } let(:topic) { post.topic } @@ -435,80 +549,6 @@ describe Search do end end - context 'searching for a post' do - let!(:reply) do - Fabricate(:post_with_long_raw_content, - topic: topic, - user: topic.user, - ).tap { |post| post.update!(raw: "#{post.raw} elephant") } - end - - let(:expected_blurb) do - "...quire content longer than the typical test post raw content. It really is some long content, folks. elephant" - end - - it 'returns the post' do - result = Search.execute('elephant', - type_filter: 'topic' - ) - - expect(result.posts).to contain_exactly(reply) - expect(result.blurb(reply)).to eq(expected_blurb) - end - - it 'returns the right post and blurb for searches with phrase' do - result = Search.execute('"elephant"', - type_filter: 'topic' - ) - - expect(result.posts).to contain_exactly(reply) - expect(result.blurb(reply)).to eq(expected_blurb) - end - - it 'does not allow a post with repeated words to dominate the ranking' do - category = Fabricate(:category_with_definition, name: "winter is coming") - - post = Fabricate(:post, - raw: "I think winter will end soon", - topic: Fabricate(:topic, - title: "dragon john snow winter", - category: category - ) - ) - - post2 = Fabricate(:post, - raw: "I think #{'winter' * 20} will end soon", - topic: Fabricate(:topic, title: "dragon john snow summer", category: category) - ) - - result = Search.execute('winter') - - expect(result.posts.pluck(:id)).to eq([ - post.id, category.topic.first_post.id, post2.id - ]) - end - - it 'applies a small penalty to closed topic when ranking' do - post = Fabricate(:post, - raw: "My weekly update", - topic: Fabricate(:topic, - title: "A topic that will be closed", - closed: true - ) - ) - - post2 = Fabricate(:post, - raw: "My weekly update", - topic: Fabricate(:topic, - title: "A topic that will be open" - ) - ) - - result = Search.execute('weekly update') - expect(result.posts.pluck(:id)).to eq([post2.id, post.id]) - end - end - context 'searching for quoted title' do it "can find quoted title" do create_post(raw: "this is the raw body", title: "I am a title yeah") @@ -681,18 +721,19 @@ describe Search do it "should return posts in the right order" do raw = "The pure genuine evian" - post = freeze_time(10.seconds.from_now) { Fabricate(:post, topic: category.topic, raw: raw) } - post2 = freeze_time(20.seconds.from_now) { Fabricate(:post, topic: category2.topic, raw: raw) } + post = Fabricate(:post, topic: category.topic, raw: raw) + post2 = Fabricate(:post, topic: category2.topic, raw: raw) + post2.topic.update!(bumped_at: 10.seconds.from_now) search = Search.execute(raw) - expect(search.posts).to eq([post2, post]) + expect(search.posts.map(&:id)).to eq([post2.id, post.id]) category.update!(search_priority: Searchable::PRIORITIES[:high]) search = Search.execute(raw) - expect(search.posts).to eq([post, post2]) + expect(search.posts.map(&:id)).to eq([post.id, post2.id]) end end @@ -1192,6 +1233,18 @@ describe Search do ]) end + it 'can order by topic views' do + topic = Fabricate(:topic, views: 1) + topic2 = Fabricate(:topic, views: 2) + post = Fabricate(:post, raw: 'Topic', topic: topic) + post2 = Fabricate(:post, raw: 'Topic', topic: topic2) + + expect(Search.execute('Topic order:views').posts.map(&:id)).to eq([ + post2.id, + post.id + ]) + end + it 'can tokenize dots' do post = Fabricate(:post, raw: 'Will.2000 Will.Bob.Bill...') expect(Search.execute('bill').posts.map(&:id)).to eq([post.id]) @@ -1343,8 +1396,10 @@ describe Search do expect(Search.execute('bakey tags:lunch order:latest').posts.map(&:id)) .to eq([post8.id, post7.id]) + expect(Search.execute('#food tags:lunch order:latest').posts.map(&:id)) .to eq([post8.id, post7.id]) + expect(Search.execute('#food tags:lunch order:likes').posts.map(&:id)) .to eq([post7.id, post8.id]) end