From bd32912c5ed7eebcda32721849a1e006d02b119b Mon Sep 17 00:00:00 2001 From: Sam Date: Wed, 10 May 2023 11:47:58 +1000 Subject: [PATCH] FIX: do not allow title stuffing to dominate search (#21464) We were giving topics with repeated words extra weight in search index. This meant that it was trivial to stuff words into title to dominate in search given we search for exact title matches first. The following tweak means that: `invite invited invites` and `invite some stuff` Both rank the same for title searching. Titles are short and punchy, duplicating words should not give special weight. Requires a full reindex to take effect. --- app/services/search_indexer.rb | 4 +++- spec/lib/search_spec.rb | 31 +++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index f0af956e85b..75ae8e99821 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -118,7 +118,9 @@ class SearchIndexer .each do |index| family = nil family = index[-1] if index[-1].match?(/[A-D]/) - if (family_counts[family] += 1) <= max_dupes + # title dupes can completely dominate the index + # so we limit them to 1 + if (family_counts[family] += 1) <= (family == "A" ? 1 : max_dupes) new_index_array << index end end diff --git a/spec/lib/search_spec.rb b/spec/lib/search_spec.rb index 72ed4718f04..aa889550163 100644 --- a/spec/lib/search_spec.rb +++ b/spec/lib/search_spec.rb @@ -2658,6 +2658,37 @@ RSpec.describe Search do end end + context "when some categories are prioritized" do + before { SearchIndexer.enable } + after { SearchIndexer.disable } + + it "correctly ranks topics with prioritized categories and stuffed topic terms" do + topic1 = Fabricate(:topic, title: "invite invited invites testing stuff with things") + post1 = + Fabricate( + :post, + topic: topic1, + raw: "this topic is a story about some person invites are fun", + ) + + category = Fabricate(:category, search_priority: Searchable::PRIORITIES[:high]) + + topic2 = Fabricate(:topic, title: "invite is the bestest", category: category) + post2 = + Fabricate( + :post, + topic: topic2, + raw: "this topic is a story about some other person invites are fun", + ) + + result = Search.execute("invite") + expect(result.posts.length).to eq(2) + + # title match should win cause we limited duplication + expect(result.posts.pluck(:id)).to eq([post2.id, post1.id]) + end + end + context "when max_duplicate_search_index_terms limits duplication" do before { SearchIndexer.enable }