FEATURE: allow restricting duplication in search index (#20062)

* FEATURE: allow restricting duplication in search index

This introduces the site setting `max_duplicate_search_index_terms`.
Using this number we limit the amount of duplication in our search index.

This allows us to more correctly weight title searches, so bloated posts
don't unfairly bump to the top of search results.

This feature is completely disabled by default and behind a site setting

We will experiment with it first. Note entire search index must be rebuilt
for it to take effect.


---------

Co-authored-by: Alan Guo Xiang Tan <gxtan1990@gmail.com>
This commit is contained in:
Sam 2023-01-31 12:41:31 +11:00 committed by GitHub
parent c5c72a74b7
commit 07679888c8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 69 additions and 0 deletions

View File

@ -66,6 +66,28 @@ class SearchIndexer
tsvector = "#{tsvector} #{additional_lexemes.join(" ")}"
if (max_dupes = SiteSetting.max_duplicate_search_index_terms) > 0
reduced = []
tsvector
.scan(/([^\:]+\:)(([0-9]+[A-D]?,?)+)/)
.each do |term, indexes|
family_counts = Hash.new(0)
new_index_array = []
indexes
.split(",")
.each do |index|
family = nil
family = index[-1] if index[-1].match?(/[A-D]/)
if (family_counts[family] += 1) <= max_dupes
new_index_array << index
end
end
reduced << "#{term.strip}#{new_index_array.join(",")}"
end
tsvector = reduced.join(" ")
end
indexed_data =
if table.to_s == "post"
clean_post_raw_data!(search_data[:d])

View File

@ -2192,6 +2192,9 @@ backups:
client: true
search:
max_duplicate_search_index_terms:
default: -1
hidden: true
use_pg_headlines_for_excerpt:
default: false
hidden: true

View File

@ -2602,4 +2602,34 @@ RSpec.describe Search do
expect(result.categories.length).to eq(0)
end
end
context "when max_duplicate_search_index_terms limits duplication" do
before { SearchIndexer.enable }
after { SearchIndexer.disable }
it "correctly ranks topics" do
SiteSetting.max_duplicate_search_index_terms = 5
topic1 = Fabricate(:topic, title: "this is a topic about sam")
post1 = Fabricate(:post, topic: topic1, raw: "this topic is a story about some person")
topic2 = Fabricate(:topic, title: "this is a topic about bob")
post2 =
Fabricate(
:post,
topic: topic2,
raw: "this topic is a story about some person #{"sam " * 100}",
)
SearchIndexer.index(post1, force: true)
SearchIndexer.index(post2, force: true)
result = Search.execute("sam")
expect(result.posts.length).to eq(2)
# title match should win cause we limited duplication
expect(result.posts.pluck(:id)).to eq([post1.id, post2.id])
end
end
end

View File

@ -303,6 +303,20 @@ RSpec.describe SearchIndexer do
"unca",
)
end
it "limits number of repeated terms when max_duplicate_search_index_terms site setting has been configured" do
SiteSetting.max_duplicate_search_index_terms = 5
contents = "I am #{"sam " * 10}"
post.update!(raw: contents)
post_search_data = post.post_search_data
post_search_data.reload
expect(post_search_data.search_data).to eq(
"'sam':12,13,14,15,16 'test':8A 'titl':4A 'uncategor':9B",
)
end
end
describe ".queue_post_reindex" do