FEATURE: allow restricting duplication in search index (#20062)
* FEATURE: allow restricting duplication in search index This introduces the site setting `max_duplicate_search_index_terms`. Using this number we limit the amount of duplication in our search index. This allows us to more correctly weight title searches, so bloated posts don't unfairly bump to the top of search results. This feature is completely disabled by default and behind a site setting We will experiment with it first. Note entire search index must be rebuilt for it to take effect. --------- Co-authored-by: Alan Guo Xiang Tan <gxtan1990@gmail.com>
This commit is contained in:
parent
c5c72a74b7
commit
07679888c8
|
@ -66,6 +66,28 @@ class SearchIndexer
|
|||
|
||||
tsvector = "#{tsvector} #{additional_lexemes.join(" ")}"
|
||||
|
||||
if (max_dupes = SiteSetting.max_duplicate_search_index_terms) > 0
|
||||
reduced = []
|
||||
tsvector
|
||||
.scan(/([^\:]+\:)(([0-9]+[A-D]?,?)+)/)
|
||||
.each do |term, indexes|
|
||||
family_counts = Hash.new(0)
|
||||
new_index_array = []
|
||||
|
||||
indexes
|
||||
.split(",")
|
||||
.each do |index|
|
||||
family = nil
|
||||
family = index[-1] if index[-1].match?(/[A-D]/)
|
||||
if (family_counts[family] += 1) <= max_dupes
|
||||
new_index_array << index
|
||||
end
|
||||
end
|
||||
reduced << "#{term.strip}#{new_index_array.join(",")}"
|
||||
end
|
||||
tsvector = reduced.join(" ")
|
||||
end
|
||||
|
||||
indexed_data =
|
||||
if table.to_s == "post"
|
||||
clean_post_raw_data!(search_data[:d])
|
||||
|
|
|
@ -2192,6 +2192,9 @@ backups:
|
|||
client: true
|
||||
|
||||
search:
|
||||
max_duplicate_search_index_terms:
|
||||
default: -1
|
||||
hidden: true
|
||||
use_pg_headlines_for_excerpt:
|
||||
default: false
|
||||
hidden: true
|
||||
|
|
|
@ -2602,4 +2602,34 @@ RSpec.describe Search do
|
|||
expect(result.categories.length).to eq(0)
|
||||
end
|
||||
end
|
||||
|
||||
context "when max_duplicate_search_index_terms limits duplication" do
|
||||
before { SearchIndexer.enable }
|
||||
|
||||
after { SearchIndexer.disable }
|
||||
|
||||
it "correctly ranks topics" do
|
||||
SiteSetting.max_duplicate_search_index_terms = 5
|
||||
|
||||
topic1 = Fabricate(:topic, title: "this is a topic about sam")
|
||||
post1 = Fabricate(:post, topic: topic1, raw: "this topic is a story about some person")
|
||||
|
||||
topic2 = Fabricate(:topic, title: "this is a topic about bob")
|
||||
post2 =
|
||||
Fabricate(
|
||||
:post,
|
||||
topic: topic2,
|
||||
raw: "this topic is a story about some person #{"sam " * 100}",
|
||||
)
|
||||
|
||||
SearchIndexer.index(post1, force: true)
|
||||
SearchIndexer.index(post2, force: true)
|
||||
|
||||
result = Search.execute("sam")
|
||||
expect(result.posts.length).to eq(2)
|
||||
|
||||
# title match should win cause we limited duplication
|
||||
expect(result.posts.pluck(:id)).to eq([post1.id, post2.id])
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -303,6 +303,20 @@ RSpec.describe SearchIndexer do
|
|||
"unca",
|
||||
)
|
||||
end
|
||||
|
||||
it "limits number of repeated terms when max_duplicate_search_index_terms site setting has been configured" do
|
||||
SiteSetting.max_duplicate_search_index_terms = 5
|
||||
|
||||
contents = "I am #{"sam " * 10}"
|
||||
post.update!(raw: contents)
|
||||
|
||||
post_search_data = post.post_search_data
|
||||
post_search_data.reload
|
||||
|
||||
expect(post_search_data.search_data).to eq(
|
||||
"'sam':12,13,14,15,16 'test':8A 'titl':4A 'uncategor':9B",
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
describe ".queue_post_reindex" do
|
||||
|
|
Loading…
Reference in New Issue