mirror of
https://github.com/discourse/discourse.git
synced 2025-03-07 19:59:33 +00:00
FIX: search index duplicate parser matching is too restrictive (#20129)
Previous regex did not allow for cases where a lexeme contains a : (colon) This can happen when parsing URLs. New algorithm allows for this. Test was amended to more clearly call out index problems
This commit is contained in:
parent
41f265ae46
commit
4570118a63
@ -54,7 +54,7 @@ class SearchIndexer
|
|||||||
loop do
|
loop do
|
||||||
count += 1
|
count += 1
|
||||||
break if count >= 10 # Safeguard here to prevent infinite loop when a term has many dots
|
break if count >= 10 # Safeguard here to prevent infinite loop when a term has many dots
|
||||||
term, _, remaining = lexeme.partition(".")
|
_term, _, remaining = lexeme.partition(".")
|
||||||
break if remaining.blank?
|
break if remaining.blank?
|
||||||
array << "'#{remaining}':#{positions}"
|
array << "'#{remaining}':#{positions}"
|
||||||
lexeme = remaining
|
lexeme = remaining
|
||||||
@ -69,8 +69,8 @@ class SearchIndexer
|
|||||||
if (max_dupes = SiteSetting.max_duplicate_search_index_terms) > 0
|
if (max_dupes = SiteSetting.max_duplicate_search_index_terms) > 0
|
||||||
reduced = []
|
reduced = []
|
||||||
tsvector
|
tsvector
|
||||||
.scan(/([^\:]+\:)(([0-9]+[A-D]?,?)+)/)
|
.scan(/('([^']*|'')*'\:)(([0-9]+[A-D]?,?)+)/)
|
||||||
.each do |term, indexes|
|
.each do |term, _, indexes|
|
||||||
family_counts = Hash.new(0)
|
family_counts = Hash.new(0)
|
||||||
new_index_array = []
|
new_index_array = []
|
||||||
|
|
||||||
|
@ -307,15 +307,22 @@ RSpec.describe SearchIndexer do
|
|||||||
it "limits number of repeated terms when max_duplicate_search_index_terms site setting has been configured" do
|
it "limits number of repeated terms when max_duplicate_search_index_terms site setting has been configured" do
|
||||||
SiteSetting.max_duplicate_search_index_terms = 5
|
SiteSetting.max_duplicate_search_index_terms = 5
|
||||||
|
|
||||||
contents = "I am #{"sam " * 10}"
|
contents = <<~TEXT
|
||||||
|
#{"sam " * 10}
|
||||||
|
<a href="https://something.com/path:path'path?term='hello'">url</a>
|
||||||
|
TEXT
|
||||||
|
|
||||||
post.update!(raw: contents)
|
post.update!(raw: contents)
|
||||||
|
|
||||||
post_search_data = post.post_search_data
|
post_search_data = post.post_search_data
|
||||||
post_search_data.reload
|
post_search_data.reload
|
||||||
|
|
||||||
expect(post_search_data.search_data).to eq(
|
terms =
|
||||||
"'sam':12,13,14,15,16 'test':8A 'titl':4A 'uncategor':9B",
|
"'/path:path''path':22 'com':21 'sam':10,11,12,13,14 'something.com':21 'something.com/path:path''path':20 'test':8A 'titl':4A 'uncategor':9B 'url':23".split(
|
||||||
)
|
" ",
|
||||||
|
).sort
|
||||||
|
|
||||||
|
expect(post_search_data.search_data.split(" ").sort).to contain_exactly(*terms)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user