FIX: search index duplicate parser matching is too restrictive (#20129)
Previous regex did not allow for cases where a lexeme contains a : (colon) This can happen when parsing URLs. New algorithm allows for this. Test was amended to more clearly call out index problems
This commit is contained in:
parent
41f265ae46
commit
4570118a63
|
@ -54,7 +54,7 @@ class SearchIndexer
|
|||
loop do
|
||||
count += 1
|
||||
break if count >= 10 # Safeguard here to prevent infinite loop when a term has many dots
|
||||
term, _, remaining = lexeme.partition(".")
|
||||
_term, _, remaining = lexeme.partition(".")
|
||||
break if remaining.blank?
|
||||
array << "'#{remaining}':#{positions}"
|
||||
lexeme = remaining
|
||||
|
@ -69,8 +69,8 @@ class SearchIndexer
|
|||
if (max_dupes = SiteSetting.max_duplicate_search_index_terms) > 0
|
||||
reduced = []
|
||||
tsvector
|
||||
.scan(/([^\:]+\:)(([0-9]+[A-D]?,?)+)/)
|
||||
.each do |term, indexes|
|
||||
.scan(/('([^']*|'')*'\:)(([0-9]+[A-D]?,?)+)/)
|
||||
.each do |term, _, indexes|
|
||||
family_counts = Hash.new(0)
|
||||
new_index_array = []
|
||||
|
||||
|
|
|
@ -307,15 +307,22 @@ RSpec.describe SearchIndexer do
|
|||
it "limits number of repeated terms when max_duplicate_search_index_terms site setting has been configured" do
|
||||
SiteSetting.max_duplicate_search_index_terms = 5
|
||||
|
||||
contents = "I am #{"sam " * 10}"
|
||||
contents = <<~TEXT
|
||||
#{"sam " * 10}
|
||||
<a href="https://something.com/path:path'path?term='hello'">url</a>
|
||||
TEXT
|
||||
|
||||
post.update!(raw: contents)
|
||||
|
||||
post_search_data = post.post_search_data
|
||||
post_search_data.reload
|
||||
|
||||
expect(post_search_data.search_data).to eq(
|
||||
"'sam':12,13,14,15,16 'test':8A 'titl':4A 'uncategor':9B",
|
||||
)
|
||||
terms =
|
||||
"'/path:path''path':22 'com':21 'sam':10,11,12,13,14 'something.com':21 'something.com/path:path''path':20 'test':8A 'titl':4A 'uncategor':9B 'url':23".split(
|
||||
" ",
|
||||
).sort
|
||||
|
||||
expect(post_search_data.search_data.split(" ").sort).to contain_exactly(*terms)
|
||||
end
|
||||
end
|
||||
|
||||
|
|
Loading…
Reference in New Issue