FIX: search index duplicate parser matching is too restrictive (#20129)

Previous regex did not allow for cases where a lexeme contains a : (colon) This can happen when parsing URLs. New algorithm allows for this. Test was amended to more clearly call out index problems
2023-02-02 12:17:19 +11:00 · 2023-02-02 12:17:19 +11:00 · 4570118a63
parent 41f265ae46
commit 4570118a63
2 changed files with 14 additions and 7 deletions
--- a/app/services/search_indexer.rb
+++ b/app/services/search_indexer.rb
@ -54,7 +54,7 @@ class SearchIndexer
          loop do
            count += 1
            break if count >= 10 # Safeguard here to prevent infinite loop when a term has many dots
-            term, _, remaining = lexeme.partition(".")
+            _term, _, remaining = lexeme.partition(".")
            break if remaining.blank?
            array << "'#{remaining}':#{positions}"
            lexeme = remaining
@ -69,8 +69,8 @@ class SearchIndexer
    if (max_dupes = SiteSetting.max_duplicate_search_index_terms) > 0
      reduced = []
      tsvector
-        .scan(/([^\:]+\:)(([0-9]+[A-D]?,?)+)/)
+        .scan(/('([^']*|'')*'\:)(([0-9]+[A-D]?,?)+)/)
-        .each do |term, indexes|
+        .each do |term, _, indexes|
          family_counts = Hash.new(0)
          new_index_array = []
--- a/spec/services/search_indexer_spec.rb
+++ b/spec/services/search_indexer_spec.rb
@ -307,15 +307,22 @@ RSpec.describe SearchIndexer do
    it "limits number of repeated terms when max_duplicate_search_index_terms site setting has been configured" do
      SiteSetting.max_duplicate_search_index_terms = 5
-      contents = "I am #{"sam " * 10}"
+      contents = <<~TEXT
        #{"sam " * 10}
        <a href="https://something.com/path:path'path?term='hello'">url</a>
      TEXT
      post.update!(raw: contents)
      post_search_data = post.post_search_data
      post_search_data.reload
-      expect(post_search_data.search_data).to eq(
+      terms =
-        "'sam':12,13,14,15,16 'test':8A 'titl':4A 'uncategor':9B",
+        "'/path:path''path':22 'com':21 'sam':10,11,12,13,14 'something.com':21 'something.com/path:path''path':20 'test':8A 'titl':4A 'uncategor':9B 'url':23".split(
-      )
+          " ",
        ).sort
      expect(post_search_data.search_data.split(" ").sort).to contain_exactly(*terms)
    end
  end