diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index c91519fa6d1..1885a4c6f47 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -18,12 +18,25 @@ class SearchIndexer end def self.update_index(table: , id: , a_weight: nil, b_weight: nil, c_weight: nil, d_weight: nil) - raw_data = [a_weight, b_weight, c_weight, d_weight] + raw_data = { + a: a_weight, + b: b_weight, + c: c_weight, + d: d_weight, + } - search_data = raw_data.map do |data| + # The version used in excerpts + search_data = raw_data.transform_values do |data| Search.prepare_data(data || "", :index) end + # The version used to build the index + indexed_data = search_data.transform_values do |data| + data.gsub(/\S+/) { |word| + word[0...SiteSetting.search_max_indexed_word_length] + } + end + table_name = "#{table}_search_data" foreign_key = "#{table}_id" @@ -37,14 +50,7 @@ class SearchIndexer setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:d,''))")}, 'D') SQL - ranked_params = { - a: search_data[0], - b: search_data[1], - c: search_data[2], - d: search_data[3], - } - - tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0] + tsvector = DB.query_single("SELECT #{ranked_index}", indexed_data)[0] additional_lexemes = [] tsvector.scan(/'(([a-zA-Z0-9]+\.)+[a-zA-Z0-9]+)'\:([\w+,]+)/).reduce(additional_lexemes) do |array, (lexeme, _, positions)| @@ -68,9 +74,9 @@ class SearchIndexer indexed_data = if table.to_s == "post" - clean_post_raw_data!(ranked_params[:d]) + clean_post_raw_data!(search_data[:d]) else - search_data.select { |d| d.length > 0 }.join(' ') + search_data.values.select { |d| d.length > 0 }.join(' ') end params = { diff --git a/config/site_settings.yml b/config/site_settings.yml index 7f84f542636..9ebfb5efa25 100644 --- a/config/site_settings.yml +++ b/config/site_settings.yml @@ -2028,6 +2028,9 @@ search: default: false hidden: true client: true + search_max_indexed_word_length: + default: 100 + hidden: true search_ranking_normalization: default: "0" hidden: true diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb index 542ecf78aa3..180d1d0be82 100644 --- a/spec/services/search_indexer_spec.rb +++ b/spec/services/search_indexer_spec.rb @@ -269,6 +269,22 @@ describe SearchIndexer do expect(post.post_search_data.search_data).to include('Ĺ“uvr') expect(post.post_search_data.search_data).to include('oeuvr') end + + it 'truncates long words in the index' do + SiteSetting.search_max_indexed_word_length = 4 + title = 'A title that is long enough' + contents = 'I am the best beige object http://example.com/long/url' + + topic.update!(title: title) + post.update!(raw: contents) + post_search_data = post.post_search_data + post_search_data.reload + + expect(post_search_data.raw_data).to eq(contents) + + words = post_search_data.search_data.scan(/'([^']*)'/).map { |match| match[0] } + expect(words).to contain_exactly('best', 'beig', 'obj', 'http', 'titl', 'long', 'enou', 'unca') + end end describe '.queue_post_reindex' do