From cd247d532236659d1c4346ac49148c2c58ec76e0 Mon Sep 17 00:00:00 2001 From: Sam Date: Mon, 20 Feb 2023 11:53:35 +1100 Subject: [PATCH] FEATURE: Roll out new search optimisations (#20364) - Reduce duplication of terms in post index from unlimited to 6. This will result in reduced index size and reduced weighting for posts containing a huge amount of duplicate terms. (Eg: a post containing "sam sam sam sam sam sam sam sam", will index as "sam sam sam sam sam sam", only including the word up to 6 times.) This corrects a flaw where title weighting could be ignored. - Prioritize exact matches of words in titles. Our search always performs a prefix match. However we want to give special weight to exact title matches meaning that a search for "sum" will find topics such as "the sum of us" vs "summer in spring". - Pick up fixes to our search algorithm which are missing from old indexes. Specifically pick up the fix that indexes URLs properly. (`https://happy.com` was stemmed to `happi` in keywords and then was not searchable) see also: https://meta.discourse.org/t/refinements-to-search-being-tested-on-meta/254158 Indexing will take a while and work in batches, in the background. --- app/services/search_indexer.rb | 9 ++++++--- config/site_settings.yml | 4 ++-- lib/search/grouped_search_results.rb | 2 +- spec/jobs/reindex_search_spec.rb | 2 +- spec/lib/search_spec.rb | 2 ++ 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index 96d750a281a..40104c5b13c 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -1,12 +1,15 @@ # frozen_string_literal: true class SearchIndexer - POST_INDEX_VERSION = 4 - MIN_POST_REINDEX_VERSION = 3 - TOPIC_INDEX_VERSION = 3 + MIN_POST_BLURB_INDEX_VERSION = 4 + + POST_INDEX_VERSION = 5 + TOPIC_INDEX_VERSION = 4 CATEGORY_INDEX_VERSION = 3 USER_INDEX_VERSION = 3 TAG_INDEX_VERSION = 3 + + # version to apply when issuing a background reindex REINDEX_VERSION = 0 TS_VECTOR_PARSE_REGEX = /('([^']*|'')*'\:)(([0-9]+[A-D]?,?)+)/ diff --git a/config/site_settings.yml b/config/site_settings.yml index 1dd04eeca65..679b6771bfa 100644 --- a/config/site_settings.yml +++ b/config/site_settings.yml @@ -2191,10 +2191,10 @@ search: default: false hidden: true prioritize_exact_search_title_match: - default: false + default: true hidden: true max_duplicate_search_index_terms: - default: -1 + default: 6 hidden: true use_pg_headlines_for_excerpt: default: false diff --git a/lib/search/grouped_search_results.rb b/lib/search/grouped_search_results.rb index f3470963eb5..ccb9c84b014 100644 --- a/lib/search/grouped_search_results.rb +++ b/lib/search/grouped_search_results.rb @@ -71,7 +71,7 @@ class Search def blurb(post) opts = { term: @blurb_term, blurb_length: @blurb_length } - if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION && + if post.post_search_data.version >= SearchIndexer::MIN_POST_BLURB_INDEX_VERSION && !Search.segment_chinese? && !Search.segment_japanese? if SiteSetting.use_pg_headlines_for_excerpt scrubbed_headline = post.headline.gsub(SCRUB_HEADLINE_REGEXP, '\1') diff --git a/spec/jobs/reindex_search_spec.rb b/spec/jobs/reindex_search_spec.rb index ae63f64d055..a9885598d2c 100644 --- a/spec/jobs/reindex_search_spec.rb +++ b/spec/jobs/reindex_search_spec.rb @@ -70,7 +70,7 @@ RSpec.describe Jobs::ReindexSearch do end it "should not reindex posts with a developmental version" do - post = Fabricate(:post, version: SearchIndexer::MIN_POST_REINDEX_VERSION + 1) + Fabricate(:post, version: SearchIndexer::POST_INDEX_VERSION + 1) subject.rebuild_posts(indexer: FakeIndexer) diff --git a/spec/lib/search_spec.rb b/spec/lib/search_spec.rb index feca2de42eb..5608da93402 100644 --- a/spec/lib/search_spec.rb +++ b/spec/lib/search_spec.rb @@ -122,6 +122,8 @@ RSpec.describe Search do before do SearchIndexer.enable + SiteSetting.max_duplicate_search_index_terms = -1 + SiteSetting.prioritize_exact_search_title_match = false [post1, post2].each { |post| SearchIndexer.index(post, force: true) } end