diff --git a/app/jobs/scheduled/reindex_search.rb b/app/jobs/scheduled/reindex_search.rb index e354b36d007..0e9768d2922 100644 --- a/app/jobs/scheduled/reindex_search.rb +++ b/app/jobs/scheduled/reindex_search.rb @@ -1,7 +1,7 @@ module Jobs # if locale changes or search algorithm changes we may want to reindex stuff class ReindexSearch < Jobs::Scheduled - every 1.day + every 2.hours def execute(args) rebuild_problem_topics @@ -38,13 +38,14 @@ module Jobs end end - def rebuild_problem_posts(limit = 10000) + def rebuild_problem_posts(limit = 20000) post_ids = load_problem_post_ids(limit) post_ids.each do |id| - post = Post.find_by(id: id) # could be deleted while iterating through batch - SearchIndexer.index(post, force: true) if post + if post = Post.find_by(id: id) + SearchIndexer.index(post, force: true) + end end end @@ -67,6 +68,7 @@ module Jobs WHERE pd.post_id IS NULL )', SiteSetting.default_locale, Search::INDEX_VERSION) .limit(limit) + .order('posts.id DESC') .pluck(:id) end diff --git a/app/models/topic.rb b/app/models/topic.rb index ac6af153922..c49b60625a6 100644 --- a/app/models/topic.rb +++ b/app/models/topic.rb @@ -223,10 +223,15 @@ class Topic < ActiveRecord::Base ApplicationController.banner_json_cache.clear end - if tags_changed - TagUser.auto_watch(topic_id: id) - TagUser.auto_track(topic_id: id) - self.tags_changed = false + if tags_changed || saved_change_to_attribute?(:category_id) + + SearchIndexer.queue_post_reindex(self.id) + + if tags_changed + TagUser.auto_watch(topic_id: id) + TagUser.auto_track(topic_id: id) + self.tags_changed = false + end end SearchIndexer.index(self) @@ -474,7 +479,7 @@ class Topic < ActiveRecord::Base search_data = "#{title} #{raw[0...MAX_SIMILAR_BODY_LENGTH]}".strip filter_words = Search.prepare_data(search_data) - ts_query = Search.ts_query(filter_words, nil, "|") + ts_query = Search.ts_query(term: filter_words, joiner: "|") candidates = Topic .visible diff --git a/app/models/user_search.rb b/app/models/user_search.rb index 8a2013b5b50..a9eceda1e9a 100644 --- a/app/models/user_search.rb +++ b/app/models/user_search.rb @@ -49,7 +49,7 @@ class UserSearch if @term.present? if SiteSetting.enable_names? && @term !~ /[_\.-]/ - query = Search.ts_query(@term, "simple") + query = Search.ts_query(term: @term, ts_config: "simple") users = users.includes(:user_search_data) .references(:user_search_data) diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index fdf6b506690..645866db8be 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_dependency 'search' class SearchIndexer @@ -14,111 +15,152 @@ class SearchIndexer HtmlScrubber.scrub(html) end - def self.update_index(table, id, raw_data) - raw_data = Search.prepare_data(raw_data, :index) - - table_name = "#{table}_search_data" - foreign_key = "#{table}_id" - + def self.inject_extra_terms(raw) # insert some extra words for I.am.a.word so "word" is tokenized # I.am.a.word becomes I.am.a.word am a word - search_data = raw_data.gsub(/[^[:space:]]*[\.]+[^[:space:]]*/) do |with_dot| + raw.gsub(/[^[:space:]]*[\.]+[^[:space:]]*/) do |with_dot| split = with_dot.split(".") if split.length > 1 - with_dot + (" " << split[1..-1].join(" ")) + with_dot + ((+" ") << split[1..-1].join(" ")) else with_dot end end + end + + def self.update_index(table: , id: , raw_data:) + search_data = raw_data.map do |data| + inject_extra_terms(Search.prepare_data(data || "", :index)) + end + + table_name = "#{table}_search_data" + foreign_key = "#{table}_id" # for user login and name use "simple" lowercase stemmer stemmer = table == "user" ? "simple" : Search.ts_config + ranked_index = <<~SQL + setweight(to_tsvector('#{stemmer}', coalesce(:a,'')), 'A') || + setweight(to_tsvector('#{stemmer}', coalesce(:b,'')), 'B') || + setweight(to_tsvector('#{stemmer}', coalesce(:c,'')), 'C') || + setweight(to_tsvector('#{stemmer}', coalesce(:d,'')), 'D') + SQL + + indexed_data = search_data.select { |d| d.length > 0 }.join(' ') + + params = { + a: search_data[0], + b: search_data[1], + c: search_data[2], + d: search_data[3], + raw_data: indexed_data, + id: id, + locale: SiteSetting.default_locale, + version: Search::INDEX_VERSION + } + # Would be nice to use AR here but not sure how to execut Postgres functions # when inserting data like this. - rows = Post.exec_sql_row_count("UPDATE #{table_name} - SET - raw_data = :raw_data, - locale = :locale, - search_data = TO_TSVECTOR('#{stemmer}', :search_data), - version = :version - WHERE #{foreign_key} = :id", - raw_data: raw_data, - search_data: search_data, - id: id, - locale: SiteSetting.default_locale, - version: Search::INDEX_VERSION) + rows = Post.exec_sql_row_count(<<~SQL, params) + UPDATE #{table_name} + SET + raw_data = :raw_data, + locale = :locale, + search_data = #{ranked_index}, + version = :version + WHERE #{foreign_key} = :id + SQL + if rows == 0 - Post.exec_sql("INSERT INTO #{table_name} - (#{foreign_key}, search_data, locale, raw_data, version) - VALUES (:id, TO_TSVECTOR('#{stemmer}', :search_data), :locale, :raw_data, :version)", - raw_data: raw_data, - search_data: search_data, - id: id, - locale: SiteSetting.default_locale, - version: Search::INDEX_VERSION) + Post.exec_sql(<<~SQL, params) + INSERT INTO #{table_name} + (#{foreign_key}, search_data, locale, raw_data, version) + VALUES (:id, #{ranked_index}, :locale, :raw_data, :version) + SQL end rescue - # don't allow concurrency to mess up saving a post + # TODO is there any way we can safely avoid this? + # best way is probably pushing search indexer into a dedicated process so it no longer happens on save + # instead in the post processor end def self.update_topics_index(topic_id, title, cooked) - search_data = title.dup << " " << scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH] - update_index('topic', topic_id, search_data) + scrubbed_cooked = scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH] + + # a bit inconsitent that we use title as A and body as B when in + # the post index body is C + update_index(table: 'topic', id: topic_id, raw_data: [title, scrubbed_cooked]) end - def self.update_posts_index(post_id, cooked, title, category) - search_data = scrub_html_for_search(cooked) << " " << title.dup.force_encoding('UTF-8') - search_data << " " << category if category - update_index('post', post_id, search_data) + def self.update_posts_index(post_id, title, category, tags, cooked) + update_index(table: 'post', id: post_id, raw_data: [title, category, tags, scrub_html_for_search(cooked)]) end def self.update_users_index(user_id, username, name) - search_data = username.dup << " " << (name || "") - update_index('user', user_id, search_data) + update_index(table: 'user', id: user_id, raw_data: [username, name]) end def self.update_categories_index(category_id, name) - update_index('category', category_id, name) + update_index(table: 'category', id: category_id, raw_data: [name]) end def self.update_tags_index(tag_id, name) - update_index('tag', tag_id, name) + update_index(table: 'tag', id: tag_id, raw_data: [name]) + end + + def self.queue_post_reindex(topic_id) + return if @disabled + + ActiveRecord::Base.exec_sql(<<~SQL, topic_id: topic_id) + UPDATE post_search_data + SET version = 0 + WHERE post_id IN (SELECT id FROM posts WHERE topic_id = :topic_id) + SQL end def self.index(obj, force: false) return if @disabled - if obj.class == Post && (obj.saved_change_to_cooked? || force) - if obj.topic - category_name = obj.topic.category.name if obj.topic.category - SearchIndexer.update_posts_index(obj.id, obj.cooked, obj.topic.title, category_name) - SearchIndexer.update_topics_index(obj.topic_id, obj.topic.title, obj.cooked) if obj.is_first_post? + category_name, tag_names = nil + topic = nil + + if Topic === obj + topic = obj + elsif Post === obj + topic = obj.topic + end + + category_name = topic.category&.name if topic + tag_names = topic.tags.pluck(:name).join(' ') if topic + + if Post === obj && (obj.saved_change_to_cooked? || force) + if topic + SearchIndexer.update_posts_index(obj.id, topic.title, category_name, tag_names, obj.cooked) + SearchIndexer.update_topics_index(topic.id, topic.title, obj.cooked) if obj.is_first_post? else Rails.logger.warn("Orphan post skipped in search_indexer, topic_id: #{obj.topic_id} post_id: #{obj.id} raw: #{obj.raw}") end end - if obj.class == User && (obj.saved_change_to_username? || obj.saved_change_to_name? || force) + if User === obj && (obj.saved_change_to_username? || obj.saved_change_to_name? || force) SearchIndexer.update_users_index(obj.id, obj.username_lower || '', obj.name ? obj.name.downcase : '') end - if obj.class == Topic && (obj.saved_change_to_title? || force) + if Topic === obj && (obj.saved_change_to_title? || force) if obj.posts post = obj.posts.find_by(post_number: 1) if post - category_name = obj.category.name if obj.category - SearchIndexer.update_posts_index(post.id, post.cooked, obj.title, category_name) + SearchIndexer.update_posts_index(post.id, obj.title, category_name, tag_names, post.cooked) SearchIndexer.update_topics_index(obj.id, obj.title, post.cooked) end end end - if obj.class == Category && (obj.saved_change_to_name? || force) + if Category === obj && (obj.saved_change_to_name? || force) SearchIndexer.update_categories_index(obj.id, obj.name) end - if obj.class == Tag && (obj.saved_change_to_name? || force) + if Tag === obj && (obj.saved_change_to_name? || force) SearchIndexer.update_tags_index(obj.id, obj.name) end end @@ -127,14 +169,14 @@ class SearchIndexer attr_reader :scrubbed def initialize - @scrubbed = "" + @scrubbed = +"" end def self.scrub(html) me = new parser = Nokogiri::HTML::SAX::Parser.new(me) begin - copy = "