discourse/app/services/search_indexer.rb

require_dependency 'search'

class SearchIndexer

  def self.disable
    @disabled = true
  end

  def self.enable
    @disabled = false
  end

  def self.scrub_html_for_search(html)
    HtmlScrubber.scrub(html)
  end

  def self.update_index(table, id, raw_data)
    raw_data = Search.prepare_data(raw_data, :index)

    table_name = "#{table}_search_data"
    foreign_key = "#{table}_id"

    # insert some extra words for I.am.a.word so "word" is tokenized
    search_data = raw_data.gsub(/\p{L}*\.\p{L}*/) do |with_dot|
      split = with_dot.split(".")
      if split.length > 1
        with_dot + (" " << split[1..-1].join(" "))
      else
        with_dot
      end
    end

    # for user login and name use "simple" lowercase stemmer
    stemmer = table == "user" ? "simple" : Search.ts_config

    # Would be nice to use AR here but not sure how to execut Postgres functions
    # when inserting data like this.
    rows = Post.exec_sql_row_count("UPDATE #{table_name}
                                   SET
                                      raw_data = :raw_data,
                                      locale = :locale,
                                      search_data = TO_TSVECTOR('#{stemmer}', :search_data)
                                   WHERE #{foreign_key} = :id",
                                    raw_data: raw_data,
                                    search_data: search_data,
                                    id: id,
                                    locale: SiteSetting.default_locale)
    if rows == 0
      Post.exec_sql("INSERT INTO #{table_name}
                    (#{foreign_key}, search_data, locale, raw_data)
                    VALUES (:id, TO_TSVECTOR('#{stemmer}', :search_data), :locale, :raw_data)",
                                    raw_data: raw_data,
                                    search_data: search_data,
                                    id: id,
                                    locale: SiteSetting.default_locale)
    end
  rescue
    # don't allow concurrency to mess up saving a post
  end

  def self.update_topics_index(topic_id, title, cooked)
    search_data = title.dup << " " << scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
    update_index('topic', topic_id, search_data)
  end

  def self.update_posts_index(post_id, cooked, title, category)
    search_data = scrub_html_for_search(cooked) << " " << title.dup.force_encoding('UTF-8')
    search_data << " " << category if category
    update_index('post', post_id, search_data)
  end

  def self.update_users_index(user_id, username, name)
    search_data = username.dup << " " << (name || "")
    update_index('user', user_id, search_data)
  end

  def self.update_categories_index(category_id, name)
    update_index('category', category_id, name)
  end

  def self.index(obj)
    return if @disabled

    if obj.class == Post && obj.cooked_changed?
      if obj.topic
        category_name = obj.topic.category.name if obj.topic.category
        SearchIndexer.update_posts_index(obj.id, obj.cooked, obj.topic.title, category_name)
        SearchIndexer.update_topics_index(obj.topic_id, obj.topic.title, obj.cooked) if obj.is_first_post?
      else
        Rails.logger.warn("Orphan post skipped in search_indexer, topic_id: #{obj.topic_id} post_id: #{obj.id} raw: #{obj.raw}")
      end
    end
    if obj.class == User && (obj.username_changed? || obj.name_changed?)
      SearchIndexer.update_users_index(obj.id, obj.username_lower || '', obj.name ? obj.name.downcase : '')
    end

    if obj.class == Topic && obj.title_changed?
      if obj.posts
        post = obj.posts.find_by(post_number: 1)
        if post
          category_name = obj.category.name if obj.category
          SearchIndexer.update_posts_index(post.id, post.cooked, obj.title, category_name)
          SearchIndexer.update_topics_index(obj.id, obj.title, post.cooked)
        end
      end
    end

    if obj.class == Category && obj.name_changed?
      SearchIndexer.update_categories_index(obj.id, obj.name)
    end
  end

  class HtmlScrubber < Nokogiri::XML::SAX::Document
    attr_reader :scrubbed

    def initialize
      @scrubbed = ""
    end

    def self.scrub(html)
      me = new
      parser = Nokogiri::HTML::SAX::Parser.new(me)
      begin
        copy = "<div>"
        copy << html unless html.nil?
        copy << "</div>"
        parser.parse(html) unless html.nil?
      end
      me.scrubbed
    end

    def start_element(name, attributes = [])
      attributes = Hash[*attributes.flatten]
      if attributes["alt"]
        scrubbed << " "
        scrubbed << attributes["alt"]
        scrubbed << " "
      end
      if attributes["title"]
        scrubbed << " "
        scrubbed << attributes["title"]
        scrubbed << " "
      end
    end

    def characters(string)
      scrubbed << " "
      scrubbed << string
      scrubbed << " "
    end
  end
end
Fix locale dependend stemmer for FTS Fix locale dependend stemmer for FTS to improve search relevance on non English languages. 2013-07-22 19:07:59 -04:00			`require_dependency 'search'`

Remove SearchObserver, aim is to remove all observers rails-observers gem is mostly unmaintained and is a pain to carry forward new implementation contains significantly less magic as a bonus 2016-12-21 21:13:14 -05:00			`class SearchIndexer`

			`def self.disable`
			`@disabled = true`
			`end`

			`def self.enable`
			`@disabled = false`
			`end`
Initial release of Discourse 2013-02-05 14:16:51 -05:00
			`def self.scrub_html_for_search(html)`
			`HtmlScrubber.scrub(html)`
			`end`

shuffle code around so excerpt is not messed up 2016-07-25 03:12:01 -04:00			`def self.update_index(table, id, raw_data)`
Replace rmmseg gem for cppjieba_rb since better dictionary (#5006) * Rename locale to ts config in search module to make it clear * Replace rmmese-cpp for cppjieba_rb 2017-07-31 15:28:48 -04:00			`raw_data = Search.prepare_data(raw_data, :index)`
BUGFIX: Chinese search was broken BUGFIX: User locale was used index data BUGFIX: missing Norwegian fulltext config FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search) FEATURE: re-index posts when locale changes (in bg job) FEATURE: allow reindexing by trucating post_search_data Note: I removed japanese specific config cause it requires custom pg config, happy to add it once our base docker config ships with it 2014-06-24 03:10:56 -04:00
Search Refactor: Remove some manual SQL, make search data tables more idomatic Rails/AR 2013-05-22 15:33:33 -04:00			`table_name = "#{table}_search_data"`
			`foreign_key = "#{table}_id"`

shuffle code around so excerpt is not messed up 2016-07-25 03:12:01 -04:00			`# insert some extra words for I.am.a.word so "word" is tokenized`
			`search_data = raw_data.gsub(/\p{L}\.\p{L}/) do \|with_dot\|`
			`split = with_dot.split(".")`
			`if split.length > 1`
			`with_dot + (" " << split[1..-1].join(" "))`
			`else`
			`with_dot`
			`end`
			`end`

Fix locale dependend stemmer for FTS Fix locale dependend stemmer for FTS to improve search relevance on non English languages. 2013-07-22 19:07:59 -04:00			`# for user login and name use "simple" lowercase stemmer`
Replace rmmseg gem for cppjieba_rb since better dictionary (#5006) * Rename locale to ts config in search module to make it clear * Replace rmmese-cpp for cppjieba_rb 2017-07-31 15:28:48 -04:00			`stemmer = table == "user" ? "simple" : Search.ts_config`
Fix locale dependend stemmer for FTS Fix locale dependend stemmer for FTS to improve search relevance on non English languages. 2013-07-22 19:07:59 -04:00
Search Refactor: Remove some manual SQL, make search data tables more idomatic Rails/AR 2013-05-22 15:33:33 -04:00			`# Would be nice to use AR here but not sure how to execut Postgres functions`
			`# when inserting data like this.`
BUGFIX: Chinese search was broken BUGFIX: User locale was used index data BUGFIX: missing Norwegian fulltext config FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search) FEATURE: re-index posts when locale changes (in bg job) FEATURE: allow reindexing by trucating post_search_data Note: I removed japanese specific config cause it requires custom pg config, happy to add it once our base docker config ships with it 2014-06-24 03:10:56 -04:00			`rows = Post.exec_sql_row_count("UPDATE #{table_name}`
			`SET`
shuffle code around so excerpt is not messed up 2016-07-25 03:12:01 -04:00			`raw_data = :raw_data,`
BUGFIX: Chinese search was broken BUGFIX: User locale was used index data BUGFIX: missing Norwegian fulltext config FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search) FEATURE: re-index posts when locale changes (in bg job) FEATURE: allow reindexing by trucating post_search_data Note: I removed japanese specific config cause it requires custom pg config, happy to add it once our base docker config ships with it 2014-06-24 03:10:56 -04:00			`locale = :locale,`
			`search_data = TO_TSVECTOR('#{stemmer}', :search_data)`
			`WHERE #{foreign_key} = :id",`
shuffle code around so excerpt is not messed up 2016-07-25 03:12:01 -04:00			`raw_data: raw_data,`
			`search_data: search_data,`
			`id: id,`
			`locale: SiteSetting.default_locale)`
Search Refactor: Remove some manual SQL, make search data tables more idomatic Rails/AR 2013-05-22 15:33:33 -04:00			`if rows == 0`
BUGFIX: Chinese search was broken BUGFIX: User locale was used index data BUGFIX: missing Norwegian fulltext config FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search) FEATURE: re-index posts when locale changes (in bg job) FEATURE: allow reindexing by trucating post_search_data Note: I removed japanese specific config cause it requires custom pg config, happy to add it once our base docker config ships with it 2014-06-24 03:10:56 -04:00			`Post.exec_sql("INSERT INTO #{table_name}`
			`(#{foreign_key}, search_data, locale, raw_data)`
shuffle code around so excerpt is not messed up 2016-07-25 03:12:01 -04:00			`VALUES (:id, TO_TSVECTOR('#{stemmer}', :search_data), :locale, :raw_data)",`
			`raw_data: raw_data,`
			`search_data: search_data,`
			`id: id,`
			`locale: SiteSetting.default_locale)`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`end`
Search Refactor: Remove some manual SQL, make search data tables more idomatic Rails/AR 2013-05-22 15:33:33 -04:00			`rescue`
			`# don't allow concurrency to mess up saving a post`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`end`

PERF: new table used for title similarity search 2014-08-08 01:50:26 -04:00			`def self.update_topics_index(topic_id, title, cooked)`
Revert "Allowing poll feed with UTF-8 title" This reverts commit 63704c5cee0cdbaa34eedf4180bb3e4519182551. 2015-04-04 02:33:01 -04:00			`search_data = title.dup << " " << scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]`
PERF: new table used for title similarity search 2014-08-08 01:50:26 -04:00			`update_index('topic', topic_id, search_data)`
			`end`

Initial release of Discourse 2013-02-05 14:16:51 -05:00			`def self.update_posts_index(post_id, cooked, title, category)`
due to travis error message: can't modify frozen string 2014-10-06 05:45:11 -04:00			`search_data = scrub_html_for_search(cooked) << " " << title.dup.force_encoding('UTF-8')`
Search Refactor: Remove some manual SQL, make search data tables more idomatic Rails/AR 2013-05-22 15:33:33 -04:00			`search_data << " " << category if category`
			`update_index('post', post_id, search_data)`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`end`

			`def self.update_users_index(user_id, username, name)`
Search Refactor: Remove some manual SQL, make search data tables more idomatic Rails/AR 2013-05-22 15:33:33 -04:00			`search_data = username.dup << " " << (name \|\| "")`
			`update_index('user', user_id, search_data)`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`end`
Fix all the trailing whitespace 2013-02-07 10:45:24 -05:00
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`def self.update_categories_index(category_id, name)`
Search Refactor: Remove some manual SQL, make search data tables more idomatic Rails/AR 2013-05-22 15:33:33 -04:00			`update_index('category', category_id, name)`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`end`

BUGFIX: Chinese search was broken BUGFIX: User locale was used index data BUGFIX: missing Norwegian fulltext config FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search) FEATURE: re-index posts when locale changes (in bg job) FEATURE: allow reindexing by trucating post_search_data Note: I removed japanese specific config cause it requires custom pg config, happy to add it once our base docker config ships with it 2014-06-24 03:10:56 -04:00			`def self.index(obj)`
Remove SearchObserver, aim is to remove all observers rails-observers gem is mostly unmaintained and is a pain to carry forward new implementation contains significantly less magic as a bonus 2016-12-21 21:13:14 -05:00			`return if @disabled`

Initial release of Discourse 2013-02-05 14:16:51 -05:00			`if obj.class == Post && obj.cooked_changed?`
More logging, less problems 2014-05-06 22:35:26 -04:00			`if obj.topic`
			`category_name = obj.topic.category.name if obj.topic.category`
Remove SearchObserver, aim is to remove all observers rails-observers gem is mostly unmaintained and is a pain to carry forward new implementation contains significantly less magic as a bonus 2016-12-21 21:13:14 -05:00			`SearchIndexer.update_posts_index(obj.id, obj.cooked, obj.topic.title, category_name)`
			`SearchIndexer.update_topics_index(obj.topic_id, obj.topic.title, obj.cooked) if obj.is_first_post?`
More logging, less problems 2014-05-06 22:35:26 -04:00			`else`
Remove SearchObserver, aim is to remove all observers rails-observers gem is mostly unmaintained and is a pain to carry forward new implementation contains significantly less magic as a bonus 2016-12-21 21:13:14 -05:00			`Rails.logger.warn("Orphan post skipped in search_indexer, topic_id: #{obj.topic_id} post_id: #{obj.id} raw: #{obj.raw}")`
More logging, less problems 2014-05-06 22:35:26 -04:00			`end`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`end`
			`if obj.class == User && (obj.username_changed? \|\| obj.name_changed?)`
Remove SearchObserver, aim is to remove all observers rails-observers gem is mostly unmaintained and is a pain to carry forward new implementation contains significantly less magic as a bonus 2016-12-21 21:13:14 -05:00			`SearchIndexer.update_users_index(obj.id, obj.username_lower \|\| '', obj.name ? obj.name.downcase : '')`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`end`

			`if obj.class == Topic && obj.title_changed?`
			`if obj.posts`
Perform the where(...).first to find_by(...) refactoring. This refactoring was automated using the command: bundle exec "ruby refactorings/where_dot_first_to_find_by/app.rb" 2014-05-06 09:41:59 -04:00			`post = obj.posts.find_by(post_number: 1)`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`if post`
			`category_name = obj.category.name if obj.category`
Remove SearchObserver, aim is to remove all observers rails-observers gem is mostly unmaintained and is a pain to carry forward new implementation contains significantly less magic as a bonus 2016-12-21 21:13:14 -05:00			`SearchIndexer.update_posts_index(post.id, post.cooked, obj.title, category_name)`
			`SearchIndexer.update_topics_index(obj.id, obj.title, post.cooked)`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`end`
			`end`
			`end`

Fix all the trailing whitespace 2013-02-07 10:45:24 -05:00			`if obj.class == Category && obj.name_changed?`
Remove SearchObserver, aim is to remove all observers rails-observers gem is mostly unmaintained and is a pain to carry forward new implementation contains significantly less magic as a bonus 2016-12-21 21:13:14 -05:00			`SearchIndexer.update_categories_index(obj.id, obj.name)`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`end`
			`end`

			`class HtmlScrubber < Nokogiri::XML::SAX::Document`
			`attr_reader :scrubbed`

			`def initialize`
			`@scrubbed = ""`
			`end`

			`def self.scrub(html)`
minor cleanup, using AR querying DSL over raw SQL in some places 2013-02-28 13:54:12 -05:00			`me = new`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`parser = Nokogiri::HTML::SAX::Parser.new(me)`
Fix all the trailing whitespace 2013-02-07 10:45:24 -05:00			`begin`
			`copy = "<div>"`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`copy << html unless html.nil?`
			`copy << "</div>"`
			`parser.parse(html) unless html.nil?`
			`end`
			`me.scrubbed`
			`end`

Add rubocop to our build. (#5004) 2017-07-27 21:20:09 -04:00			`def start_element(name, attributes = [])`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`attributes = Hash[*attributes.flatten]`
			`if attributes["alt"]`
			`scrubbed << " "`
			`scrubbed << attributes["alt"]`
			`scrubbed << " "`
			`end`
			`if attributes["title"]`
			`scrubbed << " "`
Fix all the trailing whitespace 2013-02-07 10:45:24 -05:00			`scrubbed << attributes["title"]`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`scrubbed << " "`
			`end`
			`end`

			`def characters(string)`
			`scrubbed << " "`
shuffle code around so excerpt is not messed up 2016-07-25 03:12:01 -04:00			`scrubbed << string`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`scrubbed << " "`
			`end`
			`end`
			`end`