require_dependency 'search' class SearchIndexer def self.disable @disabled = true end def self.enable @disabled = false end def self.scrub_html_for_search(html) HtmlScrubber.scrub(html) end def self.update_index(table, id, raw_data) raw_data = Search.prepare_data(raw_data, :index) table_name = "#{table}_search_data" foreign_key = "#{table}_id" # insert some extra words for I.am.a.word so "word" is tokenized # I.am.a.word becomes I.am.a.word am a word # uses \p{L} which matchs a single code point in category letter # uses \p{N} which matchs a single code point in category number search_data = raw_data.gsub(/(\p{L}|\p{N}|_|-|\.)*\.(\p{L}|\p{N}|_|-|\.)*/) do |with_dot| split = with_dot.split(".") if split.length > 1 with_dot + (" " << split[1..-1].join(" ")) else with_dot end end # for user login and name use "simple" lowercase stemmer stemmer = table == "user" ? "simple" : Search.ts_config # Would be nice to use AR here but not sure how to execut Postgres functions # when inserting data like this. rows = Post.exec_sql_row_count("UPDATE #{table_name} SET raw_data = :raw_data, locale = :locale, search_data = TO_TSVECTOR('#{stemmer}', :search_data), version = :version WHERE #{foreign_key} = :id", raw_data: raw_data, search_data: search_data, id: id, locale: SiteSetting.default_locale, version: Search::INDEX_VERSION) if rows == 0 Post.exec_sql("INSERT INTO #{table_name} (#{foreign_key}, search_data, locale, raw_data, version) VALUES (:id, TO_TSVECTOR('#{stemmer}', :search_data), :locale, :raw_data, :version)", raw_data: raw_data, search_data: search_data, id: id, locale: SiteSetting.default_locale, version: Search::INDEX_VERSION) end rescue # don't allow concurrency to mess up saving a post end def self.update_topics_index(topic_id, title, cooked) search_data = title.dup << " " << scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH] update_index('topic', topic_id, search_data) end def self.update_posts_index(post_id, cooked, title, category) search_data = scrub_html_for_search(cooked) << " " << title.dup.force_encoding('UTF-8') search_data << " " << category if category update_index('post', post_id, search_data) end def self.update_users_index(user_id, username, name) search_data = username.dup << " " << (name || "") update_index('user', user_id, search_data) end def self.update_categories_index(category_id, name) update_index('category', category_id, name) end def self.index(obj, force: false) return if @disabled if obj.class == Post && (obj.cooked_changed? || force) if obj.topic category_name = obj.topic.category.name if obj.topic.category SearchIndexer.update_posts_index(obj.id, obj.cooked, obj.topic.title, category_name) SearchIndexer.update_topics_index(obj.topic_id, obj.topic.title, obj.cooked) if obj.is_first_post? else Rails.logger.warn("Orphan post skipped in search_indexer, topic_id: #{obj.topic_id} post_id: #{obj.id} raw: #{obj.raw}") end end if obj.class == User && (obj.username_changed? || obj.name_changed? || force) SearchIndexer.update_users_index(obj.id, obj.username_lower || '', obj.name ? obj.name.downcase : '') end if obj.class == Topic && (obj.title_changed? || force) if obj.posts post = obj.posts.find_by(post_number: 1) if post category_name = obj.category.name if obj.category SearchIndexer.update_posts_index(post.id, post.cooked, obj.title, category_name) SearchIndexer.update_topics_index(obj.id, obj.title, post.cooked) end end end if obj.class == Category && (obj.name_changed? || force) SearchIndexer.update_categories_index(obj.id, obj.name) end end class HtmlScrubber < Nokogiri::XML::SAX::Document attr_reader :scrubbed def initialize @scrubbed = "" end def self.scrub(html) me = new parser = Nokogiri::HTML::SAX::Parser.new(me) begin copy = "
" copy << html unless html.nil? copy << "
" parser.parse(html) unless html.nil? end me.scrubbed end def start_element(name, attributes = []) attributes = Hash[*attributes.flatten] if attributes["alt"] scrubbed << " " scrubbed << attributes["alt"] scrubbed << " " end if attributes["title"] scrubbed << " " scrubbed << attributes["title"] scrubbed << " " end end def characters(string) scrubbed << " " scrubbed << string scrubbed << " " end end end