discourse/app/services/search_indexer.rb

168 lines
5.5 KiB
Ruby
Raw Normal View History

require_dependency 'search'
class SearchIndexer
def self.disable
@disabled = true
end
def self.enable
@disabled = false
end
2013-02-05 14:16:51 -05:00
def self.scrub_html_for_search(html)
HtmlScrubber.scrub(html)
end
def self.update_index(table, id, raw_data)
raw_data = Search.prepare_data(raw_data, :index)
table_name = "#{table}_search_data"
foreign_key = "#{table}_id"
# insert some extra words for I.am.a.word so "word" is tokenized
# I.am.a.word becomes I.am.a.word am a word
# uses \p{L} which matchs a single code point in category letter
# uses \p{N} which matchs a single code point in category number
search_data = raw_data.gsub(/(\p{L}|\p{N}|_|-|\.)*\.(\p{L}|\p{N}|_|-|\.)*/) do |with_dot|
split = with_dot.split(".")
if split.length > 1
with_dot + (" " << split[1..-1].join(" "))
else
with_dot
end
end
# for user login and name use "simple" lowercase stemmer
stemmer = table == "user" ? "simple" : Search.ts_config
# Would be nice to use AR here but not sure how to execut Postgres functions
# when inserting data like this.
rows = Post.exec_sql_row_count("UPDATE #{table_name}
SET
raw_data = :raw_data,
locale = :locale,
search_data = TO_TSVECTOR('#{stemmer}', :search_data),
version = :version
WHERE #{foreign_key} = :id",
raw_data: raw_data,
search_data: search_data,
id: id,
locale: SiteSetting.default_locale,
version: Search::INDEX_VERSION)
if rows == 0
Post.exec_sql("INSERT INTO #{table_name}
(#{foreign_key}, search_data, locale, raw_data, version)
VALUES (:id, TO_TSVECTOR('#{stemmer}', :search_data), :locale, :raw_data, :version)",
raw_data: raw_data,
search_data: search_data,
id: id,
locale: SiteSetting.default_locale,
version: Search::INDEX_VERSION)
2013-02-05 14:16:51 -05:00
end
rescue
# don't allow concurrency to mess up saving a post
2013-02-05 14:16:51 -05:00
end
def self.update_topics_index(topic_id, title, cooked)
search_data = title.dup << " " << scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
update_index('topic', topic_id, search_data)
end
2013-02-05 14:16:51 -05:00
def self.update_posts_index(post_id, cooked, title, category)
search_data = scrub_html_for_search(cooked) << " " << title.dup.force_encoding('UTF-8')
search_data << " " << category if category
update_index('post', post_id, search_data)
2013-02-05 14:16:51 -05:00
end
def self.update_users_index(user_id, username, name)
search_data = username.dup << " " << (name || "")
update_index('user', user_id, search_data)
2013-02-05 14:16:51 -05:00
end
2013-02-07 10:45:24 -05:00
2013-02-05 14:16:51 -05:00
def self.update_categories_index(category_id, name)
update_index('category', category_id, name)
2013-02-05 14:16:51 -05:00
end
2017-08-25 11:52:18 -04:00
def self.update_tags_index(tag_id, name)
update_index('tag', tag_id, name)
end
def self.index(obj, force: false)
return if @disabled
if obj.class == Post && (obj.cooked_changed? || force)
2014-05-06 22:35:26 -04:00
if obj.topic
category_name = obj.topic.category.name if obj.topic.category
SearchIndexer.update_posts_index(obj.id, obj.cooked, obj.topic.title, category_name)
SearchIndexer.update_topics_index(obj.topic_id, obj.topic.title, obj.cooked) if obj.is_first_post?
2014-05-06 22:35:26 -04:00
else
Rails.logger.warn("Orphan post skipped in search_indexer, topic_id: #{obj.topic_id} post_id: #{obj.id} raw: #{obj.raw}")
2014-05-06 22:35:26 -04:00
end
2013-02-05 14:16:51 -05:00
end
if obj.class == User && (obj.username_changed? || obj.name_changed? || force)
SearchIndexer.update_users_index(obj.id, obj.username_lower || '', obj.name ? obj.name.downcase : '')
2013-02-05 14:16:51 -05:00
end
if obj.class == Topic && (obj.title_changed? || force)
2013-02-05 14:16:51 -05:00
if obj.posts
post = obj.posts.find_by(post_number: 1)
2013-02-05 14:16:51 -05:00
if post
category_name = obj.category.name if obj.category
SearchIndexer.update_posts_index(post.id, post.cooked, obj.title, category_name)
SearchIndexer.update_topics_index(obj.id, obj.title, post.cooked)
2013-02-05 14:16:51 -05:00
end
end
end
if obj.class == Category && (obj.name_changed? || force)
SearchIndexer.update_categories_index(obj.id, obj.name)
2013-02-05 14:16:51 -05:00
end
2017-08-25 11:52:18 -04:00
if obj.class == Tag && (obj.name_changed? || force)
SearchIndexer.update_tags_index(obj.id, obj.name)
end
2013-02-05 14:16:51 -05:00
end
class HtmlScrubber < Nokogiri::XML::SAX::Document
attr_reader :scrubbed
def initialize
@scrubbed = ""
end
def self.scrub(html)
me = new
2013-02-05 14:16:51 -05:00
parser = Nokogiri::HTML::SAX::Parser.new(me)
2013-02-07 10:45:24 -05:00
begin
copy = "<div>"
2013-02-05 14:16:51 -05:00
copy << html unless html.nil?
copy << "</div>"
parser.parse(html) unless html.nil?
end
me.scrubbed
end
2017-07-27 21:20:09 -04:00
def start_element(name, attributes = [])
2013-02-05 14:16:51 -05:00
attributes = Hash[*attributes.flatten]
if attributes["alt"]
scrubbed << " "
scrubbed << attributes["alt"]
scrubbed << " "
end
if attributes["title"]
scrubbed << " "
2013-02-07 10:45:24 -05:00
scrubbed << attributes["title"]
2013-02-05 14:16:51 -05:00
scrubbed << " "
end
end
def characters(string)
scrubbed << " "
scrubbed << string
2013-02-05 14:16:51 -05:00
scrubbed << " "
end
end
end