2018-02-19 22:41:00 -05:00
|
|
|
# frozen_string_literal: true
|
2013-07-22 19:07:59 -04:00
|
|
|
|
2016-12-21 21:13:14 -05:00
|
|
|
class SearchIndexer
|
2020-07-17 04:27:30 -04:00
|
|
|
POST_INDEX_VERSION = 4
|
2020-07-23 02:52:20 -04:00
|
|
|
MIN_POST_REINDEX_VERSION = 3
|
2020-07-23 02:10:05 -04:00
|
|
|
TOPIC_INDEX_VERSION = 3
|
|
|
|
CATEGORY_INDEX_VERSION = 3
|
|
|
|
USER_INDEX_VERSION = 3
|
|
|
|
TAG_INDEX_VERSION = 3
|
2019-04-01 21:52:59 -04:00
|
|
|
REINDEX_VERSION = 0
|
2016-12-21 21:13:14 -05:00
|
|
|
|
|
|
|
def self.disable
|
|
|
|
@disabled = true
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.enable
|
|
|
|
@disabled = false
|
|
|
|
end
|
2013-02-05 14:16:51 -05:00
|
|
|
|
2018-09-17 04:31:15 -04:00
|
|
|
def self.scrub_html_for_search(html, strip_diacritics: SiteSetting.search_ignore_accents)
|
|
|
|
HtmlScrubber.scrub(html, strip_diacritics: strip_diacritics)
|
2013-02-05 14:16:51 -05:00
|
|
|
end
|
|
|
|
|
2018-02-19 22:41:00 -05:00
|
|
|
def self.update_index(table: , id: , raw_data:)
|
|
|
|
search_data = raw_data.map do |data|
|
2020-07-09 02:56:02 -04:00
|
|
|
Search.prepare_data(data || "", :index)
|
2018-02-19 22:41:00 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
table_name = "#{table}_search_data"
|
|
|
|
foreign_key = "#{table}_id"
|
2016-07-25 03:12:01 -04:00
|
|
|
|
2013-07-22 19:07:59 -04:00
|
|
|
# for user login and name use "simple" lowercase stemmer
|
2017-07-31 15:28:48 -04:00
|
|
|
stemmer = table == "user" ? "simple" : Search.ts_config
|
2013-07-22 19:07:59 -04:00
|
|
|
|
2018-02-19 22:41:00 -05:00
|
|
|
ranked_index = <<~SQL
|
|
|
|
setweight(to_tsvector('#{stemmer}', coalesce(:a,'')), 'A') ||
|
|
|
|
setweight(to_tsvector('#{stemmer}', coalesce(:b,'')), 'B') ||
|
|
|
|
setweight(to_tsvector('#{stemmer}', coalesce(:c,'')), 'C') ||
|
|
|
|
setweight(to_tsvector('#{stemmer}', coalesce(:d,'')), 'D')
|
|
|
|
SQL
|
|
|
|
|
2020-07-09 02:56:02 -04:00
|
|
|
ranked_params = {
|
2018-02-19 22:41:00 -05:00
|
|
|
a: search_data[0],
|
|
|
|
b: search_data[1],
|
|
|
|
c: search_data[2],
|
|
|
|
d: search_data[3],
|
2020-07-09 02:56:02 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0]
|
|
|
|
additional_lexemes = []
|
|
|
|
|
|
|
|
tsvector.scan(/'(([a-zA-Z0-9]+\.)+[a-zA-Z0-9]+)'\:([\w+,]+)/).reduce(additional_lexemes) do |array, (lexeme, _, positions)|
|
|
|
|
count = 0
|
|
|
|
|
2020-07-27 03:17:49 -04:00
|
|
|
if lexeme !~ /^(\d+\.)?(\d+\.)*(\*|\d+)$/
|
2020-07-27 02:46:44 -04:00
|
|
|
loop do
|
|
|
|
count += 1
|
|
|
|
break if count >= 10 # Safeguard here to prevent infinite loop when a term has many dots
|
|
|
|
term, _, remaining = lexeme.partition(".")
|
|
|
|
break if remaining.blank?
|
2020-07-27 03:22:54 -04:00
|
|
|
array << "'#{remaining}':#{positions}"
|
2020-07-27 02:46:44 -04:00
|
|
|
lexeme = remaining
|
|
|
|
end
|
2020-07-09 02:56:02 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
array
|
|
|
|
end
|
|
|
|
|
|
|
|
tsvector = "#{tsvector} #{additional_lexemes.join(' ')}"
|
|
|
|
|
2020-08-06 00:25:03 -04:00
|
|
|
indexed_data =
|
|
|
|
if table.to_s == "post"
|
|
|
|
clean_post_raw_data!(ranked_params[:d])
|
|
|
|
else
|
|
|
|
search_data.select { |d| d.length > 0 }.join(' ')
|
|
|
|
end
|
|
|
|
|
2020-07-09 02:56:02 -04:00
|
|
|
params = {
|
2018-02-19 22:41:00 -05:00
|
|
|
raw_data: indexed_data,
|
|
|
|
id: id,
|
|
|
|
locale: SiteSetting.default_locale,
|
2020-07-23 02:10:05 -04:00
|
|
|
version: const_get("#{table.upcase}_INDEX_VERSION"),
|
2020-07-09 02:56:02 -04:00
|
|
|
tsvector: tsvector,
|
2018-02-19 22:41:00 -05:00
|
|
|
}
|
|
|
|
|
2013-05-22 15:33:33 -04:00
|
|
|
# Would be nice to use AR here but not sure how to execut Postgres functions
|
|
|
|
# when inserting data like this.
|
2018-06-19 02:13:14 -04:00
|
|
|
rows = DB.exec(<<~SQL, params)
|
2018-02-19 22:41:00 -05:00
|
|
|
UPDATE #{table_name}
|
|
|
|
SET
|
|
|
|
raw_data = :raw_data,
|
|
|
|
locale = :locale,
|
2020-07-09 02:56:02 -04:00
|
|
|
search_data = (:tsvector)::tsvector,
|
2018-02-19 22:41:00 -05:00
|
|
|
version = :version
|
|
|
|
WHERE #{foreign_key} = :id
|
|
|
|
SQL
|
|
|
|
|
2013-05-22 15:33:33 -04:00
|
|
|
if rows == 0
|
2018-06-19 02:13:14 -04:00
|
|
|
DB.exec(<<~SQL, params)
|
2018-02-19 22:41:00 -05:00
|
|
|
INSERT INTO #{table_name}
|
|
|
|
(#{foreign_key}, search_data, locale, raw_data, version)
|
2020-07-09 02:56:02 -04:00
|
|
|
VALUES (:id, (:tsvector)::tsvector, :locale, :raw_data, :version)
|
2018-02-19 22:41:00 -05:00
|
|
|
SQL
|
2013-02-05 14:16:51 -05:00
|
|
|
end
|
2013-05-22 15:33:33 -04:00
|
|
|
rescue
|
2018-02-19 22:41:00 -05:00
|
|
|
# TODO is there any way we can safely avoid this?
|
|
|
|
# best way is probably pushing search indexer into a dedicated process so it no longer happens on save
|
|
|
|
# instead in the post processor
|
2013-02-05 14:16:51 -05:00
|
|
|
end
|
|
|
|
|
2014-08-08 01:50:26 -04:00
|
|
|
def self.update_topics_index(topic_id, title, cooked)
|
2018-02-19 22:41:00 -05:00
|
|
|
scrubbed_cooked = scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
|
|
|
|
|
|
|
|
# a bit inconsitent that we use title as A and body as B when in
|
2020-07-17 04:27:30 -04:00
|
|
|
# the post index body is D
|
2018-02-19 22:41:00 -05:00
|
|
|
update_index(table: 'topic', id: topic_id, raw_data: [title, scrubbed_cooked])
|
2014-08-08 01:50:26 -04:00
|
|
|
end
|
|
|
|
|
2020-07-16 23:12:31 -04:00
|
|
|
def self.update_posts_index(post_id, topic_title, category_name, topic_tags, cooked)
|
|
|
|
update_index(table: 'post', id: post_id, raw_data: [topic_title, category_name, topic_tags, scrub_html_for_search(cooked)])
|
2013-02-05 14:16:51 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def self.update_users_index(user_id, username, name)
|
2018-02-19 22:41:00 -05:00
|
|
|
update_index(table: 'user', id: user_id, raw_data: [username, name])
|
2013-02-05 14:16:51 -05:00
|
|
|
end
|
2013-02-07 10:45:24 -05:00
|
|
|
|
2013-02-05 14:16:51 -05:00
|
|
|
def self.update_categories_index(category_id, name)
|
2018-02-19 22:41:00 -05:00
|
|
|
update_index(table: 'category', id: category_id, raw_data: [name])
|
2013-02-05 14:16:51 -05:00
|
|
|
end
|
|
|
|
|
2017-08-25 11:52:18 -04:00
|
|
|
def self.update_tags_index(tag_id, name)
|
2018-10-05 05:23:52 -04:00
|
|
|
update_index(table: 'tag', id: tag_id, raw_data: [name.downcase])
|
2018-02-19 22:41:00 -05:00
|
|
|
end
|
|
|
|
|
2020-07-16 23:12:31 -04:00
|
|
|
def self.queue_category_posts_reindex(category_id)
|
|
|
|
return if @disabled
|
|
|
|
|
|
|
|
DB.exec(<<~SQL, category_id: category_id, version: REINDEX_VERSION)
|
|
|
|
UPDATE post_search_data
|
|
|
|
SET version = :version
|
|
|
|
FROM posts
|
|
|
|
INNER JOIN topics ON posts.topic_id = topics.id
|
|
|
|
INNER JOIN categories ON topics.category_id = categories.id
|
|
|
|
WHERE post_search_data.post_id = posts.id
|
|
|
|
AND categories.id = :category_id
|
|
|
|
SQL
|
|
|
|
end
|
|
|
|
|
2018-02-19 22:41:00 -05:00
|
|
|
def self.queue_post_reindex(topic_id)
|
|
|
|
return if @disabled
|
|
|
|
|
2019-04-01 21:52:59 -04:00
|
|
|
DB.exec(<<~SQL, topic_id: topic_id, version: REINDEX_VERSION)
|
2018-02-19 22:41:00 -05:00
|
|
|
UPDATE post_search_data
|
2019-04-01 21:52:59 -04:00
|
|
|
SET version = :version
|
|
|
|
FROM posts
|
|
|
|
WHERE post_search_data.post_id = posts.id
|
|
|
|
AND posts.topic_id = :topic_id
|
2018-02-19 22:41:00 -05:00
|
|
|
SQL
|
2017-08-25 11:52:18 -04:00
|
|
|
end
|
|
|
|
|
2017-08-16 07:38:34 -04:00
|
|
|
def self.index(obj, force: false)
|
2016-12-21 21:13:14 -05:00
|
|
|
return if @disabled
|
|
|
|
|
2018-08-23 11:13:52 -04:00
|
|
|
category_name = nil
|
|
|
|
tag_names = nil
|
2018-02-19 22:41:00 -05:00
|
|
|
topic = nil
|
|
|
|
|
|
|
|
if Topic === obj
|
|
|
|
topic = obj
|
|
|
|
elsif Post === obj
|
|
|
|
topic = obj.topic
|
|
|
|
end
|
|
|
|
|
|
|
|
category_name = topic.category&.name if topic
|
2020-07-17 04:27:30 -04:00
|
|
|
|
2019-12-04 13:33:51 -05:00
|
|
|
if topic
|
2020-07-17 04:27:30 -04:00
|
|
|
tags = topic.tags.select(:id, :name).to_a
|
|
|
|
|
|
|
|
if tags.present?
|
2019-12-04 13:33:51 -05:00
|
|
|
tag_names = (tags.map(&:name) + Tag.where(target_tag_id: tags.map(&:id)).pluck(:name)).join(' ')
|
|
|
|
end
|
|
|
|
end
|
2018-02-19 22:41:00 -05:00
|
|
|
|
2019-03-31 22:06:27 -04:00
|
|
|
if Post === obj && obj.raw.present? &&
|
2019-03-19 05:16:57 -04:00
|
|
|
(
|
|
|
|
obj.saved_change_to_cooked? ||
|
|
|
|
obj.saved_change_to_topic_id? ||
|
|
|
|
force
|
|
|
|
)
|
|
|
|
|
2018-02-19 22:41:00 -05:00
|
|
|
if topic
|
|
|
|
SearchIndexer.update_posts_index(obj.id, topic.title, category_name, tag_names, obj.cooked)
|
|
|
|
SearchIndexer.update_topics_index(topic.id, topic.title, obj.cooked) if obj.is_first_post?
|
2014-05-06 22:35:26 -04:00
|
|
|
end
|
2013-02-05 14:16:51 -05:00
|
|
|
end
|
2017-08-16 07:38:34 -04:00
|
|
|
|
2018-02-19 22:41:00 -05:00
|
|
|
if User === obj && (obj.saved_change_to_username? || obj.saved_change_to_name? || force)
|
2016-12-21 21:13:14 -05:00
|
|
|
SearchIndexer.update_users_index(obj.id, obj.username_lower || '', obj.name ? obj.name.downcase : '')
|
2013-02-05 14:16:51 -05:00
|
|
|
end
|
|
|
|
|
2018-02-19 22:41:00 -05:00
|
|
|
if Topic === obj && (obj.saved_change_to_title? || force)
|
2013-02-05 14:16:51 -05:00
|
|
|
if obj.posts
|
2018-08-23 11:13:52 -04:00
|
|
|
if post = obj.posts.find_by(post_number: 1)
|
2018-02-19 22:41:00 -05:00
|
|
|
SearchIndexer.update_posts_index(post.id, obj.title, category_name, tag_names, post.cooked)
|
2016-12-21 21:13:14 -05:00
|
|
|
SearchIndexer.update_topics_index(obj.id, obj.title, post.cooked)
|
2013-02-05 14:16:51 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2018-02-19 22:41:00 -05:00
|
|
|
if Category === obj && (obj.saved_change_to_name? || force)
|
2016-12-21 21:13:14 -05:00
|
|
|
SearchIndexer.update_categories_index(obj.id, obj.name)
|
2013-02-05 14:16:51 -05:00
|
|
|
end
|
2017-08-25 11:52:18 -04:00
|
|
|
|
2018-02-19 22:41:00 -05:00
|
|
|
if Tag === obj && (obj.saved_change_to_name? || force)
|
2017-08-25 11:52:18 -04:00
|
|
|
SearchIndexer.update_tags_index(obj.id, obj.name)
|
|
|
|
end
|
2013-02-05 14:16:51 -05:00
|
|
|
end
|
|
|
|
|
2020-08-06 00:25:03 -04:00
|
|
|
def self.clean_post_raw_data!(raw_data)
|
|
|
|
urls = Set.new
|
|
|
|
raw_data.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
|
|
|
|
|
|
|
|
urls.each do |url|
|
|
|
|
begin
|
|
|
|
case File.extname(URI(url).path || "")
|
|
|
|
when Oneboxer::VIDEO_REGEX
|
|
|
|
raw_data.gsub!(url, I18n.t("search.video"))
|
|
|
|
when Oneboxer::AUDIO_REGEX
|
|
|
|
raw_data.gsub!(url, I18n.t("search.audio"))
|
|
|
|
end
|
|
|
|
rescue URI::InvalidURIError
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
raw_data
|
|
|
|
end
|
|
|
|
private_class_method :clean_post_raw_data!
|
|
|
|
|
2013-02-05 14:16:51 -05:00
|
|
|
class HtmlScrubber < Nokogiri::XML::SAX::Document
|
2018-08-23 20:00:51 -04:00
|
|
|
|
2013-02-05 14:16:51 -05:00
|
|
|
attr_reader :scrubbed
|
|
|
|
|
2018-08-30 21:46:55 -04:00
|
|
|
def initialize(strip_diacritics: false)
|
2018-02-19 22:41:00 -05:00
|
|
|
@scrubbed = +""
|
2018-08-30 21:46:55 -04:00
|
|
|
@strip_diacritics = strip_diacritics
|
2013-02-05 14:16:51 -05:00
|
|
|
end
|
|
|
|
|
2018-08-30 21:46:55 -04:00
|
|
|
def self.scrub(html, strip_diacritics: false)
|
2018-08-23 12:00:07 -04:00
|
|
|
return +"" if html.blank?
|
|
|
|
|
2020-05-04 23:46:57 -04:00
|
|
|
document = Nokogiri::HTML5("<div>#{html}</div>", nil, Encoding::UTF_8.to_s)
|
2019-03-31 22:14:29 -04:00
|
|
|
|
2019-04-01 04:18:54 -04:00
|
|
|
nodes = document.css(
|
2019-03-31 22:14:29 -04:00
|
|
|
"div.#{CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS}"
|
2019-04-01 04:18:54 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
if nodes.present?
|
|
|
|
nodes.each do |node|
|
|
|
|
node.traverse do |child_node|
|
|
|
|
next if child_node == node
|
|
|
|
|
|
|
|
if %w{a img}.exclude?(child_node.name)
|
|
|
|
child_node.remove
|
|
|
|
elsif child_node.name == "a"
|
|
|
|
ATTRIBUTES.each do |attribute|
|
|
|
|
child_node.remove_attribute(attribute)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2019-03-31 22:14:29 -04:00
|
|
|
|
2019-04-29 11:26:29 -04:00
|
|
|
document.css("img[class='emoji']").each do |node|
|
|
|
|
node.remove_attribute("alt")
|
|
|
|
end
|
|
|
|
|
2019-03-31 22:14:29 -04:00
|
|
|
document.css("a[href]").each do |node|
|
2019-04-29 11:15:55 -04:00
|
|
|
if node["href"] == node.text || MENTION_CLASSES.include?(node["class"])
|
|
|
|
node.remove_attribute("href")
|
|
|
|
end
|
2019-03-31 22:14:29 -04:00
|
|
|
end
|
|
|
|
|
2018-08-30 21:46:55 -04:00
|
|
|
me = new(strip_diacritics: strip_diacritics)
|
2019-03-31 22:14:29 -04:00
|
|
|
Nokogiri::HTML::SAX::Parser.new(me).parse(document.to_html)
|
2018-09-17 04:31:15 -04:00
|
|
|
me.scrubbed.squish
|
2013-02-05 14:16:51 -05:00
|
|
|
end
|
|
|
|
|
2019-04-29 11:15:55 -04:00
|
|
|
MENTION_CLASSES ||= %w{mention mention-group}
|
2018-08-23 11:13:52 -04:00
|
|
|
ATTRIBUTES ||= %w{alt title href data-youtube-title}
|
|
|
|
|
2019-03-31 22:14:29 -04:00
|
|
|
def start_element(_name, attributes = [])
|
2013-02-05 14:16:51 -05:00
|
|
|
attributes = Hash[*attributes.flatten]
|
2018-08-23 11:13:52 -04:00
|
|
|
|
2019-03-31 22:14:29 -04:00
|
|
|
ATTRIBUTES.each do |attribute_name|
|
|
|
|
if attributes[attribute_name].present? &&
|
|
|
|
!(
|
|
|
|
attribute_name == "href" &&
|
|
|
|
UrlHelper.is_local(attributes[attribute_name])
|
|
|
|
)
|
|
|
|
|
|
|
|
characters(attributes[attribute_name])
|
2018-09-13 12:53:53 -04:00
|
|
|
end
|
2018-08-19 20:39:19 -04:00
|
|
|
end
|
2013-02-05 14:16:51 -05:00
|
|
|
end
|
|
|
|
|
2018-08-23 20:00:51 -04:00
|
|
|
def characters(str)
|
2018-10-22 21:10:33 -04:00
|
|
|
str = Search.strip_diacritics(str) if @strip_diacritics
|
2018-08-30 21:46:55 -04:00
|
|
|
scrubbed << " #{str} "
|
2013-02-05 14:16:51 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|