diff --git a/app/jobs/scheduled/reindex_search.rb b/app/jobs/scheduled/reindex_search.rb index e354b36d007..0e9768d2922 100644 --- a/app/jobs/scheduled/reindex_search.rb +++ b/app/jobs/scheduled/reindex_search.rb @@ -1,7 +1,7 @@ module Jobs # if locale changes or search algorithm changes we may want to reindex stuff class ReindexSearch < Jobs::Scheduled - every 1.day + every 2.hours def execute(args) rebuild_problem_topics @@ -38,13 +38,14 @@ module Jobs end end - def rebuild_problem_posts(limit = 10000) + def rebuild_problem_posts(limit = 20000) post_ids = load_problem_post_ids(limit) post_ids.each do |id| - post = Post.find_by(id: id) # could be deleted while iterating through batch - SearchIndexer.index(post, force: true) if post + if post = Post.find_by(id: id) + SearchIndexer.index(post, force: true) + end end end @@ -67,6 +68,7 @@ module Jobs WHERE pd.post_id IS NULL )', SiteSetting.default_locale, Search::INDEX_VERSION) .limit(limit) + .order('posts.id DESC') .pluck(:id) end diff --git a/app/models/topic.rb b/app/models/topic.rb index ac6af153922..c49b60625a6 100644 --- a/app/models/topic.rb +++ b/app/models/topic.rb @@ -223,10 +223,15 @@ class Topic < ActiveRecord::Base ApplicationController.banner_json_cache.clear end - if tags_changed - TagUser.auto_watch(topic_id: id) - TagUser.auto_track(topic_id: id) - self.tags_changed = false + if tags_changed || saved_change_to_attribute?(:category_id) + + SearchIndexer.queue_post_reindex(self.id) + + if tags_changed + TagUser.auto_watch(topic_id: id) + TagUser.auto_track(topic_id: id) + self.tags_changed = false + end end SearchIndexer.index(self) @@ -474,7 +479,7 @@ class Topic < ActiveRecord::Base search_data = "#{title} #{raw[0...MAX_SIMILAR_BODY_LENGTH]}".strip filter_words = Search.prepare_data(search_data) - ts_query = Search.ts_query(filter_words, nil, "|") + ts_query = Search.ts_query(term: filter_words, joiner: "|") candidates = Topic .visible diff --git a/app/models/user_search.rb b/app/models/user_search.rb index 8a2013b5b50..a9eceda1e9a 100644 --- a/app/models/user_search.rb +++ b/app/models/user_search.rb @@ -49,7 +49,7 @@ class UserSearch if @term.present? if SiteSetting.enable_names? && @term !~ /[_\.-]/ - query = Search.ts_query(@term, "simple") + query = Search.ts_query(term: @term, ts_config: "simple") users = users.includes(:user_search_data) .references(:user_search_data) diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index fdf6b506690..645866db8be 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_dependency 'search' class SearchIndexer @@ -14,111 +15,152 @@ class SearchIndexer HtmlScrubber.scrub(html) end - def self.update_index(table, id, raw_data) - raw_data = Search.prepare_data(raw_data, :index) - - table_name = "#{table}_search_data" - foreign_key = "#{table}_id" - + def self.inject_extra_terms(raw) # insert some extra words for I.am.a.word so "word" is tokenized # I.am.a.word becomes I.am.a.word am a word - search_data = raw_data.gsub(/[^[:space:]]*[\.]+[^[:space:]]*/) do |with_dot| + raw.gsub(/[^[:space:]]*[\.]+[^[:space:]]*/) do |with_dot| split = with_dot.split(".") if split.length > 1 - with_dot + (" " << split[1..-1].join(" ")) + with_dot + ((+" ") << split[1..-1].join(" ")) else with_dot end end + end + + def self.update_index(table: , id: , raw_data:) + search_data = raw_data.map do |data| + inject_extra_terms(Search.prepare_data(data || "", :index)) + end + + table_name = "#{table}_search_data" + foreign_key = "#{table}_id" # for user login and name use "simple" lowercase stemmer stemmer = table == "user" ? "simple" : Search.ts_config + ranked_index = <<~SQL + setweight(to_tsvector('#{stemmer}', coalesce(:a,'')), 'A') || + setweight(to_tsvector('#{stemmer}', coalesce(:b,'')), 'B') || + setweight(to_tsvector('#{stemmer}', coalesce(:c,'')), 'C') || + setweight(to_tsvector('#{stemmer}', coalesce(:d,'')), 'D') + SQL + + indexed_data = search_data.select { |d| d.length > 0 }.join(' ') + + params = { + a: search_data[0], + b: search_data[1], + c: search_data[2], + d: search_data[3], + raw_data: indexed_data, + id: id, + locale: SiteSetting.default_locale, + version: Search::INDEX_VERSION + } + # Would be nice to use AR here but not sure how to execut Postgres functions # when inserting data like this. - rows = Post.exec_sql_row_count("UPDATE #{table_name} - SET - raw_data = :raw_data, - locale = :locale, - search_data = TO_TSVECTOR('#{stemmer}', :search_data), - version = :version - WHERE #{foreign_key} = :id", - raw_data: raw_data, - search_data: search_data, - id: id, - locale: SiteSetting.default_locale, - version: Search::INDEX_VERSION) + rows = Post.exec_sql_row_count(<<~SQL, params) + UPDATE #{table_name} + SET + raw_data = :raw_data, + locale = :locale, + search_data = #{ranked_index}, + version = :version + WHERE #{foreign_key} = :id + SQL + if rows == 0 - Post.exec_sql("INSERT INTO #{table_name} - (#{foreign_key}, search_data, locale, raw_data, version) - VALUES (:id, TO_TSVECTOR('#{stemmer}', :search_data), :locale, :raw_data, :version)", - raw_data: raw_data, - search_data: search_data, - id: id, - locale: SiteSetting.default_locale, - version: Search::INDEX_VERSION) + Post.exec_sql(<<~SQL, params) + INSERT INTO #{table_name} + (#{foreign_key}, search_data, locale, raw_data, version) + VALUES (:id, #{ranked_index}, :locale, :raw_data, :version) + SQL end rescue - # don't allow concurrency to mess up saving a post + # TODO is there any way we can safely avoid this? + # best way is probably pushing search indexer into a dedicated process so it no longer happens on save + # instead in the post processor end def self.update_topics_index(topic_id, title, cooked) - search_data = title.dup << " " << scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH] - update_index('topic', topic_id, search_data) + scrubbed_cooked = scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH] + + # a bit inconsitent that we use title as A and body as B when in + # the post index body is C + update_index(table: 'topic', id: topic_id, raw_data: [title, scrubbed_cooked]) end - def self.update_posts_index(post_id, cooked, title, category) - search_data = scrub_html_for_search(cooked) << " " << title.dup.force_encoding('UTF-8') - search_data << " " << category if category - update_index('post', post_id, search_data) + def self.update_posts_index(post_id, title, category, tags, cooked) + update_index(table: 'post', id: post_id, raw_data: [title, category, tags, scrub_html_for_search(cooked)]) end def self.update_users_index(user_id, username, name) - search_data = username.dup << " " << (name || "") - update_index('user', user_id, search_data) + update_index(table: 'user', id: user_id, raw_data: [username, name]) end def self.update_categories_index(category_id, name) - update_index('category', category_id, name) + update_index(table: 'category', id: category_id, raw_data: [name]) end def self.update_tags_index(tag_id, name) - update_index('tag', tag_id, name) + update_index(table: 'tag', id: tag_id, raw_data: [name]) + end + + def self.queue_post_reindex(topic_id) + return if @disabled + + ActiveRecord::Base.exec_sql(<<~SQL, topic_id: topic_id) + UPDATE post_search_data + SET version = 0 + WHERE post_id IN (SELECT id FROM posts WHERE topic_id = :topic_id) + SQL end def self.index(obj, force: false) return if @disabled - if obj.class == Post && (obj.saved_change_to_cooked? || force) - if obj.topic - category_name = obj.topic.category.name if obj.topic.category - SearchIndexer.update_posts_index(obj.id, obj.cooked, obj.topic.title, category_name) - SearchIndexer.update_topics_index(obj.topic_id, obj.topic.title, obj.cooked) if obj.is_first_post? + category_name, tag_names = nil + topic = nil + + if Topic === obj + topic = obj + elsif Post === obj + topic = obj.topic + end + + category_name = topic.category&.name if topic + tag_names = topic.tags.pluck(:name).join(' ') if topic + + if Post === obj && (obj.saved_change_to_cooked? || force) + if topic + SearchIndexer.update_posts_index(obj.id, topic.title, category_name, tag_names, obj.cooked) + SearchIndexer.update_topics_index(topic.id, topic.title, obj.cooked) if obj.is_first_post? else Rails.logger.warn("Orphan post skipped in search_indexer, topic_id: #{obj.topic_id} post_id: #{obj.id} raw: #{obj.raw}") end end - if obj.class == User && (obj.saved_change_to_username? || obj.saved_change_to_name? || force) + if User === obj && (obj.saved_change_to_username? || obj.saved_change_to_name? || force) SearchIndexer.update_users_index(obj.id, obj.username_lower || '', obj.name ? obj.name.downcase : '') end - if obj.class == Topic && (obj.saved_change_to_title? || force) + if Topic === obj && (obj.saved_change_to_title? || force) if obj.posts post = obj.posts.find_by(post_number: 1) if post - category_name = obj.category.name if obj.category - SearchIndexer.update_posts_index(post.id, post.cooked, obj.title, category_name) + SearchIndexer.update_posts_index(post.id, obj.title, category_name, tag_names, post.cooked) SearchIndexer.update_topics_index(obj.id, obj.title, post.cooked) end end end - if obj.class == Category && (obj.saved_change_to_name? || force) + if Category === obj && (obj.saved_change_to_name? || force) SearchIndexer.update_categories_index(obj.id, obj.name) end - if obj.class == Tag && (obj.saved_change_to_name? || force) + if Tag === obj && (obj.saved_change_to_name? || force) SearchIndexer.update_tags_index(obj.id, obj.name) end end @@ -127,14 +169,14 @@ class SearchIndexer attr_reader :scrubbed def initialize - @scrubbed = "" + @scrubbed = +"" end def self.scrub(html) me = new parser = Nokogiri::HTML::SAX::Parser.new(me) begin - copy = "
" + copy = +"
" copy << html unless html.nil? copy << "
" parser.parse(html) unless html.nil? diff --git a/lib/search.rb b/lib/search.rb index 30c5b00b996..a391ebbd5a3 100644 --- a/lib/search.rb +++ b/lib/search.rb @@ -1,7 +1,7 @@ require_dependency 'search/grouped_search_results' class Search - INDEX_VERSION = 1.freeze + INDEX_VERSION = 2.freeze def self.per_facet 5 @@ -409,7 +409,7 @@ class Search if match.to_s.length >= SiteSetting.min_search_term_length posts.where("posts.id IN ( SELECT post_id FROM post_search_data pd1 - WHERE pd1.search_data @@ #{Search.ts_query("##{match}")})") + WHERE pd1.search_data @@ #{Search.ts_query(term: "##{match}")})") end end end @@ -511,12 +511,17 @@ class Search end end + @in_title = false + if word == 'order:latest' || word == 'l' @order = :latest nil elsif word == 'order:latest_topic' @order = :latest_topic nil + elsif word == 'in:title' + @in_title = true + nil elsif word =~ /topic:(\d+)/ topic_id = $1.to_i if topic_id > 1 @@ -681,7 +686,12 @@ class Search posts = posts.joins('JOIN users u ON u.id = posts.user_id') posts = posts.where("posts.raw || ' ' || u.username || ' ' || COALESCE(u.name, '') ilike ?", "%#{term_without_quote}%") else - posts = posts.where("post_search_data.search_data @@ #{ts_query}") + # A is for title + # B is for category + # C is for tags + # D is for cooked + weights = @in_title ? 'A' : (SiteSetting.tagging_enabled ? 'ABCD' : 'ABD') + posts = posts.where("post_search_data.search_data @@ #{ts_query(weight_filter: weights)}") exact_terms = @term.scan(/"([^"]+)"/).flatten exact_terms.each do |exact| posts = posts.where("posts.raw ilike ?", "%#{exact}%") @@ -743,11 +753,9 @@ class Search posts = posts.order("posts.like_count DESC") end else - posts = posts.order("TS_RANK_CD(TO_TSVECTOR(#{default_ts_config}, topics.title), #{ts_query}) DESC") - data_ranking = "TS_RANK_CD(post_search_data.search_data, #{ts_query})" if opts[:aggregate_search] - posts = posts.order("SUM(#{data_ranking}) DESC") + posts = posts.order("MAX(#{data_ranking}) DESC") else posts = posts.order("#{data_ranking} DESC") end @@ -772,7 +780,7 @@ class Search self.class.default_ts_config end - def self.ts_query(term, ts_config = nil, joiner = "&") + def self.ts_query(term: , ts_config: nil, joiner: "&", weight_filter: nil) data = Post.exec_sql("SELECT TO_TSVECTOR(:config, :term)", config: 'simple', @@ -786,16 +794,17 @@ class Search query = ActiveRecord::Base.connection.quote( all_terms - .map { |t| "'#{PG::Connection.escape_string(t)}':*" } + .map { |t| "'#{PG::Connection.escape_string(t)}':*#{weight_filter}" } .join(" #{joiner} ") ) "TO_TSQUERY(#{ts_config || default_ts_config}, #{query})" end - def ts_query(ts_config = nil) + def ts_query(ts_config = nil, weight_filter: nil) @ts_query_cache ||= {} - @ts_query_cache["#{ts_config || default_ts_config} #{@term}"] ||= Search.ts_query(@term, ts_config) + @ts_query_cache["#{ts_config || default_ts_config} #{@term} #{weight_filter}"] ||= + Search.ts_query(term: @term, ts_config: ts_config, weight_filter: weight_filter) end def wrap_rows(query) diff --git a/lib/tasks/search.rake b/lib/tasks/search.rake index 7d1ce3ab30c..bb609dff673 100644 --- a/lib/tasks/search.rake +++ b/lib/tasks/search.rake @@ -5,52 +5,37 @@ end def reindex_search(db = RailsMultisite::ConnectionManagement.current_db) puts "Reindexing '#{db}'" puts "" - puts "Posts:" - Post.exec_sql("select p.id, p.cooked, c.name category, t.title, p.post_number, t.id topic_id from - posts p - join topics t on t.id = p.topic_id - left join categories c on c.id = t.category_id - ").each do |p| - post_id = p["id"] - cooked = p["cooked"] - title = p["title"] - category = p["cat"] - post_number = p["post_number"].to_i - topic_id = p["topic_id"].to_i - - SearchIndexer.update_posts_index(post_id, cooked, title, category) - SearchIndexer.update_topics_index(topic_id, title , cooked) if post_number == 1 - + puts "Posts" + Post.includes(topic: [:category, :tags]).find_each do |p| + if p.post_number == 1 + SearchIndexer.index(p.topic, force: true) + else + SearchIndexer.index(p, force: true) + end putc "." end puts - puts "Users:" - User.exec_sql("select id, name, username from users").each do |u| - id = u["id"] - name = u["name"] - username = u["username"] - SearchIndexer.update_users_index(id, username, name) - + puts "Users" + User.find_each do |u| + SearchIndexer.index(u, force: true) putc "." end puts puts "Categories" - Category.exec_sql("select id, name from categories").each do |c| - id = c["id"] - name = c["name"] - SearchIndexer.update_categories_index(id, name) - - putc '.' + Category.find_each do |c| + SearchIndexer.index(c, force: true) + putc "." end - puts '', 'Tags' + puts + puts "Tags" - Tag.exec_sql('select id, name from tags').each do |t| - SearchIndexer.update_tags_index(t['id'], t['name']) - putc '.' + Tag.find_each do |t| + SearchIndexer.index(t, force: true) + putc "." end puts diff --git a/spec/components/search_spec.rb b/spec/components/search_spec.rb index 3f2228a990f..ed54be8c1da 100644 --- a/spec/components/search_spec.rb +++ b/spec/components/search_spec.rb @@ -395,6 +395,27 @@ describe Search do let(:tag_group) { Fabricate(:tag_group) } let(:category) { Fabricate(:category) } + context 'post searching' do + it 'can find posts with tags' do + SiteSetting.tagging_enabled = true + + post = Fabricate(:post, raw: 'I am special post') + DiscourseTagging.tag_topic_by_names(post.topic, Guardian.new(Fabricate.build(:admin)), [tag.name]) + post.topic.save + + # we got to make this index (it is deferred) + Jobs::ReindexSearch.new.rebuild_problem_posts + + result = Search.execute(tag.name) + expect(result.posts.length).to eq(1) + + SiteSetting.tagging_enabled = false + + result = Search.execute(tag.name) + expect(result.posts.length).to eq(0) + end + end + context 'tagging is disabled' do before { SiteSetting.tagging_enabled = false } @@ -856,7 +877,7 @@ describe Search do str = " grigio:babel deprecated? " str << "page page on Atmosphere](https://atmospherejs.com/grigio/babel)xxx: aaa.js:222 aaa'\"bbb" - ts_query = Search.ts_query(str, "simple") + ts_query = Search.ts_query(term: str, ts_config: "simple") Post.exec_sql("SELECT to_tsvector('bbb') @@ " << ts_query) end @@ -923,6 +944,19 @@ describe Search do end end + context 'in:title' do + it 'allows for search in title' do + topic = Fabricate(:topic, title: 'I am testing a title search') + _post = Fabricate(:post, topic_id: topic.id, raw: 'this is the first post') + + results = Search.execute('title in:title') + expect(results.posts.length).to eq(1) + + results = Search.execute('first in:title') + expect(results.posts.length).to eq(0) + end + end + context 'pagination' do let(:number_of_results) { 2 } let!(:post1) { Fabricate(:post, raw: 'hello hello hello hello hello') } diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb index efbfce82c1b..fb73ea2fbb4 100644 --- a/spec/services/search_indexer_spec.rb +++ b/spec/services/search_indexer_spec.rb @@ -7,7 +7,7 @@ describe SearchIndexer do data = "你好世界" expect(data.split(" ").length).to eq(1) - SearchIndexer.update_posts_index(post_id, "你好世界", "", nil) + SearchIndexer.update_posts_index(post_id, "你好世界", "", "", nil) raw_data = PostSearchData.where(post_id: post_id).pluck(:raw_data)[0] expect(raw_data.split(' ').length).to eq(2) @@ -15,18 +15,18 @@ describe SearchIndexer do it 'correctly indexes a post according to version' do # Preparing so that they can be indexed to right version - SearchIndexer.update_posts_index(post_id, "dummy", "", nil) + SearchIndexer.update_posts_index(post_id, "dummy", "", nil, nil) PostSearchData.find_by(post_id: post_id).update_attributes!(version: -1) data = "This is a test" - SearchIndexer.update_posts_index(post_id, data, "", nil) + SearchIndexer.update_posts_index(post_id, "", "", nil, data) raw_data, locale, version = PostSearchData.where(post_id: post_id).pluck(:raw_data, :locale, :version)[0] expect(raw_data).to eq("This is a test") expect(locale).to eq("en") expect(version).to eq(Search::INDEX_VERSION) - SearchIndexer.update_posts_index(post_id, "tester", "", nil) + SearchIndexer.update_posts_index(post_id, "tester", "", nil, nil) raw_data = PostSearchData.where(post_id: post_id).pluck(:raw_data)[0] expect(raw_data).to eq("tester")