FIX: reindex_search job should work on model with no search data (#11819)

Lots of changes but it's mostly a refactoring.

The interesting part that was fix are the 'load_problem_<model>_ids' methods.
They will now return records with no search data associated so they can be properly indexed for the search.
This "bad" state usually happens after a migration.
This commit is contained in:
Régis Hanol 2021-01-25 11:23:36 +01:00 committed by GitHub
parent fcbb6c4143
commit aa1138ff71
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 204 additions and 268 deletions

View File

@ -1041,18 +1041,14 @@ class UsersController < ApplicationController
def search_users
term = params[:term].to_s.strip
topic_id = params[:topic_id]
topic_id = topic_id.to_i if topic_id
category_id = params[:category_id].to_i if category_id.present?
topic_id = params[:topic_id].to_i if params[:topic_id].present?
category_id = params[:category_id].to_i if params[:category_id].present?
topic_allowed_users = params[:topic_allowed_users] || false
group_names = params[:groups] || []
group_names << params[:group] if params[:group]
if group_names.present?
@groups = Group.where(name: group_names)
end
@groups = Group.where(name: group_names) if group_names.present?
options = {
topic_allowed_users: topic_allowed_users,
@ -1060,13 +1056,8 @@ class UsersController < ApplicationController
groups: @groups
}
if topic_id
options[:topic_id] = topic_id
end
if category_id
options[:category_id] = category_id
end
options[:topic_id] = topic_id if topic_id
options[:category_id] = category_id if category_id
results = UserSearch.new(term, options).search
@ -1075,8 +1066,10 @@ class UsersController < ApplicationController
to_render = { users: results.as_json(only: user_fields, methods: [:avatar_template]) }
# blank term is only handy for in-topic search of users after @
# we do not want group results ever if term is blank
groups =
if current_user
if term.present? && current_user
if params[:include_groups] == 'true'
Group.visible_groups(current_user)
elsif params[:include_mentionable_groups] == 'true'
@ -1086,10 +1079,6 @@ class UsersController < ApplicationController
end
end
# blank term is only handy for in-topic search of users after @
# we do not want group results ever if term is blank
groups = nil if term.blank?
if groups
groups = Group.search_groups(term, groups: groups)
groups = groups.order('groups.name asc')

View File

@ -1,201 +1,162 @@
# frozen_string_literal: true
module Jobs
# if locale changes or search algorithm changes we may want to reindex stuff
class ReindexSearch < ::Jobs::Scheduled
every 2.hours
CLEANUP_GRACE_PERIOD = 1.day.ago
def execute(args)
@verbose = true if args && Hash === args && args[:verbose]
@verbose = args[:verbose]
@cleanup_grace_period = 1.day.ago
rebuild_problem_topics
rebuild_problem_posts
rebuild_problem_categories
rebuild_problem_users
rebuild_problem_tags
clean_post_search_data
clean_topic_search_data
rebuild_categories
rebuild_tags
rebuild_topics
rebuild_posts
rebuild_users
@verbose = nil
clean_topics
clean_posts
end
def rebuild_problem_categories(limit: 500)
def rebuild_categories(limit: 500, indexer: SearchIndexer)
category_ids = load_problem_category_ids(limit)
if @verbose
puts "rebuilding #{category_ids.length} categories"
end
puts "rebuilding #{category_ids.size} categories" if @verbose
category_ids.each do |id|
category = Category.find_by(id: id)
SearchIndexer.index(category, force: true) if category
indexer.index(category, force: true) if category
end
end
def rebuild_problem_users(limit: 10000)
user_ids = load_problem_user_ids(limit)
if @verbose
puts "rebuilding #{user_ids.length} users"
end
user_ids.each do |id|
user = User.find_by(id: id)
SearchIndexer.index(user, force: true) if user
end
end
def rebuild_problem_topics(limit: 10000)
topic_ids = load_problem_topic_ids(limit)
if @verbose
puts "rebuilding #{topic_ids.length} topics"
end
topic_ids.each do |id|
topic = Topic.find_by(id: id)
SearchIndexer.index(topic, force: true) if topic
end
end
def rebuild_problem_posts(limit: 20000, indexer: SearchIndexer, verbose: false)
post_ids = load_problem_post_ids(limit)
verbose ||= @verbose
if verbose
puts "rebuilding #{post_ids.length} posts"
end
i = 0
post_ids.each do |id|
# could be deleted while iterating through batch
if post = Post.find_by(id: id)
indexer.index(post, force: true)
i += 1
if verbose && i % 1000 == 0
puts "#{i} posts reindexed"
end
end
end
end
def rebuild_problem_tags(limit: 10000)
def rebuild_tags(limit: 1_000, indexer: SearchIndexer)
tag_ids = load_problem_tag_ids(limit)
if @verbose
puts "rebuilding #{tag_ids.length} tags"
end
puts "rebuilding #{tag_ids.size} tags" if @verbose
tag_ids.each do |id|
tag = Tag.find_by(id: id)
SearchIndexer.index(tag, force: true) if tag
indexer.index(tag, force: true) if tag
end
end
private
def rebuild_topics(limit: 10_000, indexer: SearchIndexer)
topic_ids = load_problem_topic_ids(limit)
def clean_post_search_data
puts "cleaning up post search data" if @verbose
puts "rebuilding #{topic_ids.size} topics" if @verbose
PostSearchData
.joins("LEFT JOIN posts p ON p.id = post_search_data.post_id")
.where("p.raw = ''")
.delete_all
DB.exec(<<~SQL, deleted_at: CLEANUP_GRACE_PERIOD)
DELETE FROM post_search_data
WHERE post_id IN (
SELECT post_id
FROM post_search_data
LEFT JOIN posts ON post_search_data.post_id = posts.id
INNER JOIN topics ON posts.topic_id = topics.id
WHERE (topics.deleted_at IS NOT NULL
AND topics.deleted_at <= :deleted_at) OR (
posts.deleted_at IS NOT NULL AND
posts.deleted_at <= :deleted_at
)
)
SQL
topic_ids.each do |id|
topic = Topic.find_by(id: id)
indexer.index(topic, force: true) if topic
end
end
def clean_topic_search_data
def rebuild_posts(limit: 20_000, indexer: SearchIndexer)
post_ids = load_problem_post_ids(limit)
puts "rebuilding #{post_ids.size} posts" if @verbose
post_ids.each do |id|
post = Post.find_by(id: id)
indexer.index(post, force: true) if post
end
end
def rebuild_users(limit: 5_000, indexer: SearchIndexer)
user_ids = load_problem_user_ids(limit)
puts "rebuilding #{user_ids.size} users" if @verbose
user_ids.each do |id|
user = User.find_by(id: id)
indexer.index(user, force: true) if user
end
end
def clean_topics
puts "cleaning up topic search data" if @verbose
DB.exec(<<~SQL, deleted_at: CLEANUP_GRACE_PERIOD)
DELETE FROM topic_search_data
WHERE topic_id IN (
SELECT topic_id
FROM topic_search_data
INNER JOIN topics ON topic_search_data.topic_id = topics.id
WHERE topics.deleted_at IS NOT NULL
AND topics.deleted_at <= :deleted_at
)
# remove search data from deleted topics
DB.exec(<<~SQL, deleted_at: @cleanup_grace_period)
DELETE FROM topic_search_data
WHERE topic_id IN (
SELECT topic_id
FROM topic_search_data
LEFT JOIN topics ON topic_id = topics.id
WHERE topics.id IS NULL
OR (deleted_at IS NOT NULL AND deleted_at <= :deleted_at)
)
SQL
end
def load_problem_post_ids(limit)
params = {
locale: SiteSetting.default_locale,
version: SearchIndexer::MIN_POST_REINDEX_VERSION,
limit: limit
}
def clean_posts
puts "cleaning up post search data" if @verbose
DB.query_single(<<~SQL, params)
SELECT
posts.id
FROM posts
JOIN topics ON topics.id = posts.topic_id
LEFT JOIN post_search_data pd
ON pd.locale = :locale
AND pd.version >= :version
AND pd.post_id = posts.id
WHERE pd.post_id IS NULL
AND posts.deleted_at IS NULL
AND topics.deleted_at IS NULL
AND posts.raw != ''
ORDER BY posts.id DESC
LIMIT :limit
# remove search data from deleted/empty posts
DB.exec(<<~SQL, deleted_at: @cleanup_grace_period)
DELETE FROM post_search_data
WHERE post_id IN (
SELECT post_id
FROM post_search_data
LEFT JOIN posts ON post_id = posts.id
JOIN topics ON posts.topic_id = topics.id
WHERE posts.id IS NULL
OR posts.raw = ''
OR (posts.deleted_at IS NOT NULL AND posts.deleted_at <= :deleted_at)
OR (topics.deleted_at IS NOT NULL AND topics.deleted_at <= :deleted_at)
)
SQL
end
def load_problem_category_ids(limit)
Category.joins(:category_search_data)
.where('category_search_data.locale != ?
OR category_search_data.version != ?', SiteSetting.default_locale, SearchIndexer::CATEGORY_INDEX_VERSION)
.order('categories.id asc')
.limit(limit)
.pluck(:id)
end
def load_problem_topic_ids(limit)
Topic.joins(:topic_search_data)
.where('topic_search_data.locale != ?
OR topic_search_data.version != ?', SiteSetting.default_locale, SearchIndexer::TOPIC_INDEX_VERSION)
.order('topics.id desc')
.limit(limit)
.pluck(:id)
end
def load_problem_user_ids(limit)
User.joins(:user_search_data)
.where('user_search_data.locale != ?
OR user_search_data.version != ?', SiteSetting.default_locale, SearchIndexer::USER_INDEX_VERSION)
.order('users.id asc')
Category
.joins("LEFT JOIN category_search_data ON category_id = categories.id")
.where("category_search_data.locale IS NULL OR category_search_data.locale != ? OR category_search_data.version != ?", SiteSetting.default_locale, SearchIndexer::CATEGORY_INDEX_VERSION)
.order("categories.id ASC")
.limit(limit)
.pluck(:id)
end
def load_problem_tag_ids(limit)
Tag.joins(:tag_search_data)
.where('tag_search_data.locale != ?
OR tag_search_data.version != ?', SiteSetting.default_locale, SearchIndexer::TAG_INDEX_VERSION)
.order('tags.id asc')
Tag
.joins("LEFT JOIN tag_search_data ON tag_id = tags.id")
.where("tag_search_data.locale IS NULL OR tag_search_data.locale != ? OR tag_search_data.version != ?", SiteSetting.default_locale, SearchIndexer::TAG_INDEX_VERSION)
.order("tags.id ASC")
.limit(limit)
.pluck(:id)
end
def load_problem_topic_ids(limit)
Topic
.joins("LEFT JOIN topic_search_data ON topic_id = topics.id")
.where("topic_search_data.locale IS NULL OR topic_search_data.locale != ? OR topic_search_data.version != ?", SiteSetting.default_locale, SearchIndexer::TOPIC_INDEX_VERSION)
.order("topics.id DESC")
.limit(limit)
.pluck(:id)
end
def load_problem_post_ids(limit)
Post
.joins(:topic)
.joins("LEFT JOIN post_search_data ON post_id = posts.id")
.where("posts.raw != ''")
.where("topics.deleted_at IS NULL")
.where("post_search_data.locale IS NULL OR post_search_data.locale != ? OR post_search_data.version != ?", SiteSetting.default_locale, SearchIndexer::POST_INDEX_VERSION)
.order("posts.id DESC")
.limit(limit)
.pluck(:id)
end
def load_problem_user_ids(limit)
User
.joins("LEFT JOIN user_search_data ON user_id = users.id")
.where("user_search_data.locale IS NULL OR user_search_data.locale != ? OR user_search_data.version != ?", SiteSetting.default_locale, SearchIndexer::USER_INDEX_VERSION)
.order("users.id ASC")
.limit(limit)
.pluck(:id)
end
end
end

View File

@ -1,13 +1,12 @@
# frozen_string_literal: true
# Searches for a user by username or full text or name (if enabled in SiteSettings)
class UserSearch
MAX_SIZE_PRIORITY_MENTION ||= 500
def initialize(term, opts = {})
@term = term
@term_like = "#{term.downcase.gsub("_", "\\_")}%"
@term = term.downcase
@term_like = @term.gsub("_", "\\_") + "%"
@topic_id = opts[:topic_id]
@category_id = opts[:category_id]
@topic_allowed_users = opts[:topic_allowed_users]
@ -15,131 +14,125 @@ class UserSearch
@include_staged_users = opts[:include_staged_users] || false
@limit = opts[:limit] || 20
@groups = opts[:groups]
@topic = Topic.find(@topic_id) if @topic_id
@category = Category.find(@category_id) if @category_id
@guardian = Guardian.new(@searching_user)
@guardian.ensure_can_see_groups_members! @groups if @groups
@guardian.ensure_can_see_category! Category.find(@category_id) if @category_id
@guardian.ensure_can_see_topic! Topic.find(@topic_id) if @topic_id
@guardian.ensure_can_see_groups_members!(@groups) if @groups
@guardian.ensure_can_see_category!(@category) if @category
@guardian.ensure_can_see_topic!(@topic) if @topic
end
def scoped_users
users = User.where(active: true)
users = users.where(staged: false) unless @include_staged_users
users = users.not_suspended unless @searching_user&.staff?
if @groups
users = users.joins("INNER JOIN group_users ON group_users.user_id = users.id")
users = users
.joins(:group_users)
.where("group_users.group_id IN (?)", @groups.map(&:id))
end
unless @searching_user && @searching_user.staff?
users = users.not_suspended
end
# Only show users who have access to private topic
if @topic_id && @topic_allowed_users == "true"
topic = Topic.find_by(id: @topic_id)
if topic.category && topic.category.read_restricted
users = users.includes(:secure_categories)
.where("users.admin = TRUE OR categories.id = ?", topic.category.id)
.references(:categories)
end
end
users.limit(@limit)
end
def filtered_by_term_users
users = scoped_users
if @term.present?
if SiteSetting.enable_names? && @term !~ /[_\.-]/
query = Search.ts_query(term: @term, ts_config: "simple")
users = users.includes(:user_search_data)
.references(:user_search_data)
.where("user_search_data.search_data @@ #{query}")
.order(DB.sql_fragment("CASE WHEN username_lower LIKE ? THEN 0 ELSE 1 END ASC", @term_like))
else
users = users.where("username_lower LIKE :term_like", term_like: @term_like)
end
if @topic_allowed_users == "true" && @topic&.category&.read_restricted
users = users
.references(:categories)
.includes(:secure_categories)
.where("users.admin OR categories.id = ?", @topic.category_id)
end
users
end
def filtered_by_term_users
if @term.blank?
scoped_users
elsif SiteSetting.enable_names? && @term !~ /[_\.-]/
query = Search.ts_query(term: @term, ts_config: "simple")
scoped_users
.includes(:user_search_data)
.where("user_search_data.search_data @@ #{query}")
.order(DB.sql_fragment("CASE WHEN username_lower LIKE ? THEN 0 ELSE 1 END ASC", @term_like))
else
scoped_users.where("username_lower LIKE :term_like", term_like: @term_like)
end
end
def search_ids
users = Set.new
# 1. exact username matches
if @term.present?
scoped_users.where(username_lower: @term.downcase)
scoped_users
.where(username_lower: @term)
.limit(@limit)
.pluck(:id)
.each { |id| users << id }
end
return users.to_a if users.length >= @limit
return users.to_a if users.size >= @limit
# 2. in topic
if @topic_id
in_topic = filtered_by_term_users
.where('users.id IN (SELECT p.user_id FROM posts p WHERE topic_id = ?)', @topic_id)
.where('users.id IN (SELECT user_id FROM posts WHERE topic_id = ?)', @topic_id)
if @searching_user.present?
in_topic = in_topic.where('users.id <> ?', @searching_user.id)
end
in_topic
.order('last_seen_at DESC')
.limit(@limit - users.length)
.order('last_seen_at DESC NULLS LAST')
.limit(@limit - users.size)
.pluck(:id)
.each { |id| users << id }
end
return users.to_a if users.length >= @limit
return users.to_a if users.size >= @limit
secure_category_id = nil
# 3. in category
secure_category_id =
if @category_id
DB.query_single(<<~SQL, @category_id).first
SELECT id
FROM categories
WHERE read_restricted
AND id = ?
SQL
elsif @topic_id
DB.query_single(<<~SQL, @topic_id).first
SELECT id
FROM categories
WHERE read_restricted
AND id IN (SELECT category_id FROM topics WHERE id = ?)
SQL
end
if @category_id
secure_category_id = DB.query_single(<<~SQL, @category_id).first
SELECT id FROM categories
WHERE read_restricted AND id = ?
SQL
elsif @topic_id
secure_category_id = DB.query_single(<<~SQL, @topic_id).first
SELECT id FROM categories
WHERE read_restricted AND id IN (
SELECT category_id FROM topics
WHERE id = ?
)
SQL
end
# 3. category matches
if secure_category_id
@searching_user.present?
category_groups = Group.where(<<~SQL, secure_category_id, MAX_SIZE_PRIORITY_MENTION)
groups.id IN (
SELECT group_id FROM category_groups
JOIN groups g ON group_id = g.id
WHERE
category_id = ? AND
user_count < ?
SELECT group_id
FROM category_groups
JOIN groups g ON group_id = g.id
WHERE category_id = ?
AND user_count < ?
)
SQL
category_groups = category_groups.members_visible_groups(@searching_user)
if @searching_user.present?
category_groups = category_groups.members_visible_groups(@searching_user)
end
in_category = filtered_by_term_users
.where(<<~SQL, category_groups.pluck(:id))
users.id IN (
SELECT gu.user_id
FROM group_users gu
WHERE group_id IN (?)
LIMIT 200
FROM group_users gu
WHERE group_id IN (?)
LIMIT 200
)
SQL
@ -148,17 +141,19 @@ class UserSearch
end
in_category
.order('last_seen_at DESC')
.limit(@limit - users.length)
.order('last_seen_at DESC NULLS LAST')
.limit(@limit - users.size)
.pluck(:id)
.each { |id| users << id }
end
return users.to_a if users.length >= @limit
return users.to_a if users.size >= @limit
# 4. global matches
if @term.present?
filtered_by_term_users.order('last_seen_at DESC')
.limit(@limit - users.length)
filtered_by_term_users
.order('last_seen_at DESC NULLS LAST')
.limit(@limit - users.size)
.pluck(:id)
.each { |id| users << id }
end

View File

@ -961,7 +961,7 @@ describe Search do
it 'can find posts with tags' do
# we got to make this index (it is deferred)
Jobs::ReindexSearch.new.rebuild_problem_posts
Jobs::ReindexSearch.new.rebuild_posts
result = Search.execute(tag.name)
expect(result.posts.length).to eq(1)
@ -977,7 +977,7 @@ describe Search do
it 'can find posts with tag synonyms' do
synonym = Fabricate(:tag, name: 'synonym', target_tag: tag)
Jobs::ReindexSearch.new.rebuild_problem_posts
Jobs::ReindexSearch.new.rebuild_posts
result = Search.execute(synonym.name)
expect(result.posts.length).to eq(1)
end

View File

@ -34,7 +34,7 @@ describe Jobs::ReindexSearch do
end
end
describe 'rebuild_problem_posts' do
describe 'rebuild_posts' do
class FakeIndexer
def self.index(post, force:)
get_posts.push(post)
@ -59,10 +59,7 @@ describe Jobs::ReindexSearch do
FakeIndexer.reset
end
it (
'should not reindex posts that belong to a deleted topic ' \
'or have been trashed'
) do
it "should not reindex posts that belong to a deleted topic or have been trashed" do
post = Fabricate(:post)
post2 = Fabricate(:post)
post3 = Fabricate(:post)
@ -70,7 +67,7 @@ describe Jobs::ReindexSearch do
post2.topic.trash!
post3.trash!
subject.rebuild_problem_posts(indexer: FakeIndexer)
subject.rebuild_posts(indexer: FakeIndexer)
expect(FakeIndexer.posts).to contain_exactly(post)
end
@ -78,7 +75,7 @@ describe Jobs::ReindexSearch do
it 'should not reindex posts with a developmental version' do
post = Fabricate(:post, version: SearchIndexer::MIN_POST_REINDEX_VERSION + 1)
subject.rebuild_problem_posts(indexer: FakeIndexer)
subject.rebuild_posts(indexer: FakeIndexer)
expect(FakeIndexer.posts).to eq([])
end
@ -94,7 +91,7 @@ describe Jobs::ReindexSearch do
post2.save!(validate: false)
subject.rebuild_problem_posts(indexer: FakeIndexer)
subject.rebuild_posts(indexer: FakeIndexer)
expect(FakeIndexer.posts).to contain_exactly(post)
end
@ -107,9 +104,7 @@ describe Jobs::ReindexSearch do
[topic, topic2].each { |t| SearchIndexer.index(t, force: true) }
freeze_time(described_class::CLEANUP_GRACE_PERIOD) do
topic.trash!
end
freeze_time(1.day.ago) { topic.trash! }
expect { subject.execute({}) }.to change { TopicSearchData.count }.by(-1)
expect(Topic.pluck(:id)).to contain_exactly(topic2.id)
@ -119,11 +114,7 @@ describe Jobs::ReindexSearch do
)
end
it(
"should clean up post_search_data of posts with empty raw or posts from " \
"trashed topics"
) do
it "should clean up post_search_data of posts with empty raw or posts from trashed topics" do
post = Fabricate(:post)
post2 = Fabricate(:post, post_type: Post.types[:small_action])
post2.raw = ""
@ -132,7 +123,7 @@ describe Jobs::ReindexSearch do
post3.topic.trash!
post4, post5, post6 = nil
freeze_time(described_class::CLEANUP_GRACE_PERIOD) do
freeze_time(1.day.ago) do
post4 = Fabricate(:post)
post4.topic.trash!

View File

@ -17,7 +17,7 @@ describe SimilarTopicsController do
def reindex_posts
SearchIndexer.enable
Jobs::ReindexSearch.new.rebuild_problem_posts
Jobs::ReindexSearch.new.rebuild_posts
end
it "requires a title param" do