From c52191d49e321f23d088285b6eef8fab6ccfe4cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Hanol?= Date: Tue, 16 Jun 2020 19:55:08 +0200 Subject: [PATCH] PERF: improve loading a imported_ids in bulk imports - Stream the queries that load the imported_ids - Use an array instead of a hash for keeping the mapping between imported_ids and new ids - Ensure we always treat the imported_ids as integers instead of strings --- script/bulk_import/base.rb | 65 ++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 23 deletions(-) diff --git a/script/bulk_import/base.rb b/script/bulk_import/base.rb index 9fc7b1cd17a..7ac31295b9b 100644 --- a/script/bulk_import/base.rb +++ b/script/bulk_import/base.rb @@ -77,7 +77,6 @@ class BulkImport::Base db = ActiveRecord::Base.connection_config @encoder = PG::TextEncoder::CopyRow.new @raw_connection = PG.connect(dbname: db[:database], host: db[:host_names]&.first, port: db[:port]) - # @raw_connection = PG.connect(dbname: db[:database], host: db[:host_names]&.first, port: db[:port], password: "discourse") @uploader = ImportScripts::Uploader.new @html_entities = HTMLEntities.new @encoding = CHARSET_MAP[charset] @@ -128,28 +127,44 @@ class BulkImport::Base SQL end + def imported_ids(name) + map = [] + ids = [] + + @raw_connection.send_query("SELECT value, #{name}_id FROM #{name}_custom_fields WHERE name = 'import_id'") + @raw_connection.set_single_row_mode + + @raw_connection.get_result.stream_each do |row| + id = row["value"].to_i + ids << id + map[id] = row["#{name}_id"] + end + + @raw_connection.get_result + + [map, ids] + end + def load_imported_ids puts "Loading imported group ids..." - @groups = GroupCustomField.where(name: "import_id").pluck(:value, :group_id).to_h - @last_imported_group_id = @groups.keys.map(&:to_i).max || -1 + @groups, imported_group_ids = imported_ids("group") + @last_imported_group_id = imported_group_ids.max || -1 puts "Loading imported user ids..." - @users = UserCustomField.where(name: "import_id").pluck(:value, :user_id).to_h - @last_imported_user_id = @users.keys.map(&:to_i).max || -1 + @users, imported_user_ids = imported_ids("user") + @last_imported_user_id = imported_user_ids.max || -1 puts "Loading imported category ids..." - @categories = CategoryCustomField.where(name: "import_id").pluck(:value, :category_id).to_h - @last_imported_category_id = @categories.keys.map(&:to_i).max || -1 + @categories, imported_category_ids = imported_ids("category") + @last_imported_category_id = imported_category_ids.max || -1 puts "Loading imported topic ids..." - @topics = TopicCustomField.where(name: "import_id").pluck(:value, :topic_id).to_h - imported_topic_ids = @topics.keys.map(&:to_i) + @topics, imported_topic_ids = imported_ids("topic") @last_imported_topic_id = imported_topic_ids.select { |id| id < PRIVATE_OFFSET }.max || -1 @last_imported_private_topic_id = imported_topic_ids.select { |id| id > PRIVATE_OFFSET }.max || (PRIVATE_OFFSET - 1) puts "Loading imported post ids..." - @posts = PostCustomField.where(name: "import_id").pluck(:value, :post_id).to_h - imported_post_ids = @posts.keys.map(&:to_i) + @posts, imported_post_ids = imported_ids("post") @last_imported_post_id = imported_post_ids.select { |id| id < PRIVATE_OFFSET }.max || -1 @last_imported_private_post_id = imported_post_ids.select { |id| id > PRIVATE_OFFSET }.max || (PRIVATE_OFFSET - 1) end @@ -208,19 +223,23 @@ class BulkImport::Base end def group_id_from_imported_id(id) - @groups[id.to_s] + @groups[id.to_i] end + def user_id_from_imported_id(id) - @users[id.to_s] + @users[id.to_i] end + def category_id_from_imported_id(id) - @categories[id.to_s] + @categories[id.to_i] end + def topic_id_from_imported_id(id) - @topics[id.to_s] + @topics[id.to_i] end + def post_id_from_imported_id(id) - @posts[id.to_s] + @posts[id.to_i] end def post_number_from_imported_id(id) @@ -337,7 +356,7 @@ class BulkImport::Base end def process_group(group) - @groups[group[:imported_id].to_s] = group[:id] = @last_group_id += 1 + @groups[group[:imported_id].to_i] = group[:id] = @last_group_id += 1 group[:name] = fix_name(group[:name]) @@ -356,7 +375,7 @@ class BulkImport::Base end def process_user(user) - @users[user[:imported_id].to_s] = user[:id] = @last_user_id += 1 + @users[user[:imported_id].to_i] = user[:id] = @last_user_id += 1 imported_username = user[:username].dup @@ -392,7 +411,7 @@ class BulkImport::Base def process_user_email(user_email) user_email[:id] = @last_user_email_id += 1 - user_email[:user_id] = @users[user_email[:imported_user_id].to_s] + user_email[:user_id] = @users[user_email[:imported_user_id].to_i] user_email[:primary] = true user_email[:created_at] ||= NOW user_email[:updated_at] ||= user_email[:created_at] @@ -403,7 +422,7 @@ class BulkImport::Base end def process_user_stat(user_stat) - user_stat[:user_id] = @users[user_stat[:imported_user_id].to_s] + user_stat[:user_id] = @users[user_stat[:imported_user_id].to_i] user_stat[:topic_reply_count] = user_stat[:post_count] - user_stat[:topic_count] user_stat[:topics_entered] ||= 0 user_stat[:time_read] ||= 0 @@ -434,7 +453,7 @@ class BulkImport::Base def process_category(category) category[:id] ||= @last_category_id += 1 - @categories[category[:imported_id].to_s] ||= category[:id] + @categories[category[:imported_id].to_i] ||= category[:id] category[:name] = category[:name][0...50].scrub.strip # TODO: unique name category[:name_lower] = category[:name].downcase @@ -447,7 +466,7 @@ class BulkImport::Base end def process_topic(topic) - @topics[topic[:imported_id].to_s] = topic[:id] = @last_topic_id += 1 + @topics[topic[:imported_id].to_i] = topic[:id] = @last_topic_id += 1 topic[:archetype] ||= Archetype.default topic[:title] = topic[:title][0...255].scrub.strip topic[:fancy_title] ||= pre_fancy(topic[:title]) @@ -465,7 +484,7 @@ class BulkImport::Base end def process_post(post) - @posts[post[:imported_id].to_s] = post[:id] = @last_post_id += 1 + @posts[post[:imported_id].to_i] = post[:id] = @last_post_id += 1 post[:user_id] ||= Discourse::SYSTEM_USER_ID post[:last_editor_id] = post[:user_id] @highest_post_number_by_topic_id[post[:topic_id]] ||= 0