From 0955d9ece9158755e078090ef355e37382354f32 Mon Sep 17 00:00:00 2001 From: Blake Erickson Date: Sun, 2 Jun 2019 20:17:22 -0600 Subject: [PATCH] create answerhub importer (#7671) --- script/import_scripts/answerhub.rb | 444 +++++++++++++++++++++++++++++ 1 file changed, 444 insertions(+) create mode 100644 script/import_scripts/answerhub.rb diff --git a/script/import_scripts/answerhub.rb b/script/import_scripts/answerhub.rb new file mode 100644 index 00000000000..a81877755ae --- /dev/null +++ b/script/import_scripts/answerhub.rb @@ -0,0 +1,444 @@ +# frozen_string_literal: true + +# AnswerHub Importer +# +# Based on having access to a mysql dump. +# Pass in the ENV variables listed below before runing the script. + +require_relative 'base' +require 'mysql2' +require 'open-uri' + +class ImportScripts::AnswerHub < ImportScripts::Base + + DB_NAME ||= ENV['DB_NAME'] || "answerhub" + DB_PASS ||= ENV['DB_PASS'] || "answerhub" + DB_USER ||= ENV['DB_USER'] || "answerhub" + TABLE_PREFIX ||= ENV['TABLE_PREFIX'] || "network1" + BATCH_SIZE ||= ENV['BATCH_SIZE'].to_i || 1000 + ATTACHMENT_DIR = ENV['ATTACHMENT_DIR'] || '' + PROCESS_UPLOADS = ENV['PROCESS_UPLOADS'].to_i || 0 + ANSWERHUB_DOMAIN = ENV['ANSWERHUB_DOMAIN'] + AVATAR_DIR = ENV['AVATAR_DIR'] || "" + SITE_ID = ENV['SITE_ID'].to_i || 0 + CATEGORY_MAP_FROM = ENV['CATEGORY_MAP_FROM'].to_i || 0 + CATEGORY_MAP_TO = ENV['CATEGORY_MAP_TO'].to_i || 0 + SCRAPE_AVATARS = ENV['SCRAPE_AVATARS'].to_i || 0 + + def initialize + super + @client = Mysql2::Client.new( + host: "localhost", + username: DB_USER, + password: DB_PASS, + database: DB_NAME + ) + @skip_updates = true + SiteSetting.tagging_enabled = true + SiteSetting.max_tags_per_topic = 10 + end + + def execute + puts "Now starting the AnswerHub Import" + puts "DB Name: #{DB_NAME}" + puts "Table Prefix: #{TABLE_PREFIX}" + puts + import_users + import_categories + import_topics + import_posts + import_groups + add_users_to_groups + add_moderators + add_admins + import_avatars + end + + def import_users + puts '', "creating users" + + query = + "SELECT count(*) count + FROM #{TABLE_PREFIX}_authoritables + WHERE c_type = 'user' + AND c_active = 1 + AND c_system <> 1;" + total_count = @client.query(query).first['count'] + puts "Total count: #{total_count}" + @last_user_id = -1 + + batches(BATCH_SIZE) do |offset| + query = "SELECT c_id, c_creation_date, c_name, c_primaryEmail, c_last_seen, c_description + FROM #{TABLE_PREFIX}_authoritables + WHERE c_type='user' + AND c_active = 1 + AND c_system <> 1 + AND c_id > #{@last_user_id} + LIMIT #{BATCH_SIZE};" + + results = @client.query(query) + break if results.size < 1 + @last_user_id = results.to_a.last['c_id'] + + create_users(results, total: total_count, offset: offset) do |user| + puts user['c_id'].to_s + ' ' + user['c_name'] + next if @lookup.user_id_from_imported_user_id(user['c_id']) + { id: user['c_id'], + email: "#{SecureRandom.hex}@invalid.invalid", + username: user['c_name'], + created_at: user['c_creation_date'], + bio_raw: user['c_description'], + last_seen_at: user['c_last_seen'], + } + end + end + end + + def import_categories + puts "", "importing categories..." + + # Import parent categories first + query = "SELECT c_id, c_name, c_plug, c_parent + FROM containers + WHERE c_type = 'space' + AND c_active = 1 + AND c_parent = 7 OR c_parent IS NULL" + results = @client.query(query) + + create_categories(results) do |c| + { + id: c['c_id'], + name: c['c_name'], + parent_category_id: check_parent_id(c['c_parent']), + } + end + + # Import sub-categories + query = "SELECT c_id, c_name, c_plug, c_parent + FROM containers + WHERE c_type = 'space' + AND c_active = 1 + AND c_parent != 7 AND c_parent IS NOT NULL" + results = @client.query(query) + + create_categories(results) do |c| + puts c.inspect + { + id: c['c_id'], + name: c['c_name'], + parent_category_id: category_id_from_imported_category_id(check_parent_id(c['c_parent'])), + } + end + end + + def import_topics + puts "", "importing topics..." + + count_query = + "SELECT count(*) count + FROM #{TABLE_PREFIX}_nodes + WHERE c_visibility <> 'deleted' + AND (c_type = 'question' + OR c_type = 'kbentry');" + total_count = @client.query(count_query).first['count'] + + @last_topic_id = -1 + + batches(BATCH_SIZE) do |offset| + # Let's start with just question types + query = + "SELECT * + FROM #{TABLE_PREFIX}_nodes + WHERE c_id > #{@last_topic_id} + AND c_visibility <> 'deleted' + AND (c_type = 'question' + OR c_type = 'kbentry') + ORDER BY c_id ASC + LIMIT #{BATCH_SIZE};" + topics = @client.query(query) + + break if topics.size < 1 + @last_topic_id = topics.to_a.last['c_id'] + + create_posts(topics, total: total_count, offset: offset) do |t| + user_id = user_id_from_imported_user_id(t['c_author']) || Discourse::SYSTEM_USER_ID + if PROCESS_UPLOADS == 1 + body = process_uploads(t['c_body'], user_id) + else + body = t['c_body'] + end + markdown_body = HtmlToMarkdown.new(body).to_markdown + { + id: t['c_id'], + user_id: user_id, + title: t['c_title'], + category: category_id_from_imported_category_id(t['c_primaryContainer']), + raw: markdown_body, + created_at: t['c_creation_date'], + post_create_action: proc do |post| + tag_names = t['c_topic_names'].split(',') + DiscourseTagging.tag_topic_by_names(post.topic, staff_guardian, tag_names) + end + } + end + end + end + + def import_posts + puts "", "importing posts..." + + count_query = + "SELECT count(*) count + FROM #{TABLE_PREFIX}_nodes + WHERE c_visibility <> 'deleted' + AND (c_type = 'answer' + OR c_type = 'comment' + OR c_type = 'kbentry');" + total_count = @client.query(count_query).first['count'] + + @last_post_id = -1 + + batches(BATCH_SIZE) do |offset| + query = + "SELECT * + FROM #{TABLE_PREFIX}_nodes + WHERE c_id > #{@last_post_id} + AND c_visibility <> 'deleted' + AND (c_type = 'answer' + OR c_type = 'comment' + OR c_type = 'kbentry') + ORDER BY c_id ASC + LIMIT #{BATCH_SIZE};" + posts = @client.query(query) + next if all_records_exist? :posts, posts.map { |p| p['c_id'] } + + break if posts.size < 1 + @last_post_id = posts.to_a.last['c_id'] + + create_posts(posts, total: total_count, offset: offset) do |p| + t = topic_lookup_from_imported_post_id(p['c_parent']) + next unless t + user_id = user_id_from_imported_user_id(p['c_author']) || Discourse::SYSTEM_USER_ID + if PROCESS_UPLOADS == 1 + body = process_uploads(p['c_body'], user_id) + else + body = t['c_body'] + end + markdown_body = HtmlToMarkdown.new(body).to_markdown + { + id: p['c_id'], + user_id: user_id, + topic_id: t[:topic_id], + reply_to_post_number: t[:post_number], + raw: markdown_body, + created_at: p['c_creation_date'], + post_create_action: proc do |post_info| + begin + if p['c_type'] == 'answer' && p['c_marked'] == 1 + post = Post.find(post_info[:id]) + if post + user_id = user_id_from_imported_user_id(p['c_author']) || Discourse::SYSTEM_USER_ID + current_user = User.find(user_id) + solved = DiscourseSolved.accept_answer!(post, current_user) + puts "SOLVED: #{solved}" + end + end + rescue ActiveRecord::RecordInvalid + puts "SOLVED: Skipped post_id: #{post.id} because invalid" + end + end + } + end + end + end + + def import_groups + puts "", "importing groups..." + + query = + "SELECT c_id, c_name + FROM network6_authoritables + WHERE c_type='group' + AND c_id > 6;" # Ignore Anonymous, Users, Moderators, etc. + groups = @client.query(query) + + create_groups(groups) do |group| + { + id: group["c_id"], + name: group["c_name"], + visibility_level: 1 + } + end + end + + def add_users_to_groups + puts "", "adding users to groups..." + + query = + "SELECT c_id, c_name + FROM network6_authoritables + WHERE c_type='group' + AND c_id > 6;" # Ignore Anonymous, Users, Moderators, etc. + groups = @client.query(query) + + members_query = + "SELECT * + FROM network6_authoritable_groups;" + group_members = @client.query(members_query) + + total_count = groups.count + progress_count = 0 + start_time = Time.now + + group_members.map + groups.each do |group| + dgroup = find_group_by_import_id(group['c_id']) + + next if dgroup.custom_fields['import_users_added'] + + group_member_ids = group_members.map { |m| user_id_from_imported_user_id(m["c_members"]) if m["c_groups"] == group['c_id'] }.compact + + # add members + dgroup.bulk_add(group_member_ids) + + # reload group + dgroup.reload + + dgroup.custom_fields['import_users_added'] = true + dgroup.save + + progress_count += 1 + print_status(progress_count, total_count, start_time) + end + end + + def add_moderators + puts "", "adding moderators..." + + query = + "SELECT * + FROM network6_authoritable_groups + WHERE c_groups = 4;" + moderators = @client.query(query) + + moderator_ids = moderators.map { |m| user_id_from_imported_user_id(m["c_members"]) }.compact + + moderator_ids.each do |id| + user = User.find(id) + user.grant_moderation! + end + end + + def add_admins + puts "", "adding admins..." + + query = + "SELECT * + FROM network6_authoritable_groups + WHERE c_groups = 5 OR c_groups = 6;" # Super Users, Network Administrators + admins = @client.query(query) + + admin_ids = admins.map { |a| user_id_from_imported_user_id(a["c_members"]) }.compact + + admin_ids.each do |id| + user = User.find(id) + user.grant_admin! + end + end + + def import_avatars + puts "", "importing user avatars" + query = + "SELECT * + FROM network6_user_preferences + WHERE c_key = 'avatarImage'" + avatars = @client.query(query) + + avatars.each do |a| + begin + user_id = user_id_from_imported_user_id(a['c_user']) + user = User.find(user_id) + if user + filename = "avatar-#{user_id}.png" + path = File.join(AVATAR_DIR, filename) + next if !File.exists?(path) + + # Scrape Avatars - Avatars are saved in the db, but it might be easier to just scrape them + if SCRAPE_AVATARS == 1 + File.open(path, 'wb') { |f| + f << open("https://#{ANSWERHUB_DOMAIN}/forums/users/#{a['c_user']}/photo/view.html?s=240").read + } + end + + upload = @uploader.create_upload(user.id, path, filename) + + if upload.persisted? + user.import_mode = false + user.create_user_avatar + user.import_mode = true + user.user_avatar.update(custom_upload_id: upload.id) + user.update(uploaded_avatar_id: upload.id) + else + Rails.logger.error("Could not persist avatar for user #{user.username}") + end + end + rescue ActiveRecord::RecordNotFound + puts "Could not find User for user_id: #{a['c_user']}" + end + end + end + + def process_uploads(body, user_id) + if body.match(//) + # There could be multiple images in a post + images = body.scan(//) + + images.each do |image| + filepath = File.basename(image).split('"')[0] + filepath = File.join(ATTACHMENT_DIR, filepath) + + if File.exists?(filepath) + filename = File.basename(filepath) + upload = create_upload(user_id, filepath, filename) + image_html = html_for_upload(upload, filename) + original_image_html = '' + body.sub!(original_image_html, image_html) + end + end + end + # Non-images + if body.match(//) + # There could be multiple files in a post + files = body.scan(//) + + files.each do |file| + filepath = File.basename(file).split('"')[0] + filepath = File.join(ATTACHMENT_DIR, filepath) + + if File.exists?(filepath) + filename = File.basename(filepath) + upload = create_upload(user_id, filepath, filename) + file_html = html_for_upload(upload, filename) + original_file_html = '' + body.sub!(original_file_html, file_html) + end + end + end + + body + end + + def staff_guardian + @_staff_guardian ||= Guardian.new(Discourse.system_user) + end + + # Some category parent id's need to be adjusted + def check_parent_id(id) + return nil if SITE_ID > 0 && id == SITE_ID + return CATEGORY_MAP_TO if CATEGORY_MAP_FROM > 0 && id == CATEGORY_MAP_FROM + id + end + +end + +ImportScripts::AnswerHub.new.perform