From c36c9c2ee5069b7c0b8df907642084218ffb76bd Mon Sep 17 00:00:00 2001 From: Gerhard Schlager Date: Thu, 28 Feb 2019 21:59:36 +0100 Subject: [PATCH] FEATURE: Import script for AnswerBase Improves the generic database used by some import scripts: * Adds additional columns for users * Adds support for attachments * Allows setting the data type for keys (numeric or string) to ensure correct sorting --- Gemfile | 3 +- script/import_scripts/answerbase.rb | 341 ++++++++++++++++++ .../import_scripts/base/generic_database.rb | 145 ++++++-- script/import_scripts/zendesk.rb | 16 +- script/import_scripts/zendesk_api.rb | 6 +- 5 files changed, 471 insertions(+), 40 deletions(-) create mode 100644 script/import_scripts/answerbase.rb diff --git a/Gemfile b/Gemfile index 4030e93e97b..54797b5cf8c 100644 --- a/Gemfile +++ b/Gemfile @@ -202,10 +202,11 @@ gem 'rchardet', require: false if ENV["IMPORT"] == "1" gem 'mysql2' gem 'redcarpet' - gem 'sqlite3', '~> 1.3.13' + gem 'sqlite3', '~> 1.3', '>= 1.3.13' gem 'ruby-bbcode-to-md', git: 'https://github.com/nlalonde/ruby-bbcode-to-md' gem 'reverse_markdown' gem 'tiny_tds' + gem 'csv', '~> 3.0' end gem 'webpush', require: false diff --git a/script/import_scripts/answerbase.rb b/script/import_scripts/answerbase.rb new file mode 100644 index 00000000000..88fa1914775 --- /dev/null +++ b/script/import_scripts/answerbase.rb @@ -0,0 +1,341 @@ +require 'csv' +require 'reverse_markdown' +require_relative 'base' +require_relative 'base/generic_database' + +# Call it like this: +# RAILS_ENV=production bundle exec ruby script/import_scripts/answerbase.rb DIRNAME +class ImportScripts::Answerbase < ImportScripts::Base + OLD_DOMAIN = "http://answerbase.example.com" # without trailing slash + NEW_DOMAIN = "https://discourse.example.com" + AVATAR_DIRECTORY = "User Images" + ANSWER_ATTACHMENT_DIRECTORY = "Answer Attachments" + ANSWER_IMAGE_DIRECTORY = "Answer Images" + QUESTION_ATTACHMENT_DIRECTORY = "Question Attachments" + QUESTION_IMAGE_DIRECTORY = "Question Images" + EMBEDDED_IMAGE_REGEX = /]*href="[^"]*relativeUrl=(?[^"\&]*)[^"]*"[^>]*>\s*]*>\s*<\/a>/i + QUESTION_LINK_REGEX = /]*?href="#{Regexp.escape(OLD_DOMAIN)}\/[^"]*?(?:q|questionid=)(?\d+)[^"]*?"[^>]*>(?.*?)<\/a>/i + TOPIC_LINK_NORMALIZATION = '/.*?-(q\d+).*/\1' + BATCH_SIZE = 1000 + + def initialize(path) + super() + + @path = path + @db = ImportScripts::GenericDatabase.new( + @path, + batch_size: BATCH_SIZE, + recreate: true, + numeric_keys: true + ) + end + + def execute + read_csv_files + + add_permalink_normalizations + import_categories + import_users + import_topics + import_posts + end + + def read_csv_files + puts "", "reading CSV files..." + + category_position = 0 + csv_parse("categories") do |row| + @db.insert_category( + id: row[:id], + name: row[:name], + position: category_position += 1 + ) + end + + csv_parse("users") do |row| + @db.insert_user( + id: row[:id], + email: row[:email], + username: row[:username], + bio: row[:description], + avatar_path: row[:profile_image], + created_at: parse_date(row[:createtime]), + active: true + ) + end + + last_topic_id = nil + csv_parse("questions-answers-comments") do |row| + next if row[:published] == "No" + user_id = @db.get_user_id(row[:username]) + created_at = parse_datetime(row[:createtime]) + + begin + if row[:type] == "Question" + attachments = parse_filenames(row[:attachments], QUESTION_ATTACHMENT_DIRECTORY) + + parse_filenames(row[:images], QUESTION_IMAGE_DIRECTORY) + + @db.insert_topic( + id: row[:id], + title: row[:title], + raw: row[:text], + category_id: row[:categorylist], + user_id: user_id, + created_at: created_at, + attachments: attachments + ) + last_topic_id = row[:id] + else + attachments = parse_filenames(row[:attachments], ANSWER_ATTACHMENT_DIRECTORY) + + parse_filenames(row[:images], ANSWER_IMAGE_DIRECTORY) + + @db.insert_post( + id: row[:id], + raw: row[:text], + topic_id: last_topic_id, + user_id: user_id, + created_at: created_at, + attachments: attachments + ) + end + rescue + p row + raise + end + end + end + + def parse_filenames(text, directory) + return [] if text.blank? + + text + .split(';') + .map { |filename| File.join(@path, directory, filename.strip) } + end + + def parse_date(text) + return nil if text.blank? + DateTime.strptime(text, "%m/%d/%y") + end + + def parse_datetime(text) + return nil if text.blank? + # DateTime.strptime(text, "%m/%d/%Y %H:%M") + DateTime.parse(text).utc.to_datetime + end + + def import_categories + puts "", "creating categories" + rows = @db.fetch_categories + + create_categories(rows) do |row| + { + id: row['id'], + name: row['name'], + description: row['description'], + position: row['position'] + } + end + end + + def batches + super(BATCH_SIZE) + end + + def import_users + puts "", "creating users" + total_count = @db.count_users + last_id = 0 + + batches do |offset| + rows, last_id = @db.fetch_users(last_id) + break if rows.empty? + + next if all_records_exist?(:users, rows.map { |row| row['id'] }) + + create_users(rows, total: total_count, offset: offset) do |row| + { + id: row['id'], + email: row['email'], + username: row['username'], + bio_raw: row['bio'], + created_at: row['created_at'], + active: row['active'] == 1, + post_create_action: proc do |user| + create_avatar(user, row['avatar_path']) + end + } + end + end + end + + def create_avatar(user, avatar_path) + return if avatar_path.blank? + avatar_path = File.join(@path, AVATAR_DIRECTORY, avatar_path) + + if File.exist?(avatar_path) + @uploader.create_avatar(user, avatar_path) + else + STDERR.puts "Could not find avatar: #{avatar_path}" + end + end + + def import_topics + puts "", "creating topics" + total_count = @db.count_topics + last_id = 0 + + batches do |offset| + rows, last_id = @db.fetch_topics(last_id) + break if rows.empty? + + next if all_records_exist?(:posts, rows.map { |row| row['id'] }) + + create_posts(rows, total: total_count, offset: offset) do |row| + attachments = @db.fetch_topic_attachments(row['id']) if row['upload_count'] > 0 + user_id = user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id + + { + id: row['id'], + title: row['title'], + raw: raw_with_attachments(row['raw'].presence || row['title'], attachments, user_id), + category: category_id_from_imported_category_id(row['category_id']), + user_id: user_id, + created_at: row['created_at'], + closed: row['closed'] == 1, + post_create_action: proc do |post| + url = "q#{row['id']}" + Permalink.create(url: url, topic_id: post.topic.id) unless permalink_exists?(url) + end + } + end + end + end + + def import_posts + puts "", "creating posts" + total_count = @db.count_posts + last_row_id = 0 + + batches do |offset| + rows, last_row_id = @db.fetch_posts(last_row_id) + break if rows.empty? + + next if all_records_exist?(:posts, rows.map { |row| row['id'] }) + + create_posts(rows, total: total_count, offset: offset) do |row| + topic = topic_lookup_from_imported_post_id(row['topic_id']) + attachments = @db.fetch_post_attachments(row['id']) if row['upload_count'] > 0 + user_id = user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id + + { + id: row['id'], + raw: raw_with_attachments(row['raw'], attachments, user_id), + user_id: user_id, + topic_id: topic[:topic_id], + created_at: row['created_at'] + } + end + end + end + + def raw_with_attachments(raw, attachments, user_id) + raw, embedded_paths, upload_ids = replace_embedded_attachments(raw, user_id) + raw = replace_question_links(raw) + raw = ReverseMarkdown.convert(raw) || "" + + attachments&.each do |attachment| + path = attachment['path'] + next if embedded_paths.include?(path) + + if File.exist?(path) + filename = File.basename(path) + upload = @uploader.create_upload(user_id, path, filename) + + if upload.present? && upload.persisted? && !upload_ids.include?(upload.id) + raw << "\n" << @uploader.html_for_upload(upload, filename) + end + else + STDERR.puts "Could not find file: #{path}" + end + end + + raw + end + + def replace_embedded_attachments(raw, user_id) + paths = [] + upload_ids = [] + + raw = raw.gsub(EMBEDDED_IMAGE_REGEX) do + path = File.join(@path, Regexp.last_match['path']) + filename = File.basename(path) + path = find_image_path(filename) + + if path + upload = @uploader.create_upload(user_id, path, filename) + + if upload.present? && upload.persisted? + paths << path + upload_ids << upload.id + @uploader.html_for_upload(upload, filename) + end + else + STDERR.puts "Could not find file: #{path}" + end + end + + [raw, paths, upload_ids] + end + + def find_image_path(filename) + [QUESTION_IMAGE_DIRECTORY, ANSWER_IMAGE_DIRECTORY].each do |directory| + path = File.join(@path, directory, filename) + return path if File.exist?(path) + end + end + + def replace_question_links(raw) + raw.gsub(QUESTION_LINK_REGEX) do + topic_id = Regexp.last_match("id") + topic = topic_lookup_from_imported_post_id(topic_id) + return Regexp.last_match.to_s unless topic + + url = File.join(NEW_DOMAIN, topic[:url]) + text = Regexp.last_match("text") + text.include?(OLD_DOMAIN) ? url : "#{text}" + end + end + + def add_permalink_normalizations + normalizations = SiteSetting.permalink_normalizations + normalizations = normalizations.blank? ? [] : normalizations.split('|') + + add_normalization(normalizations, TOPIC_LINK_NORMALIZATION) + + SiteSetting.permalink_normalizations = normalizations.join('|') + end + + def add_normalization(normalizations, normalization) + normalizations << normalization unless normalizations.include?(normalization) + end + + def permalink_exists?(url) + Permalink.find_by(url: url) + end + + def csv_parse(table_name) + CSV.foreach(File.join(@path, "#{table_name}.csv"), + headers: true, + header_converters: :symbol, + skip_blanks: true, + encoding: 'bom|utf-8') { |row| yield row } + end +end + +unless ARGV[0] && Dir.exist?(ARGV[0]) + puts "", "Usage:", "", "bundle exec ruby script/import_scripts/answerbase.rb DIRNAME", "" + exit 1 +end + +ImportScripts::Answerbase.new(ARGV[0]).perform diff --git a/script/import_scripts/base/generic_database.rb b/script/import_scripts/base/generic_database.rb index 39284a4d345..c0f046779fc 100644 --- a/script/import_scripts/base/generic_database.rb +++ b/script/import_scripts/base/generic_database.rb @@ -2,12 +2,13 @@ require 'sqlite3' module ImportScripts class GenericDatabase - def initialize(directory, batch_size:, recreate: false) + def initialize(directory, batch_size:, recreate: false, numeric_keys: false) filename = "#{directory}/index.db" File.delete(filename) if recreate && File.exists?(filename) @db = SQLite3::Database.new(filename, results_as_hash: true) @batch_size = batch_size + @numeric_keys = numeric_keys configure_database create_category_table @@ -25,36 +26,72 @@ module ImportScripts def insert_user(user) @db.execute(<<-SQL, prepare(user)) - INSERT OR REPLACE INTO user (id, email, username, name, created_at, last_seen_at, active) - VALUES (:id, :email, :username, :name, :created_at, :last_seen_at, :active) + INSERT OR REPLACE + INTO user (id, email, username, name, bio, avatar_path, created_at, last_seen_at, active) + VALUES (:id, :email, :username, :name, :bio, :avatar_path, :created_at, :last_seen_at, :active) SQL end def insert_topic(topic) + attachments = topic.delete(:attachments) + topic[:upload_count] = attachments&.size || 0 + @db.execute(<<-SQL, prepare(topic)) - INSERT OR REPLACE INTO topic (id, title, raw, category_id, closed, user_id, created_at, url) - VALUES (:id, :title, :raw, :category_id, :closed, :user_id, :created_at, :url) + INSERT OR REPLACE INTO topic (id, title, raw, category_id, closed, user_id, created_at, url, upload_count) + VALUES (:id, :title, :raw, :category_id, :closed, :user_id, :created_at, :url, :upload_count) SQL + + attachments&.each do |attachment| + @db.execute(<<-SQL, topic_id: topic[:id], path: attachment) + INSERT OR REPLACE INTO topic_upload (topic_id, path) + VALUES (:topic_id, :path) + SQL + end end def insert_post(post) + attachments = post.delete(:attachments) + post[:upload_count] = attachments&.size || 0 + @db.execute(<<-SQL, prepare(post)) - INSERT OR REPLACE INTO post (id, raw, topic_id, user_id, created_at, reply_to_post_id, url) - VALUES (:id, :raw, :topic_id, :user_id, :created_at, :reply_to_post_id, :url) + INSERT OR REPLACE INTO post (id, raw, topic_id, user_id, created_at, reply_to_post_id, url, upload_count) + VALUES (:id, :raw, :topic_id, :user_id, :created_at, :reply_to_post_id, :url, :upload_count) SQL + + attachments&.each do |attachment| + @db.execute(<<-SQL, post_id: post[:id], path: attachment) + INSERT OR REPLACE INTO post_upload (post_id, path) + VALUES (:post_id, :path) + SQL + end end def sort_posts_by_created_at @db.execute 'DELETE FROM post_order' @db.execute <<-SQL - INSERT INTO post_order (id) + INSERT INTO post_order (post_id) SELECT id FROM post ORDER BY created_at, topic_id, id SQL end + def delete_unused_users + @db.execute <<~SQL + DELETE FROM user + WHERE NOT EXISTS( + SELECT 1 + FROM topic + WHERE topic.user_id = user.id + ) AND NOT EXISTS( + SELECT 1 + FROM post + WHERE post.user_id = user.id + ) + SQL + end + def fetch_categories @db.execute(<<-SQL) SELECT * @@ -82,6 +119,14 @@ module ImportScripts add_last_column_value(rows, 'id') end + def get_user_id(username) + @db.get_first_value(<<-SQL, username) + SELECT id + FROM user + WHERE username = :username + SQL + end + def count_topics @db.get_first_value(<<-SQL) SELECT COUNT(*) @@ -101,6 +146,14 @@ module ImportScripts add_last_column_value(rows, 'id') end + def fetch_topic_attachments(topic_id) + @db.execute(<<-SQL, topic_id) + SELECT path + FROM topic_upload + WHERE topic_id = :topic_id + SQL + end + def count_posts @db.get_first_value(<<-SQL) SELECT COUNT(*) @@ -110,9 +163,21 @@ module ImportScripts def fetch_posts(last_row_id) rows = @db.execute(<<-SQL, last_row_id) - SELECT o.ROWID, p.* + SELECT ROWID AS rowid, * + FROM post + WHERE ROWID > :last_row_id + ORDER BY ROWID + LIMIT #{@batch_size} + SQL + + add_last_column_value(rows, 'rowid') + end + + def fetch_sorted_posts(last_row_id) + rows = @db.execute(<<-SQL, last_row_id) + SELECT o.ROWID AS rowid, p.* FROM post p - JOIN post_order o USING (id) + JOIN post_order o ON (p.id = o.post_id) WHERE o.ROWID > :last_row_id ORDER BY o.ROWID LIMIT #{@batch_size} @@ -121,6 +186,14 @@ module ImportScripts add_last_column_value(rows, 'rowid') end + def fetch_post_attachments(post_id) + @db.execute(<<-SQL, post_id) + SELECT path + FROM post_upload + WHERE post_id = :post_id + SQL + end + def execute_sql(sql) @db.execute(sql) end @@ -136,10 +209,14 @@ module ImportScripts @db.execute 'PRAGMA locking_mode = EXCLUSIVE' end + def key_data_type + @numeric_keys ? 'INTEGER' : 'TEXT' + end + def create_category_table @db.execute <<-SQL CREATE TABLE IF NOT EXISTS category ( - id TEXT NOT NULL PRIMARY KEY, + id #{key_data_type} NOT NULL PRIMARY KEY, name TEXT NOT NULL, description TEXT, position INTEGER, @@ -151,44 +228,59 @@ module ImportScripts def create_user_table @db.execute <<-SQL CREATE TABLE IF NOT EXISTS user ( - id TEXT NOT NULL PRIMARY KEY, + id #{key_data_type} NOT NULL PRIMARY KEY, email TEXT, username TEXT, name TEXT, + bio TEXT, + avatar_path TEXT, created_at DATETIME, last_seen_at DATETIME, active BOOLEAN NOT NULL DEFAULT true ) SQL + + @db.execute 'CREATE INDEX IF NOT EXISTS user_by_username ON user (username)' end def create_topic_table @db.execute <<-SQL CREATE TABLE IF NOT EXISTS topic ( - id TEXT NOT NULL PRIMARY KEY, + id #{key_data_type} NOT NULL PRIMARY KEY, title TEXT, raw TEXT, - category_id TEXT NOT NULL, + category_id #{key_data_type}, closed BOOLEAN NOT NULL DEFAULT false, - user_id TEXT NOT NULL, + user_id #{key_data_type} NOT NULL, created_at DATETIME, - url TEXT + url TEXT, + upload_count INTEGER DEFAULT 0 ) SQL @db.execute 'CREATE INDEX IF NOT EXISTS topic_by_user_id ON topic (user_id)' + + @db.execute <<-SQL + CREATE TABLE IF NOT EXISTS topic_upload ( + topic_id #{key_data_type} NOT NULL, + path TEXT NOT NULL + ) + SQL + + @db.execute 'CREATE UNIQUE INDEX IF NOT EXISTS topic_upload_unique ON topic_upload(topic_id, path)' end def create_post_table @db.execute <<-SQL CREATE TABLE IF NOT EXISTS post ( - id TEXT NOT NULL PRIMARY KEY, + id #{key_data_type} NOT NULL PRIMARY KEY, raw TEXT, - topic_id TEXT NOT NULL, - user_id TEXT NOT NULL, + topic_id #{key_data_type} NOT NULL, + user_id #{key_data_type} NOT NULL, created_at DATETIME, - reply_to_post_id TEXT, - url TEXT + reply_to_post_id #{key_data_type}, + url TEXT, + upload_count INTEGER DEFAULT 0 ) SQL @@ -196,9 +288,18 @@ module ImportScripts @db.execute <<-SQL CREATE TABLE IF NOT EXISTS post_order ( - id TEXT NOT NULL PRIMARY KEY + post_id #{key_data_type} NOT NULL PRIMARY KEY ) SQL + + @db.execute <<-SQL + CREATE TABLE IF NOT EXISTS post_upload ( + post_id #{key_data_type} NOT NULL, + path TEXT NOT NULL + ) + SQL + + @db.execute 'CREATE UNIQUE INDEX IF NOT EXISTS post_upload_unique ON post_upload(post_id, path)' end def prepare(hash) diff --git a/script/import_scripts/zendesk.rb b/script/import_scripts/zendesk.rb index 15d44282b2c..a0eb30f9f0c 100644 --- a/script/import_scripts/zendesk.rb +++ b/script/import_scripts/zendesk.rb @@ -82,19 +82,7 @@ class ImportScripts::Zendesk < ImportScripts::Base ) end - @db.execute_sql(<<~SQL) - DELETE FROM user - WHERE NOT EXISTS( - SELECT 1 - FROM topic - WHERE topic.user_id = user.id - ) AND NOT EXISTS( - SELECT 1 - FROM post - WHERE post.user_id = user.id - ) - SQL - + @db.delete_unused_users @db.sort_posts_by_created_at end @@ -188,7 +176,7 @@ class ImportScripts::Zendesk < ImportScripts::Base last_row_id = 0 batches do |offset| - rows, last_row_id = @db.fetch_posts(last_row_id) + rows, last_row_id = @db.fetch_sorted_posts(last_row_id) break if rows.empty? next if all_records_exist?(:posts, rows.map { |row| row['id'] }) diff --git a/script/import_scripts/zendesk_api.rb b/script/import_scripts/zendesk_api.rb index ddea6f1cb5f..00896a78f79 100644 --- a/script/import_scripts/zendesk_api.rb +++ b/script/import_scripts/zendesk_api.rb @@ -8,7 +8,7 @@ require_relative 'base/generic_database' # Call it like this: # RAILS_ENV=production bundle exec ruby script/import_scripts/zendesk_api.rb SOURCE_URL DIRNAME AUTH_EMAIL AUTH_TOKEN -class ImportScripts::Zendesk < ImportScripts::Base +class ImportScripts::ZendeskApi < ImportScripts::Base BATCH_SIZE = 1000 def initialize(source_url, path, auth_email, auth_token) @@ -193,7 +193,7 @@ class ImportScripts::Zendesk < ImportScripts::Base last_row_id = 0 batches do |offset| - rows, last_row_id = @db.fetch_posts(last_row_id) + rows, last_row_id = @db.fetch_sorted_posts(last_row_id) break if rows.empty? create_posts(rows, total: total_count, offset: offset) do |row| @@ -303,4 +303,4 @@ unless ARGV.length == 4 && Dir.exist?(ARGV[1]) exit 1 end -ImportScripts::Zendesk.new(ARGV[0], ARGV[1], ARGV[2], ARGV[3]).perform +ImportScripts::ZendeskApi.new(ARGV[0], ARGV[1], ARGV[2], ARGV[3]).perform