require 'csv' require 'reverse_markdown' require_relative 'base' require_relative 'base/generic_database' # Call it like this: # RAILS_ENV=production bundle exec ruby script/import_scripts/answerbase.rb DIRNAME class ImportScripts::Answerbase < ImportScripts::Base OLD_DOMAIN = "http://answerbase.example.com" # without trailing slash NEW_DOMAIN = "https://discourse.example.com" AVATAR_DIRECTORY = "User Images" ANSWER_ATTACHMENT_DIRECTORY = "Answer Attachments" ANSWER_IMAGE_DIRECTORY = "Answer Images" QUESTION_ATTACHMENT_DIRECTORY = "Question Attachments" QUESTION_IMAGE_DIRECTORY = "Question Images" EMBEDDED_IMAGE_REGEX = /]*href="[^"]*relativeUrl=(?[^"\&]*)[^"]*"[^>]*>\s*]*>\s*<\/a>/i QUESTION_LINK_REGEX = /]*?href="#{Regexp.escape(OLD_DOMAIN)}\/[^"]*?(?:q|questionid=)(?\d+)[^"]*?"[^>]*>(?.*?)<\/a>/i TOPIC_LINK_NORMALIZATION = '/.*?-(q\d+).*/\1' BATCH_SIZE = 1000 def initialize(path) super() @path = path @db = ImportScripts::GenericDatabase.new( @path, batch_size: BATCH_SIZE, recreate: true, numeric_keys: true ) end def execute read_csv_files add_permalink_normalizations import_categories import_users import_topics import_posts end def read_csv_files puts "", "reading CSV files..." category_position = 0 csv_parse("categories") do |row| @db.insert_category( id: row[:id], name: row[:name], position: category_position += 1 ) end csv_parse("users") do |row| @db.insert_user( id: row[:id], email: row[:email], username: row[:username], bio: row[:description], avatar_path: row[:profile_image], created_at: parse_date(row[:createtime]), active: true ) end last_topic_id = nil csv_parse("questions-answers-comments") do |row| next if row[:published] == "No" user_id = @db.get_user_id(row[:username]) created_at = parse_datetime(row[:createtime]) begin if row[:type] == "Question" attachments = parse_filenames(row[:attachments], QUESTION_ATTACHMENT_DIRECTORY) + parse_filenames(row[:images], QUESTION_IMAGE_DIRECTORY) @db.insert_topic( id: row[:id], title: row[:title], raw: row[:text], category_id: row[:categorylist], user_id: user_id, created_at: created_at, attachments: attachments ) last_topic_id = row[:id] else attachments = parse_filenames(row[:attachments], ANSWER_ATTACHMENT_DIRECTORY) + parse_filenames(row[:images], ANSWER_IMAGE_DIRECTORY) @db.insert_post( id: row[:id], raw: row[:text], topic_id: last_topic_id, user_id: user_id, created_at: created_at, attachments: attachments ) end rescue p row raise end end end def parse_filenames(text, directory) return [] if text.blank? text .split(';') .map { |filename| File.join(@path, directory, filename.strip) } end def parse_date(text) return nil if text.blank? DateTime.strptime(text, "%m/%d/%y") end def parse_datetime(text) return nil if text.blank? # DateTime.strptime(text, "%m/%d/%Y %H:%M") DateTime.parse(text).utc.to_datetime end def import_categories puts "", "creating categories" rows = @db.fetch_categories create_categories(rows) do |row| { id: row['id'], name: row['name'], description: row['description'], position: row['position'] } end end def batches super(BATCH_SIZE) end def import_users puts "", "creating users" total_count = @db.count_users last_id = 0 batches do |offset| rows, last_id = @db.fetch_users(last_id) break if rows.empty? next if all_records_exist?(:users, rows.map { |row| row['id'] }) create_users(rows, total: total_count, offset: offset) do |row| { id: row['id'], email: row['email'], username: row['username'], bio_raw: row['bio'], created_at: row['created_at'], active: row['active'] == 1, post_create_action: proc do |user| create_avatar(user, row['avatar_path']) end } end end end def create_avatar(user, avatar_path) return if avatar_path.blank? avatar_path = File.join(@path, AVATAR_DIRECTORY, avatar_path) if File.exist?(avatar_path) @uploader.create_avatar(user, avatar_path) else STDERR.puts "Could not find avatar: #{avatar_path}" end end def import_topics puts "", "creating topics" total_count = @db.count_topics last_id = 0 batches do |offset| rows, last_id = @db.fetch_topics(last_id) break if rows.empty? next if all_records_exist?(:posts, rows.map { |row| row['id'] }) create_posts(rows, total: total_count, offset: offset) do |row| attachments = @db.fetch_topic_attachments(row['id']) if row['upload_count'] > 0 user_id = user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id { id: row['id'], title: row['title'], raw: raw_with_attachments(row['raw'].presence || row['title'], attachments, user_id), category: category_id_from_imported_category_id(row['category_id']), user_id: user_id, created_at: row['created_at'], closed: row['closed'] == 1, post_create_action: proc do |post| url = "q#{row['id']}" Permalink.create(url: url, topic_id: post.topic.id) unless permalink_exists?(url) end } end end end def import_posts puts "", "creating posts" total_count = @db.count_posts last_row_id = 0 batches do |offset| rows, last_row_id = @db.fetch_posts(last_row_id) break if rows.empty? next if all_records_exist?(:posts, rows.map { |row| row['id'] }) create_posts(rows, total: total_count, offset: offset) do |row| topic = topic_lookup_from_imported_post_id(row['topic_id']) attachments = @db.fetch_post_attachments(row['id']) if row['upload_count'] > 0 user_id = user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id { id: row['id'], raw: raw_with_attachments(row['raw'], attachments, user_id), user_id: user_id, topic_id: topic[:topic_id], created_at: row['created_at'] } end end end def raw_with_attachments(raw, attachments, user_id) raw, embedded_paths, upload_ids = replace_embedded_attachments(raw, user_id) raw = replace_question_links(raw) raw = ReverseMarkdown.convert(raw) || "" attachments&.each do |attachment| path = attachment['path'] next if embedded_paths.include?(path) if File.exist?(path) filename = File.basename(path) upload = @uploader.create_upload(user_id, path, filename) if upload.present? && upload.persisted? && !upload_ids.include?(upload.id) raw << "\n" << @uploader.html_for_upload(upload, filename) end else STDERR.puts "Could not find file: #{path}" end end raw end def replace_embedded_attachments(raw, user_id) paths = [] upload_ids = [] raw = raw.gsub(EMBEDDED_IMAGE_REGEX) do path = File.join(@path, Regexp.last_match['path']) filename = File.basename(path) path = find_image_path(filename) if path upload = @uploader.create_upload(user_id, path, filename) if upload.present? && upload.persisted? paths << path upload_ids << upload.id @uploader.html_for_upload(upload, filename) end else STDERR.puts "Could not find file: #{path}" end end [raw, paths, upload_ids] end def find_image_path(filename) [QUESTION_IMAGE_DIRECTORY, ANSWER_IMAGE_DIRECTORY].each do |directory| path = File.join(@path, directory, filename) return path if File.exist?(path) end end def replace_question_links(raw) raw.gsub(QUESTION_LINK_REGEX) do topic_id = Regexp.last_match("id") topic = topic_lookup_from_imported_post_id(topic_id) return Regexp.last_match.to_s unless topic url = File.join(NEW_DOMAIN, topic[:url]) text = Regexp.last_match("text") text.include?(OLD_DOMAIN) ? url : "#{text}" end end def add_permalink_normalizations normalizations = SiteSetting.permalink_normalizations normalizations = normalizations.blank? ? [] : normalizations.split('|') add_normalization(normalizations, TOPIC_LINK_NORMALIZATION) SiteSetting.permalink_normalizations = normalizations.join('|') end def add_normalization(normalizations, normalization) normalizations << normalization unless normalizations.include?(normalization) end def permalink_exists?(url) Permalink.find_by(url: url) end def csv_parse(table_name) CSV.foreach(File.join(@path, "#{table_name}.csv"), headers: true, header_converters: :symbol, skip_blanks: true, encoding: 'bom|utf-8') { |row| yield row } end end unless ARGV[0] && Dir.exist?(ARGV[0]) puts "", "Usage:", "", "bundle exec ruby script/import_scripts/answerbase.rb DIRNAME", "" exit 1 end ImportScripts::Answerbase.new(ARGV[0]).perform