2019-05-02 18:17:27 -04:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2019-02-28 15:59:36 -05:00
|
|
|
require "csv"
|
|
|
|
require "reverse_markdown"
|
|
|
|
require_relative "base"
|
|
|
|
require_relative "base/generic_database"
|
|
|
|
|
|
|
|
# Call it like this:
|
|
|
|
# RAILS_ENV=production bundle exec ruby script/import_scripts/answerbase.rb DIRNAME
|
|
|
|
class ImportScripts::Answerbase < ImportScripts::Base
|
|
|
|
OLD_DOMAIN = "http://answerbase.example.com" # without trailing slash
|
|
|
|
NEW_DOMAIN = "https://discourse.example.com"
|
|
|
|
AVATAR_DIRECTORY = "User Images"
|
|
|
|
ANSWER_ATTACHMENT_DIRECTORY = "Answer Attachments"
|
|
|
|
ANSWER_IMAGE_DIRECTORY = "Answer Images"
|
|
|
|
QUESTION_ATTACHMENT_DIRECTORY = "Question Attachments"
|
|
|
|
QUESTION_IMAGE_DIRECTORY = "Question Images"
|
|
|
|
EMBEDDED_IMAGE_REGEX =
|
|
|
|
%r{<a[^>]*href="[^"]*relativeUrl=(?<path>[^"\&]*)[^"]*"[^>]*>\s*<img[^>]*>\s*</a>}i
|
|
|
|
QUESTION_LINK_REGEX =
|
|
|
|
%r{<a[^>]*?href="#{Regexp.escape(OLD_DOMAIN)}/[^"]*?(?:q|questionid=)(?<id>\d+)[^"]*?"[^>]*>(?<text>.*?)</a>}i
|
|
|
|
TOPIC_LINK_NORMALIZATION = '/.*?-(q\d+).*/\1'
|
|
|
|
BATCH_SIZE = 1000
|
|
|
|
|
|
|
|
def initialize(path)
|
|
|
|
super()
|
|
|
|
|
|
|
|
@path = path
|
|
|
|
@db =
|
|
|
|
ImportScripts::GenericDatabase.new(
|
|
|
|
@path,
|
|
|
|
batch_size: BATCH_SIZE,
|
|
|
|
recreate: true,
|
|
|
|
numeric_keys: true,
|
|
|
|
)
|
|
|
|
end
|
|
|
|
|
|
|
|
def execute
|
|
|
|
read_csv_files
|
|
|
|
|
|
|
|
add_permalink_normalizations
|
|
|
|
import_categories
|
|
|
|
import_users
|
|
|
|
import_topics
|
|
|
|
import_posts
|
|
|
|
end
|
|
|
|
|
|
|
|
def read_csv_files
|
|
|
|
puts "", "reading CSV files..."
|
|
|
|
|
|
|
|
category_position = 0
|
|
|
|
csv_parse("categories") do |row|
|
|
|
|
@db.insert_category(id: row[:id], name: row[:name], position: category_position += 1)
|
|
|
|
end
|
|
|
|
|
|
|
|
csv_parse("users") do |row|
|
|
|
|
@db.insert_user(
|
|
|
|
id: row[:id],
|
|
|
|
email: row[:email],
|
|
|
|
username: row[:username],
|
|
|
|
bio: row[:description],
|
|
|
|
avatar_path: row[:profile_image],
|
|
|
|
created_at: parse_date(row[:createtime]),
|
|
|
|
active: true,
|
|
|
|
)
|
|
|
|
end
|
|
|
|
|
|
|
|
last_topic_id = nil
|
|
|
|
csv_parse("questions-answers-comments") do |row|
|
|
|
|
next if row[:published] == "No"
|
|
|
|
user_id = @db.get_user_id(row[:username])
|
|
|
|
created_at = parse_datetime(row[:createtime])
|
|
|
|
|
|
|
|
begin
|
|
|
|
if row[:type] == "Question"
|
|
|
|
attachments =
|
|
|
|
parse_filenames(row[:attachments], QUESTION_ATTACHMENT_DIRECTORY) +
|
|
|
|
parse_filenames(row[:images], QUESTION_IMAGE_DIRECTORY)
|
|
|
|
|
|
|
|
@db.insert_topic(
|
|
|
|
id: row[:id],
|
|
|
|
title: row[:title],
|
|
|
|
raw: row[:text],
|
|
|
|
category_id: row[:categorylist],
|
|
|
|
user_id: user_id,
|
|
|
|
created_at: created_at,
|
|
|
|
attachments: attachments,
|
|
|
|
)
|
|
|
|
last_topic_id = row[:id]
|
|
|
|
else
|
|
|
|
attachments =
|
|
|
|
parse_filenames(row[:attachments], ANSWER_ATTACHMENT_DIRECTORY) +
|
|
|
|
parse_filenames(row[:images], ANSWER_IMAGE_DIRECTORY)
|
|
|
|
|
|
|
|
@db.insert_post(
|
|
|
|
id: row[:id],
|
|
|
|
raw: row[:text],
|
|
|
|
topic_id: last_topic_id,
|
|
|
|
user_id: user_id,
|
|
|
|
created_at: created_at,
|
|
|
|
attachments: attachments,
|
|
|
|
)
|
|
|
|
end
|
|
|
|
rescue StandardError
|
|
|
|
p row
|
|
|
|
raise
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def parse_filenames(text, directory)
|
|
|
|
return [] if text.blank?
|
|
|
|
|
|
|
|
text.split(";").map { |filename| File.join(@path, directory, filename.strip) }
|
|
|
|
end
|
|
|
|
|
|
|
|
def parse_date(text)
|
|
|
|
return nil if text.blank?
|
|
|
|
DateTime.strptime(text, "%m/%d/%y")
|
|
|
|
end
|
|
|
|
|
|
|
|
def parse_datetime(text)
|
|
|
|
return nil if text.blank?
|
|
|
|
# DateTime.strptime(text, "%m/%d/%Y %H:%M")
|
|
|
|
DateTime.parse(text).utc.to_datetime
|
|
|
|
end
|
|
|
|
|
|
|
|
def import_categories
|
|
|
|
puts "", "creating categories"
|
|
|
|
rows = @db.fetch_categories
|
|
|
|
|
|
|
|
create_categories(rows) do |row|
|
|
|
|
{
|
|
|
|
id: row["id"],
|
|
|
|
name: row["name"],
|
|
|
|
description: row["description"],
|
|
|
|
position: row["position"],
|
|
|
|
}
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def batches
|
|
|
|
super(BATCH_SIZE)
|
|
|
|
end
|
|
|
|
|
|
|
|
def import_users
|
|
|
|
puts "", "creating users"
|
|
|
|
total_count = @db.count_users
|
|
|
|
last_id = 0
|
|
|
|
|
|
|
|
batches do |offset|
|
|
|
|
rows, last_id = @db.fetch_users(last_id)
|
|
|
|
break if rows.empty?
|
|
|
|
|
|
|
|
next if all_records_exist?(:users, rows.map { |row| row["id"] })
|
|
|
|
|
|
|
|
create_users(rows, total: total_count, offset: offset) do |row|
|
|
|
|
{
|
|
|
|
id: row["id"],
|
|
|
|
email: row["email"],
|
|
|
|
username: row["username"],
|
|
|
|
bio_raw: row["bio"],
|
|
|
|
created_at: row["created_at"],
|
|
|
|
active: row["active"] == 1,
|
|
|
|
post_create_action: proc { |user| create_avatar(user, row["avatar_path"]) },
|
|
|
|
}
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def create_avatar(user, avatar_path)
|
|
|
|
return if avatar_path.blank?
|
|
|
|
avatar_path = File.join(@path, AVATAR_DIRECTORY, avatar_path)
|
|
|
|
|
|
|
|
if File.exist?(avatar_path)
|
|
|
|
@uploader.create_avatar(user, avatar_path)
|
|
|
|
else
|
|
|
|
STDERR.puts "Could not find avatar: #{avatar_path}"
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def import_topics
|
|
|
|
puts "", "creating topics"
|
|
|
|
total_count = @db.count_topics
|
|
|
|
last_id = 0
|
|
|
|
|
|
|
|
batches do |offset|
|
|
|
|
rows, last_id = @db.fetch_topics(last_id)
|
|
|
|
break if rows.empty?
|
|
|
|
|
|
|
|
next if all_records_exist?(:posts, rows.map { |row| row["id"] })
|
|
|
|
|
|
|
|
create_posts(rows, total: total_count, offset: offset) do |row|
|
|
|
|
attachments = @db.fetch_topic_attachments(row["id"]) if row["upload_count"] > 0
|
|
|
|
user_id = user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id
|
|
|
|
|
|
|
|
{
|
|
|
|
id: row["id"],
|
|
|
|
title: row["title"],
|
|
|
|
raw: raw_with_attachments(row["raw"].presence || row["title"], attachments, user_id),
|
|
|
|
category: category_id_from_imported_category_id(row["category_id"]),
|
|
|
|
user_id: user_id,
|
|
|
|
created_at: row["created_at"],
|
|
|
|
closed: row["closed"] == 1,
|
|
|
|
post_create_action:
|
|
|
|
proc do |post|
|
|
|
|
url = "q#{row["id"]}"
|
|
|
|
Permalink.create(url: url, topic_id: post.topic.id) unless permalink_exists?(url)
|
|
|
|
end,
|
|
|
|
}
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def import_posts
|
|
|
|
puts "", "creating posts"
|
|
|
|
total_count = @db.count_posts
|
|
|
|
last_row_id = 0
|
|
|
|
|
|
|
|
batches do |offset|
|
|
|
|
rows, last_row_id = @db.fetch_posts(last_row_id)
|
|
|
|
break if rows.empty?
|
|
|
|
|
|
|
|
next if all_records_exist?(:posts, rows.map { |row| row["id"] })
|
|
|
|
|
|
|
|
create_posts(rows, total: total_count, offset: offset) do |row|
|
|
|
|
topic = topic_lookup_from_imported_post_id(row["topic_id"])
|
|
|
|
attachments = @db.fetch_post_attachments(row["id"]) if row["upload_count"] > 0
|
|
|
|
user_id = user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id
|
|
|
|
|
|
|
|
{
|
|
|
|
id: row["id"],
|
|
|
|
raw: raw_with_attachments(row["raw"], attachments, user_id),
|
|
|
|
user_id: user_id,
|
|
|
|
topic_id: topic[:topic_id],
|
|
|
|
created_at: row["created_at"],
|
|
|
|
}
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def raw_with_attachments(raw, attachments, user_id)
|
|
|
|
raw, embedded_paths, upload_ids = replace_embedded_attachments(raw, user_id)
|
|
|
|
raw = replace_question_links(raw)
|
|
|
|
raw = ReverseMarkdown.convert(raw) || ""
|
|
|
|
|
|
|
|
attachments&.each do |attachment|
|
|
|
|
path = attachment["path"]
|
|
|
|
next if embedded_paths.include?(path)
|
|
|
|
|
|
|
|
if File.exist?(path)
|
|
|
|
filename = File.basename(path)
|
|
|
|
upload = @uploader.create_upload(user_id, path, filename)
|
|
|
|
|
|
|
|
if upload.present? && upload.persisted? && !upload_ids.include?(upload.id)
|
2019-05-30 16:20:57 -04:00
|
|
|
raw = "#{raw}\n#{@uploader.html_for_upload(upload, filename)}"
|
2019-02-28 15:59:36 -05:00
|
|
|
end
|
|
|
|
else
|
|
|
|
STDERR.puts "Could not find file: #{path}"
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
raw
|
|
|
|
end
|
|
|
|
|
|
|
|
def replace_embedded_attachments(raw, user_id)
|
|
|
|
paths = []
|
|
|
|
upload_ids = []
|
|
|
|
|
|
|
|
raw =
|
|
|
|
raw.gsub(EMBEDDED_IMAGE_REGEX) do
|
|
|
|
path = File.join(@path, Regexp.last_match["path"])
|
|
|
|
filename = File.basename(path)
|
|
|
|
path = find_image_path(filename)
|
|
|
|
|
|
|
|
if path
|
|
|
|
upload = @uploader.create_upload(user_id, path, filename)
|
|
|
|
|
|
|
|
if upload.present? && upload.persisted?
|
|
|
|
paths << path
|
|
|
|
upload_ids << upload.id
|
|
|
|
@uploader.html_for_upload(upload, filename)
|
2023-01-07 06:53:14 -05:00
|
|
|
end
|
|
|
|
else
|
2019-02-28 15:59:36 -05:00
|
|
|
STDERR.puts "Could not find file: #{path}"
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
[raw, paths, upload_ids]
|
|
|
|
end
|
|
|
|
|
|
|
|
def find_image_path(filename)
|
|
|
|
[QUESTION_IMAGE_DIRECTORY, ANSWER_IMAGE_DIRECTORY].each do |directory|
|
|
|
|
path = File.join(@path, directory, filename)
|
|
|
|
return path if File.exist?(path)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def replace_question_links(raw)
|
|
|
|
raw.gsub(QUESTION_LINK_REGEX) do
|
|
|
|
topic_id = Regexp.last_match("id")
|
|
|
|
topic = topic_lookup_from_imported_post_id(topic_id)
|
|
|
|
return Regexp.last_match.to_s unless topic
|
|
|
|
|
|
|
|
url = File.join(NEW_DOMAIN, topic[:url])
|
|
|
|
text = Regexp.last_match("text")
|
|
|
|
text.include?(OLD_DOMAIN) ? url : "<a href='#{url}'>#{text}</a>"
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def add_permalink_normalizations
|
|
|
|
normalizations = SiteSetting.permalink_normalizations
|
|
|
|
normalizations = normalizations.blank? ? [] : normalizations.split("|")
|
|
|
|
|
|
|
|
add_normalization(normalizations, TOPIC_LINK_NORMALIZATION)
|
|
|
|
|
|
|
|
SiteSetting.permalink_normalizations = normalizations.join("|")
|
|
|
|
end
|
|
|
|
|
|
|
|
def add_normalization(normalizations, normalization)
|
2024-05-27 06:27:13 -04:00
|
|
|
normalizations << normalization if normalizations.exclude?(normalization)
|
2019-02-28 15:59:36 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def permalink_exists?(url)
|
|
|
|
Permalink.find_by(url: url)
|
|
|
|
end
|
|
|
|
|
|
|
|
def csv_parse(table_name)
|
|
|
|
CSV.foreach(
|
|
|
|
File.join(@path, "#{table_name}.csv"),
|
|
|
|
headers: true,
|
|
|
|
header_converters: :symbol,
|
|
|
|
skip_blanks: true,
|
|
|
|
encoding: "bom|utf-8",
|
|
|
|
) { |row| yield row }
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
unless ARGV[0] && Dir.exist?(ARGV[0])
|
|
|
|
puts "", "Usage:", "", "bundle exec ruby script/import_scripts/answerbase.rb DIRNAME", ""
|
|
|
|
exit 1
|
|
|
|
end
|
|
|
|
|
|
|
|
ImportScripts::Answerbase.new(ARGV[0]).perform
|