# frozen_string_literal: true require_relative 'base' require 'tiny_tds' # Import script for Telligent communities # # It's really hard to find all attachments, but the script tries to do it anyway. # # You can supply a JSON file if you need to map and ignore categories during the import # by providing the path to the file in the `CATEGORY_MAPPING` environment variable. # You can also add tags to remapped categories and remap multiple old forums into one # category. Here's an example of such a `mapping.json` file: # # { # "ignored_forum_ids": [41, 360, 378], # # "mapping": [ # { # "category": ["New Category 1"], # "forums": [ # { "id": 348, "tag": "some_tag" }, # { "id": 347, "tag": "another_tag" } # ] # }, # { # "category": ["New Category 2"], # "forums": [ # { "id": 9 } # ] # }, # { # "category": ["Nested", "Category"], # "forums": [ # { "id": 322 } # ] # } # ] # } class ImportScripts::Telligent < ImportScripts::Base BATCH_SIZE ||= 1000 LOCAL_AVATAR_REGEX ||= /\A~\/.*(?communityserver-components-(?:selectable)?avatars)\/(?[^\/]+)\/(?.+)/i REMOTE_AVATAR_REGEX ||= /\Ahttps?:\/\//i ATTACHMENT_REGEXES ||= [ /]*\shref="[^"]*?\/cfs-file(?:systemfile)?(?:\.ashx)?\/__key\/(?[^\/]+)\/(?[^\/]+)\/(?.+?)".*?>.*?<\/a>/i, /]*\ssrc="[^"]*?\/cfs-file(?:systemfile)?(?:\.ashx)?\/__key\/(?[^\/]+)\/(?[^\/]+)\/(?.+?)".*?>/i, /\[View:[^\]]*?\/cfs-file(?:systemfile)?(?:\.ashx)?\/__key\/(?[^\/]+)\/(?[^\/]+)\/(?.+?)(?:\:[:\d\s]*?)?\]/i, /\[(?img|url)\][^\[]*?cfs-file(?:systemfile)?(?:\.ashx)?\/__key\/(?[^\/]+)\/(?[^\/]+)\/(?.+?)\[\/\k\]/i, /\[(?img|url)=[^\[]*?cfs-file(?:systemfile)?(?:\.ashx)?\/__key\/(?[^\/]+)\/(?[^\/]+)\/(?.+?)\][^\[]*?\[\/\k\]/i ] PROPERTY_NAMES_REGEX ||= /(?\w+):S:(?\d+):(?\d+):/ CATEGORY_LINK_NORMALIZATION = '/.*?(f\/\d+)$/\1' TOPIC_LINK_NORMALIZATION = '/.*?(f\/\d+\/t\/\d+)$/\1' UNICODE_REPLACEMENTS = { "5F00" => "_", "2800" => "(", "2900" => ")", "2D00" => "-", "2C00" => ",", "2700" => "'", "5B00" => "[", "5D00" => "]", "3D00" => "=", "2600" => "&", "2100" => "!", "2300" => "#", "7E00" => "~", "2500" => "%", "2E00" => ".", "4000" => "@", "2B00" => "+", "2400" => "$", "1920" => "’", "E900" => "é", "E000" => "à", "F300" => "ó", "1C20" => "“", "1D20" => "”", "B000" => "°", "0003" => ["0300".to_i(16)].pack("U"), "0103" => ["0301".to_i(16)].pack("U") } def initialize super() @client = TinyTds::Client.new( host: ENV["DB_HOST"], username: ENV["DB_USERNAME"], password: ENV["DB_PASSWORD"], database: ENV["DB_NAME"], timeout: 60 # the user query is very slow ) @filestore_root_directory = ENV["FILE_BASE_DIR"] @files = {} SiteSetting.tagging_enabled = true end def execute add_permalink_normalizations index_filestore import_categories import_users import_topics import_posts mark_topics_as_solved end def index_filestore puts "", "Indexing filestore..." index_directory(@filestore_root_directory) end def import_users puts "", "Importing users..." user_conditions = <<~SQL ( EXISTS(SELECT 1 FROM te_Forum_Threads t WHERE t.UserId = u.UserID) OR EXISTS(SELECT 1 FROM te_Forum_ThreadReplies r WHERE r.UserId = u.UserID) ) SQL last_user_id = -1 total_count = count(<<~SQL) SELECT COUNT(1) AS count FROM cs_Users u WHERE #{user_conditions} SQL import_count = 0 loop do rows = query(<<~SQL) SELECT TOP #{BATCH_SIZE} u.UserID, u.Email, u.UserName, u.CreateDate, ap.PropertyNames AP_PropertyNames, ap.PropertyValuesString AS AP_PropertyValues, up.PropertyNames UP_PropertyNames, up.PropertyValues AS UP_PropertyValues FROM cs_Users u LEFT OUTER JOIN aspnet_Profile ap ON ap.UserId = u.MembershipID LEFT OUTER JOIN cs_UserProfile up ON up.UserID = u.UserID WHERE u.UserID > #{last_user_id} AND #{user_conditions} ORDER BY UserID SQL break if rows.blank? last_user_id = rows[-1]["UserID"] if all_records_exist?(:users, rows.map { |row| row["UserID"] }) import_count += rows.size next end create_users(rows, total: total_count, offset: import_count) do |row| ap_properties = parse_properties(row["AP_PropertyNames"], row["AP_PropertyValues"]) up_properties = parse_properties(row["UP_PropertyNames"], row["UP_PropertyValues"]) { id: row["UserID"], email: row["Email"], username: row["UserName"], name: ap_properties["commonName"], created_at: row["CreateDate"], bio_raw: html_to_markdown(ap_properties["bio"]), location: ap_properties["location"], website: ap_properties["webAddress"], post_create_action: proc do |user| import_avatar(user, up_properties["avatarUrl"]) suspend_user(user, up_properties["BannedUntil"], up_properties["UserBanReason"]) end } end import_count += rows.size end end # TODO move into base importer (create_user) and use consistent error handling def import_avatar(user, avatar_url) return if @filestore_root_directory.blank? || avatar_url.blank? || avatar_url.include?("anonymous") if match_data = avatar_url.match(LOCAL_AVATAR_REGEX) avatar_path = File.join(@filestore_root_directory, match_data[:directory].gsub("-", "."), match_data[:path].split("-"), match_data[:filename]) if File.file?(avatar_path) @uploader.create_avatar(user, avatar_path) else STDERR.puts "Could not find avatar: #{avatar_path}" end elsif avatar_url.match?(REMOTE_AVATAR_REGEX) UserAvatar.import_url_for_user(avatar_url, user) rescue nil end end def suspend_user(user, banned_until, ban_reason) return if banned_until.blank? if banned_until = DateTime.parse(banned_until) > DateTime.now user.suspended_till = banned_until user.suspended_at = DateTime.now user.save! StaffActionLogger.new(Discourse.system_user).log_user_suspend(user, ban_reason) end end def import_categories if ENV['CATEGORY_MAPPING'] import_mapped_forums_as_categories else import_groups_and_forums_as_categories end end def import_mapped_forums_as_categories puts "", "Importing categories..." json = JSON.parse(File.read(ENV['CATEGORY_MAPPING'])) categories = [] @forum_ids_to_tags = {} @ignored_forum_ids = json["ignored_forum_ids"] json["mapping"].each do |m| parent_id = nil last_index = m["category"].size - 1 forum_ids = [] m["forums"].each do |f| forum_ids << f["id"] @forum_ids_to_tags[f["id"]] = f["tag"] if f["tag"].present? end m["category"].each_with_index do |name, index| id = Digest::MD5.hexdigest(name) categories << { id: id, name: name, parent_id: parent_id, forum_ids: index == last_index ? forum_ids : nil } parent_id = id end end create_categories(categories) do |c| if category_id = category_id_from_imported_category_id(c[:id]) map_forum_ids(category_id, c[:forum_ids]) nil else { id: c[:id], name: c[:name], parent_category_id: category_id_from_imported_category_id(c[:parent_id]), post_create_action: proc do |category| map_forum_ids(category.id, c[:forum_ids]) end } end end end def map_forum_ids(category_id, forum_ids) return if forum_ids.blank? forum_ids.each do |id| url = "f/#{id}" Permalink.create(url: url, category_id: category_id) unless Permalink.exists?(url: url) add_category(id, Category.find_by_id(category_id)) end end def import_groups_and_forums_as_categories puts "", "Importing parent categories..." parent_categories = query(<<~SQL) SELECT GroupID, Name, HtmlDescription, DateCreated, SortOrder FROM cs_Groups g WHERE (SELECT COUNT(1) FROM te_Forum_Forums f WHERE f.GroupId = g.GroupID) > 1 ORDER BY SortOrder, Name SQL create_categories(parent_categories) do |row| { id: "G#{row['GroupID']}", name: clean_category_name(row["Name"]), description: html_to_markdown(row["HtmlDescription"]), position: row["SortOrder"] } end puts "", "Importing child categories..." child_categories = query(<<~SQL) SELECT ForumId, GroupId, Name, Description, DateCreated, SortOrder FROM te_Forum_Forums ORDER BY GroupId, SortOrder, Name SQL create_categories(child_categories) do |row| parent_category_id = parent_category_id_for(row) if category_id = replace_with_category_id(child_categories, parent_category_id) add_category(row['ForumId'], Category.find_by_id(category_id)) url = "f/#{row['ForumId']}" Permalink.create(url: url, category_id: category_id) unless Permalink.exists?(url: url) nil else { id: row['ForumId'], parent_category_id: parent_category_id, name: clean_category_name(row["Name"]), description: html_to_markdown(row["Description"]), position: row["SortOrder"], post_create_action: proc do |category| url = "f/#{row['ForumId']}" Permalink.create(url: url, category_id: category.id) unless Permalink.exists?(url: url) end } end end end def parent_category_id_for(row) category_id_from_imported_category_id("G#{row['GroupId']}") if row.key?("GroupId") end def replace_with_category_id(child_categories, parent_category_id) parent_category_id if only_child?(child_categories, parent_category_id) end def only_child?(child_categories, parent_category_id) count = 0 child_categories.each do |row| count += 1 if parent_category_id_for(row) == parent_category_id end count == 1 end def clean_category_name(name) CGI.unescapeHTML(name) .strip end def import_topics puts "", "Importing topics..." last_topic_id = -1 total_count = count("SELECT COUNT(1) AS count FROM te_Forum_Threads t WHERE #{ignored_forum_sql_condition}") batches do |offset| rows = query(<<~SQL) SELECT TOP #{BATCH_SIZE} t.ThreadId, t.ForumId, t.UserId, t.TotalViews, t.ContentID AS TopicContentId, t.Subject, t.Body, t.DateCreated, t.IsLocked, t.StickyDate, a.ApplicationTypeId, a.ApplicationId, a.ApplicationContentTypeId, a.ContentId, a.FileName, a.IsRemote FROM te_Forum_Threads t LEFT JOIN te_Attachments a ON (a.ApplicationId = t.ForumId AND a.ApplicationTypeId = 0 AND a.ContentId = t.ThreadId AND a.ApplicationContentTypeId = 0) WHERE t.ThreadId > #{last_topic_id} AND #{ignored_forum_sql_condition} ORDER BY t.ThreadId SQL break if rows.blank? last_topic_id = rows[-1]["ThreadId"] next if all_records_exist?(:post, rows.map { |row| import_topic_id(row["ThreadId"]) }) create_posts(rows, total: total_count, offset: offset) do |row| user_id = user_id_from_imported_user_id(row["UserId"]) || Discourse::SYSTEM_USER_ID post = { id: import_topic_id(row["ThreadId"]), title: CGI.unescapeHTML(row["Subject"]), raw: raw_with_attachment(row, user_id, :topic), category: category_id_from_imported_category_id(row["ForumId"]), user_id: user_id, created_at: row["DateCreated"], closed: row["IsLocked"], views: row["TotalViews"], post_create_action: proc do |action_post| topic = action_post.topic Jobs.enqueue_at(topic.pinned_until, :unpin_topic, topic_id: topic.id) if topic.pinned_until url = "f/#{row['ForumId']}/t/#{row['ThreadId']}" Permalink.create(url: url, topic_id: topic.id) unless Permalink.exists?(url: url) import_topic_views(topic, row["TopicContentId"]) end } if row["StickyDate"] > Time.now post[:pinned_until] = row["StickyDate"] post[:pinned_at] = row["DateCreated"] end post end end end def import_topic_id(topic_id) "T#{topic_id}" end def import_topic_views(topic, content_id) last_user_id = -1 batches do |_| rows = query(<<~SQL) SELECT TOP #{BATCH_SIZE} UserId, MAX(CreatedUtcDate) AS ViewDate FROM te_Content_Views WHERE ContentId = '#{content_id}' AND UserId > #{last_user_id} GROUP BY UserId ORDER BY UserId SQL break if rows.blank? last_user_id = rows[-1]["UserId"] rows.each do |row| user_id = user_id_from_imported_user_id(row["UserId"]) TopicViewItem.add(topic.id, "127.0.0.1", user_id, row["ViewDate"], true) if user_id end end end def ignored_forum_sql_condition @ignored_forum_sql_condition ||= @ignored_forum_ids.present? \ ? "t.ForumId NOT IN (#{@ignored_forum_ids.join(',')})" \ : "1 = 1" end def import_posts puts "", "Importing posts..." last_post_id = -1 total_count = count(<<~SQL) SELECT COUNT(1) AS count FROM te_Forum_ThreadReplies tr JOIN te_Forum_Threads t ON (tr.ThreadId = t.ThreadId) WHERE #{ignored_forum_sql_condition} SQL batches do |offset| rows = query(<<~SQL) SELECT TOP #{BATCH_SIZE} tr.ThreadReplyId, tr.ThreadId, tr.UserId, pr.ThreadReplyId AS ParentReplyId, tr.Body, tr.ThreadReplyDate, CONVERT(BIT, CASE WHEN tr.AnswerVerifiedUtcDate IS NOT NULL AND NOT EXISTS( SELECT 1 FROM te_Forum_ThreadReplies x WHERE x.ThreadId = tr.ThreadId AND x.ThreadReplyId < tr.ThreadReplyId AND x.AnswerVerifiedUtcDate IS NOT NULL ) THEN 1 ELSE 0 END) AS IsFirstVerifiedAnswer, a.ApplicationTypeId, a.ApplicationId, a.ApplicationContentTypeId, a.ContentId, a.FileName, a.IsRemote FROM te_Forum_ThreadReplies tr JOIN te_Forum_Threads t ON (tr.ThreadId = t.ThreadId) LEFT JOIN te_Forum_ThreadReplies pr ON (tr.ParentReplyId = pr.ThreadReplyId AND tr.ParentReplyId < tr.ThreadReplyId AND tr.ThreadId = pr.ThreadId) LEFT JOIN te_Attachments a ON (a.ApplicationId = t.ForumId AND a.ApplicationTypeId = 0 AND a.ContentId = tr.ThreadReplyId AND a.ApplicationContentTypeId = 1) WHERE tr.ThreadReplyId > #{last_post_id} AND #{ignored_forum_sql_condition} ORDER BY tr.ThreadReplyId SQL break if rows.blank? last_post_id = rows[-1]["ThreadReplyId"] next if all_records_exist?(:post, rows.map { |row| row["ThreadReplyId"] }) create_posts(rows, total: total_count, offset: offset) do |row| imported_parent_id = row["ParentReplyId"]&.nonzero? ? row["ParentReplyId"] : import_topic_id(row["ThreadId"]) parent_post = topic_lookup_from_imported_post_id(imported_parent_id) user_id = user_id_from_imported_user_id(row["UserId"]) || Discourse::SYSTEM_USER_ID if parent_post post = { id: row["ThreadReplyId"], raw: raw_with_attachment(row, user_id, :post), user_id: user_id, topic_id: parent_post[:topic_id], created_at: row["ThreadReplyDate"], reply_to_post_number: parent_post[:post_number] } post[:custom_fields] = { is_accepted_answer: "true" } if row["IsFirstVerifiedAnswer"] post else puts "Failed to import post #{row['ThreadReplyId']}. Parent was not found." end end end end def index_directory(root_directory) Dir.foreach(root_directory) do |directory_name| next if directory_name == "." || directory_name == ".." path = File.join(root_directory, directory_name) if File.directory?(path) index_directory(path) else path.delete_prefix!(@filestore_root_directory) path.delete_prefix!("/") @files[path.downcase] = path end end end def raw_with_attachment(row, user_id, type) raw, embedded_paths, upload_ids = replace_embedded_attachments(row["Body"], user_id) raw = html_to_markdown(raw) || "" filename = row["FileName"] return raw if @filestore_root_directory.blank? || filename.blank? if row["IsRemote"] return "#{raw}\n#{filename}" end path = File.join( "telligent.evolution.components.attachments", "%02d" % row["ApplicationTypeId"], "%02d" % row["ApplicationId"], "%02d" % row["ApplicationContentTypeId"], ("%010d" % row["ContentId"]).scan(/.{2}/) ) path = fix_attachment_path(path, filename) if path && !embedded_paths.include?(path) if File.file?(path) upload = @uploader.create_upload(user_id, path, filename) if upload.present? && upload.persisted? && !upload_ids.include?(upload.id) raw = "#{raw}\n#{@uploader.html_for_upload(upload, filename)}" end else id = type == :topic ? row['ThreadId'] : row['ThreadReplyId'] STDERR.puts "Could not find file for #{type} #{id}: #{path}" end end raw end def replace_embedded_attachments(raw, user_id) paths = [] upload_ids = [] return [raw, paths, upload_ids] if @filestore_root_directory.blank? ATTACHMENT_REGEXES.each do |regex| raw = raw.gsub(regex) do match_data = Regexp.last_match path = File.join(match_data[:directory], match_data[:path]) fixed_path = fix_attachment_path(path, match_data[:filename]) if fixed_path && File.file?(fixed_path) filename = File.basename(fixed_path) upload = @uploader.create_upload(user_id, fixed_path, filename) if upload.present? && upload.persisted? paths << fixed_path upload_ids << upload.id @uploader.html_for_upload(upload, filename) end else path = File.join(path, match_data[:filename]) STDERR.puts "Could not find file: #{path}" match_data[0] end end end [raw, paths, upload_ids] end def fix_attachment_path(base_path, filename) path = find_correct_path(base_path, filename) return path if attachment_exists?(path) base_path.downcase! path = find_correct_path(base_path, filename) return path if attachment_exists?(path) filename = CGI.unescapeHTML(filename) path = find_correct_path(base_path, filename) return path if attachment_exists?(path) filename.gsub!("-", " ") filename.strip! path = find_correct_path(base_path, filename) return path if attachment_exists?(path) directories = base_path.split(File::SEPARATOR) first_directory = directories.shift first_directory.gsub!("-", ".") base_path = File.join(first_directory, directories) path = find_correct_path(base_path, filename) return path if attachment_exists?(path) directories.map! { |d| File.join(d.split(/[\.\-]/).map(&:strip)) } base_path = File.join(first_directory, directories) path = find_correct_path(base_path, filename) return path if attachment_exists?(path) directories = base_path.split(File::SEPARATOR) directories.map! { |d| d.gsub("+", " ").strip } base_path = File.join(directories) path = find_correct_path(base_path, filename) return path if attachment_exists?(path) replace_codes!(filename) path = find_correct_path(base_path, filename) return path if attachment_exists?(path) replace_codes!(base_path) path = find_correct_path(base_path, filename) return path if attachment_exists?(path) filename.gsub!(/(?:\:\d+)+$/, "") path = find_correct_path(base_path, filename) return path if attachment_exists?(path) path = File.join(base_path, filename) path_regex = Regexp.new("^#{Regexp.escape(path)}-\\d+x\\d+\\.\\w+$", Regexp::IGNORECASE) path = find_correct_path_with_regex(path_regex) return path if attachment_exists?(path) nil end def find_correct_path(base_path, filename) path = File.join(base_path, filename) path = @files[path.downcase] path ? File.join(@filestore_root_directory, path) : nil end def find_correct_path_with_regex(regex) keys = @files.keys.filter { |key| regex =~ key } keys.size == 1 ? File.join(@filestore_root_directory, @files[keys.first]) : nil end def attachment_exists?(path) path.present? && File.file?(path) end def replace_codes!(text) text.gsub!(/_(\h{4}+)_/i) do codes = Regexp.last_match[1].upcase.scan(/.{4}/) mapped_codes = codes.map { |c| UNICODE_REPLACEMENTS[c] } mapped_codes.any? { |c| c.nil? } ? Regexp.last_match[0] : mapped_codes.join("") end end def html_to_markdown(html) return html if html.blank? md = HtmlToMarkdown.new(html).to_markdown md.gsub!(/\[quote.*?\]/, "\n" + '\0' + "\n") md.gsub!(/(?